View Javadoc
1   /*
2    * ObjectLab, http://www.objectlab.co.uk/open is supporting FlatPack.
3    *
4    * Based in London, we are world leaders in the design and development
5    * of bespoke applications for the securities financing markets.
6    *
7    * <a href="http://www.objectlab.co.uk/open">Click here to learn more</a>
8    *           ___  _     _           _   _          _
9    *          / _ \| |__ (_) ___  ___| |_| |    __ _| |__
10   *         | | | | '_ \| |/ _ \/ __| __| |   / _` | '_ \
11   *         | |_| | |_) | |  __/ (__| |_| |__| (_| | |_) |
12   *          \___/|_.__// |\___|\___|\__|_____\__,_|_.__/
13   *                   |__/
14   *
15   *                     www.ObjectLab.co.uk
16   *
17   * $Id: ColorProvider.java 74 2006-10-24 22:19:05Z benoitx $
18   *
19   * Copyright 2006 the original author or authors.
20   *
21   * Licensed under the Apache License, Version 2.0 (the "License"); you may not
22   * use this file except in compliance with the License. You may obtain a copy of
23   * the License at
24   *
25   * http://www.apache.org/licenses/LICENSE-2.0
26   *
27   * Unless required by applicable law or agreed to in writing, software
28   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
29   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
30   * License for the specific language governing permissions and limitations under
31   * the License.
32   */
33  package net.sf.flatpack;
34  
35  import java.io.BufferedReader;
36  import java.io.IOException;
37  import java.io.Reader;
38  import java.util.List;
39  
40  import org.slf4j.Logger;
41  import org.slf4j.LoggerFactory;
42  
43  import net.sf.flatpack.structure.ColumnMetaData;
44  import net.sf.flatpack.structure.Row;
45  import net.sf.flatpack.util.FPConstants;
46  import net.sf.flatpack.util.ParserUtils;
47  
48  /**
49   * @author Benoit Xhenseval
50   * @author Paul Zepernick
51   *
52   */
53  public abstract class AbstractDelimiterParser extends AbstractParser {
54      private static final Logger LOGGER = LoggerFactory.getLogger(AbstractDelimiterParser.class);
55      private static final String LINE_BREAK = System.lineSeparator();
56  
57      private char delimiter = 0;
58      private char qualifier = 0;
59      private boolean ignoreFirstRecord = false;
60  
61      private int lineCount = 0;
62  
63      public AbstractDelimiterParser(final Reader dataSourceReader, final String dataDefinition, final char delimiter, final char qualifier,
64              final boolean ignoreFirstRecord) {
65          super(dataSourceReader, dataDefinition);
66          this.delimiter = delimiter;
67          this.qualifier = qualifier;
68          this.ignoreFirstRecord = ignoreFirstRecord;
69      }
70  
71      public AbstractDelimiterParser(final Reader dataSourceReader, final char delimiter, final char qualifier, final boolean ignoreFirstRecord) {
72          super(dataSourceReader);
73          this.delimiter = delimiter;
74          this.qualifier = qualifier;
75          this.ignoreFirstRecord = ignoreFirstRecord;
76      }
77  
78      @Override
79      protected DataSet doParse() {
80          try {
81              lineCount = 0;
82              return doDelimitedFile(getDataSourceReader(), shouldCreateMDFromFile());
83          } catch (final IOException e) {
84              LOGGER.error("error accessing/creating inputstream", e);
85          }
86          return null;
87      }
88  
89      protected abstract boolean shouldCreateMDFromFile();
90  
91      protected char getDelimiter() {
92          return delimiter;
93      }
94  
95      protected void setDelimiter(final char delimiter) {
96          this.delimiter = delimiter;
97      }
98  
99      protected boolean isIgnoreFirstRecord() {
100         return ignoreFirstRecord;
101     }
102 
103     protected void setIgnoreFirstRecord(final boolean ignoreFirstRecord) {
104         this.ignoreFirstRecord = ignoreFirstRecord;
105     }
106 
107     protected char getQualifier() {
108         return qualifier;
109     }
110 
111     protected void setQualifier(final char qualifier) {
112         this.qualifier = qualifier;
113     }
114 
115     protected int getLineCount() {
116         return lineCount;
117     }
118 
119     /*
120      * This is the new version of doDelimitedFile using InputStream instead of
121      * File. This is more flexible especially it is working with WebStart.
122      *
123      * puts together the dataset for a DELIMITED file. This is used for PZ XML
124      * mappings, and SQL table mappings
125      */
126     private DataSet doDelimitedFile(final Reader dataSource, final boolean createMDFromFile) throws IOException {
127         if (dataSource == null) {
128             throw new IllegalArgumentException("dataSource is null");
129         }
130         final DefaultDataSet ds = new DefaultDataSet(getPzMetaData(), this);
131         try (BufferedReader br = new BufferedReader(dataSource)) {
132             // gather the conversion properties
133             ds.setPZConvertProps(ParserUtils.loadConvertProperties());
134 
135             boolean processedFirst = false;
136             /** loop through each line in the file */
137             String line = null;
138             int estimatedColCount = FPConstants.SPLITLINE_SIZE_INIT;
139             while ((line = fetchNextRecord(br, getQualifier(), getDelimiter())) != null) {
140                 // check to see if the user has elected to skip the first record
141                 if (!processedFirst && isIgnoreFirstRecord()) {
142                     processedFirst = true;
143                     continue;
144                 } else if (!processedFirst && createMDFromFile) {
145                     processedFirst = true;
146                     setPzMetaData(ParserUtils.getPZMetaDataFromFile(line, delimiter, qualifier, this, isAddSuffixToDuplicateColumnNames()));
147                     ds.setMetaData(getPzMetaData());
148                     continue;
149                 }
150                 // column values
151 
152                 // check number of Qualifier, if ODD number --> Incorrect!!!
153                 if (oddNumberOfQualifier(line, getQualifier())) {
154                     addError(ds, "Odd number of Qualifier characters", lineCount, 1, isStoreRawDataToDataError() ? line : null);
155                     continue;
156                 }
157 
158                 List<String> columns = ParserUtils.splitLine(line, getDelimiter(), getQualifier(), estimatedColCount, isPreserveLeadingWhitespace(),
159                         isPreserveTrailingWhitespace());
160                 final String mdkey = ParserUtils.getCMDKeyForDelimitedFile(getPzMetaData(), columns);
161                 final List<ColumnMetaData> metaData = ParserUtils.getColumnMetaData(mdkey, getPzMetaData());
162                 final int columnCount = metaData.size();
163                 estimatedColCount = columnCount;
164 
165                 if (columns.size() > columnCount) {
166                     // Incorrect record length on line log the error. Line
167                     // will not be included in the dataset log the error
168                     if (isIgnoreExtraColumns()) {
169                         // user has chosen to ignore the fact that we have too many columns in the data from
170                         // what the mapping has described. sublist the array to remove un-needed columns
171                         columns = columns.subList(0, columnCount);
172                         addError(ds, "Flatpack truncated line to correct number of columns", lineCount, 1, isStoreRawDataToDataError() ? line : null);
173                     } else {
174                         addError(ds, "Too many columns expected: " + columnCount + " Flatpack got: " + columns.size(), lineCount, 2,
175                                 isStoreRawDataToDataError() ? line : null);
176                         continue;
177                     }
178                 } else if (columns.size() < columnCount) {
179                     if (isHandlingShortLines()) {
180                         // We can pad this line out
181                         while (columns.size() < columnCount) {
182                             columns.add("");
183                         }
184 
185                         // log a warning
186                         addError(ds, "Flatpack padded line to correct number of columns", lineCount, 1, isStoreRawDataToDataError() ? line : null);
187 
188                     } else {
189                         addError(ds, "Too few columns expected: " + columnCount + " only got: " + columns.size(), lineCount, 2,
190                                 isStoreRawDataToDataError() ? line : null);
191                         continue;
192                     }
193                 }
194 
195                 final Row row = new Row();
196                 row.setMdkey(mdkey.equals(FPConstants.DETAIL_ID) ? null : mdkey); // try
197                 // to limit the memory use
198                 row.setCols(columns);
199                 row.setRowNumber(lineCount);
200                 if (isFlagEmptyRows()) {
201                     // user has elected to have the parser flag rows that are empty
202                     row.setEmpty(ParserUtils.isListElementsEmpty(columns));
203                 }
204                 if (isStoreRawDataToDataSet()) {
205                     // user told the parser to keep a copy of the raw data in the row
206                     // WARNING potential for high memory usage here
207                     row.setRawData(line);
208                 }
209 
210                 // add the row to the array
211                 ds.addRow(row);
212 
213             }
214         } finally {
215             closeReaders();
216         }
217         return ds;
218     }
219 
220     private boolean oddNumberOfQualifier(final String line, final char q) {
221         if (line == null || line.isEmpty()) {
222             return false;
223         }
224         int count = 0;
225         int idx = 0;
226         while ((idx = line.indexOf(q, idx)) != -1) {
227             count++;
228             idx++;
229         }
230 
231         return count % 2 != 0;
232     }
233 
234     /**
235      * Reads a record from a delimited file.  This will account for records which
236      * could span multiple lines.
237      * NULL will be returned when the end of the file is reached
238      *
239      * @param aContentReader
240      *          Open reader being used to read through the file
241      * @param aQualifier
242      *          Qualifier being used for parse
243      * @param aDelimiter
244      *          Delimiter being used for parse
245      * @return String
246      *          Record from delimited file
247      * @throws IOException if any problem with the stream of data (e.g. file reader)
248      *
249      * Improved version of line fetching that solves some of the issues of flatpack parser.
250      */
251     protected String fetchNextRecord(final BufferedReader aContentReader, final char aQualifier, final char aDelimiter) throws IOException {
252         if (aQualifier == FPConstants.NO_QUALIFIER) {
253             // no qualifier defined, then there can't be line breaks in the line
254             return aContentReader.readLine();
255         }
256 
257         StringBuilder lineData = null;
258         String line = null;
259         boolean multiline = false;
260 
261         // consuming lines until we find end of the data row
262         while ((line = aContentReader.readLine()) != null) {
263             if (lineData == null) {
264                 lineData = new StringBuilder(line);
265             } else {
266                 lineData.append(LINE_BREAK).append(line);
267             }
268 
269             multiline = isMultiline(line.toCharArray(), multiline, aQualifier, aDelimiter);
270             if (!multiline) {
271                 // data row ended
272                 break;
273             }
274         }
275 
276         if (lineData != null) {
277             lineCount++;
278 
279             final String result = lineData.toString();
280             // no line break character at the end of data row
281             return result.endsWith(LINE_BREAK) ? result.substring(0, result.length() - LINE_BREAK.length()) : result;
282         }
283 
284         return null;
285     }
286 
287     /**
288      * Checks if we need to consume one more line because data row was splitted to multiple lines.
289      * @param aСhrArray
290      * @param aMultiline
291      * @param aQualifier
292      * @param aDelimiter
293      * @return
294      */
295     protected boolean isMultiline(final char[] aСhrArray, boolean aMultiline, final char aQualifier, final char aDelimiter) {
296         // do not trim the line, according to rfc4180:
297         // Spaces are considered part of a field and should not be ignored
298         int position = 0;
299 
300         if (aСhrArray == null || aСhrArray.length == 0) {
301             return aMultiline;
302         }
303         do {
304             // field processing here
305             if (!aMultiline && aСhrArray[position] == aDelimiter) {
306                 // empty field
307                 position++;
308             } else if (!aMultiline && aСhrArray[position] != aQualifier) {
309                 // if the first char of the line is NOT a qualifier, then the field should not
310                 // contain CRLF, double quotes, and commas
311                 // therefore find the end of the field by looking for the first delimiter
312 
313                 while (++position < aСhrArray.length) {
314                     if (aСhrArray[position] == aDelimiter) {
315                         position++;
316                         break;
317                     }
318                 }
319 
320                 if (position >= aСhrArray.length) {
321                     // end of the line without any delimiters so it's safe to say its the end of the line
322                     // and not multiline
323                     return false;
324                 }
325             } else {
326                 // the first char is a qualifier, the field may contain CRLF, double quotes, and commas
327                 // double quotes must be escaped with a double quote (i.e. "some ""data"" here").
328                 // newline won't be present in the line because it's removed by the reader during
329                 // readLine() call. so look for dangling "
330 
331                 if (aMultiline && position == 0 && aСhrArray[0] == aQualifier && aСhrArray.length > 1 && aСhrArray[1] == aQualifier) {
332                     position++;
333                 } else if (aMultiline && position == 0 && aСhrArray[0] == aQualifier && aСhrArray.length > 1 && aСhrArray[1] != aQualifier) {
334                     // if we start the new line with the qualifier AND we are already in multiline we therefore have
335                     // found the end of the multiline
336                     aMultiline = false;
337                 } else {
338                     aMultiline = true;
339                 }
340 
341                 if (aСhrArray[position] == aQualifier) {
342                     // if we have just now found a qualifier we need to move cursor to the next char
343                     position++;
344                 }
345 
346                 // looking for the end of the text field
347                 while (position < aСhrArray.length) {
348                     if (aСhrArray[position] == aQualifier) {
349                         if (position == aСhrArray.length - 1 || aСhrArray[position + 1] != aQualifier) {
350                             // end of text found
351                             position++;
352                             aMultiline = false;
353                             break;
354                         } else {
355                             // skipping escaped qualified like ""
356                             position += 2;
357                         }
358                     } else {
359                         position++;
360                     }
361                 }
362             }
363         } while (position <= aСhrArray.length - 1);
364 
365         return aMultiline;
366     }
367 }