View Javadoc
1   package com.opencsv;
2   
3   /*
4    Copyright 2005 Bytecode Pty Ltd.
5   
6    Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9   
10   http://www.apache.org/licenses/LICENSE-2.0
11  
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17   */
18  
19  import com.opencsv.enums.CSVReaderNullFieldIndicator;
20  import org.apache.commons.lang3.ArrayUtils;
21  import org.apache.commons.lang3.ObjectUtils;
22  import org.apache.commons.lang3.StringUtils;
23  
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.Locale;
28  import java.util.ResourceBundle;
29  
30  /**
31   * <p>A very simple CSV parser released under a commercial-friendly license.
32   * This just implements splitting a single line into fields.</p>
33   *
34   * <p>The purpose of the CSVParser is to take a single string and parse it into
35   * its elements based on the delimiter, quote and escape characters.</p>
36   *
37   * <p>The CSVParser has grown organically based on user requests and does not truly match
38   * any current requirements (though it can be configured to match or come close).  There
39   * are no plans to change this as it will break existing requirements.  Consider using
40   * the RFC4180Parser for less configurability but closer match to the RFC4180 requirements.</p>
41   *
42   * @author Glen Smith
43   * @author Rainer Pruy
44   */
45  public class CSVParser extends AbstractCSVParser {
46  
47      private static final int BEGINNING_OF_LINE = 3;
48      /**
49       * This is the character that the CSVParser will treat as the escape character.
50       */
51      private final char escape;
52  
53      /**
54       * String of escape character - optimization for replaceAll
55       */
56      private final String escapeAsString;
57  
58      /**
59       * String escapeAsString+escapeAsString - optimization for replaceAll
60       */
61      private final String escapeDoubleAsString;
62  
63      /**
64       * Determines if the field is between quotes (true) or between separators (false).
65       */
66      private final boolean strictQuotes;
67      /**
68       * Ignore any leading white space at the start of the field.
69       */
70      private final boolean ignoreLeadingWhiteSpace;
71      /**
72       * Skip over quotation characters when parsing.
73       */
74      private final boolean ignoreQuotations;
75      private int tokensOnLastCompleteLine = -1;
76      private boolean inField = false;
77      
78      /** Locale for all translations. */
79      private Locale errorLocale;
80  
81      /**
82       * Constructs CSVParser using default values for everything.
83       */
84      public CSVParser() {
85          this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER,
86                  DEFAULT_ESCAPE_CHARACTER, DEFAULT_STRICT_QUOTES,
87                  DEFAULT_IGNORE_LEADING_WHITESPACE,
88                  DEFAULT_IGNORE_QUOTATIONS,
89                  DEFAULT_NULL_FIELD_INDICATOR, Locale.getDefault());
90      }
91      
92      /**
93       * Constructs CSVParser.
94       * <p>This constructor sets all necessary parameters for CSVParser, and
95       * intentionally has package access so only the builder can use it.</p>
96       * 
97       * @param separator               The delimiter to use for separating entries
98       * @param quotechar               The character to use for quoted elements
99       * @param escape                  The character to use for escaping a separator or quote
100      * @param strictQuotes            If true, characters outside the quotes are ignored
101      * @param ignoreLeadingWhiteSpace If true, white space in front of a quote in a field is ignored
102      * @param ignoreQuotations        If true, treat quotations like any other character.
103      * @param nullFieldIndicator      Which field content will be returned as null: EMPTY_SEPARATORS, EMPTY_QUOTES,
104      *                                BOTH, NEITHER (default)
105      * @param errorLocale             Locale for error messages.
106      */
107     CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace,
108               boolean ignoreQuotations, CSVReaderNullFieldIndicator nullFieldIndicator, Locale errorLocale) {
109         super(separator, quotechar, nullFieldIndicator);
110         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
111         if (anyCharactersAreTheSame(separator, quotechar, escape)) {
112             throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("special.characters.must.differ"));
113         }
114         if (separator == NULL_CHARACTER) {
115             throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("define.separator"));
116         }
117         this.escape = escape;
118         this.escapeAsString = Character.toString(escape);
119         this.escapeDoubleAsString = escapeAsString + escapeAsString;
120         this.strictQuotes = strictQuotes;
121         this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
122         this.ignoreQuotations = ignoreQuotations;
123     }
124 
125     /**
126      * @return The default escape character for this parser.
127      */
128     public char getEscape() {
129         return escape;
130     }
131     
132     /**
133      * @return The default strictQuotes setting for this parser.
134      */
135     public boolean isStrictQuotes() {
136         return strictQuotes;
137     }
138 
139     /**
140      * @return The default ignoreLeadingWhiteSpace setting for this parser.
141      */
142     public boolean isIgnoreLeadingWhiteSpace() {
143         return ignoreLeadingWhiteSpace;
144     }
145 
146     /**
147      * @return The default ignoreQuotation setting for this parser.
148      */
149     public boolean isIgnoreQuotations() {
150         return ignoreQuotations;
151     }
152 
153     /**
154      * Checks to see if any two of the three characters are the same.
155      * This is because in opencsv the separator, quote, and escape characters
156      * must the different.
157      *
158      * @param separator The defined separator character
159      * @param quotechar The defined quotation cahracter
160      * @param escape    The defined escape character
161      * @return True if any two of the three are the same.
162      */
163     private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
164         return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
165     }
166 
167     /**
168      * Checks that the two characters are the same and are not the defined NULL_CHARACTER.
169      * @param c1 First character
170      * @param c2 Second character
171      * @return True if both characters are the same and are not the defined NULL_CHARACTER
172      */
173     private boolean isSameCharacter(char c1, char c2) {
174         return c1 != NULL_CHARACTER && c1 == c2;
175     }
176 
177     @Override
178     protected String convertToCsvValue(String value, boolean applyQuotestoAll) {
179         String testValue = (value == null && !nullFieldIndicator.equals(CSVReaderNullFieldIndicator.NEITHER)) ? "" : value;
180         StringBuilder builder = new StringBuilder(testValue == null ? MAX_SIZE_FOR_EMPTY_FIELD : (testValue.length() * 2));
181         boolean containsQuoteChar = StringUtils.contains(testValue, getQuotechar());
182         boolean containsEscapeChar = StringUtils.contains(testValue, getEscape());
183         boolean containsSeparatorChar = StringUtils.contains(testValue, getSeparator());
184         boolean surroundWithQuotes = applyQuotestoAll || isSurroundWithQuotes(value, containsSeparatorChar);
185 
186         String convertedString = !containsQuoteChar ? testValue : quoteMatcherPattern.matcher(testValue).replaceAll(quoteDoubledAsString);
187         convertedString = !containsEscapeChar ? convertedString : convertedString.replace(escapeAsString, escapeDoubleAsString);
188 
189         if (surroundWithQuotes) {
190             builder.append(getQuotechar());
191         }
192 
193         builder.append(convertedString);
194 
195         if (surroundWithQuotes) {
196             builder.append(getQuotechar());
197         }
198 
199         return builder.toString();
200     }
201 
202     @Override
203     protected String[] parseLine(String nextLine, boolean multi) throws IOException {
204 
205         if (!multi && pending != null) {
206             pending = null;
207         }
208 
209         if (nextLine == null) {
210             if (pending != null) {
211                 String s = pending;
212                 pending = null;
213                 return new String[]{s};
214             }
215             return null;
216         }
217         final List<String> tokensOnThisLine = tokensOnLastCompleteLine <= 0 ? new ArrayList<>() : new ArrayList<>((tokensOnLastCompleteLine + 1) * 2);
218         final StringFragmentCopier sfc = new StringFragmentCopier(nextLine);
219         boolean inQuotes = false;
220         boolean fromQuotedField = false;
221         if (pending != null) {
222             sfc.append(pending);
223             pending = null;
224             inQuotes = !this.ignoreQuotations;
225         }
226 
227         while (!sfc.isEmptyInput()) {
228             final char c = sfc.takeInput();
229             if (c == this.escape) {
230                 if (!strictQuotes) {
231                     inField = true; // For the unusual case of escaping the first character
232                 }
233                 handleEscapeCharacter(nextLine, sfc, inQuotes);
234             } else if (c == quotechar) {
235                 if (isNextCharacterEscapedQuote(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
236                     sfc.takeInput();
237                     sfc.appendPrev();
238                 } else {
239 
240                     inQuotes = !inQuotes;
241                     if (sfc.isEmptyOutput()) {
242                         fromQuotedField = true;
243                     }
244 
245                     // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
246                     handleQuoteCharButNotStrictQuotes(nextLine, sfc);
247                 }
248                 inField = !inField;
249             } else if (c == separator && !(inQuotes && !ignoreQuotations)) {
250                 tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
251                 fromQuotedField = false;
252                 inField = false;
253             } else {
254                 if (!strictQuotes || (inQuotes && !ignoreQuotations)) {
255                     sfc.appendPrev();
256                     inField = true;
257                     fromQuotedField = true;
258                 }
259             }
260 
261         }
262         // line is done - check status
263         line_done: {
264             if (inQuotes && !ignoreQuotations) {
265                 if (multi) {
266                     // continuing a quoted section, re-append newline
267                     sfc.append('\n');
268                     pending = sfc.peekOutput();
269                     break line_done; // this partial content is not to be added to field list yet
270                 } else {
271                     throw new IOException(String.format(
272                             ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, errorLocale).getString("unterminated.quote"),
273                             sfc.peekOutput()));
274                 }
275             } else {
276                 inField = false;
277             }
278 
279             tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
280         }
281 
282         tokensOnLastCompleteLine = tokensOnThisLine.size();
283         return tokensOnThisLine.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
284 
285     }
286 
287     private void handleQuoteCharButNotStrictQuotes(String nextLine, StringFragmentCopier sfc) {
288         if (!strictQuotes) {
289             final int i = sfc.i;
290             if (i > BEGINNING_OF_LINE //not on the beginning of the line
291                     && nextLine.charAt(i - 2) != this.separator //not at the beginning of an escape sequence
292                     && nextLine.length() > (i) &&
293                     nextLine.charAt(i) != this.separator //not at the	end of an escape sequence
294             ) {
295 
296                 if (ignoreLeadingWhiteSpace && !sfc.isEmptyOutput() && StringUtils.isWhitespace(sfc.peekOutput())) {
297                     sfc.clearOutput();
298                 } else {
299                     sfc.appendPrev();
300                 }
301             }
302         }
303     }
304 
305     private void handleEscapeCharacter(String nextLine, StringFragmentCopier sfc, boolean inQuotes) {
306         if (isNextCharacterEscapable(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
307             sfc.takeInput();
308             sfc.appendPrev();
309         }
310     }
311 
312     private String convertEmptyToNullIfNeeded(String s, boolean fromQuotedField) {
313         if (s.isEmpty() && shouldConvertEmptyToNull(fromQuotedField)) {
314             return null;
315         }
316         return s;
317     }
318 
319     private boolean shouldConvertEmptyToNull(boolean fromQuotedField) {
320         switch (nullFieldIndicator) {
321             case BOTH:
322                 return true;
323             case EMPTY_SEPARATORS:
324                 return !fromQuotedField;
325             case EMPTY_QUOTES:
326                 return fromQuotedField;
327             default:
328                 return false;
329         }
330     }
331 
332     /**
333      * Determines if we can process as if we were in quotes.
334      *
335      * @param inQuotes Are we currently in quotes?
336      * @return True if we should process as if we are inside quotes.
337      */
338     private boolean inQuotes(boolean inQuotes) {
339         return (inQuotes && !ignoreQuotations) || inField;
340     }
341 
342     /**
343      * Checks to see if the character after the index is a quotation character.
344      *
345      * Precondition: the current character is a quote or an escape.
346      *
347      * @param nextLine The current line
348      * @param inQuotes True if the current context is quoted
349      * @param i        Current index in line
350      * @return True if the following character is a quote
351      */
352     private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
353         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
354                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
355                 && isCharacterQuoteCharacter(nextLine.charAt(i + 1));
356     }
357 
358     /**
359      * Checks to see if the passed in character is the defined quotation character.
360      *
361      * @param c Source character
362      * @return True if c is the defined quotation character
363      */
364     private boolean isCharacterQuoteCharacter(char c) {
365         return c == quotechar;
366     }
367 
368     /**
369      * Checks to see if the character is the defined escape character.
370      *
371      * @param c Source character
372      * @return True if the character is the defined escape character
373      */
374     private boolean isCharacterEscapeCharacter(char c) {
375         return c == escape;
376     }
377 
378     /**
379      * Checks to see if the character is the defined separator.
380      *
381      * @param c Source character
382      * @return True if the character is the defined separator
383      */
384     private boolean isCharacterSeparator(char c) {
385         return c == separator;
386     }
387 
388     /**
389      * Checks to see if the character passed in could be escapable.
390      * Escapable characters for opencsv are the quotation character, the
391      * escape character, and the separator.
392      *
393      * @param c Source character
394      * @return True if the character could be escapable.
395      */
396     private boolean isCharacterEscapable(char c) {
397         return isCharacterQuoteCharacter(c) || isCharacterEscapeCharacter(c) || isCharacterSeparator(c);
398     }
399 
400     /**
401      * Checks to see if the character after the current index in a String is an
402      * escapable character.
403      * <p>Meaning the next character is a quotation character, the escape
404      * char, or the separator and you are inside quotes.</p>
405      * <p>"Inside quotes" in this context is interpreted liberally. For
406      * instance, if quotes are not expected but we are inside a field, that
407      * still counts for the purposes of this method as being "in quotes".</p>
408      *
409      * Precondition: the current character is an escape.
410      *
411      * @param nextLine The current line
412      * @param inQuotes True if the current context is quoted
413      * @param i        Current index in line
414      * @return True if the following character is a quote
415      */
416     protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
417         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
418                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
419                 && isCharacterEscapable(nextLine.charAt(i + 1));
420     }
421 
422     @Override
423     public void setErrorLocale(Locale errorLocale) {
424         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
425     }
426     
427     /**
428      * This class serves to optimize {@link CSVParser#parseLine(java.lang.String)},
429      * which is the hot inner loop of opencsv.
430      */
431     private static class StringFragmentCopier {
432         private final String input;
433         // Index of the next character in input to consume
434         private int i = 0;
435 
436         // This holds what is known of the next token to be output so far. We initialize this lazily because for
437         // CSVs where there are no escaped characters we can actually avoid creating this entirely.
438         private StringBuilder sb;
439         // Indexes of a substring of nextLine that is logically already appended to the sb buffer. If possible,
440         // we just fiddle these indices rather than actually appending anything to sb.
441         private int pendingSubstrFrom = 0;
442         private int pendingSubstrTo = 0;
443 
444         StringFragmentCopier(String input) {
445             this.input = input;
446         }
447 
448         public boolean isEmptyInput() {
449             return i >= input.length();
450         }
451 
452         public char takeInput() {
453             return input.charAt(i++);
454         }
455 
456         private StringBuilder materializeBuilder() {
457             if (sb == null) {
458                 sb = new StringBuilder(input.length() + READ_BUFFER_SIZE);
459             }
460 
461             if (pendingSubstrFrom < pendingSubstrTo) {
462                 sb.append(input, pendingSubstrFrom, pendingSubstrTo);
463                 pendingSubstrFrom = pendingSubstrTo = i;
464             }
465 
466             return sb;
467         }
468 
469         public void append(String pending) {
470             materializeBuilder().append(pending);
471         }
472 
473         public void append(char pending) {
474             materializeBuilder().append(pending);
475         }
476 
477         public void appendPrev() {
478             if (pendingSubstrTo == pendingSubstrFrom) {
479                 pendingSubstrFrom = i - 1;
480                 pendingSubstrTo = i;
481             } else if (pendingSubstrTo == i - 1) {
482                 pendingSubstrTo++;
483             } else {
484                 materializeBuilder().append(input.charAt(i - 1));
485             }
486         }
487 
488         public boolean isEmptyOutput() {
489             return pendingSubstrFrom >= pendingSubstrTo && (sb == null || sb.length() == 0);
490         }
491 
492         public void clearOutput() {
493             if (sb != null) {
494                 sb.setLength(0);
495             }
496 
497             pendingSubstrFrom = pendingSubstrTo = i;
498         }
499 
500         public String peekOutput() {
501             if (sb == null || sb.length() == 0) {
502                 return input.substring(pendingSubstrFrom, pendingSubstrTo);
503             } else {
504                 return materializeBuilder().toString();
505             }
506         }
507 
508         public String takeOutput() {
509             final String result = peekOutput();
510             clearOutput();
511             return result;
512         }
513     }
514 }