View Javadoc
1   package com.opencsv;
2   
3   /*
4    Copyright 2005 Bytecode Pty Ltd.
5   
6    Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9   
10   http://www.apache.org/licenses/LICENSE-2.0
11  
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17   */
18  
19  import com.opencsv.enums.CSVReaderNullFieldIndicator;
20  import org.apache.commons.lang3.ArrayUtils;
21  import org.apache.commons.lang3.ObjectUtils;
22  import org.apache.commons.lang3.StringUtils;
23  
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.Locale;
28  import java.util.ResourceBundle;
29  
30  /**
31   * <p>A very simple CSV parser released under a commercial-friendly license.
32   * This just implements splitting a single line into fields.</p>
33   *
34   * <p>The purpose of the CSVParser is to take a single string and parse it into
35   * its elements based on the delimiter, quote and escape characters.</p>
36   *
37   * <p>The CSVParser has grown organically based on user requests and does not truly match
38   * any current requirements (though it can be configured to match or come close).  There
39   * are no plans to change this as it will break existing requirements.  Consider using
40   * the RFC4180Parser for all standard csv data.   It may be less configurable, but that is not
41   * needed for data matching the RFC4180 requirements.</p>
42   *
43   * @author Glen Smith
44   * @author Rainer Pruy
45   */
46  public class CSVParser extends AbstractCSVParser {
47  
48      private static final int BEGINNING_OF_LINE = 3;
49      /**
50       * This is the character that the CSVParser will treat as the escape character.
51       */
52      private final char escape;
53  
54      /**
55       * String of escape character - optimization for replaceAll
56       */
57      private final String escapeAsString;
58  
59      /**
60       * String escapeAsString+escapeAsString - optimization for replaceAll
61       */
62      private final String escapeDoubleAsString;
63  
64      /**
65       * Determines if the field is between quotes (true) or between separators (false).
66       */
67      private final boolean strictQuotes;
68      /**
69       * Ignore any leading white space at the start of the field.
70       */
71      private final boolean ignoreLeadingWhiteSpace;
72      /**
73       * Skip over quotation characters when parsing.
74       */
75      private final boolean ignoreQuotations;
76      private int tokensOnLastCompleteLine = -1;
77      private boolean inField = false;
78      
79      /** Locale for all translations. */
80      private Locale errorLocale;
81  
82      /**
83       * Constructs CSVParser using default values for everything.
84       */
85      public CSVParser() {
86          this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER,
87                  DEFAULT_ESCAPE_CHARACTER, DEFAULT_STRICT_QUOTES,
88                  DEFAULT_IGNORE_LEADING_WHITESPACE,
89                  DEFAULT_IGNORE_QUOTATIONS,
90                  DEFAULT_NULL_FIELD_INDICATOR, Locale.getDefault());
91      }
92      
93      /**
94       * Constructs CSVParser.
95       * <p>This constructor sets all necessary parameters for CSVParser, and
96       * intentionally has package access so only the builder can use it.</p>
97       * 
98       * @param separator               The delimiter to use for separating entries
99       * @param quotechar               The character to use for quoted elements
100      * @param escape                  The character to use for escaping a separator or quote
101      * @param strictQuotes            If true, characters outside the quotes are ignored
102      * @param ignoreLeadingWhiteSpace If true, white space in front of a quote in a field is ignored
103      * @param ignoreQuotations        If true, treat quotations like any other character.
104      * @param nullFieldIndicator      Which field content will be returned as null: EMPTY_SEPARATORS, EMPTY_QUOTES,
105      *                                BOTH, NEITHER (default)
106      * @param errorLocale             Locale for error messages.
107      */
108     protected CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace,
109               boolean ignoreQuotations, CSVReaderNullFieldIndicator nullFieldIndicator, Locale errorLocale) {
110         super(separator, quotechar, nullFieldIndicator);
111         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
112         if (anyCharactersAreTheSame(separator, quotechar, escape)) {
113             throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("special.characters.must.differ"));
114         }
115         if (separator == NULL_CHARACTER) {
116             throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("define.separator"));
117         }
118         this.escape = escape;
119         this.escapeAsString = Character.toString(escape);
120         this.escapeDoubleAsString = escapeAsString + escapeAsString;
121         this.strictQuotes = strictQuotes;
122         this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
123         this.ignoreQuotations = ignoreQuotations;
124     }
125 
126     /**
127      * @return The default escape character for this parser.
128      */
129     public char getEscape() {
130         return escape;
131     }
132     
133     /**
134      * @return The default strictQuotes setting for this parser.
135      */
136     public boolean isStrictQuotes() {
137         return strictQuotes;
138     }
139 
140     /**
141      * @return The default ignoreLeadingWhiteSpace setting for this parser.
142      */
143     public boolean isIgnoreLeadingWhiteSpace() {
144         return ignoreLeadingWhiteSpace;
145     }
146 
147     /**
148      * @return The default ignoreQuotation setting for this parser.
149      */
150     public boolean isIgnoreQuotations() {
151         return ignoreQuotations;
152     }
153 
154     /**
155      * Checks to see if any two of the three characters are the same.
156      * This is because in opencsv the separator, quote, and escape characters
157      * must the different.
158      *
159      * @param separator The defined separator character
160      * @param quotechar The defined quotation cahracter
161      * @param escape    The defined escape character
162      * @return True if any two of the three are the same.
163      */
164     private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
165         return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
166     }
167 
168     /**
169      * Checks that the two characters are the same and are not the defined NULL_CHARACTER.
170      * @param c1 First character
171      * @param c2 Second character
172      * @return True if both characters are the same and are not the defined NULL_CHARACTER
173      */
174     private boolean isSameCharacter(char c1, char c2) {
175         return c1 != NULL_CHARACTER && c1 == c2;
176     }
177 
178     @Override
179     protected String convertToCsvValue(String value, boolean applyQuotestoAll) {
180         String testValue = (value == null && !nullFieldIndicator.equals(CSVReaderNullFieldIndicator.NEITHER)) ? "" : value;
181         StringBuilder builder = new StringBuilder(testValue == null ? MAX_SIZE_FOR_EMPTY_FIELD : (testValue.length() * 2));
182         boolean containsQuoteChar = StringUtils.contains(testValue, getQuotechar());
183         boolean containsEscapeChar = StringUtils.contains(testValue, getEscape());
184         boolean containsSeparatorChar = StringUtils.contains(testValue, getSeparator());
185         boolean surroundWithQuotes = applyQuotestoAll || isSurroundWithQuotes(value, containsSeparatorChar);
186 
187         String convertedString = !containsQuoteChar ? testValue : getQuoteMatcherPattern().matcher(testValue).replaceAll(getQuoteDoubledAsString());
188         convertedString = !containsEscapeChar ? convertedString : convertedString.replace(escapeAsString, escapeDoubleAsString);
189 
190         if (surroundWithQuotes) {
191             builder.append(getQuotechar());
192         }
193 
194         builder.append(convertedString);
195 
196         if (surroundWithQuotes) {
197             builder.append(getQuotechar());
198         }
199 
200         return builder.toString();
201     }
202 
203     @Override
204     protected String[] parseLine(String nextLine, boolean multi) throws IOException {
205 
206         if (!multi && pending != null) {
207             pending = null;
208         }
209 
210         if (nextLine == null) {
211             if (pending != null) {
212                 String s = pending;
213                 pending = null;
214                 return new String[]{s};
215             }
216             return null;
217         }
218         final List<String> tokensOnThisLine = tokensOnLastCompleteLine <= 0 ? new ArrayList<>() : new ArrayList<>((tokensOnLastCompleteLine + 1) * 2);
219         final StringFragmentCopier sfc = new StringFragmentCopier(nextLine);
220         boolean inQuotes = false;
221         boolean fromQuotedField = false;
222         if (pending != null) {
223             sfc.append(pending);
224             pending = null;
225             inQuotes = !this.ignoreQuotations;
226         }
227 
228         while (!sfc.isEmptyInput()) {
229             final char c = sfc.takeInput();
230             if (c == this.escape) {
231                 if (!strictQuotes) {
232                     inField = true; // For the unusual case of escaping the first character
233                 }
234                 handleEscapeCharacter(nextLine, sfc, inQuotes);
235             } else if (c == quotechar) {
236                 if (isNextCharacterEscapedQuote(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
237                     sfc.takeInput();
238                     sfc.appendPrev();
239                 } else {
240 
241                     inQuotes = !inQuotes;
242                     if (sfc.isEmptyOutput()) {
243                         fromQuotedField = true;
244                     }
245 
246                     // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
247                     handleQuoteCharButNotStrictQuotes(nextLine, sfc);
248                 }
249                 inField = !inField;
250             } else if (c == separator && !(inQuotes && !ignoreQuotations)) {
251                 tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
252                 fromQuotedField = false;
253                 inField = false;
254             } else {
255                 if (!strictQuotes || (inQuotes && !ignoreQuotations)) {
256                     sfc.appendPrev();
257                     inField = true;
258                     fromQuotedField = true;
259                 }
260             }
261 
262         }
263         // line is done - check status
264         line_done: {
265             if (inQuotes && !ignoreQuotations) {
266                 if (multi) {
267                     // continuing a quoted section, re-append newline
268                     sfc.append('\n');
269                     pending = sfc.peekOutput();
270                     break line_done; // this partial content is not to be added to field list yet
271                 } else {
272                     throw new IOException(String.format(
273                             ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, errorLocale).getString("unterminated.quote"),
274                             sfc.peekOutput()));
275                 }
276             } else {
277                 inField = false;
278             }
279 
280             tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
281         }
282 
283         tokensOnLastCompleteLine = tokensOnThisLine.size();
284         return tokensOnThisLine.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
285 
286     }
287 
288     private void handleQuoteCharButNotStrictQuotes(String nextLine, StringFragmentCopier sfc) {
289         if (!strictQuotes) {
290             final int i = sfc.i;
291             if (i > BEGINNING_OF_LINE //not on the beginning of the line
292                     && nextLine.charAt(i - 2) != this.separator //not at the beginning of an escape sequence
293                     && nextLine.length() > (i) &&
294                     nextLine.charAt(i) != this.separator //not at the	end of an escape sequence
295             ) {
296 
297                 if (ignoreLeadingWhiteSpace && !sfc.isEmptyOutput() && StringUtils.isWhitespace(sfc.peekOutput())) {
298                     sfc.clearOutput();
299                 } else {
300                     sfc.appendPrev();
301                 }
302             }
303         }
304     }
305 
306     private void handleEscapeCharacter(String nextLine, StringFragmentCopier sfc, boolean inQuotes) {
307         if (isNextCharacterEscapable(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
308             sfc.takeInput();
309             sfc.appendPrev();
310         }
311     }
312 
313     private String convertEmptyToNullIfNeeded(String s, boolean fromQuotedField) {
314         if (s.isEmpty() && shouldConvertEmptyToNull(fromQuotedField)) {
315             return null;
316         }
317         return s;
318     }
319 
320     private boolean shouldConvertEmptyToNull(boolean fromQuotedField) {
321         switch (nullFieldIndicator) {
322             case BOTH:
323                 return true;
324             case EMPTY_SEPARATORS:
325                 return !fromQuotedField;
326             case EMPTY_QUOTES:
327                 return fromQuotedField;
328             default:
329                 return false;
330         }
331     }
332 
333     /**
334      * Determines if we can process as if we were in quotes.
335      *
336      * @param inQuotes Are we currently in quotes?
337      * @return True if we should process as if we are inside quotes.
338      */
339     private boolean inQuotes(boolean inQuotes) {
340         return (inQuotes && !ignoreQuotations) || inField;
341     }
342 
343     /**
344      * Checks to see if the character after the index is a quotation character.
345      *
346      * Precondition: the current character is a quote or an escape.
347      *
348      * @param nextLine The current line
349      * @param inQuotes True if the current context is quoted
350      * @param i        Current index in line
351      * @return True if the following character is a quote
352      */
353     private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
354         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
355                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
356                 && isCharacterQuoteCharacter(nextLine.charAt(i + 1));
357     }
358 
359     /**
360      * Checks to see if the passed in character is the defined quotation character.
361      *
362      * @param c Source character
363      * @return True if c is the defined quotation character
364      */
365     private boolean isCharacterQuoteCharacter(char c) {
366         return c == quotechar;
367     }
368 
369     /**
370      * Checks to see if the character is the defined escape character.
371      *
372      * @param c Source character
373      * @return True if the character is the defined escape character
374      */
375     private boolean isCharacterEscapeCharacter(char c) {
376         return c == escape;
377     }
378 
379     /**
380      * Checks to see if the character is the defined separator.
381      *
382      * @param c Source character
383      * @return True if the character is the defined separator
384      */
385     private boolean isCharacterSeparator(char c) {
386         return c == separator;
387     }
388 
389     /**
390      * Checks to see if the character passed in could be escapable.
391      * Escapable characters for opencsv are the quotation character, the
392      * escape character, and the separator.
393      *
394      * @param c Source character
395      * @return True if the character could be escapable.
396      */
397     private boolean isCharacterEscapable(char c) {
398         return isCharacterQuoteCharacter(c) || isCharacterEscapeCharacter(c) || isCharacterSeparator(c);
399     }
400 
401     /**
402      * Checks to see if the character after the current index in a String is an
403      * escapable character.
404      * <p>Meaning the next character is a quotation character, the escape
405      * char, or the separator and you are inside quotes.</p>
406      * <p>"Inside quotes" in this context is interpreted liberally. For
407      * instance, if quotes are not expected but we are inside a field, that
408      * still counts for the purposes of this method as being "in quotes".</p>
409      *
410      * Precondition: the current character is an escape.
411      *
412      * @param nextLine The current line
413      * @param inQuotes True if the current context is quoted
414      * @param i        Current index in line
415      * @return True if the following character is a quote
416      */
417     protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
418         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
419                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
420                 && isCharacterEscapable(nextLine.charAt(i + 1));
421     }
422 
423     @Override
424     public void setErrorLocale(Locale errorLocale) {
425         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
426     }
427     
428     /**
429      * This class serves to optimize {@link CSVParser#parseLine(java.lang.String)},
430      * which is the hot inner loop of opencsv.
431      */
432     private static class StringFragmentCopier {
433         private final String input;
434         // Index of the next character in input to consume
435         private int i = 0;
436 
437         // This holds what is known of the next token to be output so far. We initialize this lazily because for
438         // CSVs where there are no escaped characters we can actually avoid creating this entirely.
439         private StringBuilder sb;
440         // Indexes of a substring of nextLine that is logically already appended to the sb buffer. If possible,
441         // we just fiddle these indices rather than actually appending anything to sb.
442         private int pendingSubstrFrom = 0;
443         private int pendingSubstrTo = 0;
444 
445         StringFragmentCopier(String input) {
446             this.input = input;
447         }
448 
449         public boolean isEmptyInput() {
450             return i >= input.length();
451         }
452 
453         public char takeInput() {
454             return input.charAt(i++);
455         }
456 
457         private StringBuilder materializeBuilder() {
458             if (sb == null) {
459                 sb = new StringBuilder(input.length() + READ_BUFFER_SIZE);
460             }
461 
462             if (pendingSubstrFrom < pendingSubstrTo) {
463                 sb.append(input, pendingSubstrFrom, pendingSubstrTo);
464                 pendingSubstrFrom = pendingSubstrTo = i;
465             }
466 
467             return sb;
468         }
469 
470         public void append(String pending) {
471             materializeBuilder().append(pending);
472         }
473 
474         public void append(char pending) {
475             materializeBuilder().append(pending);
476         }
477 
478         public void appendPrev() {
479             if (pendingSubstrTo == pendingSubstrFrom) {
480                 pendingSubstrFrom = i - 1;
481                 pendingSubstrTo = i;
482             } else if (pendingSubstrTo == i - 1) {
483                 pendingSubstrTo++;
484             } else {
485                 materializeBuilder().append(input.charAt(i - 1));
486             }
487         }
488 
489         /**
490          * Determines whether the current output is empty.
491          * <p>
492          * The output is considered empty if the pending substring indices
493          * indicate there is no substring to process (pendingSubstrFrom >= pendingSubstrTo),
494          * and the StringBuilder object (sb) is either null or has a length of zero.
495          *
496          * @return true if the output is empty, false otherwise.
497          */
498         public boolean isEmptyOutput() {
499             return pendingSubstrFrom >= pendingSubstrTo && (sb == null || sb.length() == 0);
500         }
501 
502         /**
503          * Clears the current output buffer and resets the indices used to track
504          * substrings of the input.
505          *
506          * Specifically, this method sets the internal StringBuilder's length to zero,
507          * effectively clearing its content. Additionally, it resets the range of
508          * pending substring indices (pendingSubstrFrom and pendingSubstrTo) to the
509          * current position (i) in the input.
510          */
511         public void clearOutput() {
512             if (sb != null) {
513                 sb.setLength(0);
514             }
515 
516             pendingSubstrFrom = pendingSubstrTo = i;
517         }
518 
519         /**
520          * Retrieves the current accumulated output as a string without modifying or clearing the underlying buffers or state.
521          *
522          * @return The current output. If no output has been accumulated, returns a substring of the input
523          *         between the indices specified by the internal state or an empty string if no such substring exists.
524          */
525         public String peekOutput() {
526             if (sb == null || sb.length() == 0) {
527                 return input.substring(pendingSubstrFrom, pendingSubstrTo);
528             } else {
529                 return materializeBuilder().toString();
530             }
531         }
532 
533         /**
534          * Retrieves the current output and clears it. This method combines the operations of
535          * peeking at the current output and resetting any internal buffers or indexes to prepare
536          * for new content while ensuring the current output is returned.
537          *
538          * @return The current output as a string before it is cleared. If there is no output,
539          *         an empty string is returned.
540          */
541         public String takeOutput() {
542             final String result = peekOutput();
543             clearOutput();
544             return result;
545         }
546     }
547 }