1 package com.opencsv;
2
3 /*
4 Copyright 2005 Bytecode Pty Ltd.
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 */
18
19 import com.opencsv.enums.CSVReaderNullFieldIndicator;
20 import org.apache.commons.lang3.ArrayUtils;
21 import org.apache.commons.lang3.ObjectUtils;
22 import org.apache.commons.lang3.StringUtils;
23
24 import java.io.IOException;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.ResourceBundle;
29
30 /**
31 * <p>A very simple CSV parser released under a commercial-friendly license.
32 * This just implements splitting a single line into fields.</p>
33 *
34 * <p>The purpose of the CSVParser is to take a single string and parse it into
35 * its elements based on the delimiter, quote and escape characters.</p>
36 *
37 * <p>The CSVParser has grown organically based on user requests and does not truly match
38 * any current requirements (though it can be configured to match or come close). There
39 * are no plans to change this as it will break existing requirements. Consider using
40 * the RFC4180Parser for all standard csv data. It may be less configurable, but that is not
41 * needed for data matching the RFC4180 requirements.</p>
42 *
43 * @author Glen Smith
44 * @author Rainer Pruy
45 */
46 public class CSVParser extends AbstractCSVParser {
47
48 private static final int BEGINNING_OF_LINE = 3;
49 /**
50 * This is the character that the CSVParser will treat as the escape character.
51 */
52 private final char escape;
53
54 /**
55 * String of escape character - optimization for replaceAll
56 */
57 private final String escapeAsString;
58
59 /**
60 * String escapeAsString+escapeAsString - optimization for replaceAll
61 */
62 private final String escapeDoubleAsString;
63
64 /**
65 * Determines if the field is between quotes (true) or between separators (false).
66 */
67 private final boolean strictQuotes;
68 /**
69 * Ignore any leading white space at the start of the field.
70 */
71 private final boolean ignoreLeadingWhiteSpace;
72 /**
73 * Skip over quotation characters when parsing.
74 */
75 private final boolean ignoreQuotations;
76 private int tokensOnLastCompleteLine = -1;
77 private boolean inField = false;
78
79 /** Locale for all translations. */
80 private Locale errorLocale;
81
82 /**
83 * Constructs CSVParser using default values for everything.
84 */
85 public CSVParser() {
86 this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER,
87 DEFAULT_ESCAPE_CHARACTER, DEFAULT_STRICT_QUOTES,
88 DEFAULT_IGNORE_LEADING_WHITESPACE,
89 DEFAULT_IGNORE_QUOTATIONS,
90 DEFAULT_NULL_FIELD_INDICATOR, Locale.getDefault());
91 }
92
93 /**
94 * Constructs CSVParser.
95 * <p>This constructor sets all necessary parameters for CSVParser, and
96 * intentionally has package access so only the builder can use it.</p>
97 *
98 * @param separator The delimiter to use for separating entries
99 * @param quotechar The character to use for quoted elements
100 * @param escape The character to use for escaping a separator or quote
101 * @param strictQuotes If true, characters outside the quotes are ignored
102 * @param ignoreLeadingWhiteSpace If true, white space in front of a quote in a field is ignored
103 * @param ignoreQuotations If true, treat quotations like any other character.
104 * @param nullFieldIndicator Which field content will be returned as null: EMPTY_SEPARATORS, EMPTY_QUOTES,
105 * BOTH, NEITHER (default)
106 * @param errorLocale Locale for error messages.
107 */
108 protected CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace,
109 boolean ignoreQuotations, CSVReaderNullFieldIndicator nullFieldIndicator, Locale errorLocale) {
110 super(separator, quotechar, nullFieldIndicator);
111 this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
112 if (anyCharactersAreTheSame(separator, quotechar, escape)) {
113 throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("special.characters.must.differ"));
114 }
115 if (separator == NULL_CHARACTER) {
116 throw new UnsupportedOperationException(ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, this.errorLocale).getString("define.separator"));
117 }
118 this.escape = escape;
119 this.escapeAsString = Character.toString(escape);
120 this.escapeDoubleAsString = escapeAsString + escapeAsString;
121 this.strictQuotes = strictQuotes;
122 this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
123 this.ignoreQuotations = ignoreQuotations;
124 }
125
126 /**
127 * @return The default escape character for this parser.
128 */
129 public char getEscape() {
130 return escape;
131 }
132
133 /**
134 * @return The default strictQuotes setting for this parser.
135 */
136 public boolean isStrictQuotes() {
137 return strictQuotes;
138 }
139
140 /**
141 * @return The default ignoreLeadingWhiteSpace setting for this parser.
142 */
143 public boolean isIgnoreLeadingWhiteSpace() {
144 return ignoreLeadingWhiteSpace;
145 }
146
147 /**
148 * @return The default ignoreQuotation setting for this parser.
149 */
150 public boolean isIgnoreQuotations() {
151 return ignoreQuotations;
152 }
153
154 /**
155 * Checks to see if any two of the three characters are the same.
156 * This is because in opencsv the separator, quote, and escape characters
157 * must the different.
158 *
159 * @param separator The defined separator character
160 * @param quotechar The defined quotation cahracter
161 * @param escape The defined escape character
162 * @return True if any two of the three are the same.
163 */
164 private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
165 return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
166 }
167
168 /**
169 * Checks that the two characters are the same and are not the defined NULL_CHARACTER.
170 * @param c1 First character
171 * @param c2 Second character
172 * @return True if both characters are the same and are not the defined NULL_CHARACTER
173 */
174 private boolean isSameCharacter(char c1, char c2) {
175 return c1 != NULL_CHARACTER && c1 == c2;
176 }
177
178 @Override
179 protected String convertToCsvValue(String value, boolean applyQuotestoAll) {
180 String testValue = (value == null && !nullFieldIndicator.equals(CSVReaderNullFieldIndicator.NEITHER)) ? "" : value;
181 StringBuilder builder = new StringBuilder(testValue == null ? MAX_SIZE_FOR_EMPTY_FIELD : (testValue.length() * 2));
182 boolean containsQuoteChar = StringUtils.contains(testValue, getQuotechar());
183 boolean containsEscapeChar = StringUtils.contains(testValue, getEscape());
184 boolean containsSeparatorChar = StringUtils.contains(testValue, getSeparator());
185 boolean surroundWithQuotes = applyQuotestoAll || isSurroundWithQuotes(value, containsSeparatorChar);
186
187 String convertedString = !containsQuoteChar ? testValue : getQuoteMatcherPattern().matcher(testValue).replaceAll(getQuoteDoubledAsString());
188 convertedString = !containsEscapeChar ? convertedString : convertedString.replace(escapeAsString, escapeDoubleAsString);
189
190 if (surroundWithQuotes) {
191 builder.append(getQuotechar());
192 }
193
194 builder.append(convertedString);
195
196 if (surroundWithQuotes) {
197 builder.append(getQuotechar());
198 }
199
200 return builder.toString();
201 }
202
203 @Override
204 protected String[] parseLine(String nextLine, boolean multi) throws IOException {
205
206 if (!multi && pending != null) {
207 pending = null;
208 }
209
210 if (nextLine == null) {
211 if (pending != null) {
212 String s = pending;
213 pending = null;
214 return new String[]{s};
215 }
216 return null;
217 }
218 final List<String> tokensOnThisLine = tokensOnLastCompleteLine <= 0 ? new ArrayList<>() : new ArrayList<>((tokensOnLastCompleteLine + 1) * 2);
219 final StringFragmentCopier sfc = new StringFragmentCopier(nextLine);
220 boolean inQuotes = false;
221 boolean fromQuotedField = false;
222 if (pending != null) {
223 sfc.append(pending);
224 pending = null;
225 inQuotes = !this.ignoreQuotations;
226 }
227
228 while (!sfc.isEmptyInput()) {
229 final char c = sfc.takeInput();
230 if (c == this.escape) {
231 if (!strictQuotes) {
232 inField = true; // For the unusual case of escaping the first character
233 }
234 handleEscapeCharacter(nextLine, sfc, inQuotes);
235 } else if (c == quotechar) {
236 if (isNextCharacterEscapedQuote(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
237 sfc.takeInput();
238 sfc.appendPrev();
239 } else {
240
241 inQuotes = !inQuotes;
242 if (sfc.isEmptyOutput()) {
243 fromQuotedField = true;
244 }
245
246 // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
247 handleQuoteCharButNotStrictQuotes(nextLine, sfc);
248 }
249 inField = !inField;
250 } else if (c == separator && !(inQuotes && !ignoreQuotations)) {
251 tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
252 fromQuotedField = false;
253 inField = false;
254 } else {
255 if (!strictQuotes || (inQuotes && !ignoreQuotations)) {
256 sfc.appendPrev();
257 inField = true;
258 fromQuotedField = true;
259 }
260 }
261
262 }
263 // line is done - check status
264 line_done: {
265 if (inQuotes && !ignoreQuotations) {
266 if (multi) {
267 // continuing a quoted section, re-append newline
268 sfc.append('\n');
269 pending = sfc.peekOutput();
270 break line_done; // this partial content is not to be added to field list yet
271 } else {
272 throw new IOException(String.format(
273 ResourceBundle.getBundle(DEFAULT_BUNDLE_NAME, errorLocale).getString("unterminated.quote"),
274 sfc.peekOutput()));
275 }
276 } else {
277 inField = false;
278 }
279
280 tokensOnThisLine.add(convertEmptyToNullIfNeeded(sfc.takeOutput(), fromQuotedField));
281 }
282
283 tokensOnLastCompleteLine = tokensOnThisLine.size();
284 return tokensOnThisLine.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
285
286 }
287
288 private void handleQuoteCharButNotStrictQuotes(String nextLine, StringFragmentCopier sfc) {
289 if (!strictQuotes) {
290 final int i = sfc.i;
291 if (i > BEGINNING_OF_LINE //not on the beginning of the line
292 && nextLine.charAt(i - 2) != this.separator //not at the beginning of an escape sequence
293 && nextLine.length() > (i) &&
294 nextLine.charAt(i) != this.separator //not at the end of an escape sequence
295 ) {
296
297 if (ignoreLeadingWhiteSpace && !sfc.isEmptyOutput() && StringUtils.isWhitespace(sfc.peekOutput())) {
298 sfc.clearOutput();
299 } else {
300 sfc.appendPrev();
301 }
302 }
303 }
304 }
305
306 private void handleEscapeCharacter(String nextLine, StringFragmentCopier sfc, boolean inQuotes) {
307 if (isNextCharacterEscapable(nextLine, inQuotes(inQuotes), sfc.i - 1)) {
308 sfc.takeInput();
309 sfc.appendPrev();
310 }
311 }
312
313 private String convertEmptyToNullIfNeeded(String s, boolean fromQuotedField) {
314 if (s.isEmpty() && shouldConvertEmptyToNull(fromQuotedField)) {
315 return null;
316 }
317 return s;
318 }
319
320 private boolean shouldConvertEmptyToNull(boolean fromQuotedField) {
321 switch (nullFieldIndicator) {
322 case BOTH:
323 return true;
324 case EMPTY_SEPARATORS:
325 return !fromQuotedField;
326 case EMPTY_QUOTES:
327 return fromQuotedField;
328 default:
329 return false;
330 }
331 }
332
333 /**
334 * Determines if we can process as if we were in quotes.
335 *
336 * @param inQuotes Are we currently in quotes?
337 * @return True if we should process as if we are inside quotes.
338 */
339 private boolean inQuotes(boolean inQuotes) {
340 return (inQuotes && !ignoreQuotations) || inField;
341 }
342
343 /**
344 * Checks to see if the character after the index is a quotation character.
345 *
346 * Precondition: the current character is a quote or an escape.
347 *
348 * @param nextLine The current line
349 * @param inQuotes True if the current context is quoted
350 * @param i Current index in line
351 * @return True if the following character is a quote
352 */
353 private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
354 return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
355 && nextLine.length() > (i + 1) // there is indeed another character to check.
356 && isCharacterQuoteCharacter(nextLine.charAt(i + 1));
357 }
358
359 /**
360 * Checks to see if the passed in character is the defined quotation character.
361 *
362 * @param c Source character
363 * @return True if c is the defined quotation character
364 */
365 private boolean isCharacterQuoteCharacter(char c) {
366 return c == quotechar;
367 }
368
369 /**
370 * Checks to see if the character is the defined escape character.
371 *
372 * @param c Source character
373 * @return True if the character is the defined escape character
374 */
375 private boolean isCharacterEscapeCharacter(char c) {
376 return c == escape;
377 }
378
379 /**
380 * Checks to see if the character is the defined separator.
381 *
382 * @param c Source character
383 * @return True if the character is the defined separator
384 */
385 private boolean isCharacterSeparator(char c) {
386 return c == separator;
387 }
388
389 /**
390 * Checks to see if the character passed in could be escapable.
391 * Escapable characters for opencsv are the quotation character, the
392 * escape character, and the separator.
393 *
394 * @param c Source character
395 * @return True if the character could be escapable.
396 */
397 private boolean isCharacterEscapable(char c) {
398 return isCharacterQuoteCharacter(c) || isCharacterEscapeCharacter(c) || isCharacterSeparator(c);
399 }
400
401 /**
402 * Checks to see if the character after the current index in a String is an
403 * escapable character.
404 * <p>Meaning the next character is a quotation character, the escape
405 * char, or the separator and you are inside quotes.</p>
406 * <p>"Inside quotes" in this context is interpreted liberally. For
407 * instance, if quotes are not expected but we are inside a field, that
408 * still counts for the purposes of this method as being "in quotes".</p>
409 *
410 * Precondition: the current character is an escape.
411 *
412 * @param nextLine The current line
413 * @param inQuotes True if the current context is quoted
414 * @param i Current index in line
415 * @return True if the following character is a quote
416 */
417 protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
418 return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
419 && nextLine.length() > (i + 1) // there is indeed another character to check.
420 && isCharacterEscapable(nextLine.charAt(i + 1));
421 }
422
423 @Override
424 public void setErrorLocale(Locale errorLocale) {
425 this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
426 }
427
428 /**
429 * This class serves to optimize {@link CSVParser#parseLine(java.lang.String)},
430 * which is the hot inner loop of opencsv.
431 */
432 private static class StringFragmentCopier {
433 private final String input;
434 // Index of the next character in input to consume
435 private int i = 0;
436
437 // This holds what is known of the next token to be output so far. We initialize this lazily because for
438 // CSVs where there are no escaped characters we can actually avoid creating this entirely.
439 private StringBuilder sb;
440 // Indexes of a substring of nextLine that is logically already appended to the sb buffer. If possible,
441 // we just fiddle these indices rather than actually appending anything to sb.
442 private int pendingSubstrFrom = 0;
443 private int pendingSubstrTo = 0;
444
445 StringFragmentCopier(String input) {
446 this.input = input;
447 }
448
449 public boolean isEmptyInput() {
450 return i >= input.length();
451 }
452
453 public char takeInput() {
454 return input.charAt(i++);
455 }
456
457 private StringBuilder materializeBuilder() {
458 if (sb == null) {
459 sb = new StringBuilder(input.length() + READ_BUFFER_SIZE);
460 }
461
462 if (pendingSubstrFrom < pendingSubstrTo) {
463 sb.append(input, pendingSubstrFrom, pendingSubstrTo);
464 pendingSubstrFrom = pendingSubstrTo = i;
465 }
466
467 return sb;
468 }
469
470 public void append(String pending) {
471 materializeBuilder().append(pending);
472 }
473
474 public void append(char pending) {
475 materializeBuilder().append(pending);
476 }
477
478 public void appendPrev() {
479 if (pendingSubstrTo == pendingSubstrFrom) {
480 pendingSubstrFrom = i - 1;
481 pendingSubstrTo = i;
482 } else if (pendingSubstrTo == i - 1) {
483 pendingSubstrTo++;
484 } else {
485 materializeBuilder().append(input.charAt(i - 1));
486 }
487 }
488
489 /**
490 * Determines whether the current output is empty.
491 * <p>
492 * The output is considered empty if the pending substring indices
493 * indicate there is no substring to process (pendingSubstrFrom >= pendingSubstrTo),
494 * and the StringBuilder object (sb) is either null or has a length of zero.
495 *
496 * @return true if the output is empty, false otherwise.
497 */
498 public boolean isEmptyOutput() {
499 return pendingSubstrFrom >= pendingSubstrTo && (sb == null || sb.length() == 0);
500 }
501
502 /**
503 * Clears the current output buffer and resets the indices used to track
504 * substrings of the input.
505 *
506 * Specifically, this method sets the internal StringBuilder's length to zero,
507 * effectively clearing its content. Additionally, it resets the range of
508 * pending substring indices (pendingSubstrFrom and pendingSubstrTo) to the
509 * current position (i) in the input.
510 */
511 public void clearOutput() {
512 if (sb != null) {
513 sb.setLength(0);
514 }
515
516 pendingSubstrFrom = pendingSubstrTo = i;
517 }
518
519 /**
520 * Retrieves the current accumulated output as a string without modifying or clearing the underlying buffers or state.
521 *
522 * @return The current output. If no output has been accumulated, returns a substring of the input
523 * between the indices specified by the internal state or an empty string if no such substring exists.
524 */
525 public String peekOutput() {
526 if (sb == null || sb.length() == 0) {
527 return input.substring(pendingSubstrFrom, pendingSubstrTo);
528 } else {
529 return materializeBuilder().toString();
530 }
531 }
532
533 /**
534 * Retrieves the current output and clears it. This method combines the operations of
535 * peeking at the current output and resetting any internal buffers or indexes to prepare
536 * for new content while ensuring the current output is returned.
537 *
538 * @return The current output as a string before it is cleared. If there is no output,
539 * an empty string is returned.
540 */
541 public String takeOutput() {
542 final String result = peekOutput();
543 clearOutput();
544 return result;
545 }
546 }
547 }