1 package com.opencsv;
2
3 /*
4 Copyright 2005 Bytecode Pty Ltd.
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 */
18
19 import com.opencsv.bean.util.OrderedObject;
20 import com.opencsv.exceptions.*;
21 import com.opencsv.processor.RowProcessor;
22 import com.opencsv.stream.reader.LineReader;
23 import com.opencsv.validators.LineValidatorAggregator;
24 import com.opencsv.validators.RowValidatorAggregator;
25 import org.apache.commons.lang3.ObjectUtils;
26 import org.apache.commons.lang3.StringUtils;
27
28 import java.io.*;
29 import java.nio.charset.CharacterCodingException;
30 import java.nio.charset.MalformedInputException;
31 import java.util.*;
32 import java.util.zip.ZipException;
33
34 /**
35 * A very simple CSV reader released under a commercial-friendly license.
36 *
37 * @author Glen Smith
38 */
39 public class CSVReader implements Closeable, Iterable<String[]> {
40
41 public static final boolean DEFAULT_KEEP_CR = false;
42 public static final boolean DEFAULT_VERIFY_READER = true;
43 // context size in the exception message
44 static final int CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE = 100;
45
46 /**
47 * The default line to start reading.
48 */
49 public static final int DEFAULT_SKIP_LINES = 0;
50
51 /**
52 * The default limit for the number of lines in a multiline record.
53 * Less than one means no limit.
54 */
55 public static final int DEFAULT_MULTILINE_LIMIT = 0;
56
57 protected static final List<Class<? extends IOException>> PASSTHROUGH_EXCEPTIONS =
58 Collections.unmodifiableList(
59 Arrays.asList(CharacterCodingException.class, CharConversionException.class,
60 UnsupportedEncodingException.class, UTFDataFormatException.class,
61 ZipException.class, FileNotFoundException.class, MalformedInputException.class));
62
63 public static final int READ_AHEAD_LIMIT = Character.SIZE / Byte.SIZE;
64 private static final int MAX_WIDTH = 100;
65 protected ICSVParser parser;
66 protected int skipLines;
67 protected BufferedReader br;
68 protected LineReader lineReader;
69 protected boolean hasNext = true;
70 protected boolean linesSkipped;
71 protected boolean keepCR;
72 protected boolean verifyReader;
73 protected int multilineLimit = DEFAULT_MULTILINE_LIMIT;
74 protected Locale errorLocale;
75
76 protected long linesRead = 0;
77 protected long recordsRead = 0;
78 protected String[] peekedLine = null;
79 final protected Queue<OrderedObject<String>> peekedLines = new LinkedList<>();
80
81 private final LineValidatorAggregator lineValidatorAggregator;
82 private final RowValidatorAggregator rowValidatorAggregator;
83 private final RowProcessor rowProcessor;
84
85 /**
86 * Constructs CSVReader using defaults for all parameters.
87 *
88 * @param reader The reader to an underlying CSV source.
89 */
90 public CSVReader(Reader reader) {
91 this(reader, DEFAULT_SKIP_LINES,
92 new CSVParser(ICSVParser.DEFAULT_SEPARATOR,
93 ICSVParser.DEFAULT_QUOTE_CHARACTER,
94 ICSVParser.DEFAULT_ESCAPE_CHARACTER,
95 ICSVParser.DEFAULT_STRICT_QUOTES,
96 ICSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
97 ICSVParser.DEFAULT_IGNORE_QUOTATIONS,
98 ICSVParser.DEFAULT_NULL_FIELD_INDICATOR,
99 Locale.getDefault()),
100 DEFAULT_KEEP_CR,
101 DEFAULT_VERIFY_READER,
102 DEFAULT_MULTILINE_LIMIT,
103 Locale.getDefault(),
104 new LineValidatorAggregator(),
105 new RowValidatorAggregator(),
106 null);
107 }
108
109 /**
110 * Constructs CSVReader with supplied CSVParser.
111 * <p>This constructor sets all necessary parameters for CSVReader, and
112 * intentionally has package access so only the builder can use it.</p>
113 *
114 * @param reader The reader to an underlying CSV source
115 * @param line The number of lines to skip before reading
116 * @param icsvParser The parser to use to parse input
117 * @param keepCR True to keep carriage returns in data read, false otherwise
118 * @param verifyReader True to verify reader before each read, false otherwise
119 * @param multilineLimit Allow the user to define the limit to the number of lines in a multiline record. Less than one means no limit.
120 * @param errorLocale Set the locale for error messages. If null, the default locale is used.
121 * @param lineValidatorAggregator contains all the custom defined line validators.
122 * @param rowValidatorAggregator contains all the custom defined row validators.
123 * @param rowProcessor Custom row processor to run on all columns on a csv record.
124 */
125 CSVReader(Reader reader, int line, ICSVParser icsvParser, boolean keepCR, boolean verifyReader, int multilineLimit,
126 Locale errorLocale, LineValidatorAggregator lineValidatorAggregator, RowValidatorAggregator rowValidatorAggregator,
127 RowProcessor rowProcessor) {
128 this.br =
129 (reader instanceof BufferedReader ?
130 (BufferedReader) reader :
131 new BufferedReader(reader));
132 this.lineReader = new LineReader(br, keepCR);
133 this.skipLines = line;
134 this.parser = icsvParser;
135 this.keepCR = keepCR;
136 this.verifyReader = verifyReader;
137 this.multilineLimit = multilineLimit;
138 this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
139 this.lineValidatorAggregator = lineValidatorAggregator;
140 this.rowValidatorAggregator = rowValidatorAggregator;
141 this.rowProcessor = rowProcessor;
142 }
143
144 /**
145 * @return The CSVParser used by the reader.
146 */
147 public ICSVParser getParser() {
148 return parser;
149 }
150
151 /**
152 * Returns the number of lines in the CSV file to skip before processing.
153 * This is useful when there are miscellaneous data at the beginning of a file.
154 *
155 * @return The number of lines in the CSV file to skip before processing.
156 */
157 public int getSkipLines() {
158 return skipLines;
159 }
160
161 /**
162 * Returns if the reader will keep carriage returns found in data or remove them.
163 *
164 * @return True if reader will keep carriage returns, false otherwise.
165 */
166 public boolean keepCarriageReturns() {
167 return keepCR;
168 }
169
170 /**
171 * Reads the entire file into a List with each element being a String[] of
172 * tokens.
173 * Since the current implementation returns a {@link java.util.LinkedList},
174 * you are strongly discouraged from using index-based access methods to
175 * get at items in the list. Instead, iterate over the list.
176 *
177 * @return A List of String[], with each String[] representing a line of the
178 * file.
179 * @throws IOException If bad things happen during the read
180 * @throws CsvException If there is a failed validator
181 */
182 public List<String[]> readAll() throws IOException, CsvException {
183
184 List<String[]> allElements = new LinkedList<>();
185 while (hasNext) {
186 String[] nextLineAsTokens = readNext();
187 if (nextLineAsTokens != null) {
188 allElements.add(nextLineAsTokens);
189 }
190 }
191 return allElements;
192
193 }
194
195 /**
196 * Reads the next line from the buffer and converts to a string array.
197 *
198 * @return A string array with each comma-separated element as a separate
199 * entry, or null if there is no more input.
200 * @throws IOException If bad things happen during the read
201 * @throws CsvValidationException If a user-defined validator fails
202 */
203 public String[] readNext() throws IOException, CsvValidationException {
204 return flexibleRead(true, true);
205 }
206
207 /**
208 * Reads the next line from the buffer and converts to a string array without
209 * running the custom defined validators. This is called by the bean readers when
210 * reading the header.
211 *
212 * @return A string array with each comma-separated element as a separate
213 * entry, or null if there is no more input.
214 * @throws IOException If bad things happen during the read.
215 */
216 public String[] readNextSilently() throws IOException {
217 try {
218 return flexibleRead(true, false);
219 } catch (CsvValidationException e) {
220 throw new CsvRuntimeException("A CSValidationException was thrown from the runNextSilently method which should not happen", e);
221 }
222 }
223
224 /**
225 * Reads the next line from the buffer and converts to a string array.
226 * The results are stored in {@link #peekedLines} and {@link #peekedLine}.
227 *
228 * @throws IOException If bad things happen during the read
229 */
230 private void primeNextRecord() throws IOException {
231
232 int linesInThisRecord = 0;
233 long lastSuccessfulLineRead = linesRead+1;
234 do {
235 String nextLine = getNextLine();
236 peekedLines.add(new OrderedObject<>(lastSuccessfulLineRead, nextLine));
237 linesInThisRecord++;
238
239 // If no more input is available, check if the record is finished
240 // or simply incomplete.
241 if (!hasNext) {
242 if (parser.isPending()) {
243 throw new CsvMalformedLineException(String.format(
244 ResourceBundle.getBundle(ICSVParser.DEFAULT_BUNDLE_NAME, errorLocale).getString("unterminated.quote"),
245 StringUtils.abbreviate(parser.getPendingText(), MAX_WIDTH)), lastSuccessfulLineRead, parser.getPendingText());
246 }
247 return;
248 }
249
250
251 // If we've crossed the multiline limit, signal an error.
252 if (multilineLimit > 0 && linesInThisRecord > multilineLimit) {
253
254 // get current row records Read +1
255 long row = this.recordsRead + 1L;
256
257 String context = parser.getPendingText();
258
259 // just to avoid out of index
260 // to get the whole context use CsvMultilineLimitBrokenException::getContext()
261 if (context.length() > CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE) {
262 context = context.substring(0, CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE);
263 }
264
265 String messageFormat = ResourceBundle.getBundle(ICSVParser.DEFAULT_BUNDLE_NAME, errorLocale).getString("multiline.limit.broken");
266 String message = String.format(errorLocale, messageFormat, multilineLimit, row, context);
267 throw new CsvMultilineLimitBrokenException(message, row, parser.getPendingText(), multilineLimit);
268 }
269
270 // Combine multiple lines into one result
271 String[] r = parser.parseLineMulti(nextLine);
272 if (r.length > 0) {
273 if (peekedLine == null) {
274 peekedLine = r;
275 } else {
276 peekedLine = combineResultsFromMultipleReads(peekedLine, r);
277 }
278 }
279
280 } while (parser.isPending());
281
282 /*
283 for bug #233 (https://sourceforge.net/p/opencsv/bugs/233/) if we want to keep carriage returns we ONLY
284 want to keep the carriage returns in the data and not from the end of lines if we were in a Windows system.
285 */
286
287 if (keepCR) {
288 int lastItemIndex = peekedLine.length - 1;
289 if (peekedLine[lastItemIndex] != null && peekedLine[lastItemIndex].endsWith("\r")) {
290 peekedLine[lastItemIndex] = peekedLine[lastItemIndex].substring(0, peekedLine[lastItemIndex].length() - 1);
291 }
292 }
293 }
294
295 /**
296 * Runs all line validators on the input.
297 *
298 * @param lastSuccessfulLineRead The line number for error messages
299 * @param nextLine The input to be validated
300 * @throws CsvValidationException Only thrown if a user-supplied validator
301 * throws it
302 */
303 private void validateLine(long lastSuccessfulLineRead, String nextLine) throws CsvValidationException {
304 try {
305 lineValidatorAggregator.validate(nextLine);
306 } catch (CsvValidationException cve) {
307 cve.setLineNumber(lastSuccessfulLineRead);
308 throw cve;
309 }
310 }
311
312 /**
313 * Increments the number of records read if the result passed in is not null.
314 *
315 * @param result The result of the read operation
316 * @param lineStartOfRow Line number that the row started on
317 * @throws CsvValidationException if there is a validation error caught by a custom RowValidator.
318 */
319 protected void validateResult(String[] result, long lineStartOfRow) throws CsvValidationException {
320 if (result != null) {
321 if (rowProcessor != null) {
322 rowProcessor.processRow(result);
323 }
324 try {
325 rowValidatorAggregator.validate(result);
326 } catch (CsvValidationException cve) {
327 cve.setLineNumber(lineStartOfRow);
328 throw cve;
329 }
330 }
331 }
332
333 /**
334 * For multi-line records this method combines the current result with the result from previous read(s).
335 *
336 * @param buffer Previous data read for this record
337 * @param lastRead Latest data read for this record.
338 * @return String array with union of the buffer and lastRead arrays.
339 */
340 protected String[] combineResultsFromMultipleReads(String[] buffer, String[] lastRead) {
341 String[] t = new String[buffer.length + lastRead.length];
342 System.arraycopy(buffer, 0, t, 0, buffer.length);
343 System.arraycopy(lastRead, 0, t, buffer.length, lastRead.length);
344 return t;
345 }
346
347 /**
348 * Reads the next line from the file.
349 *
350 * @return The next line from the file without trailing newline, or null if
351 * there is no more input.
352 * @throws IOException If bad things happen during the read
353 */
354 protected String getNextLine() throws IOException {
355 if (isClosed()) {
356 hasNext = false;
357 return null;
358 }
359
360 if (!this.linesSkipped) {
361 for (int i = 0; i < skipLines; i++) {
362 lineReader.readLine();
363 linesRead++;
364 }
365 this.linesSkipped = true;
366 }
367 String nextLine = lineReader.readLine();
368 if (nextLine == null) {
369 hasNext = false;
370 } else {
371 linesRead++;
372 }
373
374 return hasNext ? nextLine : null;
375 }
376
377 /**
378 * Only useful for tests.
379 *
380 * @return The maximum number of lines allowed in a multiline record.
381 */
382 public int getMultilineLimit() {
383 return multilineLimit;
384 }
385
386 /**
387 * Checks to see if the file is closed.
388 * <p>Certain {@link IOException}s will be passed out, as they are
389 * indicative of a real problem, not that the file has already been closed.
390 * These exceptions are:<ul>
391 * <li>CharacterCodingException</li>
392 * <li>CharConversionException</li>
393 * <li>FileNotFoundException</li>
394 * <li>UnsupportedEncodingException</li>
395 * <li>UTFDataFormatException</li>
396 * <li>ZipException</li>
397 * <li>MalformedInputException</li>
398 * </ul></p>
399 *
400 * @return {@code true} if the reader can no longer be read from
401 * @throws IOException If {@link #verifyReader()} was set to {@code true}
402 * certain {@link IOException}s will still be passed out as they are
403 * indicative of a problem, not end of file.
404 */
405 protected boolean isClosed() throws IOException {
406 if (!verifyReader) {
407 return false;
408 }
409 try {
410 br.mark(READ_AHEAD_LIMIT);
411 int nextByte = br.read();
412 br.reset(); // resets stream position, possible because its buffered
413 return nextByte == -1; // read() returns -1 at end of stream
414 } catch (IOException e) {
415 if (PASSTHROUGH_EXCEPTIONS.contains(e.getClass())) {
416 throw e;
417 }
418
419 return true;
420 }
421 }
422
423 /**
424 * Closes the underlying reader.
425 *
426 * @throws IOException If the close fails
427 */
428 @Override
429 public void close() throws IOException {
430 br.close();
431 }
432
433 /**
434 * Creates an Iterator for processing the CSV data.
435 *
436 * @return A String[] iterator.
437 */
438 @Override
439 public Iterator<String[]> iterator() {
440 try {
441 CSVIterator it = new CSVIterator(this);
442 it.setErrorLocale(errorLocale);
443 return it;
444 } catch (IOException | CsvValidationException e) {
445 throw new RuntimeException(e);
446 }
447 }
448
449 /**
450 * Returns if the CSVReader will verify the reader before each read.
451 * <p>
452 * By default the value is true, which is the functionality for version 3.0.
453 * If set to false the reader is always assumed ready to read - this is the functionality
454 * for version 2.4 and before.
455 * </p>
456 * <p>
457 * The reason this method was needed was that certain types of readers would return
458 * false for their ready() methods until a read was done (namely readers created using Channels).
459 * This caused opencsv not to read from those readers.
460 * </p>
461 *
462 * @return True if CSVReader will verify the reader before reads. False otherwise.
463 * @see <a href="https://sourceforge.net/p/opencsv/bugs/108/">Bug 108</a>
464 * @since 3.3
465 */
466 public boolean verifyReader() {
467 return this.verifyReader;
468 }
469
470 /**
471 * This method returns the number of lines that
472 * has been read from the reader passed into the CSVReader.
473 * <p>
474 * Given the following data:</p>
475 * <pre>
476 * First line in the file
477 * some other descriptive line
478 * a,b,c
479 *
480 * a,"b\nb",c
481 * </pre>
482 * <p>
483 * With a CSVReader constructed like so:<br>
484 * <code>
485 * CSVReader c = builder.withCSVParser(new CSVParser())<br>
486 * .withSkipLines(2)<br>
487 * .build();<br>
488 * </code><br>
489 * The initial call to getLinesRead() will be 0. After the first call to
490 * readNext() then getLinesRead() will return 3 (because the header was read).
491 * After the second call to read the blank line then getLinesRead() will
492 * return 4 (still a read). After the third call to readNext(), getLinesRead()
493 * will return 6 because it took two line reads to retrieve this record.
494 * Subsequent calls to readNext() (since we are out of data) will not
495 * increment the number of lines read.</p>
496 *
497 * @return The number of lines read by the reader (including skipped lines).
498 * @since 3.6
499 */
500 public long getLinesRead() {
501 return linesRead;
502 }
503
504 /**
505 * Used for debugging purposes, this method returns the number of records
506 * that has been read from the CSVReader.
507 * <p>
508 * Given the following data:</p>
509 * <pre>
510 * First line in the file
511 * some other descriptive line
512 * a,b,c
513 * a,"b\nb",c
514 * </pre><p>
515 * With a CSVReader constructed like so:<br>
516 * <code>
517 * CSVReader c = builder.withCSVParser(new CSVParser())<br>
518 * .withSkipLines(2)<br>
519 * .build();<br>
520 * </code><br>
521 * The initial call to getRecordsRead() will be 0. After the first call to
522 * readNext() then getRecordsRead() will return 1. After the second call to
523 * read the blank line then getRecordsRead() will return 2 (a blank line is
524 * considered a record with one empty field). After third call to readNext()
525 * getRecordsRead() will return 3 because even though it reads to retrieve
526 * this record, it is still a single record read. Subsequent calls to
527 * readNext() (since we are out of data) will not increment the number of
528 * records read.
529 * </p>
530 * <p>
531 * An example of this is in the linesAndRecordsRead() test in CSVReaderTest.
532 * </p>
533 *
534 * @return The number of records (array of Strings[]) read by the reader.
535 * @see <a href="https://sourceforge.net/p/opencsv/feature-requests/73/">Feature Request 73</a>
536 * @since 3.6
537 */
538 public long getRecordsRead() {
539 return recordsRead;
540 }
541
542 /**
543 * Skip a given number of lines.
544 *
545 * @param numberOfLinesToSkip The number of lines to skip
546 * @throws IOException If anything bad happens when reading the file
547 * @since 4.2
548 */
549 public void skip(int numberOfLinesToSkip) throws IOException {
550 for (int j = 0; j < numberOfLinesToSkip; j++) {
551 readNextSilently();
552 }
553 }
554
555 /**
556 * Sets the locale for all error messages.
557 *
558 * @param errorLocale Locale for error messages. If null, the default locale
559 * is used.
560 * @since 4.2
561 */
562 public void setErrorLocale(Locale errorLocale) {
563 this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
564 if (parser != null) {
565 parser.setErrorLocale(this.errorLocale);
566 }
567 }
568
569 /**
570 * Returns the next line from the input without removing it from the
571 * CSVReader and not running any validators.
572 * Subsequent calls to this method will continue to return the same line
573 * until a call is made to {@link #readNext()} or any other method that
574 * advances the cursor position in the input. The first call to
575 * {@link #readNext()} after calling this method will return the same line
576 * this method does.
577 *
578 * @return The next line from the input, or {@code null} if there are no
579 * more lines
580 * @throws IOException If bad things happen during the read operation
581 * @since 4.2
582 */
583 public String[] peek() throws IOException {
584 String[] result = null;
585 try {
586 result = flexibleRead(false, false);
587 } catch (CsvValidationException e) {
588 // Do nothing. We asked for no validation, so it can't really happen.
589 }
590 return result;
591 }
592
593 /**
594 * Reads a line of input, popping or validating as desired.
595 *
596 * @param popLine Whether the line returned should be popped off the queue
597 * of input. If this is {@code true}, this method consumes
598 * the line and further calls will return the next line of
599 * input. If {@code false}, the line returned stays in the
600 * queue and further calls to this method will return the
601 * same line again.
602 * @param validate Whether all user-supplied validators should be run.
603 * @return The next line of input
604 * @throws IOException If this exception is thrown while reading
605 * @throws CsvValidationException If a user-supplied validator throws it
606 */
607 private String[] flexibleRead(boolean popLine, boolean validate) throws IOException, CsvValidationException {
608
609 if(peekedLines.isEmpty()) {
610 primeNextRecord();
611 }
612
613 if(validate) {
614 for(OrderedObject<String> orderedObject : peekedLines) {
615 validateLine(orderedObject.getOrdinal(), orderedObject.getElement());
616 }
617 validateResult(peekedLine, linesRead);
618 }
619
620 String[] result = peekedLine;
621
622 if(popLine) {
623 peekedLines.clear();
624 peekedLine = null;
625 if(result != null) {
626 recordsRead++;
627 }
628 }
629
630 return result;
631 }
632 }