View Javadoc
1   package com.opencsv;
2   
3   /*
4    Copyright 2005 Bytecode Pty Ltd.
5   
6    Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9   
10   http://www.apache.org/licenses/LICENSE-2.0
11  
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17   */
18  
19  import com.opencsv.bean.util.OrderedObject;
20  import com.opencsv.exceptions.*;
21  import com.opencsv.processor.RowProcessor;
22  import com.opencsv.stream.reader.LineReader;
23  import com.opencsv.validators.LineValidatorAggregator;
24  import com.opencsv.validators.RowValidatorAggregator;
25  import org.apache.commons.lang3.ObjectUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  import java.io.*;
29  import java.nio.charset.CharacterCodingException;
30  import java.nio.charset.MalformedInputException;
31  import java.util.*;
32  import java.util.zip.ZipException;
33  
34  /**
35   * A very simple CSV reader released under a commercial-friendly license.
36   *
37   * @author Glen Smith
38   */
39  public class CSVReader implements Closeable, Iterable<String[]> {
40  
41      public static final boolean DEFAULT_KEEP_CR = false;
42      public static final boolean DEFAULT_VERIFY_READER = true;
43      // context size in the exception message
44      static final int CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE = 100;
45  
46      /**
47       * The default line to start reading.
48       */
49      public static final int DEFAULT_SKIP_LINES = 0;
50  
51      /**
52       * The default limit for the number of lines in a multiline record.
53       * Less than one means no limit.
54       */
55      public static final int DEFAULT_MULTILINE_LIMIT = 0;
56  
57      protected static final List<Class<? extends IOException>> PASSTHROUGH_EXCEPTIONS =
58              Collections.unmodifiableList(
59                      Arrays.asList(CharacterCodingException.class, CharConversionException.class,
60                              UnsupportedEncodingException.class, UTFDataFormatException.class,
61                              ZipException.class, FileNotFoundException.class, MalformedInputException.class));
62  
63      public static final int READ_AHEAD_LIMIT = Character.SIZE / Byte.SIZE;
64      private static final int MAX_WIDTH = 100;
65      protected ICSVParser parser;
66      protected int skipLines;
67      protected BufferedReader br;
68      protected LineReader lineReader;
69      protected boolean hasNext = true;
70      protected boolean linesSkipped;
71      protected boolean keepCR;
72      protected boolean verifyReader;
73      protected int multilineLimit = DEFAULT_MULTILINE_LIMIT;
74      protected Locale errorLocale;
75  
76      protected long linesRead = 0;
77      protected long recordsRead = 0;
78      protected String[] peekedLine = null;
79      final protected Queue<OrderedObject<String>> peekedLines = new LinkedList<>();
80  
81      private final LineValidatorAggregator lineValidatorAggregator;
82      private final RowValidatorAggregator rowValidatorAggregator;
83      private final RowProcessor rowProcessor;
84  
85      /**
86       * Constructs CSVReader using defaults for all parameters.
87       *
88       * @param reader The reader to an underlying CSV source.
89       */
90      public CSVReader(Reader reader) {
91          this(reader, DEFAULT_SKIP_LINES,
92                  new CSVParser(ICSVParser.DEFAULT_SEPARATOR,
93                          ICSVParser.DEFAULT_QUOTE_CHARACTER,
94                          ICSVParser.DEFAULT_ESCAPE_CHARACTER,
95                          ICSVParser.DEFAULT_STRICT_QUOTES,
96                          ICSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
97                          ICSVParser.DEFAULT_IGNORE_QUOTATIONS,
98                          ICSVParser.DEFAULT_NULL_FIELD_INDICATOR,
99                          Locale.getDefault()),
100                 DEFAULT_KEEP_CR,
101                 DEFAULT_VERIFY_READER,
102                 DEFAULT_MULTILINE_LIMIT,
103                 Locale.getDefault(),
104                 new LineValidatorAggregator(),
105                 new RowValidatorAggregator(),
106                 null);
107     }
108 
109     /**
110      * Constructs CSVReader with supplied CSVParser.
111      * <p>This constructor sets all necessary parameters for CSVReader, and
112      * intentionally has package access so only the builder can use it.</p>
113      *
114      * @param reader                  The reader to an underlying CSV source
115      * @param line                    The number of lines to skip before reading
116      * @param icsvParser              The parser to use to parse input
117      * @param keepCR                  True to keep carriage returns in data read, false otherwise
118      * @param verifyReader            True to verify reader before each read, false otherwise
119      * @param multilineLimit          Allow the user to define the limit to the number of lines in a multiline record. Less than one means no limit.
120      * @param errorLocale             Set the locale for error messages. If null, the default locale is used.
121      * @param lineValidatorAggregator contains all the custom defined line validators.
122      * @param rowValidatorAggregator  contains all the custom defined row validators.
123      * @param rowProcessor            Custom row processor to run on all columns on a csv record.
124      */
125     CSVReader(Reader reader, int line, ICSVParser icsvParser, boolean keepCR, boolean verifyReader, int multilineLimit,
126               Locale errorLocale, LineValidatorAggregator lineValidatorAggregator, RowValidatorAggregator rowValidatorAggregator,
127               RowProcessor rowProcessor) {
128         this.br =
129                 (reader instanceof BufferedReader ?
130                         (BufferedReader) reader :
131                         new BufferedReader(reader));
132         this.lineReader = new LineReader(br, keepCR);
133         this.skipLines = line;
134         this.parser = icsvParser;
135         this.keepCR = keepCR;
136         this.verifyReader = verifyReader;
137         this.multilineLimit = multilineLimit;
138         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
139         this.lineValidatorAggregator = lineValidatorAggregator;
140         this.rowValidatorAggregator = rowValidatorAggregator;
141         this.rowProcessor = rowProcessor;
142     }
143 
144     /**
145      * @return The CSVParser used by the reader.
146      */
147     public ICSVParser getParser() {
148         return parser;
149     }
150 
151     /**
152      * Returns the number of lines in the CSV file to skip before processing.
153      * This is useful when there are miscellaneous data at the beginning of a file.
154      *
155      * @return The number of lines in the CSV file to skip before processing.
156      */
157     public int getSkipLines() {
158         return skipLines;
159     }
160 
161     /**
162      * Returns if the reader will keep carriage returns found in data or remove them.
163      *
164      * @return True if reader will keep carriage returns, false otherwise.
165      */
166     public boolean keepCarriageReturns() {
167         return keepCR;
168     }
169 
170     /**
171      * Reads the entire file into a List with each element being a String[] of
172      * tokens.
173      * Since the current implementation returns a {@link java.util.LinkedList},
174      * you are strongly discouraged from using index-based access methods to
175      * get at items in the list. Instead, iterate over the list.
176      *
177      * @return A List of String[], with each String[] representing a line of the
178      * file.
179      * @throws IOException  If bad things happen during the read
180      * @throws CsvException If there is a failed validator
181      */
182     public List<String[]> readAll() throws IOException, CsvException {
183 
184         List<String[]> allElements = new LinkedList<>();
185         while (hasNext) {
186             String[] nextLineAsTokens = readNext();
187             if (nextLineAsTokens != null) {
188                 allElements.add(nextLineAsTokens);
189             }
190         }
191         return allElements;
192 
193     }
194 
195     /**
196      * Reads the next line from the buffer and converts to a string array.
197      *
198      * @return A string array with each comma-separated element as a separate
199      * entry, or null if there is no more input.
200      * @throws IOException            If bad things happen during the read
201      * @throws CsvValidationException If a user-defined validator fails
202      */
203     public String[] readNext() throws IOException, CsvValidationException {
204         return flexibleRead(true, true);
205     }
206 
207     /**
208      * Reads the next line from the buffer and converts to a string array without
209      * running the custom defined validators.  This is called by the bean readers when
210      * reading the header.
211      *
212      * @return A string array with each comma-separated element as a separate
213      * entry, or null if there is no more input.
214      * @throws IOException If bad things happen during the read.
215      */
216     public String[] readNextSilently() throws IOException {
217         try {
218             return flexibleRead(true, false);
219         } catch (CsvValidationException e) {
220             throw new CsvRuntimeException("A CSValidationException was thrown from the runNextSilently method which should not happen", e);
221         }
222     }
223 
224     /**
225      * Reads the next line from the buffer and converts to a string array.
226      * The results are stored in {@link #peekedLines} and {@link #peekedLine}.
227      *
228      * @throws IOException If bad things happen during the read
229      */
230     private void primeNextRecord() throws IOException {
231 
232         int linesInThisRecord = 0;
233         long lastSuccessfulLineRead = linesRead+1;
234         do {
235             String nextLine = getNextLine();
236             peekedLines.add(new OrderedObject<>(lastSuccessfulLineRead, nextLine));
237             linesInThisRecord++;
238 
239             // If no more input is available, check if the record is finished
240             // or simply incomplete.
241             if (!hasNext) {
242                 if (parser.isPending()) {
243                     throw new CsvMalformedLineException(String.format(
244                             ResourceBundle.getBundle(ICSVParser.DEFAULT_BUNDLE_NAME, errorLocale).getString("unterminated.quote"),
245                             StringUtils.abbreviate(parser.getPendingText(), MAX_WIDTH)), lastSuccessfulLineRead, parser.getPendingText());
246                 }
247                 return;
248             }
249 
250 
251             // If we've crossed the multiline limit, signal an error.
252             if (multilineLimit > 0 && linesInThisRecord > multilineLimit) {
253 
254                 // get current row records Read +1
255                 long row = this.recordsRead + 1L;
256 
257                 String context = parser.getPendingText();
258 
259                 // just to avoid out of index
260                 // to get the whole context use CsvMultilineLimitBrokenException::getContext()
261                 if (context.length() > CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE) {
262                     context = context.substring(0, CONTEXT_MULTILINE_EXCEPTION_MESSAGE_SIZE);
263                 }
264 
265                 String messageFormat = ResourceBundle.getBundle(ICSVParser.DEFAULT_BUNDLE_NAME, errorLocale).getString("multiline.limit.broken");
266                 String message = String.format(errorLocale, messageFormat, multilineLimit, row, context);
267                 throw new CsvMultilineLimitBrokenException(message, row, parser.getPendingText(), multilineLimit);
268             }
269 
270             // Combine multiple lines into one result
271             String[] r = parser.parseLineMulti(nextLine);
272             if (r.length > 0) {
273                 if (peekedLine == null) {
274                     peekedLine = r;
275                 } else {
276                     peekedLine = combineResultsFromMultipleReads(peekedLine, r);
277                 }
278             }
279 
280         } while (parser.isPending());
281 
282         /*
283          for bug #233 (https://sourceforge.net/p/opencsv/bugs/233/) if we want to keep carriage returns we ONLY
284          want to keep the carriage returns in the data and not from the end of lines if we were in a Windows system.
285          */
286 
287         if (keepCR) {
288             int lastItemIndex = peekedLine.length - 1;
289             if (peekedLine[lastItemIndex] != null && peekedLine[lastItemIndex].endsWith("\r")) {
290                 peekedLine[lastItemIndex] = peekedLine[lastItemIndex].substring(0, peekedLine[lastItemIndex].length() - 1);
291             }
292         }
293     }
294 
295     /**
296      * Runs all line validators on the input.
297      *
298      * @param lastSuccessfulLineRead The line number for error messages
299      * @param nextLine The input to be validated
300      * @throws CsvValidationException Only thrown if a user-supplied validator
301      *   throws it
302      */
303     private void validateLine(long lastSuccessfulLineRead, String nextLine) throws CsvValidationException {
304         try {
305             lineValidatorAggregator.validate(nextLine);
306         } catch (CsvValidationException cve) {
307             cve.setLineNumber(lastSuccessfulLineRead);
308             throw cve;
309         }
310     }
311 
312     /**
313      * Increments the number of records read if the result passed in is not null.
314      *
315      * @param result           The result of the read operation
316      * @param lineStartOfRow   Line number that the row started on
317      * @throws CsvValidationException if there is a validation error caught by a custom RowValidator.
318      */
319     protected void validateResult(String[] result, long lineStartOfRow) throws CsvValidationException {
320         if (result != null) {
321             if (rowProcessor != null) {
322                 rowProcessor.processRow(result);
323             }
324             try {
325                 rowValidatorAggregator.validate(result);
326             } catch (CsvValidationException cve) {
327                 cve.setLineNumber(lineStartOfRow);
328                 throw cve;
329             }
330         }
331     }
332 
333     /**
334      * For multi-line records this method combines the current result with the result from previous read(s).
335      *
336      * @param buffer   Previous data read for this record
337      * @param lastRead Latest data read for this record.
338      * @return String array with union of the buffer and lastRead arrays.
339      */
340     protected String[] combineResultsFromMultipleReads(String[] buffer, String[] lastRead) {
341         String[] t = new String[buffer.length + lastRead.length];
342         System.arraycopy(buffer, 0, t, 0, buffer.length);
343         System.arraycopy(lastRead, 0, t, buffer.length, lastRead.length);
344         return t;
345     }
346 
347     /**
348      * Reads the next line from the file.
349      *
350      * @return The next line from the file without trailing newline, or null if
351      * there is no more input.
352      * @throws IOException If bad things happen during the read
353      */
354     protected String getNextLine() throws IOException {
355         if (isClosed()) {
356             hasNext = false;
357             return null;
358         }
359 
360         if (!this.linesSkipped) {
361             for (int i = 0; i < skipLines; i++) {
362                 lineReader.readLine();
363                 linesRead++;
364             }
365             this.linesSkipped = true;
366         }
367         String nextLine = lineReader.readLine();
368         if (nextLine == null) {
369             hasNext = false;
370         } else {
371             linesRead++;
372         }
373 
374         return hasNext ? nextLine : null;
375     }
376 
377     /**
378      * Only useful for tests.
379      *
380      * @return The maximum number of lines allowed in a multiline record.
381      */
382     public int getMultilineLimit() {
383         return multilineLimit;
384     }
385 
386     /**
387      * Checks to see if the file is closed.
388      * <p>Certain {@link IOException}s will be passed out, as they are
389      * indicative of a real problem, not that the file has already been closed.
390      * These exceptions are:<ul>
391      *     <li>CharacterCodingException</li>
392      *     <li>CharConversionException</li>
393      *     <li>FileNotFoundException</li>
394      *     <li>UnsupportedEncodingException</li>
395      *     <li>UTFDataFormatException</li>
396      *     <li>ZipException</li>
397      *     <li>MalformedInputException</li>
398      * </ul></p>
399      *
400      * @return {@code true} if the reader can no longer be read from
401      * @throws IOException If {@link #verifyReader()} was set to {@code true}
402      *   certain {@link IOException}s will still be passed out as they are
403      *   indicative of a problem, not end of file.
404      */
405     protected boolean isClosed() throws IOException {
406         if (!verifyReader) {
407             return false;
408         }
409         try {
410             br.mark(READ_AHEAD_LIMIT);
411             int nextByte = br.read();
412             br.reset(); // resets stream position, possible because its buffered
413             return nextByte == -1; // read() returns -1 at end of stream
414         } catch (IOException e) {
415             if (PASSTHROUGH_EXCEPTIONS.contains(e.getClass())) {
416                 throw e;
417             }
418 
419             return true;
420         }
421     }
422 
423     /**
424      * Closes the underlying reader.
425      *
426      * @throws IOException If the close fails
427      */
428     @Override
429     public void close() throws IOException {
430         br.close();
431     }
432 
433     /**
434      * Creates an Iterator for processing the CSV data.
435      *
436      * @return A String[] iterator.
437      */
438     @Override
439     public Iterator<String[]> iterator() {
440         try {
441             CSVIterator it = new CSVIterator(this);
442             it.setErrorLocale(errorLocale);
443             return it;
444         } catch (IOException | CsvValidationException e) {
445             throw new RuntimeException(e);
446         }
447     }
448 
449     /**
450      * Returns if the CSVReader will verify the reader before each read.
451      * <p>
452      * By default the value is true, which is the functionality for version 3.0.
453      * If set to false the reader is always assumed ready to read - this is the functionality
454      * for version 2.4 and before.
455      * </p>
456      * <p>
457      * The reason this method was needed was that certain types of readers would return
458      * false for their ready() methods until a read was done (namely readers created using Channels).
459      * This caused opencsv not to read from those readers.
460      * </p>
461      *
462      * @return True if CSVReader will verify the reader before reads.  False otherwise.
463      * @see <a href="https://sourceforge.net/p/opencsv/bugs/108/">Bug 108</a>
464      * @since 3.3
465      */
466     public boolean verifyReader() {
467         return this.verifyReader;
468     }
469 
470     /**
471      * This method returns the number of lines that
472      * has been read from the reader passed into the CSVReader.
473      * <p>
474      * Given the following data:</p>
475      * <pre>
476      * First line in the file
477      * some other descriptive line
478      * a,b,c
479      *
480      * a,"b\nb",c
481      * </pre>
482      * <p>
483      * With a CSVReader constructed like so:<br>
484      * <code>
485      * CSVReader c = builder.withCSVParser(new CSVParser())<br>
486      * .withSkipLines(2)<br>
487      * .build();<br>
488      * </code><br>
489      * The initial call to getLinesRead() will be 0. After the first call to
490      * readNext() then getLinesRead() will return 3 (because the header was read).
491      * After the second call to read the blank line then getLinesRead() will
492      * return 4 (still a read). After the third call to readNext(), getLinesRead()
493      * will return 6 because it took two line reads to retrieve this record.
494      * Subsequent calls to readNext() (since we are out of data) will not
495      * increment the number of lines read.</p>
496      *
497      * @return The number of lines read by the reader (including skipped lines).
498      * @since 3.6
499      */
500     public long getLinesRead() {
501         return linesRead;
502     }
503 
504     /**
505      * Used for debugging purposes, this method returns the number of records
506      * that has been read from the CSVReader.
507      * <p>
508      * Given the following data:</p>
509      * <pre>
510      * First line in the file
511      * some other descriptive line
512      * a,b,c
513      * a,"b\nb",c
514      * </pre><p>
515      * With a CSVReader constructed like so:<br>
516      * <code>
517      * CSVReader c = builder.withCSVParser(new CSVParser())<br>
518      * .withSkipLines(2)<br>
519      * .build();<br>
520      * </code><br>
521      * The initial call to getRecordsRead() will be 0. After the first call to
522      * readNext() then getRecordsRead() will return 1. After the second call to
523      * read the blank line then getRecordsRead() will return 2 (a blank line is
524      * considered a record with one empty field). After third call to readNext()
525      * getRecordsRead() will return 3 because even though it reads to retrieve
526      * this record, it is still a single record read. Subsequent calls to
527      * readNext() (since we are out of data) will not increment the number of
528      * records read.
529      * </p>
530      * <p>
531      * An example of this is in the linesAndRecordsRead() test in CSVReaderTest.
532      * </p>
533      *
534      * @return The number of records (array of Strings[]) read by the reader.
535      * @see <a href="https://sourceforge.net/p/opencsv/feature-requests/73/">Feature Request 73</a>
536      * @since 3.6
537      */
538     public long getRecordsRead() {
539         return recordsRead;
540     }
541 
542     /**
543      * Skip a given number of lines.
544      *
545      * @param numberOfLinesToSkip The number of lines to skip
546      * @throws IOException If anything bad happens when reading the file
547      * @since 4.2
548      */
549     public void skip(int numberOfLinesToSkip) throws IOException {
550         for (int j = 0; j < numberOfLinesToSkip; j++) {
551             readNextSilently();
552         }
553     }
554 
555     /**
556      * Sets the locale for all error messages.
557      *
558      * @param errorLocale Locale for error messages. If null, the default locale
559      *                    is used.
560      * @since 4.2
561      */
562     public void setErrorLocale(Locale errorLocale) {
563         this.errorLocale = ObjectUtils.defaultIfNull(errorLocale, Locale.getDefault());
564         if (parser != null) {
565             parser.setErrorLocale(this.errorLocale);
566         }
567     }
568 
569     /**
570      * Returns the next line from the input without removing it from the
571      * CSVReader and not running any validators.
572      * Subsequent calls to this method will continue to return the same line
573      * until a call is made to {@link #readNext()} or any other method that
574      * advances the cursor position in the input. The first call to
575      * {@link #readNext()} after calling this method will return the same line
576      * this method does.
577      *
578      * @return The next line from the input, or {@code null} if there are no
579      *   more lines
580      * @throws IOException If bad things happen during the read operation
581      * @since 4.2
582      */
583     public String[] peek() throws IOException {
584         String[] result = null;
585         try {
586             result = flexibleRead(false, false);
587         } catch (CsvValidationException e) {
588             // Do nothing. We asked for no validation, so it can't really happen.
589         }
590         return result;
591     }
592 
593     /**
594      * Reads a line of input, popping or validating as desired.
595      *
596      * @param popLine Whether the line returned should be popped off the queue
597      *                of input. If this is {@code true}, this method consumes
598      *                the line and further calls will return the next line of
599      *                input. If {@code false}, the line returned stays in the
600      *                queue and further calls to this method will return the
601      *                same line again.
602      * @param validate Whether all user-supplied validators should be run.
603      * @return The next line of input
604      * @throws IOException If this exception is thrown while reading
605      * @throws CsvValidationException If a user-supplied validator throws it
606      */
607     private String[] flexibleRead(boolean popLine, boolean validate) throws IOException, CsvValidationException {
608 
609         if(peekedLines.isEmpty()) {
610             primeNextRecord();
611         }
612 
613         if(validate) {
614             for(OrderedObject<String> orderedObject : peekedLines) {
615                 validateLine(orderedObject.getOrdinal(), orderedObject.getElement());
616             }
617             validateResult(peekedLine, linesRead);
618         }
619 
620         String[] result = peekedLine;
621 
622         if(popLine) {
623             peekedLines.clear();
624             peekedLine = null;
625             if(result != null) {
626                 recordsRead++;
627             }
628         }
629 
630         return result;
631     }
632 }