View Javadoc

1   package au.com.bytecode.opencsv;
2   
3   /**
4    Copyright 2005 Bytecode Pty Ltd.
5   
6    Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9   
10   http://www.apache.org/licenses/LICENSE-2.0
11  
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17   */
18  
19  import java.io.IOException;
20  import java.util.ArrayList;
21  import java.util.List;
22  
23  /**
24   * A very simple CSV parser released under a commercial-friendly license.
25   * This just implements splitting a single line into fields.
26   *
27   * @author Glen Smith
28   * @author Rainer Pruy
29   */
30  public class CSVParser {
31  
32      private final char separator;
33  
34      private final char quotechar;
35  
36      private final char escape;
37  
38      private final boolean strictQuotes;
39  
40      private String pending;
41      private boolean inField = false;
42  
43      private final boolean ignoreLeadingWhiteSpace;
44  
45      private final boolean ignoreQuotations;
46  
47      /**
48       * The default separator to use if none is supplied to the constructor.
49       */
50      public static final char DEFAULT_SEPARATOR = ',';
51  
52      public static final int INITIAL_READ_SIZE = 128;
53  
54      /**
55       * The default quote character to use if none is supplied to the
56       * constructor.
57       */
58      public static final char DEFAULT_QUOTE_CHARACTER = '"';
59  
60  
61      /**
62       * The default escape character to use if none is supplied to the
63       * constructor.
64       */
65      public static final char DEFAULT_ESCAPE_CHARACTER = '\\';
66  
67      /**
68       * The default strict quote behavior to use if none is supplied to the
69       * constructor
70       */
71      public static final boolean DEFAULT_STRICT_QUOTES = false;
72  
73      /**
74       * The default leading whitespace behavior to use if none is supplied to the
75       * constructor
76       */
77      public static final boolean DEFAULT_IGNORE_LEADING_WHITESPACE = true;
78  
79      /**
80       * I.E. if the quote character is set to null then there is no quote character.
81       */
82      public static final boolean DEFAULT_IGNORE_QUOTATIONS = false;
83  
84      /**
85       * This is the "null" character - if a value is set to this then it is ignored.
86       */
87      static final char NULL_CHARACTER = '\0';
88  
89      /**
90       * Constructs CSVParser using a comma for the separator.
91       */
92      public CSVParser() {
93          this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
94      }
95  
96      /**
97       * Constructs CSVParser with supplied separator.
98       *
99       * @param separator the delimiter to use for separating entries.
100      */
101     public CSVParser(char separator) {
102         this(separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
103     }
104 
105 
106     /**
107      * Constructs CSVParser with supplied separator and quote char.
108      *
109      * @param separator the delimiter to use for separating entries
110      * @param quotechar the character to use for quoted elements
111      */
112     public CSVParser(char separator, char quotechar) {
113         this(separator, quotechar, DEFAULT_ESCAPE_CHARACTER);
114     }
115 
116     /**
117      * Constructs CSVReader with supplied separator and quote char.
118      *
119      * @param separator the delimiter to use for separating entries
120      * @param quotechar the character to use for quoted elements
121      * @param escape    the character to use for escaping a separator or quote
122      */
123     public CSVParser(char separator, char quotechar, char escape) {
124         this(separator, quotechar, escape, DEFAULT_STRICT_QUOTES);
125     }
126 
127     /**
128      * Constructs CSVReader with supplied separator and quote char.
129      * Allows setting the "strict quotes" flag
130      *
131      * @param separator    the delimiter to use for separating entries
132      * @param quotechar    the character to use for quoted elements
133      * @param escape       the character to use for escaping a separator or quote
134      * @param strictQuotes if true, characters outside the quotes are ignored
135      */
136     public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes) {
137         this(separator, quotechar, escape, strictQuotes, DEFAULT_IGNORE_LEADING_WHITESPACE);
138     }
139 
140     /**
141      * Constructs CSVReader with supplied separator and quote char.
142      * Allows setting the "strict quotes" and "ignore leading whitespace" flags
143      *
144      * @param separator               the delimiter to use for separating entries
145      * @param quotechar               the character to use for quoted elements
146      * @param escape                  the character to use for escaping a separator or quote
147      * @param strictQuotes            if true, characters outside the quotes are ignored
148      * @param ignoreLeadingWhiteSpace if true, white space in front of a quote in a field is ignored
149      */
150     public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace) {
151         this(separator, quotechar, escape, strictQuotes, ignoreLeadingWhiteSpace, DEFAULT_IGNORE_QUOTATIONS);
152     }
153 
154     /**
155      * Constructs CSVReader with supplied separator and quote char.
156      * Allows setting the "strict quotes" and "ignore leading whitespace" flags
157      *
158      * @param separator               the delimiter to use for separating entries
159      * @param quotechar               the character to use for quoted elements
160      * @param escape                  the character to use for escaping a separator or quote
161      * @param strictQuotes            if true, characters outside the quotes are ignored
162      * @param ignoreLeadingWhiteSpace if true, white space in front of a quote in a field is ignored
163      */
164     public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace,
165                      boolean ignoreQuotations) {
166         if (anyCharactersAreTheSame(separator, quotechar, escape)) {
167             throw new UnsupportedOperationException("The separator, quote, and escape characters must be different!");
168         }
169         if (separator == NULL_CHARACTER) {
170             throw new UnsupportedOperationException("The separator character must be defined!");
171         }
172         this.separator = separator;
173         this.quotechar = quotechar;
174         this.escape = escape;
175         this.strictQuotes = strictQuotes;
176         this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
177         this.ignoreQuotations = ignoreQuotations;
178     }
179 
180     private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
181         return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
182     }
183 
184     private boolean isSameCharacter(char c1, char c2) {
185         return c1 != NULL_CHARACTER && c1 == c2;
186     }
187 
188     /**
189      * @return true if something was left over from last call(s)
190      */
191     public boolean isPending() {
192         return pending != null;
193     }
194 
195     public String[] parseLineMulti(String nextLine) throws IOException {
196         return parseLine(nextLine, true);
197     }
198 
199     public String[] parseLine(String nextLine) throws IOException {
200         return parseLine(nextLine, false);
201     }
202 
203     /**
204      * Parses an incoming String and returns an array of elements.
205      *
206      * @param nextLine the string to parse
207      * @param multi
208      * @return the comma-tokenized list of elements, or null if nextLine is null
209      * @throws IOException if bad things happen during the read
210      */
211     private String[] parseLine(String nextLine, boolean multi) throws IOException {
212 
213         if (!multi && pending != null) {
214             pending = null;
215         }
216 
217         if (nextLine == null) {
218             if (pending != null) {
219                 String s = pending;
220                 pending = null;
221                 return new String[]{s};
222             } else {
223                 return null;
224             }
225         }
226 
227         List<String> tokensOnThisLine = new ArrayList<String>();
228         StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
229         boolean inQuotes = false;
230         if (pending != null) {
231             sb.append(pending);
232             pending = null;
233             inQuotes = !this.ignoreQuotations;//true;
234         }
235         for (int i = 0; i < nextLine.length(); i++) {
236 
237             char c = nextLine.charAt(i);
238             if (c == this.escape) {
239                 if (isNextCharacterEscapable(nextLine, (inQuotes && !ignoreQuotations) || inField, i)) {
240                     sb.append(nextLine.charAt(i + 1));
241                     i++;
242                 }
243             } else if (c == quotechar) {
244                 if (isNextCharacterEscapedQuote(nextLine, (inQuotes && !ignoreQuotations) || inField, i)) {
245                     sb.append(nextLine.charAt(i + 1));
246                     i++;
247                 } else {
248                     inQuotes = !inQuotes;
249 
250                     // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
251                     if (!strictQuotes) {
252                         if (i > 2 //not on the beginning of the line
253                                 && nextLine.charAt(i - 1) != this.separator //not at the beginning of an escape sequence
254                                 && nextLine.length() > (i + 1) &&
255                                 nextLine.charAt(i + 1) != this.separator //not at the	end of an escape sequence
256                                 ) {
257 
258                             if (ignoreLeadingWhiteSpace && sb.length() > 0 && isAllWhiteSpace(sb)) {
259                                 sb = new StringBuilder(INITIAL_READ_SIZE);  //discard white space leading up to quote
260                             } else {
261                                 sb.append(c);
262                             }
263 
264                         }
265                     }
266                 }
267                 inField = !inField;
268             } else if (c == separator && !(inQuotes && !ignoreQuotations)) {
269                 tokensOnThisLine.add(sb.toString());
270                 sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token
271                 inField = false;
272             } else {
273                 if (!strictQuotes || (inQuotes && !ignoreQuotations)) {
274                     sb.append(c);
275                     inField = true;
276                 }
277             }
278         }
279         // line is done - check status
280         if ((inQuotes && !ignoreQuotations)) {
281             if (multi) {
282                 // continuing a quoted section, re-append newline
283                 sb.append("\n");
284                 pending = sb.toString();
285                 sb = null; // this partial content is not to be added to field list yet
286             } else {
287                 throw new IOException("Un-terminated quoted field at end of CSV line");
288             }
289         }
290         if (sb != null) {
291             tokensOnThisLine.add(sb.toString());
292         }
293         return tokensOnThisLine.toArray(new String[tokensOnThisLine.size()]);
294 
295     }
296 
297     /**
298      * precondition: the current character is a quote or an escape
299      *
300      * @param nextLine the current line
301      * @param inQuotes true if the current context is quoted
302      * @param i        current index in line
303      * @return true if the following character is a quote
304      */
305     private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
306         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
307                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
308                 && nextLine.charAt(i + 1) == quotechar;
309     }
310 
311     /**
312      * precondition: the current character is an escape
313      *
314      * @param nextLine the current line
315      * @param inQuotes true if the current context is quoted
316      * @param i        current index in line
317      * @return true if the following character is a quote
318      */
319     protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
320         return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
321                 && nextLine.length() > (i + 1)  // there is indeed another character to check.
322                 && (nextLine.charAt(i + 1) == quotechar || nextLine.charAt(i + 1) == this.escape);
323     }
324 
325     /**
326      * precondition: sb.length() > 0
327      *
328      * @param sb A sequence of characters to examine
329      * @return true if every character in the sequence is whitespace
330      */
331     protected boolean isAllWhiteSpace(CharSequence sb) {
332         boolean result = true;
333         for (int i = 0; i < sb.length(); i++) {
334             char c = sb.charAt(i);
335 
336             if (!Character.isWhitespace(c)) {
337                 return false;
338             }
339         }
340         return result;
341     }
342 }