1 package au.com.bytecode.opencsv;
2
3 /**
4 Copyright 2005 Bytecode Pty Ltd.
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 */
18
19 import java.io.IOException;
20 import java.util.ArrayList;
21 import java.util.List;
22
23 /**
24 * A very simple CSV parser released under a commercial-friendly license.
25 * This just implements splitting a single line into fields.
26 *
27 * @author Glen Smith
28 * @author Rainer Pruy
29 */
30 public class CSVParser {
31
32 private final char separator;
33
34 private final char quotechar;
35
36 private final char escape;
37
38 private final boolean strictQuotes;
39
40 private String pending;
41 private boolean inField = false;
42
43 private final boolean ignoreLeadingWhiteSpace;
44
45 private final boolean ignoreQuotations;
46
47 /**
48 * The default separator to use if none is supplied to the constructor.
49 */
50 public static final char DEFAULT_SEPARATOR = ',';
51
52 public static final int INITIAL_READ_SIZE = 128;
53
54 /**
55 * The default quote character to use if none is supplied to the
56 * constructor.
57 */
58 public static final char DEFAULT_QUOTE_CHARACTER = '"';
59
60
61 /**
62 * The default escape character to use if none is supplied to the
63 * constructor.
64 */
65 public static final char DEFAULT_ESCAPE_CHARACTER = '\\';
66
67 /**
68 * The default strict quote behavior to use if none is supplied to the
69 * constructor
70 */
71 public static final boolean DEFAULT_STRICT_QUOTES = false;
72
73 /**
74 * The default leading whitespace behavior to use if none is supplied to the
75 * constructor
76 */
77 public static final boolean DEFAULT_IGNORE_LEADING_WHITESPACE = true;
78
79 /**
80 * I.E. if the quote character is set to null then there is no quote character.
81 */
82 public static final boolean DEFAULT_IGNORE_QUOTATIONS = false;
83
84 /**
85 * This is the "null" character - if a value is set to this then it is ignored.
86 */
87 static final char NULL_CHARACTER = '\0';
88
89 /**
90 * Constructs CSVParser using a comma for the separator.
91 */
92 public CSVParser() {
93 this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
94 }
95
96 /**
97 * Constructs CSVParser with supplied separator.
98 *
99 * @param separator the delimiter to use for separating entries.
100 */
101 public CSVParser(char separator) {
102 this(separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
103 }
104
105
106 /**
107 * Constructs CSVParser with supplied separator and quote char.
108 *
109 * @param separator the delimiter to use for separating entries
110 * @param quotechar the character to use for quoted elements
111 */
112 public CSVParser(char separator, char quotechar) {
113 this(separator, quotechar, DEFAULT_ESCAPE_CHARACTER);
114 }
115
116 /**
117 * Constructs CSVReader with supplied separator and quote char.
118 *
119 * @param separator the delimiter to use for separating entries
120 * @param quotechar the character to use for quoted elements
121 * @param escape the character to use for escaping a separator or quote
122 */
123 public CSVParser(char separator, char quotechar, char escape) {
124 this(separator, quotechar, escape, DEFAULT_STRICT_QUOTES);
125 }
126
127 /**
128 * Constructs CSVReader with supplied separator and quote char.
129 * Allows setting the "strict quotes" flag
130 *
131 * @param separator the delimiter to use for separating entries
132 * @param quotechar the character to use for quoted elements
133 * @param escape the character to use for escaping a separator or quote
134 * @param strictQuotes if true, characters outside the quotes are ignored
135 */
136 public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes) {
137 this(separator, quotechar, escape, strictQuotes, DEFAULT_IGNORE_LEADING_WHITESPACE);
138 }
139
140 /**
141 * Constructs CSVReader with supplied separator and quote char.
142 * Allows setting the "strict quotes" and "ignore leading whitespace" flags
143 *
144 * @param separator the delimiter to use for separating entries
145 * @param quotechar the character to use for quoted elements
146 * @param escape the character to use for escaping a separator or quote
147 * @param strictQuotes if true, characters outside the quotes are ignored
148 * @param ignoreLeadingWhiteSpace if true, white space in front of a quote in a field is ignored
149 */
150 public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace) {
151 this(separator, quotechar, escape, strictQuotes, ignoreLeadingWhiteSpace, DEFAULT_IGNORE_QUOTATIONS);
152 }
153
154 /**
155 * Constructs CSVReader with supplied separator and quote char.
156 * Allows setting the "strict quotes" and "ignore leading whitespace" flags
157 *
158 * @param separator the delimiter to use for separating entries
159 * @param quotechar the character to use for quoted elements
160 * @param escape the character to use for escaping a separator or quote
161 * @param strictQuotes if true, characters outside the quotes are ignored
162 * @param ignoreLeadingWhiteSpace if true, white space in front of a quote in a field is ignored
163 */
164 public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace,
165 boolean ignoreQuotations) {
166 if (anyCharactersAreTheSame(separator, quotechar, escape)) {
167 throw new UnsupportedOperationException("The separator, quote, and escape characters must be different!");
168 }
169 if (separator == NULL_CHARACTER) {
170 throw new UnsupportedOperationException("The separator character must be defined!");
171 }
172 this.separator = separator;
173 this.quotechar = quotechar;
174 this.escape = escape;
175 this.strictQuotes = strictQuotes;
176 this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
177 this.ignoreQuotations = ignoreQuotations;
178 }
179
180 private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
181 return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
182 }
183
184 private boolean isSameCharacter(char c1, char c2) {
185 return c1 != NULL_CHARACTER && c1 == c2;
186 }
187
188 /**
189 * @return true if something was left over from last call(s)
190 */
191 public boolean isPending() {
192 return pending != null;
193 }
194
195 public String[] parseLineMulti(String nextLine) throws IOException {
196 return parseLine(nextLine, true);
197 }
198
199 public String[] parseLine(String nextLine) throws IOException {
200 return parseLine(nextLine, false);
201 }
202
203 /**
204 * Parses an incoming String and returns an array of elements.
205 *
206 * @param nextLine the string to parse
207 * @param multi
208 * @return the comma-tokenized list of elements, or null if nextLine is null
209 * @throws IOException if bad things happen during the read
210 */
211 private String[] parseLine(String nextLine, boolean multi) throws IOException {
212
213 if (!multi && pending != null) {
214 pending = null;
215 }
216
217 if (nextLine == null) {
218 if (pending != null) {
219 String s = pending;
220 pending = null;
221 return new String[]{s};
222 } else {
223 return null;
224 }
225 }
226
227 List<String> tokensOnThisLine = new ArrayList<String>();
228 StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
229 boolean inQuotes = false;
230 if (pending != null) {
231 sb.append(pending);
232 pending = null;
233 inQuotes = !this.ignoreQuotations;//true;
234 }
235 for (int i = 0; i < nextLine.length(); i++) {
236
237 char c = nextLine.charAt(i);
238 if (c == this.escape) {
239 if (isNextCharacterEscapable(nextLine, (inQuotes && !ignoreQuotations) || inField, i)) {
240 sb.append(nextLine.charAt(i + 1));
241 i++;
242 }
243 } else if (c == quotechar) {
244 if (isNextCharacterEscapedQuote(nextLine, (inQuotes && !ignoreQuotations) || inField, i)) {
245 sb.append(nextLine.charAt(i + 1));
246 i++;
247 } else {
248 inQuotes = !inQuotes;
249
250 // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
251 if (!strictQuotes) {
252 if (i > 2 //not on the beginning of the line
253 && nextLine.charAt(i - 1) != this.separator //not at the beginning of an escape sequence
254 && nextLine.length() > (i + 1) &&
255 nextLine.charAt(i + 1) != this.separator //not at the end of an escape sequence
256 ) {
257
258 if (ignoreLeadingWhiteSpace && sb.length() > 0 && isAllWhiteSpace(sb)) {
259 sb = new StringBuilder(INITIAL_READ_SIZE); //discard white space leading up to quote
260 } else {
261 sb.append(c);
262 }
263
264 }
265 }
266 }
267 inField = !inField;
268 } else if (c == separator && !(inQuotes && !ignoreQuotations)) {
269 tokensOnThisLine.add(sb.toString());
270 sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token
271 inField = false;
272 } else {
273 if (!strictQuotes || (inQuotes && !ignoreQuotations)) {
274 sb.append(c);
275 inField = true;
276 }
277 }
278 }
279 // line is done - check status
280 if ((inQuotes && !ignoreQuotations)) {
281 if (multi) {
282 // continuing a quoted section, re-append newline
283 sb.append("\n");
284 pending = sb.toString();
285 sb = null; // this partial content is not to be added to field list yet
286 } else {
287 throw new IOException("Un-terminated quoted field at end of CSV line");
288 }
289 }
290 if (sb != null) {
291 tokensOnThisLine.add(sb.toString());
292 }
293 return tokensOnThisLine.toArray(new String[tokensOnThisLine.size()]);
294
295 }
296
297 /**
298 * precondition: the current character is a quote or an escape
299 *
300 * @param nextLine the current line
301 * @param inQuotes true if the current context is quoted
302 * @param i current index in line
303 * @return true if the following character is a quote
304 */
305 private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
306 return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
307 && nextLine.length() > (i + 1) // there is indeed another character to check.
308 && nextLine.charAt(i + 1) == quotechar;
309 }
310
311 /**
312 * precondition: the current character is an escape
313 *
314 * @param nextLine the current line
315 * @param inQuotes true if the current context is quoted
316 * @param i current index in line
317 * @return true if the following character is a quote
318 */
319 protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
320 return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
321 && nextLine.length() > (i + 1) // there is indeed another character to check.
322 && (nextLine.charAt(i + 1) == quotechar || nextLine.charAt(i + 1) == this.escape);
323 }
324
325 /**
326 * precondition: sb.length() > 0
327 *
328 * @param sb A sequence of characters to examine
329 * @return true if every character in the sequence is whitespace
330 */
331 protected boolean isAllWhiteSpace(CharSequence sb) {
332 boolean result = true;
333 for (int i = 0; i < sb.length(); i++) {
334 char c = sb.charAt(i);
335
336 if (!Character.isWhitespace(c)) {
337 return false;
338 }
339 }
340 return result;
341 }
342 }