View Javadoc
1   package com.opencsv;
2   
3   import com.opencsv.enums.CSVReaderNullFieldIndicator;
4   import org.apache.commons.lang3.ArrayUtils;
5   import org.apache.commons.lang3.StringUtils;
6   
7   import java.util.ArrayList;
8   import java.util.List;
9   import java.util.Locale;
10  
11  /**
12   * This Parser is meant to parse csv data according to the RFC4180 specification.
13   * <p>Since it shares the same interface with the CSVParser there are methods here that will do nothing.
14   * For example, the RFC4180 specification does not have a concept of an escape character, so the getEscape method
15   * will return char 0.  The methods that are not supported are noted in the Javadocs.</p>
16   * <p>Another departure from the CSVParser is that there are only two constructors and only one is available publicly.
17   * The intent is that if you want to create anything other than a default RFC4180Parser you should use the
18   * CSVParserBuilder.  This way the code will not become cluttered with constructors as the CSVParser did.</p>
19   * <p>You can view the RFC4180 specification at <a href="https://tools.ietf.org/html/rfc4180">the Internet Engineering
20   * Task Force (IETF) website</a>.</p>
21   * <p>Examples:</p>
22   * {@code
23   * ICSVParser parser = new RFC4180Parser();
24   * }
25   * <p>or</p>
26   * {@code
27   * CSVParserBuilder builder = new CSVParserBuilder()
28   * ICSVParser parser = builder.withParserType(ParserType.RFC4180Parser).build()
29   * }
30   *
31   * @author Scott Conway
32   * @since 3.9
33   */
34  
35  public class RFC4180Parser extends AbstractCSVParser {
36  
37      /**
38       * Default constructor for the RFC4180Parser.  Uses values from the ICSVParser.
39       */
40      public RFC4180Parser() {
41          this(ICSVParser.DEFAULT_QUOTE_CHARACTER, ICSVParser.DEFAULT_SEPARATOR, CSVReaderNullFieldIndicator.NEITHER);
42      }
43  
44      /**
45       * Constructor used by the CSVParserBuilder.
46       *
47       * @param separator The delimiter to use for separating entries
48       * @param quoteChar The character to use for quoted elements
49       * @param nullFieldIndicator Indicate what should be considered null
50       */
51      protected RFC4180Parser(char quoteChar, char separator, CSVReaderNullFieldIndicator nullFieldIndicator) {
52          super(separator, quoteChar, nullFieldIndicator);
53      }
54  
55      @Override
56      protected String convertToCsvValue(String value, boolean applyQuotesToAll) {
57          String testValue = (value == null && !nullFieldIndicator.equals(CSVReaderNullFieldIndicator.NEITHER)) ? "" : value;
58          StringBuilder builder = new StringBuilder(testValue == null ? MAX_SIZE_FOR_EMPTY_FIELD : (testValue.length() * 2));
59          boolean containsQuoteChar = testValue != null && testValue.contains(getQuotecharAsString());
60          boolean surroundWithQuotes = applyQuotesToAll || isSurroundWithQuotes(value, containsQuoteChar);
61  
62          String convertedString = !containsQuoteChar ? testValue : getQuoteMatcherPattern().matcher(testValue).replaceAll(getQuoteDoubledAsString());
63  
64          if (surroundWithQuotes) {
65              builder.append(getQuotechar());
66          }
67  
68          builder.append(convertedString);
69  
70          if (surroundWithQuotes) {
71              builder.append(getQuotechar());
72          }
73  
74          return builder.toString();
75      }
76  
77      /**
78       * Parses an incoming String and returns an array of elements.
79       *
80       * @param nextLine The string to parse
81       * @param multi    Does it take multiple lines to form a single record?
82       * @return The list of elements, or null if nextLine is null
83       */
84      protected String[] parseLine(String nextLine, boolean multi) {
85          String[] elements;
86  
87          if (!multi && pending != null) {
88              pending = null;
89          }
90  
91          if (nextLine == null) {
92              if (pending != null) {
93                  String s = pending;
94                  pending = null;
95                  return new String[]{s};
96              }
97              return null;
98          }
99  
100         String lineToProcess = multi && pending != null ? pending + nextLine : nextLine;
101         pending = null;
102 
103         if (!StringUtils.contains(lineToProcess, quotechar)) {
104             elements = handleEmptySeparators(tokenizeStringIntoArray(lineToProcess));
105         } else {
106             elements = handleEmptySeparators(splitWhileNotInQuotes(lineToProcess, multi));
107             for (int i = 0; i < elements.length; i++) {
108                 if (StringUtils.contains(elements[i], quotechar)) {
109                     elements[i] = handleQuotes(elements[i]);
110                 }
111             }
112         }
113         return elements;
114     }
115 
116     private String[] tokenizeStringIntoArray(String nextLine) {
117         String[] tokenizedLine = StringUtils.splitPreserveAllTokens(nextLine, separator);
118         if (tokenizedLine != null && tokenizedLine.length == 0) {
119             tokenizedLine = new String[1];
120             tokenizedLine[0] = "";
121         }
122         return tokenizedLine;
123     }
124 
125     private String[] handleEmptySeparators(String[] strings) {
126         if (nullFieldIndicator == CSVReaderNullFieldIndicator.EMPTY_SEPARATORS || nullFieldIndicator == CSVReaderNullFieldIndicator.BOTH) {
127             for (int i = 0; i < strings.length; i++) {
128                 if (strings[i].isEmpty()) {
129                     strings[i] = null;
130                 }
131             }
132         }
133         return strings;
134     }
135 
136     private String[] splitWhileNotInQuotes(String nextLine, boolean multi) {
137         int currentPosition = 0;
138         List<String> elements = new ArrayList<>();
139         int nextSeparator;
140         int nextQuote;
141 
142 
143         while (currentPosition < nextLine.length()) {
144             nextSeparator = nextLine.indexOf(separator, currentPosition);
145             nextQuote = nextLine.indexOf(quotechar, currentPosition);
146 
147             if (nextSeparator == -1) {
148                 elements.add(nextLine.substring(currentPosition));
149                 currentPosition = nextLine.length();
150             } else if (nextQuote == -1 || nextQuote > nextSeparator || nextQuote != currentPosition) {
151                 elements.add(nextLine.substring(currentPosition, nextSeparator));
152                 currentPosition = nextSeparator + 1;
153             } else {
154                 int fieldEnd = findEndOfFieldFromPosition(nextLine, currentPosition);
155 
156                 elements.add(fieldEnd >= nextLine.length() ? nextLine.substring(currentPosition) : nextLine.substring(currentPosition, fieldEnd));
157 
158                 currentPosition = fieldEnd + 1;
159             }
160 
161         }
162 
163         if (multi && lastElementStartedWithQuoteButDidNotEndInOne(elements)) {
164             pending = elements.get(elements.size() - 1) + NEWLINE;
165             elements.remove(elements.size() - 1);
166         } else if (nextLine.lastIndexOf(separator) == nextLine.length() - 1) {
167             elements.add("");
168         }
169         return elements.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
170     }
171 
172     private boolean lastElementStartedWithQuoteButDidNotEndInOne(List<String> elements) {
173         String lastElement = elements.get(elements.size() - 1);
174         return startsButDoesNotEndWithQuote(lastElement) || hasOnlyOneQuote(lastElement) || hasOddQuotes(lastElement);
175     }
176 
177     private boolean hasOddQuotes(String lastElement) {
178         return StringUtils.countMatches(lastElement, quotechar) % 2 != 0;
179     }
180 
181     private boolean hasOnlyOneQuote(String lastElement) {
182         return StringUtils.countMatches(lastElement, quotechar) == 1;
183     }
184 
185     private boolean startsButDoesNotEndWithQuote(String lastElement) {
186         return lastElement.startsWith(getQuotecharAsString()) && !lastElement.endsWith(getQuotecharAsString());
187     }
188 
189     private int findEndOfFieldFromPosition(String nextLine, int currentPosition) {
190         int nextQuote = nextLine.indexOf(quotechar, currentPosition + 1);
191 
192         boolean inQuote = false;
193         while (haveNotFoundLastQuote(nextLine, nextQuote)) {
194             if (!inQuote && nextLine.charAt(nextQuote + 1) == separator) {
195                 return nextQuote + 1;
196             }
197 
198             do {
199                 nextQuote = nextLine.indexOf(quotechar, nextQuote + 1);
200                 inQuote = !inQuote;
201             } while (haveNotFoundLastQuote(nextLine, nextQuote) && nextLine.charAt(nextQuote + 1) == quotechar);
202         }
203 
204         return nextLine.length();
205     }
206 
207     private boolean haveNotFoundLastQuote(String nextLine, int nextQuote) {
208         return nextQuote != -1 && nextQuote < nextLine.length() - 1;
209     }
210 
211     private String handleQuotes(String element) {
212         String ret = element;
213 
214         if (!hasOnlyOneQuote(ret) && ret.startsWith(getQuotecharAsString())) {
215             ret = StringUtils.removeStart(ret, getQuotecharAsString());
216             ret = StringUtils.removeEnd(ret, getQuotecharAsString());
217         }
218         ret = StringUtils.replace(ret, getQuoteDoubledAsString(), getQuotecharAsString());
219         if (ret.isEmpty() && (nullFieldIndicator == CSVReaderNullFieldIndicator.BOTH || nullFieldIndicator == CSVReaderNullFieldIndicator.EMPTY_QUOTES)) {
220             ret = null;
221         }
222         return ret;
223     }
224     
225     @Override
226     public void setErrorLocale(Locale errorLocale) {
227         // Curiously enough, this implementation never throws exceptions and so
228         // has no need of translations.
229     }
230 }