1 /**
2 * =========================================================
3 * Pentaho-Reporting-Classic : a free Java reporting library
4 * =========================================================
5 *
6 * Project Info: http://reporting.pentaho.org/
7 *
8 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.
9 *
10 * This library is free software; you can redistribute it and/or modify it under the terms
11 * of the GNU Lesser General Public License as published by the Free Software Foundation;
12 * either version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
15 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 * See the GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License along with this
19 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
20 * Boston, MA 02111-1307, USA.
21 *
22 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
23 * in the United States and other countries.]
24 *
25 * ------------
26 * CSVTokenizer.java
27 * ------------
28 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.
29 */
30 package org.jfree.report.util;
31
32 import java.util.Enumeration;
33 import java.util.NoSuchElementException;
34
35 /**
36 * The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization
37 * method is much simpler than the one used by the <code>StringTokenizer</code> class. The <code>CSVTokenizer</code>
38 * methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.
39 * <p/>
40 * The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token
41 * basis.
42 * <p/>
43 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on whether it was created with the
44 * <code>returnSeparators</code> flag having the value <code>true</code> or <code>false</code>: <ul> <li>If the flag is
45 * <code>false</code>, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive
46 * characters that are not separator. <li>If the flag is <code>true</code>, delimiter characters are themselves
47 * considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive
48 * characters that are not separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current position
49 * within the string to be tokenized. Some operations advance this current position past the characters processed.<p> A
50 * token is returned by taking a substring of the string that was used to create the <tt>CSVTokenizer</tt> object.
51 * <p/>
52 * The following is one example of the use of the tokenizer. The code:
53 * <blockquote><pre>
54 * CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
55 * while (csvt.hasMoreTokens()) {
56 * println(csvt.nextToken());
57 * }
58 * </pre></blockquote>
59 * <p/>
60 * prints the following output:
61 * <blockquote><pre>
62 * this
63 * is
64 * a
65 * test
66 * </pre></blockquote>
67 *
68 * @author abupon
69 * @deprecated This class moved into LibBase. This class here will be removed in 0.8.11.
70 */
71 public class CSVTokenizer implements Enumeration
72 {
73 /**
74 * The complete record that should be separated into elements.
75 */
76 private String record;
77 /**
78 * The separator.
79 */
80 private String separator;
81 /**
82 * The quoting char.
83 */
84 private String quate;
85
86 /**
87 * the current parsing position.
88 */
89 private int currentIndex;
90
91 /**
92 * A flag indicating that the current parse position is before the start.
93 */
94 private boolean beforeStart;
95
96 /**
97 * A possible separator constant.
98 */
99 public static final String SEPARATOR_COMMA = ",";
100 /**
101 * A possible separator constant.
102 */
103 public static final String SEPARATOR_TAB = "\t";
104 /**
105 * A possible separator constant.
106 */
107 public static final String SEPARATOR_SPACE = " ";
108
109 /**
110 * A possible quote character constant.
111 */
112 public static final String DOUBLE_QUATE = "\"";
113 /**
114 * A possible quote character constant.
115 */
116 public static final String SINGLE_QUATE = "'";
117
118 /**
119 * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
120 * separating tokens.
121 * <p/>
122 * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
123 * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
124 * only serve as separator between tokens.
125 *
126 * @param aString a string to be parsed.
127 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
128 * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
129 */
130 public CSVTokenizer(final String aString, final String theSeparator,
131 final String theQuate)
132 {
133 if (aString == null)
134 {
135 throw new NullPointerException("The given string is null");
136 }
137 if (theSeparator == null)
138 {
139 throw new NullPointerException("The given separator is null");
140 }
141 if (theQuate == null)
142 {
143 throw new NullPointerException("The given quate is null");
144 }
145 this.record = aString.trim();
146 this.separator = theSeparator;
147 this.quate = theQuate;
148 this.currentIndex = 0;
149 this.beforeStart = true;
150 }
151
152 /**
153 * Constructs a csv tokenizer for the specified string. The characters in the <code>theSeparator</code> argument are
154 * the separator for separating tokens. Separator string themselves will not be treated as tokens.
155 *
156 * @param aString a string to be parsed.
157 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
158 */
159 public CSVTokenizer(final String aString, final String theSeparator)
160 {
161 this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
162 }
163
164 /**
165 * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
166 * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
167 *
168 * @param aString a string to be parsed.
169 */
170 public CSVTokenizer(final String aString)
171 {
172 this(aString, CSVTokenizer.SEPARATOR_COMMA);
173 }
174
175 /**
176 * Tests if there are more tokens available from this tokenizer's string. If this method returns <tt>true</tt>, then a
177 * subsequent call to <tt>nextToken</tt> with no argument will successfully return a token.
178 *
179 * @return <code>true</code> if and only if there is at least one token in the string after the current position;
180 * <code>false</code> otherwise.
181 */
182 public boolean hasMoreTokens()
183 {
184 return (this.currentIndex < this.record.length());
185 }
186
187 /**
188 * Returns the next token from this string tokenizer.
189 *
190 * @return the next token from this string tokenizer.
191 * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
192 * @throws IllegalArgumentException if given parameter string format was wrong
193 */
194 public String nextToken()
195 throws NoSuchElementException, IllegalArgumentException
196 {
197
198 if (!this.hasMoreTokens())
199 {
200 throw new NoSuchElementException();
201 }
202
203 if (beforeStart == false)
204 {
205 currentIndex += this.separator.length();
206 }
207 else
208 {
209 beforeStart = false;
210 }
211
212 if (this.record.startsWith(this.quate, this.currentIndex))
213 {
214 final StringBuffer token = new StringBuffer(100);
215 String rec = this.record.substring(this.currentIndex + this.quate.length());
216
217 while (true)
218 {
219 final int end = rec.indexOf(this.quate);
220 if (end < 0)
221 {
222 throw new IllegalArgumentException("Illegal format");
223 }
224
225 if (!rec.startsWith(this.quate, end + 1))
226 {
227 token.append(rec.substring(0, end));
228 break;
229 }
230 token.append(rec.substring(0, end + 1));
231 rec = rec.substring(end + this.quate.length() * 2);
232 this.currentIndex++;
233 }
234
235 this.currentIndex += (token.length() + this.quate.length() * 2);
236 return token.toString();
237 }
238
239 final int end = this.record.indexOf(this.separator, this.currentIndex);
240 if (end >= 0)
241 {
242 final int start = this.currentIndex;
243 final String token = this.record.substring(start, end);
244 this.currentIndex = end;
245 return token;
246 }
247 else
248 {
249 final int start = this.currentIndex;
250 final String token = this.record.substring(start);
251 this.currentIndex = this.record.length();
252 return token;
253 }
254 }
255
256 /**
257 * Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator
258 * by this <tt>CSVTokenizer</tt> object is changed to be the characters in the string <tt>separator</tt>. Then the
259 * next token in the string after the current position is returned. The current position is advanced beyond the
260 * recognized token. The new delimiter set remains the default after this call.
261 *
262 * @param theSeparator the new separator.
263 * @return the next token, after switching to the new delimiter set.
264 * @throws java.util.NoSuchElementException
265 * if there are no more tokens in this tokenizer's string.
266 */
267 public String nextToken(final String theSeparator)
268 {
269 separator = theSeparator;
270 return nextToken();
271 }
272
273 /**
274 * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that this class can implement the
275 * <code>Enumeration</code> interface.
276 *
277 * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
278 * @see java.util.Enumeration
279 * @see CSVTokenizer#hasMoreTokens()
280 */
281 public boolean hasMoreElements()
282 {
283 return hasMoreTokens();
284 }
285
286 /**
287 * Returns the same value as the <code>nextToken</code> method, except that its declared return value is
288 * <code>Object</code> rather than <code>String</code>. It exists so that this class can implement the
289 * <code>Enumeration</code> interface.
290 *
291 * @return the next token in the string.
292 * @throws java.util.NoSuchElementException
293 * if there are no more tokens in this tokenizer's string.
294 * @see java.util.Enumeration
295 * @see CSVTokenizer#nextToken()
296 */
297 public Object nextElement()
298 {
299 return nextToken();
300 }
301
302 /**
303 * Calculates the number of times that this tokenizer's <code>nextToken</code> method can be called before it
304 * generates an exception. The current position is not advanced.
305 *
306 * @return the number of tokens remaining in the string using the current delimiter set.
307 * @see CSVTokenizer#nextToken()
308 */
309 public int countTokens()
310 {
311 int count = 0;
312
313 final int preserve = this.currentIndex;
314 final boolean preserveStart = this.beforeStart;
315 while (this.hasMoreTokens())
316 {
317 this.nextToken();
318 count++;
319 }
320 this.currentIndex = preserve;
321 this.beforeStart = preserveStart;
322
323 return count;
324 }
325
326 /**
327 * Returns the quate.
328 *
329 * @return char
330 */
331 public String getQuate()
332 {
333 return this.quate;
334 }
335
336 /**
337 * Sets the quate.
338 *
339 * @param quate The quate to set
340 */
341 public void setQuate(final String quate)
342 {
343 this.quate = quate;
344 }
345 }