View Javadoc
1   /**
2    * Copyright 2014 Internet2
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  /*
17   * Licensed to the Apache Software Foundation (ASF) under one or more
18   * contributor license agreements.  See the NOTICE file distributed with
19   * this work for additional information regarding copyright ownership.
20   * The ASF licenses this file to You under the Apache License, Version 2.0
21   * (the "License"); you may not use this file except in compliance with
22   * the License.  You may obtain a copy of the License at
23   *
24   *      http://www.apache.org/licenses/LICENSE-2.0
25   *
26   * Unless required by applicable law or agreed to in writing, software
27   * distributed under the License is distributed on an "AS IS" BASIS,
28   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
29   * See the License for the specific language governing permissions and
30   * limitations under the License.
31   */
32  package edu.internet2.middleware.grouperClientExt.org.apache.commons.lang3.text;
33  
34  import java.util.ArrayList;
35  import java.util.Collections;
36  import java.util.List;
37  import java.util.ListIterator;
38  import java.util.NoSuchElementException;
39  
40  import edu.internet2.middleware.grouperClientExt.org.apache.commons.lang3.ArrayUtils;
41  
42  /**
43   * Tokenizes a string based based on delimiters (separators)
44   * and supporting quoting and ignored character concepts.
45   * <p>
46   * This class can split a String into many smaller strings. It aims
47   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
48   * however it offers much more control and flexibility including implementing
49   * the <code>ListIterator</code> interface. By default, it is set up
50   * like <code>StringTokenizer</code>.
51   * <p>
52   * The input String is split into a number of <i>tokens</i>.
53   * Each token is separated from the next String by a <i>delimiter</i>.
54   * One or more delimiter characters must be specified.
55   * <p>
56   * Each token may be surrounded by quotes.
57   * The <i>quote</i> matcher specifies the quote character(s).
58   * A quote may be escaped within a quoted section by duplicating itself.
59   * <p>
60   * Between each token and the delimiter are potentially characters that need trimming.
61   * The <i>trimmer</i> matcher specifies these characters.
62   * One usage might be to trim whitespace characters.
63   * <p>
64   * At any point outside the quotes there might potentially be invalid characters.
65   * The <i>ignored</i> matcher specifies these characters to be removed.
66   * One usage might be to remove new line characters.
67   * <p>
68   * Empty tokens may be removed or returned as null.
69   * <pre>
70   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
71   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
72   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
73   * </pre>
74   * <p>
75   *
76   * This tokenizer has the following properties and options:
77   *
78   * <table>
79   *  <tr>
80   *   <th>Property</th><th>Type</th><th>Default</th>
81   *  </tr>
82   *  <tr>
83   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
84   *  </tr>
85   *  <tr>
86   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
87   *  </tr>
88   *  <tr>
89   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
90   *  </tr>
91   *  <tr>
92   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
93   *  </tr>
94   *  <tr>
95   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
96   *  </tr>
97   * </table>
98   *
99   * @since 2.2
100  * @version $Id: StrTokenizer.java 1199894 2011-11-09 17:53:59Z ggregory $
101  */
102 public class StrTokenizer implements ListIterator<String>, Cloneable {
103 
104     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
105     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
106     static {
107         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
108         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
109         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
110         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
111         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
112         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
113         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
114 
115         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
116         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
117         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
118         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
119         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
120         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
121         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
122     }
123 
124     /** The text to work on. */
125     private char chars[];
126     /** The parsed tokens */
127     private String tokens[];
128     /** The current iteration position */
129     private int tokenPos;
130 
131     /** The delimiter matcher */
132     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
133     /** The quote matcher */
134     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
135     /** The ignored matcher */
136     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
137     /** The trimmer matcher */
138     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
139 
140     /** Whether to return empty tokens as null */
141     private boolean emptyAsNull = false;
142     /** Whether to ignore empty tokens */
143     private boolean ignoreEmptyTokens = true;
144 
145     //-----------------------------------------------------------------------
146 
147     /**
148      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
149      * 
150      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
151      */
152     private static StrTokenizer getCSVClone() {
153         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
154     }
155 
156     /**
157      * Gets a new tokenizer instance which parses Comma Separated Value strings
158      * initializing it with the given input.  The default for CSV processing
159      * will be trim whitespace from both ends (which can be overridden with
160      * the setTrimmer method).
161      * <p>
162      * You must call a "reset" method to set the string which you want to parse.
163      * @return a new tokenizer instance which parses Comma Separated Value strings
164      */
165     public static StrTokenizer getCSVInstance() {
166         return getCSVClone();
167     }
168 
169     /**
170      * Gets a new tokenizer instance which parses Comma Separated Value strings
171      * initializing it with the given input.  The default for CSV processing
172      * will be trim whitespace from both ends (which can be overridden with
173      * the setTrimmer method).
174      *
175      * @param input  the text to parse
176      * @return a new tokenizer instance which parses Comma Separated Value strings
177      */
178     public static StrTokenizer getCSVInstance(String input) {
179         StrTokenizer tok = getCSVClone();
180         tok.reset(input);
181         return tok;
182     }
183 
184     /**
185      * Gets a new tokenizer instance which parses Comma Separated Value strings
186      * initializing it with the given input.  The default for CSV processing
187      * will be trim whitespace from both ends (which can be overridden with
188      * the setTrimmer method).
189      *
190      * @param input  the text to parse
191      * @return a new tokenizer instance which parses Comma Separated Value strings
192      */
193     public static StrTokenizer getCSVInstance(char[] input) {
194         StrTokenizer tok = getCSVClone();
195         tok.reset(input);
196         return tok;
197     }
198 
199     /**
200      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
201      * 
202      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
203      */
204     private static StrTokenizer getTSVClone() {
205         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
206     }
207 
208 
209     /**
210      * Gets a new tokenizer instance which parses Tab Separated Value strings.
211      * The default for CSV processing will be trim whitespace from both ends
212      * (which can be overridden with the setTrimmer method).
213      * <p>
214      * You must call a "reset" method to set the string which you want to parse.
215      * @return a new tokenizer instance which parses Tab Separated Value strings.
216      */
217     public static StrTokenizer getTSVInstance() {
218         return getTSVClone();
219     }
220 
221     /**
222      * Gets a new tokenizer instance which parses Tab Separated Value strings.
223      * The default for CSV processing will be trim whitespace from both ends
224      * (which can be overridden with the setTrimmer method).
225      * @param input  the string to parse
226      * @return a new tokenizer instance which parses Tab Separated Value strings.
227      */
228     public static StrTokenizer getTSVInstance(String input) {
229         StrTokenizer tok = getTSVClone();
230         tok.reset(input);
231         return tok;
232     }
233 
234     /**
235      * Gets a new tokenizer instance which parses Tab Separated Value strings.
236      * The default for CSV processing will be trim whitespace from both ends
237      * (which can be overridden with the setTrimmer method).
238      * @param input  the string to parse
239      * @return a new tokenizer instance which parses Tab Separated Value strings.
240      */
241     public static StrTokenizer getTSVInstance(char[] input) {
242         StrTokenizer tok = getTSVClone();
243         tok.reset(input);
244         return tok;
245     }
246 
247     //-----------------------------------------------------------------------
248     /**
249      * Constructs a tokenizer splitting on space, tab, newline and formfeed
250      * as per StringTokenizer, but with no text to tokenize.
251      * <p>
252      * This constructor is normally used with {@link #reset(String)}.
253      */
254     public StrTokenizer() {
255         super();
256         this.chars = null;
257     }
258 
259     /**
260      * Constructs a tokenizer splitting on space, tab, newline and formfeed
261      * as per StringTokenizer.
262      *
263      * @param input  the string which is to be parsed
264      */
265     public StrTokenizer(String input) {
266         super();
267         if (input != null) {
268             chars = input.toCharArray();
269         } else {
270             chars = null;
271         }
272     }
273 
274     /**
275      * Constructs a tokenizer splitting on the specified delimiter character.
276      *
277      * @param input  the string which is to be parsed
278      * @param delim  the field delimiter character
279      */
280     public StrTokenizer(String input, char delim) {
281         this(input);
282         setDelimiterChar(delim);
283     }
284 
285     /**
286      * Constructs a tokenizer splitting on the specified delimiter string.
287      *
288      * @param input  the string which is to be parsed
289      * @param delim  the field delimiter string
290      */
291     public StrTokenizer(String input, String delim) {
292         this(input);
293         setDelimiterString(delim);
294     }
295 
296     /**
297      * Constructs a tokenizer splitting using the specified delimiter matcher.
298      *
299      * @param input  the string which is to be parsed
300      * @param delim  the field delimiter matcher
301      */
302     public StrTokenizer(String input, StrMatcher delim) {
303         this(input);
304         setDelimiterMatcher(delim);
305     }
306 
307     /**
308      * Constructs a tokenizer splitting on the specified delimiter character
309      * and handling quotes using the specified quote character.
310      *
311      * @param input  the string which is to be parsed
312      * @param delim  the field delimiter character
313      * @param quote  the field quoted string character
314      */
315     public StrTokenizer(String input, char delim, char quote) {
316         this(input, delim);
317         setQuoteChar(quote);
318     }
319 
320     /**
321      * Constructs a tokenizer splitting using the specified delimiter matcher
322      * and handling quotes using the specified quote matcher.
323      *
324      * @param input  the string which is to be parsed
325      * @param delim  the field delimiter matcher
326      * @param quote  the field quoted string matcher
327      */
328     public StrTokenizer(String input, StrMatcher./../../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrMatcher.html#StrMatcher">StrMatcher delim, StrMatcher quote) {
329         this(input, delim);
330         setQuoteMatcher(quote);
331     }
332 
333     /**
334      * Constructs a tokenizer splitting on space, tab, newline and formfeed
335      * as per StringTokenizer.
336      *
337      * @param input  the string which is to be parsed, not cloned
338      */
339     public StrTokenizer(char[] input) {
340         super();
341         this.chars = ArrayUtils.clone(input);
342     }
343 
344     /**
345      * Constructs a tokenizer splitting on the specified character.
346      *
347      * @param input  the string which is to be parsed, not cloned
348      * @param delim the field delimiter character
349      */
350     public StrTokenizer(char[] input, char delim) {
351         this(input);
352         setDelimiterChar(delim);
353     }
354 
355     /**
356      * Constructs a tokenizer splitting on the specified string.
357      *
358      * @param input  the string which is to be parsed, not cloned
359      * @param delim the field delimiter string
360      */
361     public StrTokenizer(char[] input, String delim) {
362         this(input);
363         setDelimiterString(delim);
364     }
365 
366     /**
367      * Constructs a tokenizer splitting using the specified delimiter matcher.
368      *
369      * @param input  the string which is to be parsed, not cloned
370      * @param delim  the field delimiter matcher
371      */
372     public StrTokenizer(char[] input, StrMatcher delim) {
373         this(input);
374         setDelimiterMatcher(delim);
375     }
376 
377     /**
378      * Constructs a tokenizer splitting on the specified delimiter character
379      * and handling quotes using the specified quote character.
380      *
381      * @param input  the string which is to be parsed, not cloned
382      * @param delim  the field delimiter character
383      * @param quote  the field quoted string character
384      */
385     public StrTokenizer(char[] input, char delim, char quote) {
386         this(input, delim);
387         setQuoteChar(quote);
388     }
389 
390     /**
391      * Constructs a tokenizer splitting using the specified delimiter matcher
392      * and handling quotes using the specified quote matcher.
393      *
394      * @param input  the string which is to be parsed, not cloned
395      * @param delim  the field delimiter character
396      * @param quote  the field quoted string character
397      */
398     public StrTokenizer(char[] input, StrMatcher./../../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrMatcher.html#StrMatcher">StrMatcher delim, StrMatcher quote) {
399         this(input, delim);
400         setQuoteMatcher(quote);
401     }
402 
403     // API
404     //-----------------------------------------------------------------------
405     /**
406      * Gets the number of tokens found in the String.
407      *
408      * @return the number of matched tokens
409      */
410     public int size() {
411         checkTokenized();
412         return tokens.length;
413     }
414 
415     /**
416      * Gets the next token from the String.
417      * Equivalent to {@link #next()} except it returns null rather than
418      * throwing {@link NoSuchElementException} when no tokens remain.
419      *
420      * @return the next sequential token, or null when no more tokens are found
421      */
422     public String nextToken() {
423         if (hasNext()) {
424             return tokens[tokenPos++];
425         }
426         return null;
427     }
428 
429     /**
430      * Gets the previous token from the String.
431      *
432      * @return the previous sequential token, or null when no more tokens are found
433      */
434     public String previousToken() {
435         if (hasPrevious()) {
436             return tokens[--tokenPos];
437         }
438         return null;
439     }
440 
441     /**
442      * Gets a copy of the full token list as an independent modifiable array.
443      *
444      * @return the tokens as a String array
445      */
446     public String[] getTokenArray() {
447         checkTokenized();
448         return tokens.clone();
449     }
450 
451     /**
452      * Gets a copy of the full token list as an independent modifiable list.
453      *
454      * @return the tokens as a String array
455      */
456     public List<String> getTokenList() {
457         checkTokenized();
458         List<String> list = new ArrayList<String>(tokens.length);
459         for (String element : tokens) {
460             list.add(element);
461         }
462         return list;
463     }
464 
465     /**
466      * Resets this tokenizer, forgetting all parsing and iteration already completed.
467      * <p>
468      * This method allows the same tokenizer to be reused for the same String.
469      *
470      * @return this, to enable chaining
471      */
472     public StrTokenizer reset() {
473         tokenPos = 0;
474         tokens = null;
475         return this;
476     }
477 
478     /**
479      * Reset this tokenizer, giving it a new input string to parse.
480      * In this manner you can re-use a tokenizer with the same settings
481      * on multiple input lines.
482      *
483      * @param input  the new string to tokenize, null sets no text to parse
484      * @return this, to enable chaining
485      */
486     public StrTokenizer reset(String input) {
487         reset();
488         if (input != null) {
489             this.chars = input.toCharArray();
490         } else {
491             this.chars = null;
492         }
493         return this;
494     }
495 
496     /**
497      * Reset this tokenizer, giving it a new input string to parse.
498      * In this manner you can re-use a tokenizer with the same settings
499      * on multiple input lines.
500      *
501      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
502      * @return this, to enable chaining
503      */
504     public StrTokenizer reset(char[] input) {
505         reset();
506         this.chars = ArrayUtils.clone(input);
507         return this;
508     }
509 
510     // ListIterator
511     //-----------------------------------------------------------------------
512     /**
513      * Checks whether there are any more tokens.
514      *
515      * @return true if there are more tokens
516      */
517     public boolean hasNext() {
518         checkTokenized();
519         return tokenPos < tokens.length;
520     }
521 
522     /**
523      * Gets the next token.
524      *
525      * @return the next String token
526      * @throws NoSuchElementException if there are no more elements
527      */
528     public String next() {
529         if (hasNext()) {
530             return tokens[tokenPos++];
531         }
532         throw new NoSuchElementException();
533     }
534 
535     /**
536      * Gets the index of the next token to return.
537      *
538      * @return the next token index
539      */
540     public int nextIndex() {
541         return tokenPos;
542     }
543 
544     /**
545      * Checks whether there are any previous tokens that can be iterated to.
546      *
547      * @return true if there are previous tokens
548      */
549     public boolean hasPrevious() {
550         checkTokenized();
551         return tokenPos > 0;
552     }
553 
554     /**
555      * Gets the token previous to the last returned token.
556      *
557      * @return the previous token
558      */
559     public String previous() {
560         if (hasPrevious()) {
561             return tokens[--tokenPos];
562         }
563         throw new NoSuchElementException();
564     }
565 
566     /**
567      * Gets the index of the previous token.
568      *
569      * @return the previous token index
570      */
571     public int previousIndex() {
572         return tokenPos - 1;
573     }
574 
575     /**
576      * Unsupported ListIterator operation.
577      *
578      * @throws UnsupportedOperationException always
579      */
580     public void remove() {
581         throw new UnsupportedOperationException("remove() is unsupported");
582     }
583 
584     /**
585      * Unsupported ListIterator operation.
586      * @param obj this parameter ignored.
587      * @throws UnsupportedOperationException always
588      */
589     public void set(String obj) {
590         throw new UnsupportedOperationException("set() is unsupported");
591     }
592 
593     /**
594      * Unsupported ListIterator operation.
595      * @param obj this parameter ignored.
596      * @throws UnsupportedOperationException always
597      */
598     public void add(String obj) {
599         throw new UnsupportedOperationException("add() is unsupported");
600     }
601 
602     // Implementation
603     //-----------------------------------------------------------------------
604     /**
605      * Checks if tokenization has been done, and if not then do it.
606      */
607     private void checkTokenized() {
608         if (tokens == null) {
609             if (chars == null) {
610                 // still call tokenize as subclass may do some work
611                 List<String> split = tokenize(null, 0, 0);
612                 tokens = split.toArray(new String[split.size()]);
613             } else {
614                 List<String> split = tokenize(chars, 0, chars.length);
615                 tokens = split.toArray(new String[split.size()]);
616             }
617         }
618     }
619 
620     /**
621      * Internal method to performs the tokenization.
622      * <p>
623      * Most users of this class do not need to call this method. This method
624      * will be called automatically by other (public) methods when required.
625      * <p>
626      * This method exists to allow subclasses to add code before or after the
627      * tokenization. For example, a subclass could alter the character array,
628      * offset or count to be parsed, or call the tokenizer multiple times on
629      * multiple strings. It is also be possible to filter the results.
630      * <p>
631      * <code>StrTokenizer</code> will always pass a zero offset and a count
632      * equal to the length of the array to this method, however a subclass
633      * may pass other values, or even an entirely different array.
634      * 
635      * @param chars  the character array being tokenized, may be null
636      * @param offset  the start position within the character array, must be valid
637      * @param count  the number of characters to tokenize, must be valid
638      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
639      */
640     protected List<String> tokenize(char[] chars, int offset, int count) {
641         if (chars == null || count == 0) {
642             return Collections.emptyList();
643         }
644         StrBuildermiddleware/grouperClientExt/org/apache/commons/lang3/text/StrBuilder.html#StrBuilder">StrBuilder buf = new StrBuilder();
645         List<String> tokens = new ArrayList<String>();
646         int pos = offset;
647         
648         // loop around the entire buffer
649         while (pos >= 0 && pos < count) {
650             // find next token
651             pos = readNextToken(chars, pos, count, buf, tokens);
652             
653             // handle case where end of string is a delimiter
654             if (pos >= count) {
655                 addToken(tokens, "");
656             }
657         }
658         return tokens;
659     }
660 
661     /**
662      * Adds a token to a list, paying attention to the parameters we've set.
663      *
664      * @param list  the list to add to
665      * @param tok  the token to add
666      */
667     private void addToken(List<String> list, String tok) {
668         if (tok == null || tok.length() == 0) {
669             if (isIgnoreEmptyTokens()) {
670                 return;
671             }
672             if (isEmptyTokenAsNull()) {
673                 tok = null;
674             }
675         }
676         list.add(tok);
677     }
678 
679     /**
680      * Reads character by character through the String to get the next token.
681      *
682      * @param chars  the character array being tokenized
683      * @param start  the first character of field
684      * @param len  the length of the character array being tokenized
685      * @param workArea  a temporary work area
686      * @param tokens  the list of parsed tokens
687      * @return the starting position of the next field (the character
688      *  immediately after the delimiter), or -1 if end of string found
689      */
690     private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
691         // skip all leading whitespace, unless it is the
692         // field delimiter or the quote character
693         while (start < len) {
694             int removeLen = Math.max(
695                     getIgnoredMatcher().isMatch(chars, start, start, len),
696                     getTrimmerMatcher().isMatch(chars, start, start, len));
697             if (removeLen == 0 ||
698                 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
699                 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
700                 break;
701             }
702             start += removeLen;
703         }
704         
705         // handle reaching end
706         if (start >= len) {
707             addToken(tokens, "");
708             return -1;
709         }
710         
711         // handle empty token
712         int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
713         if (delimLen > 0) {
714             addToken(tokens, "");
715             return start + delimLen;
716         }
717         
718         // handle found token
719         int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
720         if (quoteLen > 0) {
721             return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
722         }
723         return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
724     }
725 
726     /**
727      * Reads a possibly quoted string token.
728      *
729      * @param chars  the character array being tokenized
730      * @param start  the first character of field
731      * @param len  the length of the character array being tokenized
732      * @param workArea  a temporary work area
733      * @param tokens  the list of parsed tokens
734      * @param quoteStart  the start position of the matched quote, 0 if no quoting
735      * @param quoteLen  the length of the matched quote, 0 if no quoting
736      * @return the starting position of the next field (the character
737      *  immediately after the delimiter, or if end of string found,
738      *  then the length of string
739      */
740     private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
741                                List<String> tokens, int quoteStart, int quoteLen) {
742         // Loop until we've found the end of the quoted
743         // string or the end of the input
744         workArea.clear();
745         int pos = start;
746         boolean quoting = quoteLen > 0;
747         int trimStart = 0;
748         
749         while (pos < len) {
750             // quoting mode can occur several times throughout a string
751             // we must switch between quoting and non-quoting until we
752             // encounter a non-quoted delimiter, or end of string
753             if (quoting) {
754                 // In quoting mode
755                 
756                 // If we've found a quote character, see if it's
757                 // followed by a second quote.  If so, then we need
758                 // to actually put the quote character into the token
759                 // rather than end the token.
760                 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
761                     if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
762                         // matched pair of quotes, thus an escaped quote
763                         workArea.append(chars, pos, quoteLen);
764                         pos += quoteLen * 2;
765                         trimStart = workArea.size();
766                         continue;
767                     }
768                     
769                     // end of quoting
770                     quoting = false;
771                     pos += quoteLen;
772                     continue;
773                 }
774                 
775                 // copy regular character from inside quotes
776                 workArea.append(chars[pos++]);
777                 trimStart = workArea.size();
778                 
779             } else {
780                 // Not in quoting mode
781                 
782                 // check for delimiter, and thus end of token
783                 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
784                 if (delimLen > 0) {
785                     // return condition when end of token found
786                     addToken(tokens, workArea.substring(0, trimStart));
787                     return pos + delimLen;
788                 }
789                 
790                 // check for quote, and thus back into quoting mode
791                 if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) {
792                     quoting = true;
793                     pos += quoteLen;
794                     continue;
795                 }
796                 
797                 // check for ignored (outside quotes), and ignore
798                 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
799                 if (ignoredLen > 0) {
800                     pos += ignoredLen;
801                     continue;
802                 }
803                 
804                 // check for trimmed character
805                 // don't yet know if its at the end, so copy to workArea
806                 // use trimStart to keep track of trim at the end
807                 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
808                 if (trimmedLen > 0) {
809                     workArea.append(chars, pos, trimmedLen);
810                     pos += trimmedLen;
811                     continue;
812                 }
813                 
814                 // copy regular character from outside quotes
815                 workArea.append(chars[pos++]);
816                 trimStart = workArea.size();
817             }
818         }
819         
820         // return condition when end of string found
821         addToken(tokens, workArea.substring(0, trimStart));
822         return -1;
823     }
824 
825     /**
826      * Checks if the characters at the index specified match the quote
827      * already matched in readNextToken().
828      *
829      * @param chars  the character array being tokenized
830      * @param pos  the position to check for a quote
831      * @param len  the length of the character array being tokenized
832      * @param quoteStart  the start position of the matched quote, 0 if no quoting
833      * @param quoteLen  the length of the matched quote, 0 if no quoting
834      * @return true if a quote is matched
835      */
836     private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
837         for (int i = 0; i < quoteLen; i++) {
838             if (pos + i >= len || chars[pos + i] != chars[quoteStart + i]) {
839                 return false;
840             }
841         }
842         return true;
843     }
844 
845     // Delimiter
846     //-----------------------------------------------------------------------
847     /**
848      * Gets the field delimiter matcher.
849      *
850      * @return the delimiter matcher in use
851      */
852     public StrMatcher getDelimiterMatcher() {
853         return this.delimMatcher;
854     }
855 
856     /**
857      * Sets the field delimiter matcher.
858      * <p>
859      * The delimitier is used to separate one token from another.
860      *
861      * @param delim  the delimiter matcher to use
862      * @return this, to enable chaining
863      */
864     public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
865         if (delim == null) {
866             this.delimMatcher = StrMatcher.noneMatcher();
867         } else {
868             this.delimMatcher = delim;
869         }
870         return this;
871     }
872 
873     /**
874      * Sets the field delimiter character.
875      *
876      * @param delim  the delimiter character to use
877      * @return this, to enable chaining
878      */
879     public StrTokenizer setDelimiterChar(char delim) {
880         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
881     }
882 
883     /**
884      * Sets the field delimiter string.
885      *
886      * @param delim  the delimiter string to use
887      * @return this, to enable chaining
888      */
889     public StrTokenizer setDelimiterString(String delim) {
890         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
891     }
892 
893     // Quote
894     //-----------------------------------------------------------------------
895     /**
896      * Gets the quote matcher currently in use.
897      * <p>
898      * The quote character is used to wrap data between the tokens.
899      * This enables delimiters to be entered as data.
900      * The default value is '"' (double quote).
901      *
902      * @return the quote matcher in use
903      */
904     public StrMatcher getQuoteMatcher() {
905         return quoteMatcher;
906     }
907 
908     /**
909      * Set the quote matcher to use.
910      * <p>
911      * The quote character is used to wrap data between the tokens.
912      * This enables delimiters to be entered as data.
913      *
914      * @param quote  the quote matcher to use, null ignored
915      * @return this, to enable chaining
916      */
917     public StrTokenizer setQuoteMatcher(StrMatcher quote) {
918         if (quote != null) {
919             this.quoteMatcher = quote;
920         }
921         return this;
922     }
923 
924     /**
925      * Sets the quote character to use.
926      * <p>
927      * The quote character is used to wrap data between the tokens.
928      * This enables delimiters to be entered as data.
929      *
930      * @param quote  the quote character to use
931      * @return this, to enable chaining
932      */
933     public StrTokenizer setQuoteChar(char quote) {
934         return setQuoteMatcher(StrMatcher.charMatcher(quote));
935     }
936 
937     // Ignored
938     //-----------------------------------------------------------------------
939     /**
940      * Gets the ignored character matcher.
941      * <p>
942      * These characters are ignored when parsing the String, unless they are
943      * within a quoted region.
944      * The default value is not to ignore anything.
945      *
946      * @return the ignored matcher in use
947      */
948     public StrMatcher getIgnoredMatcher() {
949         return ignoredMatcher;
950     }
951 
952     /**
953      * Set the matcher for characters to ignore.
954      * <p>
955      * These characters are ignored when parsing the String, unless they are
956      * within a quoted region.
957      *
958      * @param ignored  the ignored matcher to use, null ignored
959      * @return this, to enable chaining
960      */
961     public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
962         if (ignored != null) {
963             this.ignoredMatcher = ignored;
964         }
965         return this;
966     }
967 
968     /**
969      * Set the character to ignore.
970      * <p>
971      * This character is ignored when parsing the String, unless it is
972      * within a quoted region.
973      *
974      * @param ignored  the ignored character to use
975      * @return this, to enable chaining
976      */
977     public StrTokenizer setIgnoredChar(char ignored) {
978         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
979     }
980 
981     // Trimmer
982     //-----------------------------------------------------------------------
983     /**
984      * Gets the trimmer character matcher.
985      * <p>
986      * These characters are trimmed off on each side of the delimiter
987      * until the token or quote is found.
988      * The default value is not to trim anything.
989      *
990      * @return the trimmer matcher in use
991      */
992     public StrMatcher getTrimmerMatcher() {
993         return trimmerMatcher;
994     }
995 
996     /**
997      * Sets the matcher for characters to trim.
998      * <p>
999      * These characters are trimmed off on each side of the delimiter
1000      * until the token or quote is found.
1001      *
1002      * @param trimmer  the trimmer matcher to use, null ignored
1003      * @return this, to enable chaining
1004      */
1005     public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1006         if (trimmer != null) {
1007             this.trimmerMatcher = trimmer;
1008         }
1009         return this;
1010     }
1011 
1012     //-----------------------------------------------------------------------
1013     /**
1014      * Gets whether the tokenizer currently returns empty tokens as null.
1015      * The default for this property is false.
1016      *
1017      * @return true if empty tokens are returned as null
1018      */
1019     public boolean isEmptyTokenAsNull() {
1020         return this.emptyAsNull;
1021     }
1022 
1023     /**
1024      * Sets whether the tokenizer should return empty tokens as null.
1025      * The default for this property is false.
1026      *
1027      * @param emptyAsNull  whether empty tokens are returned as null
1028      * @return this, to enable chaining
1029      */
1030     public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1031         this.emptyAsNull = emptyAsNull;
1032         return this;
1033     }
1034 
1035     //-----------------------------------------------------------------------
1036     /**
1037      * Gets whether the tokenizer currently ignores empty tokens.
1038      * The default for this property is true.
1039      *
1040      * @return true if empty tokens are not returned
1041      */
1042     public boolean isIgnoreEmptyTokens() {
1043         return ignoreEmptyTokens;
1044     }
1045 
1046     /**
1047      * Sets whether the tokenizer should ignore and not return empty tokens.
1048      * The default for this property is true.
1049      *
1050      * @param ignoreEmptyTokens  whether empty tokens are not returned
1051      * @return this, to enable chaining
1052      */
1053     public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1054         this.ignoreEmptyTokens = ignoreEmptyTokens;
1055         return this;
1056     }
1057 
1058     //-----------------------------------------------------------------------
1059     /**
1060      * Gets the String content that the tokenizer is parsing.
1061      *
1062      * @return the string content being parsed
1063      */
1064     public String getContent() {
1065         if (chars == null) {
1066             return null;
1067         }
1068         return new String(chars);
1069     }
1070 
1071     //-----------------------------------------------------------------------
1072     /**
1073      * Creates a new instance of this Tokenizer. The new instance is reset so
1074      * that it will be at the start of the token list.
1075      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1076      * 
1077      * @return a new instance of this Tokenizer which has been reset.
1078      */
1079     @Override
1080     public Object clone() {
1081         try {
1082             return cloneReset();
1083         } catch (CloneNotSupportedException ex) {
1084             return null;
1085         }
1086     }
1087 
1088     /**
1089      * Creates a new instance of this Tokenizer. The new instance is reset so that
1090      * it will be at the start of the token list.
1091      * 
1092      * @return a new instance of this Tokenizer which has been reset.
1093      * @throws CloneNotSupportedException if there is a problem cloning
1094      */
1095     Object cloneReset() throws CloneNotSupportedException {
1096         // this method exists to enable 100% test coverage
1097         StrTokenizer../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrTokenizer.html#StrTokenizer">StrTokenizer cloned = (StrTokenizer) super.clone();
1098         if (cloned.chars != null) {
1099             cloned.chars = cloned.chars.clone();
1100         }
1101         cloned.reset();
1102         return cloned;
1103     }
1104 
1105     //-----------------------------------------------------------------------
1106     /**
1107      * Gets the String content that the tokenizer is parsing.
1108      *
1109      * @return the string content being parsed
1110      */
1111     @Override
1112     public String toString() {
1113         if (tokens == null) {
1114             return "StrTokenizer[not tokenized yet]";
1115         }
1116         return "StrTokenizer" + getTokenList();
1117     }
1118 
1119 }