1 /** 2 * Copyright 2014 Internet2 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /* 17 * Licensed to the Apache Software Foundation (ASF) under one or more 18 * contributor license agreements. See the NOTICE file distributed with 19 * this work for additional information regarding copyright ownership. 20 * The ASF licenses this file to You under the Apache License, Version 2.0 21 * (the "License"); you may not use this file except in compliance with 22 * the License. You may obtain a copy of the License at 23 * 24 * http://www.apache.org/licenses/LICENSE-2.0 25 * 26 * Unless required by applicable law or agreed to in writing, software 27 * distributed under the License is distributed on an "AS IS" BASIS, 28 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 29 * See the License for the specific language governing permissions and 30 * limitations under the License. 31 */ 32 package edu.internet2.middleware.grouperClientExt.org.apache.commons.lang3.text; 33 34 import java.util.ArrayList; 35 import java.util.Collections; 36 import java.util.List; 37 import java.util.ListIterator; 38 import java.util.NoSuchElementException; 39 40 import edu.internet2.middleware.grouperClientExt.org.apache.commons.lang3.ArrayUtils; 41 42 /** 43 * Tokenizes a string based based on delimiters (separators) 44 * and supporting quoting and ignored character concepts. 45 * <p> 46 * This class can split a String into many smaller strings. It aims 47 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 48 * however it offers much more control and flexibility including implementing 49 * the <code>ListIterator</code> interface. By default, it is set up 50 * like <code>StringTokenizer</code>. 51 * <p> 52 * The input String is split into a number of <i>tokens</i>. 53 * Each token is separated from the next String by a <i>delimiter</i>. 54 * One or more delimiter characters must be specified. 55 * <p> 56 * Each token may be surrounded by quotes. 57 * The <i>quote</i> matcher specifies the quote character(s). 58 * A quote may be escaped within a quoted section by duplicating itself. 59 * <p> 60 * Between each token and the delimiter are potentially characters that need trimming. 61 * The <i>trimmer</i> matcher specifies these characters. 62 * One usage might be to trim whitespace characters. 63 * <p> 64 * At any point outside the quotes there might potentially be invalid characters. 65 * The <i>ignored</i> matcher specifies these characters to be removed. 66 * One usage might be to remove new line characters. 67 * <p> 68 * Empty tokens may be removed or returned as null. 69 * <pre> 70 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 71 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 72 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 73 * </pre> 74 * <p> 75 * 76 * This tokenizer has the following properties and options: 77 * 78 * <table> 79 * <tr> 80 * <th>Property</th><th>Type</th><th>Default</th> 81 * </tr> 82 * <tr> 83 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 84 * </tr> 85 * <tr> 86 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 87 * </tr> 88 * <tr> 89 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 90 * </tr> 91 * <tr> 92 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 93 * </tr> 94 * <tr> 95 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 96 * </tr> 97 * </table> 98 * 99 * @since 2.2 100 * @version $Id: StrTokenizer.java 1199894 2011-11-09 17:53:59Z ggregory $ 101 */ 102 public class StrTokenizer implements ListIterator<String>, Cloneable { 103 104 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 105 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 106 static { 107 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 108 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 109 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 110 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 111 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 112 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 113 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 114 115 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 116 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 117 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 118 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 119 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 120 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 121 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 122 } 123 124 /** The text to work on. */ 125 private char chars[]; 126 /** The parsed tokens */ 127 private String tokens[]; 128 /** The current iteration position */ 129 private int tokenPos; 130 131 /** The delimiter matcher */ 132 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 133 /** The quote matcher */ 134 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 135 /** The ignored matcher */ 136 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 137 /** The trimmer matcher */ 138 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 139 140 /** Whether to return empty tokens as null */ 141 private boolean emptyAsNull = false; 142 /** Whether to ignore empty tokens */ 143 private boolean ignoreEmptyTokens = true; 144 145 //----------------------------------------------------------------------- 146 147 /** 148 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 149 * 150 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 151 */ 152 private static StrTokenizer getCSVClone() { 153 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 154 } 155 156 /** 157 * Gets a new tokenizer instance which parses Comma Separated Value strings 158 * initializing it with the given input. The default for CSV processing 159 * will be trim whitespace from both ends (which can be overridden with 160 * the setTrimmer method). 161 * <p> 162 * You must call a "reset" method to set the string which you want to parse. 163 * @return a new tokenizer instance which parses Comma Separated Value strings 164 */ 165 public static StrTokenizer getCSVInstance() { 166 return getCSVClone(); 167 } 168 169 /** 170 * Gets a new tokenizer instance which parses Comma Separated Value strings 171 * initializing it with the given input. The default for CSV processing 172 * will be trim whitespace from both ends (which can be overridden with 173 * the setTrimmer method). 174 * 175 * @param input the text to parse 176 * @return a new tokenizer instance which parses Comma Separated Value strings 177 */ 178 public static StrTokenizer getCSVInstance(String input) { 179 StrTokenizer tok = getCSVClone(); 180 tok.reset(input); 181 return tok; 182 } 183 184 /** 185 * Gets a new tokenizer instance which parses Comma Separated Value strings 186 * initializing it with the given input. The default for CSV processing 187 * will be trim whitespace from both ends (which can be overridden with 188 * the setTrimmer method). 189 * 190 * @param input the text to parse 191 * @return a new tokenizer instance which parses Comma Separated Value strings 192 */ 193 public static StrTokenizer getCSVInstance(char[] input) { 194 StrTokenizer tok = getCSVClone(); 195 tok.reset(input); 196 return tok; 197 } 198 199 /** 200 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 201 * 202 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 203 */ 204 private static StrTokenizer getTSVClone() { 205 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 206 } 207 208 209 /** 210 * Gets a new tokenizer instance which parses Tab Separated Value strings. 211 * The default for CSV processing will be trim whitespace from both ends 212 * (which can be overridden with the setTrimmer method). 213 * <p> 214 * You must call a "reset" method to set the string which you want to parse. 215 * @return a new tokenizer instance which parses Tab Separated Value strings. 216 */ 217 public static StrTokenizer getTSVInstance() { 218 return getTSVClone(); 219 } 220 221 /** 222 * Gets a new tokenizer instance which parses Tab Separated Value strings. 223 * The default for CSV processing will be trim whitespace from both ends 224 * (which can be overridden with the setTrimmer method). 225 * @param input the string to parse 226 * @return a new tokenizer instance which parses Tab Separated Value strings. 227 */ 228 public static StrTokenizer getTSVInstance(String input) { 229 StrTokenizer tok = getTSVClone(); 230 tok.reset(input); 231 return tok; 232 } 233 234 /** 235 * Gets a new tokenizer instance which parses Tab Separated Value strings. 236 * The default for CSV processing will be trim whitespace from both ends 237 * (which can be overridden with the setTrimmer method). 238 * @param input the string to parse 239 * @return a new tokenizer instance which parses Tab Separated Value strings. 240 */ 241 public static StrTokenizer getTSVInstance(char[] input) { 242 StrTokenizer tok = getTSVClone(); 243 tok.reset(input); 244 return tok; 245 } 246 247 //----------------------------------------------------------------------- 248 /** 249 * Constructs a tokenizer splitting on space, tab, newline and formfeed 250 * as per StringTokenizer, but with no text to tokenize. 251 * <p> 252 * This constructor is normally used with {@link #reset(String)}. 253 */ 254 public StrTokenizer() { 255 super(); 256 this.chars = null; 257 } 258 259 /** 260 * Constructs a tokenizer splitting on space, tab, newline and formfeed 261 * as per StringTokenizer. 262 * 263 * @param input the string which is to be parsed 264 */ 265 public StrTokenizer(String input) { 266 super(); 267 if (input != null) { 268 chars = input.toCharArray(); 269 } else { 270 chars = null; 271 } 272 } 273 274 /** 275 * Constructs a tokenizer splitting on the specified delimiter character. 276 * 277 * @param input the string which is to be parsed 278 * @param delim the field delimiter character 279 */ 280 public StrTokenizer(String input, char delim) { 281 this(input); 282 setDelimiterChar(delim); 283 } 284 285 /** 286 * Constructs a tokenizer splitting on the specified delimiter string. 287 * 288 * @param input the string which is to be parsed 289 * @param delim the field delimiter string 290 */ 291 public StrTokenizer(String input, String delim) { 292 this(input); 293 setDelimiterString(delim); 294 } 295 296 /** 297 * Constructs a tokenizer splitting using the specified delimiter matcher. 298 * 299 * @param input the string which is to be parsed 300 * @param delim the field delimiter matcher 301 */ 302 public StrTokenizer(String input, StrMatcher delim) { 303 this(input); 304 setDelimiterMatcher(delim); 305 } 306 307 /** 308 * Constructs a tokenizer splitting on the specified delimiter character 309 * and handling quotes using the specified quote character. 310 * 311 * @param input the string which is to be parsed 312 * @param delim the field delimiter character 313 * @param quote the field quoted string character 314 */ 315 public StrTokenizer(String input, char delim, char quote) { 316 this(input, delim); 317 setQuoteChar(quote); 318 } 319 320 /** 321 * Constructs a tokenizer splitting using the specified delimiter matcher 322 * and handling quotes using the specified quote matcher. 323 * 324 * @param input the string which is to be parsed 325 * @param delim the field delimiter matcher 326 * @param quote the field quoted string matcher 327 */ 328 public StrTokenizer(String input, StrMatcher./../../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrMatcher.html#StrMatcher">StrMatcher delim, StrMatcher quote) { 329 this(input, delim); 330 setQuoteMatcher(quote); 331 } 332 333 /** 334 * Constructs a tokenizer splitting on space, tab, newline and formfeed 335 * as per StringTokenizer. 336 * 337 * @param input the string which is to be parsed, not cloned 338 */ 339 public StrTokenizer(char[] input) { 340 super(); 341 this.chars = ArrayUtils.clone(input); 342 } 343 344 /** 345 * Constructs a tokenizer splitting on the specified character. 346 * 347 * @param input the string which is to be parsed, not cloned 348 * @param delim the field delimiter character 349 */ 350 public StrTokenizer(char[] input, char delim) { 351 this(input); 352 setDelimiterChar(delim); 353 } 354 355 /** 356 * Constructs a tokenizer splitting on the specified string. 357 * 358 * @param input the string which is to be parsed, not cloned 359 * @param delim the field delimiter string 360 */ 361 public StrTokenizer(char[] input, String delim) { 362 this(input); 363 setDelimiterString(delim); 364 } 365 366 /** 367 * Constructs a tokenizer splitting using the specified delimiter matcher. 368 * 369 * @param input the string which is to be parsed, not cloned 370 * @param delim the field delimiter matcher 371 */ 372 public StrTokenizer(char[] input, StrMatcher delim) { 373 this(input); 374 setDelimiterMatcher(delim); 375 } 376 377 /** 378 * Constructs a tokenizer splitting on the specified delimiter character 379 * and handling quotes using the specified quote character. 380 * 381 * @param input the string which is to be parsed, not cloned 382 * @param delim the field delimiter character 383 * @param quote the field quoted string character 384 */ 385 public StrTokenizer(char[] input, char delim, char quote) { 386 this(input, delim); 387 setQuoteChar(quote); 388 } 389 390 /** 391 * Constructs a tokenizer splitting using the specified delimiter matcher 392 * and handling quotes using the specified quote matcher. 393 * 394 * @param input the string which is to be parsed, not cloned 395 * @param delim the field delimiter character 396 * @param quote the field quoted string character 397 */ 398 public StrTokenizer(char[] input, StrMatcher./../../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrMatcher.html#StrMatcher">StrMatcher delim, StrMatcher quote) { 399 this(input, delim); 400 setQuoteMatcher(quote); 401 } 402 403 // API 404 //----------------------------------------------------------------------- 405 /** 406 * Gets the number of tokens found in the String. 407 * 408 * @return the number of matched tokens 409 */ 410 public int size() { 411 checkTokenized(); 412 return tokens.length; 413 } 414 415 /** 416 * Gets the next token from the String. 417 * Equivalent to {@link #next()} except it returns null rather than 418 * throwing {@link NoSuchElementException} when no tokens remain. 419 * 420 * @return the next sequential token, or null when no more tokens are found 421 */ 422 public String nextToken() { 423 if (hasNext()) { 424 return tokens[tokenPos++]; 425 } 426 return null; 427 } 428 429 /** 430 * Gets the previous token from the String. 431 * 432 * @return the previous sequential token, or null when no more tokens are found 433 */ 434 public String previousToken() { 435 if (hasPrevious()) { 436 return tokens[--tokenPos]; 437 } 438 return null; 439 } 440 441 /** 442 * Gets a copy of the full token list as an independent modifiable array. 443 * 444 * @return the tokens as a String array 445 */ 446 public String[] getTokenArray() { 447 checkTokenized(); 448 return tokens.clone(); 449 } 450 451 /** 452 * Gets a copy of the full token list as an independent modifiable list. 453 * 454 * @return the tokens as a String array 455 */ 456 public List<String> getTokenList() { 457 checkTokenized(); 458 List<String> list = new ArrayList<String>(tokens.length); 459 for (String element : tokens) { 460 list.add(element); 461 } 462 return list; 463 } 464 465 /** 466 * Resets this tokenizer, forgetting all parsing and iteration already completed. 467 * <p> 468 * This method allows the same tokenizer to be reused for the same String. 469 * 470 * @return this, to enable chaining 471 */ 472 public StrTokenizer reset() { 473 tokenPos = 0; 474 tokens = null; 475 return this; 476 } 477 478 /** 479 * Reset this tokenizer, giving it a new input string to parse. 480 * In this manner you can re-use a tokenizer with the same settings 481 * on multiple input lines. 482 * 483 * @param input the new string to tokenize, null sets no text to parse 484 * @return this, to enable chaining 485 */ 486 public StrTokenizer reset(String input) { 487 reset(); 488 if (input != null) { 489 this.chars = input.toCharArray(); 490 } else { 491 this.chars = null; 492 } 493 return this; 494 } 495 496 /** 497 * Reset this tokenizer, giving it a new input string to parse. 498 * In this manner you can re-use a tokenizer with the same settings 499 * on multiple input lines. 500 * 501 * @param input the new character array to tokenize, not cloned, null sets no text to parse 502 * @return this, to enable chaining 503 */ 504 public StrTokenizer reset(char[] input) { 505 reset(); 506 this.chars = ArrayUtils.clone(input); 507 return this; 508 } 509 510 // ListIterator 511 //----------------------------------------------------------------------- 512 /** 513 * Checks whether there are any more tokens. 514 * 515 * @return true if there are more tokens 516 */ 517 public boolean hasNext() { 518 checkTokenized(); 519 return tokenPos < tokens.length; 520 } 521 522 /** 523 * Gets the next token. 524 * 525 * @return the next String token 526 * @throws NoSuchElementException if there are no more elements 527 */ 528 public String next() { 529 if (hasNext()) { 530 return tokens[tokenPos++]; 531 } 532 throw new NoSuchElementException(); 533 } 534 535 /** 536 * Gets the index of the next token to return. 537 * 538 * @return the next token index 539 */ 540 public int nextIndex() { 541 return tokenPos; 542 } 543 544 /** 545 * Checks whether there are any previous tokens that can be iterated to. 546 * 547 * @return true if there are previous tokens 548 */ 549 public boolean hasPrevious() { 550 checkTokenized(); 551 return tokenPos > 0; 552 } 553 554 /** 555 * Gets the token previous to the last returned token. 556 * 557 * @return the previous token 558 */ 559 public String previous() { 560 if (hasPrevious()) { 561 return tokens[--tokenPos]; 562 } 563 throw new NoSuchElementException(); 564 } 565 566 /** 567 * Gets the index of the previous token. 568 * 569 * @return the previous token index 570 */ 571 public int previousIndex() { 572 return tokenPos - 1; 573 } 574 575 /** 576 * Unsupported ListIterator operation. 577 * 578 * @throws UnsupportedOperationException always 579 */ 580 public void remove() { 581 throw new UnsupportedOperationException("remove() is unsupported"); 582 } 583 584 /** 585 * Unsupported ListIterator operation. 586 * @param obj this parameter ignored. 587 * @throws UnsupportedOperationException always 588 */ 589 public void set(String obj) { 590 throw new UnsupportedOperationException("set() is unsupported"); 591 } 592 593 /** 594 * Unsupported ListIterator operation. 595 * @param obj this parameter ignored. 596 * @throws UnsupportedOperationException always 597 */ 598 public void add(String obj) { 599 throw new UnsupportedOperationException("add() is unsupported"); 600 } 601 602 // Implementation 603 //----------------------------------------------------------------------- 604 /** 605 * Checks if tokenization has been done, and if not then do it. 606 */ 607 private void checkTokenized() { 608 if (tokens == null) { 609 if (chars == null) { 610 // still call tokenize as subclass may do some work 611 List<String> split = tokenize(null, 0, 0); 612 tokens = split.toArray(new String[split.size()]); 613 } else { 614 List<String> split = tokenize(chars, 0, chars.length); 615 tokens = split.toArray(new String[split.size()]); 616 } 617 } 618 } 619 620 /** 621 * Internal method to performs the tokenization. 622 * <p> 623 * Most users of this class do not need to call this method. This method 624 * will be called automatically by other (public) methods when required. 625 * <p> 626 * This method exists to allow subclasses to add code before or after the 627 * tokenization. For example, a subclass could alter the character array, 628 * offset or count to be parsed, or call the tokenizer multiple times on 629 * multiple strings. It is also be possible to filter the results. 630 * <p> 631 * <code>StrTokenizer</code> will always pass a zero offset and a count 632 * equal to the length of the array to this method, however a subclass 633 * may pass other values, or even an entirely different array. 634 * 635 * @param chars the character array being tokenized, may be null 636 * @param offset the start position within the character array, must be valid 637 * @param count the number of characters to tokenize, must be valid 638 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 639 */ 640 protected List<String> tokenize(char[] chars, int offset, int count) { 641 if (chars == null || count == 0) { 642 return Collections.emptyList(); 643 } 644 StrBuildermiddleware/grouperClientExt/org/apache/commons/lang3/text/StrBuilder.html#StrBuilder">StrBuilder buf = new StrBuilder(); 645 List<String> tokens = new ArrayList<String>(); 646 int pos = offset; 647 648 // loop around the entire buffer 649 while (pos >= 0 && pos < count) { 650 // find next token 651 pos = readNextToken(chars, pos, count, buf, tokens); 652 653 // handle case where end of string is a delimiter 654 if (pos >= count) { 655 addToken(tokens, ""); 656 } 657 } 658 return tokens; 659 } 660 661 /** 662 * Adds a token to a list, paying attention to the parameters we've set. 663 * 664 * @param list the list to add to 665 * @param tok the token to add 666 */ 667 private void addToken(List<String> list, String tok) { 668 if (tok == null || tok.length() == 0) { 669 if (isIgnoreEmptyTokens()) { 670 return; 671 } 672 if (isEmptyTokenAsNull()) { 673 tok = null; 674 } 675 } 676 list.add(tok); 677 } 678 679 /** 680 * Reads character by character through the String to get the next token. 681 * 682 * @param chars the character array being tokenized 683 * @param start the first character of field 684 * @param len the length of the character array being tokenized 685 * @param workArea a temporary work area 686 * @param tokens the list of parsed tokens 687 * @return the starting position of the next field (the character 688 * immediately after the delimiter), or -1 if end of string found 689 */ 690 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) { 691 // skip all leading whitespace, unless it is the 692 // field delimiter or the quote character 693 while (start < len) { 694 int removeLen = Math.max( 695 getIgnoredMatcher().isMatch(chars, start, start, len), 696 getTrimmerMatcher().isMatch(chars, start, start, len)); 697 if (removeLen == 0 || 698 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || 699 getQuoteMatcher().isMatch(chars, start, start, len) > 0) { 700 break; 701 } 702 start += removeLen; 703 } 704 705 // handle reaching end 706 if (start >= len) { 707 addToken(tokens, ""); 708 return -1; 709 } 710 711 // handle empty token 712 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); 713 if (delimLen > 0) { 714 addToken(tokens, ""); 715 return start + delimLen; 716 } 717 718 // handle found token 719 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); 720 if (quoteLen > 0) { 721 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); 722 } 723 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); 724 } 725 726 /** 727 * Reads a possibly quoted string token. 728 * 729 * @param chars the character array being tokenized 730 * @param start the first character of field 731 * @param len the length of the character array being tokenized 732 * @param workArea a temporary work area 733 * @param tokens the list of parsed tokens 734 * @param quoteStart the start position of the matched quote, 0 if no quoting 735 * @param quoteLen the length of the matched quote, 0 if no quoting 736 * @return the starting position of the next field (the character 737 * immediately after the delimiter, or if end of string found, 738 * then the length of string 739 */ 740 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 741 List<String> tokens, int quoteStart, int quoteLen) { 742 // Loop until we've found the end of the quoted 743 // string or the end of the input 744 workArea.clear(); 745 int pos = start; 746 boolean quoting = quoteLen > 0; 747 int trimStart = 0; 748 749 while (pos < len) { 750 // quoting mode can occur several times throughout a string 751 // we must switch between quoting and non-quoting until we 752 // encounter a non-quoted delimiter, or end of string 753 if (quoting) { 754 // In quoting mode 755 756 // If we've found a quote character, see if it's 757 // followed by a second quote. If so, then we need 758 // to actually put the quote character into the token 759 // rather than end the token. 760 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 761 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { 762 // matched pair of quotes, thus an escaped quote 763 workArea.append(chars, pos, quoteLen); 764 pos += quoteLen * 2; 765 trimStart = workArea.size(); 766 continue; 767 } 768 769 // end of quoting 770 quoting = false; 771 pos += quoteLen; 772 continue; 773 } 774 775 // copy regular character from inside quotes 776 workArea.append(chars[pos++]); 777 trimStart = workArea.size(); 778 779 } else { 780 // Not in quoting mode 781 782 // check for delimiter, and thus end of token 783 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); 784 if (delimLen > 0) { 785 // return condition when end of token found 786 addToken(tokens, workArea.substring(0, trimStart)); 787 return pos + delimLen; 788 } 789 790 // check for quote, and thus back into quoting mode 791 if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) { 792 quoting = true; 793 pos += quoteLen; 794 continue; 795 } 796 797 // check for ignored (outside quotes), and ignore 798 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); 799 if (ignoredLen > 0) { 800 pos += ignoredLen; 801 continue; 802 } 803 804 // check for trimmed character 805 // don't yet know if its at the end, so copy to workArea 806 // use trimStart to keep track of trim at the end 807 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); 808 if (trimmedLen > 0) { 809 workArea.append(chars, pos, trimmedLen); 810 pos += trimmedLen; 811 continue; 812 } 813 814 // copy regular character from outside quotes 815 workArea.append(chars[pos++]); 816 trimStart = workArea.size(); 817 } 818 } 819 820 // return condition when end of string found 821 addToken(tokens, workArea.substring(0, trimStart)); 822 return -1; 823 } 824 825 /** 826 * Checks if the characters at the index specified match the quote 827 * already matched in readNextToken(). 828 * 829 * @param chars the character array being tokenized 830 * @param pos the position to check for a quote 831 * @param len the length of the character array being tokenized 832 * @param quoteStart the start position of the matched quote, 0 if no quoting 833 * @param quoteLen the length of the matched quote, 0 if no quoting 834 * @return true if a quote is matched 835 */ 836 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { 837 for (int i = 0; i < quoteLen; i++) { 838 if (pos + i >= len || chars[pos + i] != chars[quoteStart + i]) { 839 return false; 840 } 841 } 842 return true; 843 } 844 845 // Delimiter 846 //----------------------------------------------------------------------- 847 /** 848 * Gets the field delimiter matcher. 849 * 850 * @return the delimiter matcher in use 851 */ 852 public StrMatcher getDelimiterMatcher() { 853 return this.delimMatcher; 854 } 855 856 /** 857 * Sets the field delimiter matcher. 858 * <p> 859 * The delimitier is used to separate one token from another. 860 * 861 * @param delim the delimiter matcher to use 862 * @return this, to enable chaining 863 */ 864 public StrTokenizer setDelimiterMatcher(StrMatcher delim) { 865 if (delim == null) { 866 this.delimMatcher = StrMatcher.noneMatcher(); 867 } else { 868 this.delimMatcher = delim; 869 } 870 return this; 871 } 872 873 /** 874 * Sets the field delimiter character. 875 * 876 * @param delim the delimiter character to use 877 * @return this, to enable chaining 878 */ 879 public StrTokenizer setDelimiterChar(char delim) { 880 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 881 } 882 883 /** 884 * Sets the field delimiter string. 885 * 886 * @param delim the delimiter string to use 887 * @return this, to enable chaining 888 */ 889 public StrTokenizer setDelimiterString(String delim) { 890 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 891 } 892 893 // Quote 894 //----------------------------------------------------------------------- 895 /** 896 * Gets the quote matcher currently in use. 897 * <p> 898 * The quote character is used to wrap data between the tokens. 899 * This enables delimiters to be entered as data. 900 * The default value is '"' (double quote). 901 * 902 * @return the quote matcher in use 903 */ 904 public StrMatcher getQuoteMatcher() { 905 return quoteMatcher; 906 } 907 908 /** 909 * Set the quote matcher to use. 910 * <p> 911 * The quote character is used to wrap data between the tokens. 912 * This enables delimiters to be entered as data. 913 * 914 * @param quote the quote matcher to use, null ignored 915 * @return this, to enable chaining 916 */ 917 public StrTokenizer setQuoteMatcher(StrMatcher quote) { 918 if (quote != null) { 919 this.quoteMatcher = quote; 920 } 921 return this; 922 } 923 924 /** 925 * Sets the quote character to use. 926 * <p> 927 * The quote character is used to wrap data between the tokens. 928 * This enables delimiters to be entered as data. 929 * 930 * @param quote the quote character to use 931 * @return this, to enable chaining 932 */ 933 public StrTokenizer setQuoteChar(char quote) { 934 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 935 } 936 937 // Ignored 938 //----------------------------------------------------------------------- 939 /** 940 * Gets the ignored character matcher. 941 * <p> 942 * These characters are ignored when parsing the String, unless they are 943 * within a quoted region. 944 * The default value is not to ignore anything. 945 * 946 * @return the ignored matcher in use 947 */ 948 public StrMatcher getIgnoredMatcher() { 949 return ignoredMatcher; 950 } 951 952 /** 953 * Set the matcher for characters to ignore. 954 * <p> 955 * These characters are ignored when parsing the String, unless they are 956 * within a quoted region. 957 * 958 * @param ignored the ignored matcher to use, null ignored 959 * @return this, to enable chaining 960 */ 961 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { 962 if (ignored != null) { 963 this.ignoredMatcher = ignored; 964 } 965 return this; 966 } 967 968 /** 969 * Set the character to ignore. 970 * <p> 971 * This character is ignored when parsing the String, unless it is 972 * within a quoted region. 973 * 974 * @param ignored the ignored character to use 975 * @return this, to enable chaining 976 */ 977 public StrTokenizer setIgnoredChar(char ignored) { 978 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 979 } 980 981 // Trimmer 982 //----------------------------------------------------------------------- 983 /** 984 * Gets the trimmer character matcher. 985 * <p> 986 * These characters are trimmed off on each side of the delimiter 987 * until the token or quote is found. 988 * The default value is not to trim anything. 989 * 990 * @return the trimmer matcher in use 991 */ 992 public StrMatcher getTrimmerMatcher() { 993 return trimmerMatcher; 994 } 995 996 /** 997 * Sets the matcher for characters to trim. 998 * <p> 999 * These characters are trimmed off on each side of the delimiter 1000 * until the token or quote is found. 1001 * 1002 * @param trimmer the trimmer matcher to use, null ignored 1003 * @return this, to enable chaining 1004 */ 1005 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { 1006 if (trimmer != null) { 1007 this.trimmerMatcher = trimmer; 1008 } 1009 return this; 1010 } 1011 1012 //----------------------------------------------------------------------- 1013 /** 1014 * Gets whether the tokenizer currently returns empty tokens as null. 1015 * The default for this property is false. 1016 * 1017 * @return true if empty tokens are returned as null 1018 */ 1019 public boolean isEmptyTokenAsNull() { 1020 return this.emptyAsNull; 1021 } 1022 1023 /** 1024 * Sets whether the tokenizer should return empty tokens as null. 1025 * The default for this property is false. 1026 * 1027 * @param emptyAsNull whether empty tokens are returned as null 1028 * @return this, to enable chaining 1029 */ 1030 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { 1031 this.emptyAsNull = emptyAsNull; 1032 return this; 1033 } 1034 1035 //----------------------------------------------------------------------- 1036 /** 1037 * Gets whether the tokenizer currently ignores empty tokens. 1038 * The default for this property is true. 1039 * 1040 * @return true if empty tokens are not returned 1041 */ 1042 public boolean isIgnoreEmptyTokens() { 1043 return ignoreEmptyTokens; 1044 } 1045 1046 /** 1047 * Sets whether the tokenizer should ignore and not return empty tokens. 1048 * The default for this property is true. 1049 * 1050 * @param ignoreEmptyTokens whether empty tokens are not returned 1051 * @return this, to enable chaining 1052 */ 1053 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { 1054 this.ignoreEmptyTokens = ignoreEmptyTokens; 1055 return this; 1056 } 1057 1058 //----------------------------------------------------------------------- 1059 /** 1060 * Gets the String content that the tokenizer is parsing. 1061 * 1062 * @return the string content being parsed 1063 */ 1064 public String getContent() { 1065 if (chars == null) { 1066 return null; 1067 } 1068 return new String(chars); 1069 } 1070 1071 //----------------------------------------------------------------------- 1072 /** 1073 * Creates a new instance of this Tokenizer. The new instance is reset so 1074 * that it will be at the start of the token list. 1075 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1076 * 1077 * @return a new instance of this Tokenizer which has been reset. 1078 */ 1079 @Override 1080 public Object clone() { 1081 try { 1082 return cloneReset(); 1083 } catch (CloneNotSupportedException ex) { 1084 return null; 1085 } 1086 } 1087 1088 /** 1089 * Creates a new instance of this Tokenizer. The new instance is reset so that 1090 * it will be at the start of the token list. 1091 * 1092 * @return a new instance of this Tokenizer which has been reset. 1093 * @throws CloneNotSupportedException if there is a problem cloning 1094 */ 1095 Object cloneReset() throws CloneNotSupportedException { 1096 // this method exists to enable 100% test coverage 1097 StrTokenizer../../../../../../../edu/internet2/middleware/grouperClientExt/org/apache/commons/lang3/text/StrTokenizer.html#StrTokenizer">StrTokenizer cloned = (StrTokenizer) super.clone(); 1098 if (cloned.chars != null) { 1099 cloned.chars = cloned.chars.clone(); 1100 } 1101 cloned.reset(); 1102 return cloned; 1103 } 1104 1105 //----------------------------------------------------------------------- 1106 /** 1107 * Gets the String content that the tokenizer is parsing. 1108 * 1109 * @return the string content being parsed 1110 */ 1111 @Override 1112 public String toString() { 1113 if (tokens == null) { 1114 return "StrTokenizer[not tokenized yet]"; 1115 } 1116 return "StrTokenizer" + getTokenList(); 1117 } 1118 1119 }