libjava/ChangeLog:
[official-gcc.git] / libjava / classpath / gnu / java / util / regex / RE.java
blobd064f7a3579e006cdaa745e96fbdda82df4daa3e
1 /* gnu/regexp/RE.java
2 Copyright (C) 2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu.java.util.regex;
40 import gnu.java.lang.CPStringBuilder;
42 import java.io.InputStream;
43 import java.io.Serializable;
45 import java.util.ArrayList;
46 import java.util.List;
47 import java.util.Locale;
48 import java.util.PropertyResourceBundle;
49 import java.util.ResourceBundle;
51 /**
52 * RE provides the user interface for compiling and matching regular
53 * expressions.
54 * <P>
55 * A regular expression object (class RE) is compiled by constructing it
56 * from a String, StringBuffer or character array, with optional
57 * compilation flags (below)
58 * and an optional syntax specification (see RESyntax; if not specified,
59 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
60 * <P>
61 * Once compiled, a regular expression object is reusable as well as
62 * threadsafe: multiple threads can use the RE instance simultaneously
63 * to match against different input text.
64 * <P>
65 * Various methods attempt to match input text against a compiled
66 * regular expression. These methods are:
67 * <LI><code>isMatch</code>: returns true if the input text in its
68 * entirety matches the regular expression pattern.
69 * <LI><code>getMatch</code>: returns the first match found in the
70 * input text, or null if no match is found.
71 * <LI><code>getAllMatches</code>: returns an array of all
72 * non-overlapping matches found in the input text. If no matches are
73 * found, the array is zero-length.
74 * <LI><code>substitute</code>: substitute the first occurence of the
75 * pattern in the input text with a replacement string (which may
76 * include metacharacters $0-$9, see REMatch.substituteInto).
77 * <LI><code>substituteAll</code>: same as above, but repeat for each
78 * match before returning.
79 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
80 * object that allows iteration over the matches (see
81 * REMatchEnumeration for some reasons why you may want to do this
82 * instead of using <code>getAllMatches</code>.
83 * <P>
85 * These methods all have similar argument lists. The input can be a
86 * CharIndexed, String, a character array, a StringBuffer, or an
87 * InputStream of some sort. Note that when using an
88 * InputStream, the stream read position cannot be guaranteed after
89 * attempting a match (this is not a bug, but a consequence of the way
90 * regular expressions work). Using an REMatchEnumeration can
91 * eliminate most positioning problems.
93 * Although the input object can be of various types, it is recommended
94 * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
95 * can show the last match found on this input, which helps the expression
96 * \G work as the end of the previous match.
98 * <P>
100 * The optional index argument specifies the offset from the beginning
101 * of the text at which the search should start (see the descriptions
102 * of some of the execution flags for how this can affect positional
103 * pattern operators). For an InputStream, this means an
104 * offset from the current read position, so subsequent calls with the
105 * same index argument on an InputStream will not
106 * necessarily access the same position on the stream, whereas
107 * repeated searches at a given index in a fixed string will return
108 * consistent results.
110 * <P>
111 * You can optionally affect the execution environment by using a
112 * combination of execution flags (constants listed below).
114 * <P>
115 * All operations on a regular expression are performed in a
116 * thread-safe manner.
118 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
119 * @version 1.1.5-dev, to be released
122 public class RE extends REToken
125 private static final class IntPair implements Serializable
127 public int first, second;
130 private static final class CharUnit implements Serializable
132 public char ch;
133 public boolean bk;
136 // This String will be returned by getVersion()
137 private static final String VERSION = "1.1.5-dev";
139 // The localized strings are kept in a separate file
140 // Used by getLocalizedMessage().
141 private static ResourceBundle messages;
143 // Name of the bundle that contains the localized messages.
144 private static final String bundle = "gnu/java/util/regex/MessagesBundle";
146 // These are, respectively, the first and last tokens in our linked list
147 // If there is only one token, firstToken == lastToken
148 private REToken firstToken, lastToken;
150 // This is the number of subexpressions in this regular expression,
151 // with a minimum value of zero. Returned by getNumSubs()
152 private int numSubs;
154 /** Minimum length, in characters, of any possible match. */
155 private int minimumLength;
156 private int maximumLength;
159 * Compilation flag. Do not differentiate case. Subsequent
160 * searches using this RE will be case insensitive.
162 public static final int REG_ICASE = 0x02;
165 * Compilation flag. The match-any-character operator (dot)
166 * will match a newline character. When set this overrides the syntax
167 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
168 * the "/s" operator in Perl.
170 public static final int REG_DOT_NEWLINE = 0x04;
173 * Compilation flag. Use multiline mode. In this mode, the ^ and $
174 * anchors will match based on newlines within the input. This is
175 * equivalent to the "/m" operator in Perl.
177 public static final int REG_MULTILINE = 0x08;
180 * Execution flag.
181 * The match-beginning operator (^) will not match at the beginning
182 * of the input string. Useful for matching on a substring when you
183 * know the context of the input is such that position zero of the
184 * input to the match test is not actually position zero of the text.
185 * <P>
186 * This example demonstrates the results of various ways of matching on
187 * a substring.
188 * <P>
189 * <CODE>
190 * String s = "food bar fool";<BR>
191 * RE exp = new RE("^foo.");<BR>
192 * REMatch m0 = exp.getMatch(s);<BR>
193 * REMatch m1 = exp.getMatch(s.substring(8));<BR>
194 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
195 * REMatch m3 = exp.getMatch(s,8); <BR>
196 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
197 * <P>
198 * // Results:<BR>
199 * // m0.toString(): "food"<BR>
200 * // m1.toString(): "fool"<BR>
201 * // m2.toString(): null<BR>
202 * // m3.toString(): null<BR>
203 * // m4.toString(): "fool"<BR>
204 * </CODE>
206 public static final int REG_NOTBOL = 0x10;
209 * Execution flag.
210 * The match-end operator ($) does not match at the end
211 * of the input string. Useful for matching on substrings.
213 public static final int REG_NOTEOL = 0x20;
216 * Execution flag.
217 * When a match method is invoked that starts matching at a non-zero
218 * index into the input, treat the input as if it begins at the index
219 * given. The effect of this flag is that the engine does not "see"
220 * any text in the input before the given index. This is useful so
221 * that the match-beginning operator (^) matches not at position 0
222 * in the input string, but at the position the search started at
223 * (based on the index input given to the getMatch function). See
224 * the example under REG_NOTBOL. It also affects the use of the \&lt;
225 * and \b operators.
227 public static final int REG_ANCHORINDEX = 0x40;
230 * Execution flag.
231 * The substitute and substituteAll methods will not attempt to
232 * interpolate occurrences of $1-$9 in the replacement text with
233 * the corresponding subexpressions. For example, you may want to
234 * replace all matches of "one dollar" with "$1".
236 public static final int REG_NO_INTERPOLATE = 0x80;
239 * Execution flag.
240 * Try to match the whole input string. An implicit match-end operator
241 * is added to this regexp.
243 public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
246 * Execution flag.
247 * The substitute and substituteAll methods will treat the
248 * character '\' in the replacement as an escape to a literal
249 * character. In this case "\n", "\$", "\\", "\x40" and "\012"
250 * will become "n", "$", "\", "x40" and "012" respectively.
251 * This flag has no effect if REG_NO_INTERPOLATE is set on.
253 public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
256 * Compilation flag. Allow whitespace and comments in pattern.
257 * This is equivalent to the "/x" operator in Perl.
259 public static final int REG_X_COMMENTS = 0x0400;
262 * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
264 public static final int REG_ICASE_USASCII = 0x0800;
267 * Execution flag.
268 * Do not move the position at which the search begins. If not set,
269 * the starting position will be moved until a match is found.
271 public static final int REG_FIX_STARTING_POSITION = 0x1000;
273 /** Returns a string representing the version of the gnu.regexp package. */
274 public static final String version ()
276 return VERSION;
279 // Retrieves a message from the ResourceBundle
280 static final String getLocalizedMessage (String key)
282 if (messages == null)
283 messages =
284 PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
285 return messages.getString (key);
289 * Constructs a regular expression pattern buffer without any compilation
290 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
292 * @param pattern A regular expression pattern, in the form of a String,
293 * StringBuffer or char[]. Other input types will be converted to
294 * strings using the toString() method.
295 * @exception REException The input pattern could not be parsed.
296 * @exception NullPointerException The pattern was null.
298 public RE (Object pattern) throws REException
300 this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
304 * Constructs a regular expression pattern buffer using the specified
305 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
307 * @param pattern A regular expression pattern, in the form of a String,
308 * StringBuffer, or char[]. Other input types will be converted to
309 * strings using the toString() method.
310 * @param cflags The logical OR of any combination of the compilation flags listed above.
311 * @exception REException The input pattern could not be parsed.
312 * @exception NullPointerException The pattern was null.
314 public RE (Object pattern, int cflags) throws REException
316 this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
320 * Constructs a regular expression pattern buffer using the specified
321 * compilation flags and regular expression syntax.
323 * @param pattern A regular expression pattern, in the form of a String,
324 * StringBuffer, or char[]. Other input types will be converted to
325 * strings using the toString() method.
326 * @param cflags The logical OR of any combination of the compilation flags listed above.
327 * @param syntax The type of regular expression syntax to use.
328 * @exception REException The input pattern could not be parsed.
329 * @exception NullPointerException The pattern was null.
331 public RE (Object pattern, int cflags, RESyntax syntax) throws REException
333 this (pattern, cflags, syntax, 0, 0);
336 // internal constructor used for alternation
337 private RE (REToken first, REToken last, int subs, int subIndex,
338 int minLength, int maxLength)
340 super (subIndex);
341 firstToken = first;
342 lastToken = last;
343 numSubs = subs;
344 minimumLength = minLength;
345 maximumLength = maxLength;
346 addToken (new RETokenEndSub (subIndex));
349 private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
350 int nextSub) throws REException
352 super (myIndex); // Subexpression index of this token.
353 initialize (patternObj, cflags, syntax, myIndex, nextSub);
356 // For use by subclasses
357 protected RE ()
359 super (0);
362 // The meat of construction
363 protected void initialize (Object patternObj, int cflags, RESyntax syntax,
364 int myIndex, int nextSub) throws REException
366 char[] pattern;
367 if (patternObj instanceof String)
369 pattern = ((String) patternObj).toCharArray ();
371 else if (patternObj instanceof char[])
373 pattern = (char[]) patternObj;
375 else if (patternObj instanceof StringBuffer)
377 pattern = new char[((StringBuffer) patternObj).length ()];
378 ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
380 else if (patternObj instanceof StringBuilder)
382 pattern = new char[((StringBuilder) patternObj).length ()];
383 ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
385 else if (patternObj instanceof CPStringBuilder)
387 pattern = new char[((CPStringBuilder) patternObj).length ()];
388 ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
391 else
393 pattern = patternObj.toString ().toCharArray ();
396 int pLength = pattern.length;
398 numSubs = 0; // Number of subexpressions in this token.
399 ArrayList < REToken > branches = null;
401 // linked list of tokens (sort of -- some closed loops can exist)
402 firstToken = lastToken = null;
404 // Precalculate these so we don't pay for the math every time we
405 // need to access them.
406 boolean insens = ((cflags & REG_ICASE) > 0);
407 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
409 // Parse pattern into tokens. Does anyone know if it's more efficient
410 // to use char[] than a String.charAt()? I'm assuming so.
412 // index tracks the position in the char array
413 int index = 0;
415 // this will be the current parse character (pattern[index])
416 CharUnit unit = new CharUnit ();
418 // This is used for {x,y} calculations
419 IntPair minMax = new IntPair ();
421 // Buffer a token so we can create a TokenRepeated, etc.
422 REToken currentToken = null;
423 boolean quot = false;
425 // Saved syntax and flags.
426 RESyntax savedSyntax = null;
427 int savedCflags = 0;
428 boolean flagsSaved = false;
430 while (index < pLength)
432 // read the next character unit (including backslash escapes)
433 index = getCharUnit (pattern, index, unit, quot);
435 if (unit.bk)
436 if (unit.ch == 'Q')
438 quot = true;
439 continue;
441 else if (unit.ch == 'E')
443 quot = false;
444 continue;
446 if (quot)
447 unit.bk = false;
449 if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
451 if (Character.isWhitespace (unit.ch))
453 continue;
455 if (unit.ch == '#')
457 for (int i = index; i < pLength; i++)
459 if (pattern[i] == '\n')
461 index = i + 1;
462 continue;
464 else if (pattern[i] == '\r')
466 if (i + 1 < pLength && pattern[i + 1] == '\n')
468 index = i + 2;
470 else
472 index = i + 1;
474 continue;
477 index = pLength;
478 continue;
482 // ALTERNATION OPERATOR
483 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
484 // not available if RE_LIMITED_OPS is set
486 // TODO: the '\n' literal here should be a test against REToken.newline,
487 // which unfortunately may be more than a single character.
488 if (((unit.ch == '|'
489 && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
490 || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
491 && !(unit.bk || quot)))
492 && !syntax.get (RESyntax.RE_LIMITED_OPS))
494 // make everything up to here be a branch. create vector if nec.
495 addToken (currentToken);
496 RE theBranch =
497 new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
498 maximumLength);
499 minimumLength = 0;
500 maximumLength = 0;
501 if (branches == null)
503 branches = new ArrayList < REToken > ();
505 branches.add (theBranch);
506 firstToken = lastToken = currentToken = null;
509 // INTERVAL OPERATOR:
510 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
511 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
513 // OPEN QUESTION:
514 // what is proper interpretation of '{' at start of string?
516 // This method used to check "repeat.empty.token" to avoid such regexp
517 // as "(a*){2,}", but now "repeat.empty.token" is allowed.
519 else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
520 && (syntax.
521 get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
523 int newIndex = getMinMax (pattern, index, minMax, syntax);
524 if (newIndex > index)
526 if (minMax.first > minMax.second)
527 throw new
528 REException (getLocalizedMessage ("interval.order"),
529 REException.REG_BADRPT, newIndex);
530 if (currentToken == null)
531 throw new
532 REException (getLocalizedMessage ("repeat.no.token"),
533 REException.REG_BADRPT, newIndex);
534 if (currentToken instanceof RETokenRepeated)
535 throw new
536 REException (getLocalizedMessage ("repeat.chained"),
537 REException.REG_BADRPT, newIndex);
538 if (currentToken instanceof RETokenWordBoundary
539 || currentToken instanceof RETokenWordBoundary)
540 throw new
541 REException (getLocalizedMessage ("repeat.assertion"),
542 REException.REG_BADRPT, newIndex);
543 index = newIndex;
544 currentToken =
545 setRepeated (currentToken, minMax.first, minMax.second,
546 index);
548 else
550 addToken (currentToken);
551 currentToken = new RETokenChar (subIndex, unit.ch, insens);
552 if (insensUSASCII)
553 currentToken.unicodeAware = false;
557 // LIST OPERATOR:
558 // [...] | [^...]
560 else if ((unit.ch == '[') && !(unit.bk || quot))
562 // Create a new RETokenOneOf
563 ParseCharClassResult result =
564 parseCharClass (subIndex, pattern, index, pLength, cflags,
565 syntax, 0);
566 addToken (currentToken);
567 currentToken = result.token;
568 index = result.index;
571 // SUBEXPRESSIONS
572 // (...) | \(...\) depending on RE_NO_BK_PARENS
574 else if ((unit.ch == '(')
575 && (syntax.
576 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
578 boolean pure = false;
579 boolean comment = false;
580 boolean lookAhead = false;
581 boolean lookBehind = false;
582 boolean independent = false;
583 boolean negativelh = false;
584 boolean negativelb = false;
585 if ((index + 1 < pLength) && (pattern[index] == '?'))
587 switch (pattern[index + 1])
589 case '!':
590 if (syntax.get (RESyntax.RE_LOOKAHEAD))
592 pure = true;
593 negativelh = true;
594 lookAhead = true;
595 index += 2;
597 break;
598 case '=':
599 if (syntax.get (RESyntax.RE_LOOKAHEAD))
601 pure = true;
602 lookAhead = true;
603 index += 2;
605 break;
606 case '<':
607 // We assume that if the syntax supports look-ahead,
608 // it also supports look-behind.
609 if (syntax.get (RESyntax.RE_LOOKAHEAD))
611 index++;
612 switch (pattern[index + 1])
614 case '!':
615 pure = true;
616 negativelb = true;
617 lookBehind = true;
618 index += 2;
619 break;
620 case '=':
621 pure = true;
622 lookBehind = true;
623 index += 2;
626 break;
627 case '>':
628 // We assume that if the syntax supports look-ahead,
629 // it also supports independent group.
630 if (syntax.get (RESyntax.RE_LOOKAHEAD))
632 pure = true;
633 independent = true;
634 index += 2;
636 break;
637 case 'i':
638 case 'd':
639 case 'm':
640 case 's':
641 case 'u':
642 case 'x':
643 case '-':
644 if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
645 break;
646 // Set or reset syntax flags.
647 int flagIndex = index + 1;
648 int endFlag = -1;
649 RESyntax newSyntax = new RESyntax (syntax);
650 int newCflags = cflags;
651 boolean negate = false;
652 while (flagIndex < pLength && endFlag < 0)
654 switch (pattern[flagIndex])
656 case 'i':
657 if (negate)
658 newCflags &= ~REG_ICASE;
659 else
660 newCflags |= REG_ICASE;
661 flagIndex++;
662 break;
663 case 'd':
664 if (negate)
665 newSyntax.setLineSeparator (RESyntax.
666 DEFAULT_LINE_SEPARATOR);
667 else
668 newSyntax.setLineSeparator ("\n");
669 flagIndex++;
670 break;
671 case 'm':
672 if (negate)
673 newCflags &= ~REG_MULTILINE;
674 else
675 newCflags |= REG_MULTILINE;
676 flagIndex++;
677 break;
678 case 's':
679 if (negate)
680 newCflags &= ~REG_DOT_NEWLINE;
681 else
682 newCflags |= REG_DOT_NEWLINE;
683 flagIndex++;
684 break;
685 case 'u':
686 if (negate)
687 newCflags |= REG_ICASE_USASCII;
688 else
689 newCflags &= ~REG_ICASE_USASCII;
690 flagIndex++;
691 break;
692 case 'x':
693 if (negate)
694 newCflags &= ~REG_X_COMMENTS;
695 else
696 newCflags |= REG_X_COMMENTS;
697 flagIndex++;
698 break;
699 case '-':
700 negate = true;
701 flagIndex++;
702 break;
703 case ':':
704 case ')':
705 endFlag = pattern[flagIndex];
706 break;
707 default:
708 throw new
709 REException (getLocalizedMessage
710 ("repeat.no.token"),
711 REException.REG_BADRPT, index);
714 if (endFlag == ')')
716 syntax = newSyntax;
717 cflags = newCflags;
718 insens = ((cflags & REG_ICASE) > 0);
719 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
720 // This can be treated as though it were a comment.
721 comment = true;
722 index = flagIndex - 1;
723 break;
725 if (endFlag == ':')
727 savedSyntax = syntax;
728 savedCflags = cflags;
729 flagsSaved = true;
730 syntax = newSyntax;
731 cflags = newCflags;
732 insens = ((cflags & REG_ICASE) > 0);
733 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
734 index = flagIndex - 1;
735 // Fall through to the next case.
737 else
739 throw new
740 REException (getLocalizedMessage
741 ("unmatched.paren"),
742 REException.REG_ESUBREG, index);
744 case ':':
745 if (syntax.get (RESyntax.RE_PURE_GROUPING))
747 pure = true;
748 index += 2;
750 break;
751 case '#':
752 if (syntax.get (RESyntax.RE_COMMENTS))
754 comment = true;
756 break;
757 default:
758 throw new
759 REException (getLocalizedMessage ("repeat.no.token"),
760 REException.REG_BADRPT, index);
764 if (index >= pLength)
766 throw new
767 REException (getLocalizedMessage ("unmatched.paren"),
768 REException.REG_ESUBREG, index);
771 // find end of subexpression
772 int endIndex = index;
773 int nextIndex = index;
774 int nested = 0;
776 while (((nextIndex =
777 getCharUnit (pattern, endIndex, unit, false)) > 0)
778 && !(nested == 0 && (unit.ch == ')')
779 && (syntax.
780 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
781 || quot))))
783 if ((endIndex = nextIndex) >= pLength)
784 throw new
785 REException (getLocalizedMessage ("subexpr.no.end"),
786 REException.REG_ESUBREG, nextIndex);
787 else
788 if ((unit.ch == '[') && !(unit.bk || quot))
790 // I hate to do something similar to the LIST OPERATOR matters
791 // above, but ...
792 int listIndex = nextIndex;
793 if (listIndex < pLength && pattern[listIndex] == '^')
794 listIndex++;
795 if (listIndex < pLength && pattern[listIndex] == ']')
796 listIndex++;
797 int listEndIndex = -1;
798 int listNest = 0;
799 while (listIndex < pLength && listEndIndex < 0)
801 switch (pattern[listIndex++])
803 case '\\':
804 listIndex++;
805 break;
806 case '[':
807 // Sun's API document says that regexp like "[a-d[m-p]]"
808 // is legal. Even something like "[[[^]]]]" is accepted.
809 listNest++;
810 if (listIndex < pLength
811 && pattern[listIndex] == '^')
812 listIndex++;
813 if (listIndex < pLength
814 && pattern[listIndex] == ']')
815 listIndex++;
816 break;
817 case ']':
818 if (listNest == 0)
819 listEndIndex = listIndex;
820 listNest--;
821 break;
824 if (listEndIndex >= 0)
826 nextIndex = listEndIndex;
827 if ((endIndex = nextIndex) >= pLength)
828 throw new
829 REException (getLocalizedMessage ("subexpr.no.end"),
830 REException.REG_ESUBREG, nextIndex);
831 else
832 continue;
834 throw new
835 REException (getLocalizedMessage ("subexpr.no.end"),
836 REException.REG_ESUBREG, nextIndex);
838 else if (unit.ch == '('
839 && (syntax.
840 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
841 || quot)))
842 nested++;
843 else if (unit.ch == ')'
844 && (syntax.
845 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
846 || quot)))
847 nested--;
850 // endIndex is now position at a ')','\)'
851 // nextIndex is end of string or position after ')' or '\)'
853 if (comment)
854 index = nextIndex;
855 else
856 { // not a comment
857 // create RE subexpression as token.
858 addToken (currentToken);
859 if (!pure)
861 numSubs++;
864 int useIndex = (pure || lookAhead || lookBehind
865 || independent) ? 0 : nextSub + numSubs;
866 currentToken =
867 new RE (String.valueOf (pattern, index, endIndex - index).
868 toCharArray (), cflags, syntax, useIndex,
869 nextSub + numSubs);
870 numSubs += ((RE) currentToken).getNumSubs ();
872 if (lookAhead)
874 currentToken =
875 new RETokenLookAhead (currentToken, negativelh);
877 else if (lookBehind)
879 currentToken =
880 new RETokenLookBehind (currentToken, negativelb);
882 else if (independent)
884 currentToken = new RETokenIndependent (currentToken);
887 index = nextIndex;
888 if (flagsSaved)
890 syntax = savedSyntax;
891 cflags = savedCflags;
892 insens = ((cflags & REG_ICASE) > 0);
893 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
894 flagsSaved = false;
896 } // not a comment
897 } // subexpression
899 // UNMATCHED RIGHT PAREN
900 // ) or \) throw exception if
901 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
902 else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
903 && ((unit.ch == ')')
904 && (syntax.
905 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
907 throw new REException (getLocalizedMessage ("unmatched.paren"),
908 REException.REG_EPAREN, index);
911 // START OF LINE OPERATOR
912 // ^
914 else if ((unit.ch == '^') && !(unit.bk || quot))
916 addToken (currentToken);
917 currentToken = null;
918 RETokenStart token = null;
919 if ((cflags & REG_MULTILINE) > 0)
921 String sep = syntax.getLineSeparator ();
922 if (sep == null)
924 token = new RETokenStart (subIndex, null, true);
926 else
928 token = new RETokenStart (subIndex, sep);
931 else
933 token = new RETokenStart (subIndex, null);
935 addToken (token);
938 // END OF LINE OPERATOR
939 // $
941 else if ((unit.ch == '$') && !(unit.bk || quot))
943 addToken (currentToken);
944 currentToken = null;
945 RETokenEnd token = null;
946 if ((cflags & REG_MULTILINE) > 0)
948 String sep = syntax.getLineSeparator ();
949 if (sep == null)
951 token = new RETokenEnd (subIndex, null, true);
953 else
955 token = new RETokenEnd (subIndex, sep);
958 else
960 token = new RETokenEnd (subIndex, null);
962 addToken (token);
965 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
966 // .
968 else if ((unit.ch == '.') && !(unit.bk || quot))
970 addToken (currentToken);
971 currentToken =
972 new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
973 || ((cflags & REG_DOT_NEWLINE) > 0),
974 syntax.get (RESyntax.RE_DOT_NOT_NULL));
977 // ZERO-OR-MORE REPEAT OPERATOR
978 // *
980 // This method used to check "repeat.empty.token" to avoid such regexp
981 // as "(a*)*", but now "repeat.empty.token" is allowed.
983 else if ((unit.ch == '*') && !(unit.bk || quot))
985 if (currentToken == null)
986 throw new REException (getLocalizedMessage ("repeat.no.token"),
987 REException.REG_BADRPT, index);
988 if (currentToken instanceof RETokenRepeated)
989 throw new REException (getLocalizedMessage ("repeat.chained"),
990 REException.REG_BADRPT, index);
991 if (currentToken instanceof RETokenWordBoundary
992 || currentToken instanceof RETokenWordBoundary)
993 throw new REException (getLocalizedMessage ("repeat.assertion"),
994 REException.REG_BADRPT, index);
995 currentToken =
996 setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
999 // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000 // + | \+ depending on RE_BK_PLUS_QM
1001 // not available if RE_LIMITED_OPS is set
1003 // This method used to check "repeat.empty.token" to avoid such regexp
1004 // as "(a*)+", but now "repeat.empty.token" is allowed.
1006 else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007 && (!syntax.
1008 get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1010 if (currentToken == null)
1011 throw new REException (getLocalizedMessage ("repeat.no.token"),
1012 REException.REG_BADRPT, index);
1014 // Check for possessive matching on RETokenRepeated
1015 if (currentToken instanceof RETokenRepeated)
1017 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018 if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019 && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020 tokenRep.makePossessive ();
1021 else
1022 throw new
1023 REException (getLocalizedMessage ("repeat.chained"),
1024 REException.REG_BADRPT, index);
1027 else if (currentToken instanceof RETokenWordBoundary
1028 || currentToken instanceof RETokenWordBoundary)
1029 throw new REException (getLocalizedMessage ("repeat.assertion"),
1030 REException.REG_BADRPT, index);
1031 else
1032 currentToken =
1033 setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1036 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037 // ? | \? depending on RE_BK_PLUS_QM
1038 // not available if RE_LIMITED_OPS is set
1039 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1041 else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042 && (!syntax.
1043 get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1045 if (currentToken == null)
1046 throw new REException (getLocalizedMessage ("repeat.no.token"),
1047 REException.REG_BADRPT, index);
1049 // Check for stingy matching on RETokenRepeated
1050 if (currentToken instanceof RETokenRepeated)
1052 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053 if (syntax.get (RESyntax.RE_STINGY_OPS)
1054 && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055 tokenRep.makeStingy ();
1056 else
1057 throw new
1058 REException (getLocalizedMessage ("repeat.chained"),
1059 REException.REG_BADRPT, index);
1061 else if (currentToken instanceof RETokenWordBoundary
1062 || currentToken instanceof RETokenWordBoundary)
1063 throw new REException (getLocalizedMessage ("repeat.assertion"),
1064 REException.REG_BADRPT, index);
1065 else
1066 currentToken = setRepeated (currentToken, 0, 1, index);
1069 // OCTAL CHARACTER
1070 // \0377
1072 else if (unit.bk && (unit.ch == '0')
1073 && syntax.get (RESyntax.RE_OCTAL_CHAR))
1075 CharExpression ce =
1076 getCharExpression (pattern, index - 2, pLength, syntax);
1077 if (ce == null)
1078 throw new REException ("invalid octal character",
1079 REException.REG_ESCAPE, index);
1080 index = index - 2 + ce.len;
1081 addToken (currentToken);
1082 currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083 if (insensUSASCII)
1084 currentToken.unicodeAware = false;
1087 // BACKREFERENCE OPERATOR
1088 // \1 \2 ... \9 and \10 \11 \12 ...
1089 // not available if RE_NO_BK_REFS is set
1090 // Perl recognizes \10, \11, and so on only if enough number of
1091 // parentheses have opened before it, otherwise they are treated
1092 // as aliases of \010, \011, ... (octal characters). In case of
1093 // Sun's JDK, octal character expression must always begin with \0.
1094 // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095 // JDK treats \2 as a back reference to the 2nd group because
1096 // there are only two groups. But in our poor implementation,
1097 // we cannot help but treat \29 as a back reference to the 29th group.
1099 else if (unit.bk && Character.isDigit (unit.ch)
1100 && !syntax.get (RESyntax.RE_NO_BK_REFS))
1102 addToken (currentToken);
1103 int numBegin = index - 1;
1104 int numEnd = pLength;
1105 for (int i = index; i < pLength; i++)
1107 if (!Character.isDigit (pattern[i]))
1109 numEnd = i;
1110 break;
1113 int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1115 currentToken = new RETokenBackRef (subIndex, num, insens);
1116 if (insensUSASCII)
1117 currentToken.unicodeAware = false;
1118 index = numEnd;
1121 // START OF STRING OPERATOR
1122 // \A if RE_STRING_ANCHORS is set
1124 else if (unit.bk && (unit.ch == 'A')
1125 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1127 addToken (currentToken);
1128 currentToken = new RETokenStart (subIndex, null);
1131 // WORD BREAK OPERATOR
1132 // \b if ????
1134 else if (unit.bk && (unit.ch == 'b')
1135 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1137 addToken (currentToken);
1138 currentToken =
1139 new RETokenWordBoundary (subIndex,
1140 RETokenWordBoundary.
1141 BEGIN | RETokenWordBoundary.END,
1142 false);
1145 // WORD BEGIN OPERATOR
1146 // \< if ????
1147 else if (unit.bk && (unit.ch == '<'))
1149 addToken (currentToken);
1150 currentToken =
1151 new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152 false);
1155 // WORD END OPERATOR
1156 // \> if ????
1157 else if (unit.bk && (unit.ch == '>'))
1159 addToken (currentToken);
1160 currentToken =
1161 new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162 false);
1165 // NON-WORD BREAK OPERATOR
1166 // \B if ????
1168 else if (unit.bk && (unit.ch == 'B')
1169 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1171 addToken (currentToken);
1172 currentToken =
1173 new RETokenWordBoundary (subIndex,
1174 RETokenWordBoundary.
1175 BEGIN | RETokenWordBoundary.END, true);
1179 // DIGIT OPERATOR
1180 // \d if RE_CHAR_CLASS_ESCAPES is set
1182 else if (unit.bk && (unit.ch == 'd')
1183 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1185 addToken (currentToken);
1186 currentToken =
1187 new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188 if (insensUSASCII)
1189 currentToken.unicodeAware = false;
1192 // NON-DIGIT OPERATOR
1193 // \D
1195 else if (unit.bk && (unit.ch == 'D')
1196 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1198 addToken (currentToken);
1199 currentToken =
1200 new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201 if (insensUSASCII)
1202 currentToken.unicodeAware = false;
1205 // NEWLINE ESCAPE
1206 // \n
1208 else if (unit.bk && (unit.ch == 'n'))
1210 addToken (currentToken);
1211 currentToken = new RETokenChar (subIndex, '\n', false);
1214 // RETURN ESCAPE
1215 // \r
1217 else if (unit.bk && (unit.ch == 'r'))
1219 addToken (currentToken);
1220 currentToken = new RETokenChar (subIndex, '\r', false);
1223 // WHITESPACE OPERATOR
1224 // \s if RE_CHAR_CLASS_ESCAPES is set
1226 else if (unit.bk && (unit.ch == 's')
1227 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1229 addToken (currentToken);
1230 currentToken =
1231 new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232 if (insensUSASCII)
1233 currentToken.unicodeAware = false;
1236 // NON-WHITESPACE OPERATOR
1237 // \S
1239 else if (unit.bk && (unit.ch == 'S')
1240 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1242 addToken (currentToken);
1243 currentToken =
1244 new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245 if (insensUSASCII)
1246 currentToken.unicodeAware = false;
1249 // TAB ESCAPE
1250 // \t
1252 else if (unit.bk && (unit.ch == 't'))
1254 addToken (currentToken);
1255 currentToken = new RETokenChar (subIndex, '\t', false);
1258 // ALPHANUMERIC OPERATOR
1259 // \w
1261 else if (unit.bk && (unit.ch == 'w')
1262 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1264 addToken (currentToken);
1265 currentToken =
1266 new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267 if (insensUSASCII)
1268 currentToken.unicodeAware = false;
1271 // NON-ALPHANUMERIC OPERATOR
1272 // \W
1274 else if (unit.bk && (unit.ch == 'W')
1275 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1277 addToken (currentToken);
1278 currentToken =
1279 new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280 if (insensUSASCII)
1281 currentToken.unicodeAware = false;
1284 // END OF STRING OPERATOR
1285 // \Z, \z
1287 // FIXME: \Z and \z are different in that if the input string
1288 // ends with a line terminator, \Z matches the position before
1289 // the final terminator. This special behavior of \Z is yet
1290 // to be implemented.
1292 else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293 syntax.get (RESyntax.RE_STRING_ANCHORS))
1295 addToken (currentToken);
1296 currentToken = new RETokenEnd (subIndex, null);
1299 // HEX CHARACTER, UNICODE CHARACTER
1300 // \x1B, \u1234
1302 else
1303 if ((unit.bk && (unit.ch == 'x')
1304 && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305 && (unit.ch == 'u')
1306 && syntax.
1307 get (RESyntax.
1308 RE_UNICODE_CHAR)))
1310 CharExpression ce =
1311 getCharExpression (pattern, index - 2, pLength, syntax);
1312 if (ce == null)
1313 throw new REException ("invalid hex character",
1314 REException.REG_ESCAPE, index);
1315 index = index - 2 + ce.len;
1316 addToken (currentToken);
1317 currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318 if (insensUSASCII)
1319 currentToken.unicodeAware = false;
1322 // NAMED PROPERTY
1323 // \p{prop}, \P{prop}
1325 else
1326 if ((unit.bk && (unit.ch == 'p')
1327 && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328 && (unit.ch ==
1329 'P')
1330 && syntax.
1331 get (RESyntax.
1332 RE_NAMED_PROPERTY)))
1334 NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335 if (np == null)
1336 throw new REException ("invalid escape sequence",
1337 REException.REG_ESCAPE, index);
1338 index = index - 2 + np.len;
1339 addToken (currentToken);
1340 currentToken =
1341 getRETokenNamedProperty (subIndex, np, insens, index);
1342 if (insensUSASCII)
1343 currentToken.unicodeAware = false;
1346 // END OF PREVIOUS MATCH
1347 // \G
1349 else if (unit.bk && (unit.ch == 'G') &&
1350 syntax.get (RESyntax.RE_STRING_ANCHORS))
1352 addToken (currentToken);
1353 currentToken = new RETokenEndOfPreviousMatch (subIndex);
1356 // NON-SPECIAL CHARACTER (or escape to make literal)
1357 // c | \* for example
1359 else
1360 { // not a special character
1361 addToken (currentToken);
1362 currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363 if (insensUSASCII)
1364 currentToken.unicodeAware = false;
1366 } // end while
1368 // Add final buffered token and an EndSub marker
1369 addToken (currentToken);
1371 if (branches != null)
1373 branches.
1374 add (new
1375 RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376 maximumLength));
1377 branches.trimToSize (); // compact the Vector
1378 minimumLength = 0;
1379 maximumLength = 0;
1380 firstToken = lastToken = null;
1381 addToken (new RETokenOneOf (subIndex, branches, false));
1383 else
1384 addToken (new RETokenEndSub (subIndex));
1388 private static class ParseCharClassResult
1390 RETokenOneOf token;
1391 int index;
1392 boolean returnAtAndOperator = false;
1396 * Parse [...] or [^...] and make an RETokenOneOf instance.
1397 * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398 * @param pattern Input array of characters to be parsed.
1399 * @param index Index pointing to the character next to the beginning '['.
1400 * @param pLength Limit of the input array.
1401 * @param cflags Compilation flags used to parse the pattern.
1402 * @param pflags Flags that affect the behavior of this method.
1403 * @param syntax Syntax used to parse the pattern.
1405 private static ParseCharClassResult parseCharClass (int subIndex,
1406 char[]pattern,
1407 int index, int pLength,
1408 int cflags,
1409 RESyntax syntax,
1410 int pflags) throws
1411 REException
1414 boolean insens = ((cflags & REG_ICASE) > 0);
1415 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416 final ArrayList < REToken > options = new ArrayList < REToken > ();
1417 ArrayList < Object > addition = new ArrayList < Object > ();
1418 boolean additionAndAppeared = false;
1419 final int RETURN_AT_AND = 0x01;
1420 boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421 boolean negative = false;
1422 char ch;
1424 char lastChar = 0;
1425 boolean lastCharIsSet = false;
1426 if (index == pLength)
1427 throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428 REException.REG_EBRACK, index);
1430 // Check for initial caret, negation
1431 if ((ch = pattern[index]) == '^')
1433 negative = true;
1434 if (++index == pLength)
1435 throw new REException (getLocalizedMessage ("class.no.end"),
1436 REException.REG_EBRACK, index);
1437 ch = pattern[index];
1440 // Check for leading right bracket literal
1441 if (ch == ']')
1443 lastChar = ch;
1444 lastCharIsSet = true;
1445 if (++index == pLength)
1446 throw new REException (getLocalizedMessage ("class.no.end"),
1447 REException.REG_EBRACK, index);
1450 while ((ch = pattern[index++]) != ']')
1452 if ((ch == '-') && (lastCharIsSet))
1454 if (index == pLength)
1455 throw new REException (getLocalizedMessage ("class.no.end"),
1456 REException.REG_EBRACK, index);
1457 if ((ch = pattern[index]) == ']')
1459 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460 if (insensUSASCII)
1461 t.unicodeAware = false;
1462 options.add (t);
1463 lastChar = '-';
1465 else
1467 if ((ch == '\\')
1468 && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1470 CharExpression ce =
1471 getCharExpression (pattern, index, pLength, syntax);
1472 if (ce == null)
1473 throw new REException ("invalid escape sequence",
1474 REException.REG_ESCAPE, index);
1475 ch = ce.ch;
1476 index = index + ce.len - 1;
1478 RETokenRange t =
1479 new RETokenRange (subIndex, lastChar, ch, insens);
1480 if (insensUSASCII)
1481 t.unicodeAware = false;
1482 options.add (t);
1483 lastChar = 0;
1484 lastCharIsSet = false;
1485 index++;
1488 else if ((ch == '\\')
1489 && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1491 if (index == pLength)
1492 throw new REException (getLocalizedMessage ("class.no.end"),
1493 REException.REG_EBRACK, index);
1494 int posixID = -1;
1495 boolean negate = false;
1496 char asciiEsc = 0;
1497 boolean asciiEscIsSet = false;
1498 NamedProperty np = null;
1499 if (("dswDSW".indexOf (pattern[index]) != -1)
1500 && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1502 switch (pattern[index])
1504 case 'D':
1505 negate = true;
1506 case 'd':
1507 posixID = RETokenPOSIX.DIGIT;
1508 break;
1509 case 'S':
1510 negate = true;
1511 case 's':
1512 posixID = RETokenPOSIX.SPACE;
1513 break;
1514 case 'W':
1515 negate = true;
1516 case 'w':
1517 posixID = RETokenPOSIX.ALNUM;
1518 break;
1521 if (("pP".indexOf (pattern[index]) != -1)
1522 && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1524 np = getNamedProperty (pattern, index - 1, pLength);
1525 if (np == null)
1526 throw new REException ("invalid escape sequence",
1527 REException.REG_ESCAPE, index);
1528 index = index - 1 + np.len - 1;
1530 else
1532 CharExpression ce =
1533 getCharExpression (pattern, index - 1, pLength, syntax);
1534 if (ce == null)
1535 throw new REException ("invalid escape sequence",
1536 REException.REG_ESCAPE, index);
1537 asciiEsc = ce.ch;
1538 asciiEscIsSet = true;
1539 index = index - 1 + ce.len - 1;
1541 if (lastCharIsSet)
1543 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544 if (insensUSASCII)
1545 t.unicodeAware = false;
1546 options.add (t);
1549 if (posixID != -1)
1551 RETokenPOSIX t =
1552 new RETokenPOSIX (subIndex, posixID, insens, negate);
1553 if (insensUSASCII)
1554 t.unicodeAware = false;
1555 options.add (t);
1557 else if (np != null)
1559 RETokenNamedProperty t =
1560 getRETokenNamedProperty (subIndex, np, insens, index);
1561 if (insensUSASCII)
1562 t.unicodeAware = false;
1563 options.add (t);
1565 else if (asciiEscIsSet)
1567 lastChar = asciiEsc;
1568 lastCharIsSet = true;
1570 else
1572 lastChar = pattern[index];
1573 lastCharIsSet = true;
1575 ++index;
1577 else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578 && (index < pLength) && (pattern[index] == ':'))
1580 CPStringBuilder posixSet = new CPStringBuilder ();
1581 index = getPosixSet (pattern, index + 1, posixSet);
1582 int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583 if (posixId != -1)
1585 RETokenPOSIX t =
1586 new RETokenPOSIX (subIndex, posixId, insens, false);
1587 if (insensUSASCII)
1588 t.unicodeAware = false;
1589 options.add (t);
1592 else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1594 ParseCharClassResult result =
1595 parseCharClass (subIndex, pattern, index, pLength, cflags,
1596 syntax, 0);
1597 addition.add (result.token);
1598 addition.add ("|");
1599 index = result.index;
1601 else if ((ch == '&') &&
1602 (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603 (index < pLength) && (pattern[index] == '&'))
1605 if (returnAtAndOperator)
1607 ParseCharClassResult result = new ParseCharClassResult ();
1608 options.trimToSize ();
1609 if (additionAndAppeared)
1610 addition.add ("&");
1611 if (addition.size () == 0)
1612 addition = null;
1613 result.token = new RETokenOneOf (subIndex,
1614 options, addition, negative);
1615 result.index = index - 1;
1616 result.returnAtAndOperator = true;
1617 return result;
1619 // The precedence of the operator "&&" is the lowest.
1620 // So we postpone adding "&" until other elements
1621 // are added. And we insert Boolean.FALSE at the
1622 // beginning of the list of tokens following "&&".
1623 // So, "&&[a-b][k-m]" will be stored in the Vecter
1624 // addition in this order:
1625 // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626 if (additionAndAppeared)
1627 addition.add ("&");
1628 addition.add (Boolean.FALSE);
1629 additionAndAppeared = true;
1631 // The part on which "&&" operates may be either
1632 // (1) explicitly enclosed by []
1633 // or
1634 // (2) not enclosed by [] and terminated by the
1635 // next "&&" or the end of the character list.
1636 // Let the preceding else if block do the case (1).
1637 // We must do something in case of (2).
1638 if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1640 ParseCharClassResult result =
1641 parseCharClass (subIndex, pattern, index + 1, pLength,
1642 cflags, syntax,
1643 RETURN_AT_AND);
1644 addition.add (result.token);
1645 addition.add ("|");
1646 // If the method returned at the next "&&", it is OK.
1647 // Otherwise we have eaten the mark of the end of this
1648 // character list "]". In this case we must give back
1649 // the end mark.
1650 index = (result.returnAtAndOperator ?
1651 result.index : result.index - 1);
1654 else
1656 if (lastCharIsSet)
1658 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659 if (insensUSASCII)
1660 t.unicodeAware = false;
1661 options.add (t);
1663 lastChar = ch;
1664 lastCharIsSet = true;
1666 if (index == pLength)
1667 throw new REException (getLocalizedMessage ("class.no.end"),
1668 REException.REG_EBRACK, index);
1669 } // while in list
1670 // Out of list, index is one past ']'
1672 if (lastCharIsSet)
1674 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675 if (insensUSASCII)
1676 t.unicodeAware = false;
1677 options.add (t);
1680 ParseCharClassResult result = new ParseCharClassResult ();
1681 // Create a new RETokenOneOf
1682 options.trimToSize ();
1683 if (additionAndAppeared)
1684 addition.add ("&");
1685 if (addition.size () == 0)
1686 addition = null;
1687 result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688 result.index = index;
1689 return result;
1692 private static int getCharUnit (char[]input, int index, CharUnit unit,
1693 boolean quot) throws REException
1695 unit.ch = input[index++];
1696 unit.bk = (unit.ch == '\\'
1697 && (!quot || index >= input.length || input[index] == 'E'));
1698 if (unit.bk)
1699 if (index < input.length)
1700 unit.ch = input[index++];
1701 else
1702 throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703 REException.REG_ESCAPE, index);
1704 return index;
1707 private static int parseInt (char[]input, int pos, int len, int radix)
1709 int ret = 0;
1710 for (int i = pos; i < pos + len; i++)
1712 ret = ret * radix + Character.digit (input[i], radix);
1714 return ret;
1718 * This class represents various expressions for a character.
1719 * "a" : 'a' itself.
1720 * "\0123" : Octal char 0123
1721 * "\x1b" : Hex char 0x1b
1722 * "\u1234" : Unicode char \u1234
1724 private static class CharExpression
1726 /** character represented by this expression */
1727 char ch;
1728 /** String expression */
1729 String expr;
1730 /** length of this expression */
1731 int len;
1732 public String toString ()
1734 return expr;
1738 private static CharExpression getCharExpression (char[]input, int pos,
1739 int lim, RESyntax syntax)
1741 CharExpression ce = new CharExpression ();
1742 char c = input[pos];
1743 if (c == '\\')
1745 if (pos + 1 >= lim)
1746 return null;
1747 c = input[pos + 1];
1748 switch (c)
1750 case 't':
1751 ce.ch = '\t';
1752 ce.len = 2;
1753 break;
1754 case 'n':
1755 ce.ch = '\n';
1756 ce.len = 2;
1757 break;
1758 case 'r':
1759 ce.ch = '\r';
1760 ce.len = 2;
1761 break;
1762 case 'x':
1763 case 'u':
1764 if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765 (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1767 int l = 0;
1768 int expectedLength = (c == 'x' ? 2 : 4);
1769 for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1771 if (i >= lim)
1772 break;
1773 if (!((input[i] >= '0' && input[i] <= '9') ||
1774 (input[i] >= 'A' && input[i] <= 'F') ||
1775 (input[i] >= 'a' && input[i] <= 'f')))
1776 break;
1777 l++;
1779 if (l != expectedLength)
1780 return null;
1781 ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782 ce.len = l + 2;
1784 else
1786 ce.ch = c;
1787 ce.len = 2;
1789 break;
1790 case '0':
1791 if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1793 int l = 0;
1794 for (int i = pos + 2; i < pos + 2 + 3; i++)
1796 if (i >= lim)
1797 break;
1798 if (input[i] < '0' || input[i] > '7')
1799 break;
1800 l++;
1802 if (l == 3 && input[pos + 2] > '3')
1803 l--;
1804 if (l <= 0)
1805 return null;
1806 ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807 ce.len = l + 2;
1809 else
1811 ce.ch = c;
1812 ce.len = 2;
1814 break;
1815 default:
1816 ce.ch = c;
1817 ce.len = 2;
1818 break;
1821 else
1823 ce.ch = input[pos];
1824 ce.len = 1;
1826 ce.expr = new String (input, pos, ce.len);
1827 return ce;
1831 * This class represents a substring in a pattern string expressing
1832 * a named property.
1833 * "\pA" : Property named "A"
1834 * "\p{prop}" : Property named "prop"
1835 * "\PA" : Property named "A" (Negated)
1836 * "\P{prop}" : Property named "prop" (Negated)
1838 private static class NamedProperty
1840 /** Property name */
1841 String name;
1842 /** Negated or not */
1843 boolean negate;
1844 /** length of this expression */
1845 int len;
1848 private static NamedProperty getNamedProperty (char[]input, int pos,
1849 int lim)
1851 NamedProperty np = new NamedProperty ();
1852 char c = input[pos];
1853 if (c == '\\')
1855 if (++pos >= lim)
1856 return null;
1857 c = input[pos++];
1858 switch (c)
1860 case 'p':
1861 np.negate = false;
1862 break;
1863 case 'P':
1864 np.negate = true;
1865 break;
1866 default:
1867 return null;
1869 c = input[pos++];
1870 if (c == '{')
1872 int p = -1;
1873 for (int i = pos; i < lim; i++)
1875 if (input[i] == '}')
1877 p = i;
1878 break;
1881 if (p < 0)
1882 return null;
1883 int len = p - pos;
1884 np.name = new String (input, pos, len);
1885 np.len = len + 4;
1887 else
1889 np.name = new String (input, pos - 1, 1);
1890 np.len = 3;
1892 return np;
1894 else
1895 return null;
1898 private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899 NamedProperty
1901 boolean insens,
1902 int index)
1903 throws REException
1907 return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1909 catch (REException e)
1911 REException ree;
1912 ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913 ree.initCause (e);
1914 throw ree;
1919 * Checks if the regular expression matches the input in its entirety.
1921 * @param input The input text.
1923 public boolean isMatch (Object input)
1925 return isMatch (input, 0, 0);
1929 * Checks if the input string, starting from index, is an exact match of
1930 * this regular expression.
1932 * @param input The input text.
1933 * @param index The offset index at which the search should be begin.
1935 public boolean isMatch (Object input, int index)
1937 return isMatch (input, index, 0);
1942 * Checks if the input, starting from index and using the specified
1943 * execution flags, is an exact match of this regular expression.
1945 * @param input The input text.
1946 * @param index The offset index at which the search should be begin.
1947 * @param eflags The logical OR of any execution flags above.
1949 public boolean isMatch (Object input, int index, int eflags)
1951 return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1954 private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1956 if (firstToken == null) // Trivial case
1957 return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958 REMatch m = new REMatch (numSubs, index, eflags);
1959 if (firstToken.match (input, m))
1961 if (m != null)
1963 if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1965 return true;
1969 return false;
1973 * Returns the maximum number of subexpressions in this regular expression.
1974 * If the expression contains branches, the value returned will be the
1975 * maximum subexpressions in any of the branches.
1977 public int getNumSubs ()
1979 return numSubs;
1982 // Overrides REToken.setUncle
1983 void setUncle (REToken uncle)
1985 if (lastToken != null)
1987 lastToken.setUncle (uncle);
1989 else
1990 super.setUncle (uncle); // to deal with empty subexpressions
1993 // Overrides REToken.chain
1995 boolean chain (REToken next)
1997 super.chain (next);
1998 setUncle (next);
1999 return true;
2003 * Returns the minimum number of characters that could possibly
2004 * constitute a match of this regular expression.
2006 public int getMinimumLength ()
2008 return minimumLength;
2011 public int getMaximumLength ()
2013 return maximumLength;
2017 * Returns an array of all matches found in the input.
2019 * If the regular expression allows the empty string to match, it will
2020 * substitute matches at all positions except the end of the input.
2022 * @param input The input text.
2023 * @return a non-null (but possibly zero-length) array of matches
2025 public REMatch[] getAllMatches (Object input)
2027 return getAllMatches (input, 0, 0);
2031 * Returns an array of all matches found in the input,
2032 * beginning at the specified index position.
2034 * If the regular expression allows the empty string to match, it will
2035 * substitute matches at all positions except the end of the input.
2037 * @param input The input text.
2038 * @param index The offset index at which the search should be begin.
2039 * @return a non-null (but possibly zero-length) array of matches
2041 public REMatch[] getAllMatches (Object input, int index)
2043 return getAllMatches (input, index, 0);
2047 * Returns an array of all matches found in the input string,
2048 * beginning at the specified index position and using the specified
2049 * execution flags.
2051 * If the regular expression allows the empty string to match, it will
2052 * substitute matches at all positions except the end of the input.
2054 * @param input The input text.
2055 * @param index The offset index at which the search should be begin.
2056 * @param eflags The logical OR of any execution flags above.
2057 * @return a non-null (but possibly zero-length) array of matches
2059 public REMatch[] getAllMatches (Object input, int index, int eflags)
2061 return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2064 // this has been changed since 1.03 to be non-overlapping matches
2065 private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066 int eflags)
2068 List < REMatch > all = new ArrayList < REMatch > ();
2069 REMatch m = null;
2070 while ((m = getMatchImpl (input, index, eflags, null)) != null)
2072 all.add (m);
2073 index = m.getEndIndex ();
2074 if (m.end[0] == 0)
2075 { // handle pathological case of zero-length match
2076 index++;
2077 input.move (1);
2079 else
2081 input.move (m.end[0]);
2083 if (!input.isValid ())
2084 break;
2086 return all.toArray (new REMatch[all.size ()]);
2089 /* Implements abstract method REToken.match() */
2090 boolean match (CharIndexed input, REMatch mymatch)
2092 input.setHitEnd (mymatch);
2093 if (firstToken == null)
2095 return next (input, mymatch);
2098 // Note the start of this subexpression
2099 mymatch.start1[subIndex] = mymatch.index;
2101 return firstToken.match (input, mymatch);
2104 REMatch findMatch (CharIndexed input, REMatch mymatch)
2106 if (mymatch.backtrackStack == null)
2107 mymatch.backtrackStack = new BacktrackStack ();
2108 boolean b = match (input, mymatch);
2109 if (b)
2111 return mymatch;
2113 return null;
2117 * Returns the first match found in the input. If no match is found,
2118 * null is returned.
2120 * @param input The input text.
2121 * @return An REMatch instance referencing the match, or null if none.
2123 public REMatch getMatch (Object input)
2125 return getMatch (input, 0, 0);
2129 * Returns the first match found in the input, beginning
2130 * the search at the specified index. If no match is found,
2131 * returns null.
2133 * @param input The input text.
2134 * @param index The offset within the text to begin looking for a match.
2135 * @return An REMatch instance referencing the match, or null if none.
2137 public REMatch getMatch (Object input, int index)
2139 return getMatch (input, index, 0);
2143 * Returns the first match found in the input, beginning
2144 * the search at the specified index, and using the specified
2145 * execution flags. If no match is found, returns null.
2147 * @param input The input text.
2148 * @param index The offset index at which the search should be begin.
2149 * @param eflags The logical OR of any execution flags above.
2150 * @return An REMatch instance referencing the match, or null if none.
2152 public REMatch getMatch (Object input, int index, int eflags)
2154 return getMatch (input, index, eflags, null);
2158 * Returns the first match found in the input, beginning the search
2159 * at the specified index, and using the specified execution flags.
2160 * If no match is found, returns null. If a StringBuffer is
2161 * provided and is non-null, the contents of the input text from the
2162 * index to the beginning of the match (or to the end of the input,
2163 * if there is no match) are appended to the StringBuffer.
2165 * @param input The input text.
2166 * @param index The offset index at which the search should be begin.
2167 * @param eflags The logical OR of any execution flags above.
2168 * @param buffer The StringBuffer to save pre-match text in.
2169 * @return An REMatch instance referencing the match, or null if none. */
2170 public REMatch getMatch (Object input, int index, int eflags,
2171 CPStringBuilder buffer)
2173 return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174 buffer);
2177 REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178 CPStringBuilder buffer)
2180 boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181 boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182 RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183 if (tryEntireMatch)
2185 RETokenEnd reEnd = new RETokenEnd (0, null);
2186 reEnd.setFake (true);
2187 re.chain (reEnd);
2189 // Create a new REMatch to hold results
2190 REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2193 /* The following potimization is commented out because
2194 the matching should be tried even if the length of
2195 input is obviously too short in order that
2196 java.util.regex.Matcher#hitEnd() may work correctly.
2197 // Optimization: check if anchor + minimumLength > length
2198 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2200 if (re.match (input, mymatch))
2202 REMatch best = mymatch;
2203 // We assume that the match that coms first is the best.
2204 // And the following "The longer, the better" rule has
2205 // been commented out. The longest is not neccesarily
2206 // the best. For example, "a" out of "aaa" is the best
2207 // match for /a+?/.
2209 // Find best match of them all to observe leftmost longest
2210 while ((mymatch = mymatch.next) != null) {
2211 if (mymatch.index > best.index) {
2212 best = mymatch;
2216 best.end[0] = best.index;
2217 best.finish (input);
2218 input.setLastMatch (best);
2219 return best;
2221 /* End of the optimization commented out
2224 mymatch.clear (++anchor);
2225 // Append character to buffer if needed
2226 if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2228 buffer.append (input.charAt (0));
2230 // java.util.regex.Matcher#hitEnd() requires that the search should
2231 // be tried at the end of input, so we use move1(1) instead of move(1)
2233 while (doMove && input.move1 (1));
2235 // Special handling at end of input for e.g. "$"
2236 if (minimumLength == 0)
2238 if (match (input, mymatch))
2240 mymatch.finish (input);
2241 return mymatch;
2245 return null;
2249 * Returns an REMatchEnumeration that can be used to iterate over the
2250 * matches found in the input text.
2252 * @param input The input text.
2253 * @return A non-null REMatchEnumeration instance.
2255 public REMatchEnumeration getMatchEnumeration (Object input)
2257 return getMatchEnumeration (input, 0, 0);
2262 * Returns an REMatchEnumeration that can be used to iterate over the
2263 * matches found in the input text.
2265 * @param input The input text.
2266 * @param index The offset index at which the search should be begin.
2267 * @return A non-null REMatchEnumeration instance, with its input cursor
2268 * set to the index position specified.
2270 public REMatchEnumeration getMatchEnumeration (Object input, int index)
2272 return getMatchEnumeration (input, index, 0);
2276 * Returns an REMatchEnumeration that can be used to iterate over the
2277 * matches found in the input text.
2279 * @param input The input text.
2280 * @param index The offset index at which the search should be begin.
2281 * @param eflags The logical OR of any execution flags above.
2282 * @return A non-null REMatchEnumeration instance, with its input cursor
2283 * set to the index position specified.
2285 public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286 int eflags)
2288 return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289 index, eflags);
2294 * Substitutes the replacement text for the first match found in the input.
2296 * @param input The input text.
2297 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298 * @return A String interpolating the substituted text.
2299 * @see REMatch#substituteInto
2301 public String substitute (Object input, String replace)
2303 return substitute (input, replace, 0, 0);
2307 * Substitutes the replacement text for the first match found in the input
2308 * beginning at the specified index position. Specifying an index
2309 * effectively causes the regular expression engine to throw away the
2310 * specified number of characters.
2312 * @param input The input text.
2313 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314 * @param index The offset index at which the search should be begin.
2315 * @return A String containing the substring of the input, starting
2316 * at the index position, and interpolating the substituted text.
2317 * @see REMatch#substituteInto
2319 public String substitute (Object input, String replace, int index)
2321 return substitute (input, replace, index, 0);
2325 * Substitutes the replacement text for the first match found in the input
2326 * string, beginning at the specified index position and using the
2327 * specified execution flags.
2329 * @param input The input text.
2330 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331 * @param index The offset index at which the search should be begin.
2332 * @param eflags The logical OR of any execution flags above.
2333 * @return A String containing the substring of the input, starting
2334 * at the index position, and interpolating the substituted text.
2335 * @see REMatch#substituteInto
2337 public String substitute (Object input, String replace, int index,
2338 int eflags)
2340 return substituteImpl (makeCharIndexed (input, index), replace, index,
2341 eflags);
2344 private String substituteImpl (CharIndexed input, String replace, int index,
2345 int eflags)
2347 CPStringBuilder buffer = new CPStringBuilder ();
2348 REMatch m = getMatchImpl (input, index, eflags, buffer);
2349 if (m == null)
2350 return buffer.toString ();
2351 buffer.append (getReplacement (replace, m, eflags));
2352 if (input.move (m.end[0]))
2356 buffer.append (input.charAt (0));
2358 while (input.move (1));
2360 return buffer.toString ();
2364 * Substitutes the replacement text for each non-overlapping match found
2365 * in the input text.
2367 * @param input The input text.
2368 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369 * @return A String interpolating the substituted text.
2370 * @see REMatch#substituteInto
2372 public String substituteAll (Object input, String replace)
2374 return substituteAll (input, replace, 0, 0);
2378 * Substitutes the replacement text for each non-overlapping match found
2379 * in the input text, starting at the specified index.
2381 * If the regular expression allows the empty string to match, it will
2382 * substitute matches at all positions except the end of the input.
2384 * @param input The input text.
2385 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386 * @param index The offset index at which the search should be begin.
2387 * @return A String containing the substring of the input, starting
2388 * at the index position, and interpolating the substituted text.
2389 * @see REMatch#substituteInto
2391 public String substituteAll (Object input, String replace, int index)
2393 return substituteAll (input, replace, index, 0);
2397 * Substitutes the replacement text for each non-overlapping match found
2398 * in the input text, starting at the specified index and using the
2399 * specified execution flags.
2401 * @param input The input text.
2402 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403 * @param index The offset index at which the search should be begin.
2404 * @param eflags The logical OR of any execution flags above.
2405 * @return A String containing the substring of the input, starting
2406 * at the index position, and interpolating the substituted text.
2407 * @see REMatch#substituteInto
2409 public String substituteAll (Object input, String replace, int index,
2410 int eflags)
2412 return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413 eflags);
2416 private String substituteAllImpl (CharIndexed input, String replace,
2417 int index, int eflags)
2419 CPStringBuilder buffer = new CPStringBuilder ();
2420 REMatch m;
2421 while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2423 buffer.append (getReplacement (replace, m, eflags));
2424 index = m.getEndIndex ();
2425 if (m.end[0] == 0)
2427 char ch = input.charAt (0);
2428 if (ch != CharIndexed.OUT_OF_BOUNDS)
2429 buffer.append (ch);
2430 input.move (1);
2432 else
2434 input.move (m.end[0]);
2437 if (!input.isValid ())
2438 break;
2440 return buffer.toString ();
2443 public static String getReplacement (String replace, REMatch m, int eflags)
2445 if ((eflags & REG_NO_INTERPOLATE) > 0)
2446 return replace;
2447 else
2449 if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2451 CPStringBuilder sb = new CPStringBuilder ();
2452 int l = replace.length ();
2453 for (int i = 0; i < l; i++)
2455 char c = replace.charAt (i);
2456 switch (c)
2458 case '\\':
2459 i++;
2460 // Let StringIndexOutOfBoundsException be thrown.
2461 sb.append (replace.charAt (i));
2462 break;
2463 case '$':
2464 int i1 = i + 1;
2465 while (i1 < replace.length () &&
2466 Character.isDigit (replace.charAt (i1)))
2467 i1++;
2468 sb.append (m.substituteInto (replace.substring (i, i1)));
2469 i = i1 - 1;
2470 break;
2471 default:
2472 sb.append (c);
2475 return sb.toString ();
2477 else
2478 return m.substituteInto (replace);
2482 /* Helper function for constructor */
2483 private void addToken (REToken next)
2485 if (next == null)
2486 return;
2487 minimumLength += next.getMinimumLength ();
2488 int nmax = next.getMaximumLength ();
2489 if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490 maximumLength += nmax;
2491 else
2492 maximumLength = Integer.MAX_VALUE;
2494 if (firstToken == null)
2496 lastToken = firstToken = next;
2498 else
2500 // if chain returns false, it "rejected" the token due to
2501 // an optimization, and next was combined with lastToken
2502 if (lastToken.chain (next))
2504 lastToken = next;
2509 private static REToken setRepeated (REToken current, int min, int max,
2510 int index) throws REException
2512 if (current == null)
2513 throw new REException (getLocalizedMessage ("repeat.no.token"),
2514 REException.REG_BADRPT, index);
2515 return new RETokenRepeated (current.subIndex, current, min, max);
2518 private static int getPosixSet (char[]pattern, int index,
2519 CPStringBuilder buf)
2521 // Precondition: pattern[index-1] == ':'
2522 // we will return pos of closing ']'.
2523 int i;
2524 for (i = index; i < (pattern.length - 1); i++)
2526 if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527 return i + 2;
2528 buf.append (pattern[i]);
2530 return index; // didn't match up
2533 private int getMinMax (char[]input, int index, IntPair minMax,
2534 RESyntax syntax) throws REException
2536 // Precondition: input[index-1] == '{', minMax != null
2538 boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539 int startIndex = index;
2540 if (index == input.length)
2542 if (mustMatch)
2543 throw new REException (getLocalizedMessage ("unmatched.brace"),
2544 REException.REG_EBRACE, index);
2545 else
2546 return startIndex;
2549 int min, max = 0;
2550 CharUnit unit = new CharUnit ();
2551 CPStringBuilder buf = new CPStringBuilder ();
2553 // Read string of digits
2556 index = getCharUnit (input, index, unit, false);
2557 if (Character.isDigit (unit.ch))
2558 buf.append (unit.ch);
2560 while ((index != input.length) && Character.isDigit (unit.ch));
2562 // Check for {} tomfoolery
2563 if (buf.length () == 0)
2565 if (mustMatch)
2566 throw new REException (getLocalizedMessage ("interval.error"),
2567 REException.REG_EBRACE, index);
2568 else
2569 return startIndex;
2572 min = Integer.parseInt (buf.toString ());
2574 if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575 max = min;
2576 else if (index == input.length)
2577 if (mustMatch)
2578 throw new REException (getLocalizedMessage ("interval.no.end"),
2579 REException.REG_EBRACE, index);
2580 else
2581 return startIndex;
2582 else
2583 if ((unit.ch == ',') && !unit.bk)
2585 buf = new CPStringBuilder ();
2586 // Read string of digits
2587 while (((index =
2588 getCharUnit (input, index, unit, false)) != input.length)
2589 && Character.isDigit (unit.ch))
2590 buf.append (unit.ch);
2592 if (!
2593 ((unit.ch == '}')
2594 && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595 if (mustMatch)
2596 throw new REException (getLocalizedMessage ("interval.error"),
2597 REException.REG_EBRACE, index);
2598 else
2599 return startIndex;
2601 // This is the case of {x,}
2602 if (buf.length () == 0)
2603 max = Integer.MAX_VALUE;
2604 else
2605 max = Integer.parseInt (buf.toString ());
2607 else if (mustMatch)
2608 throw new REException (getLocalizedMessage ("interval.error"),
2609 REException.REG_EBRACE, index);
2610 else
2611 return startIndex;
2613 // We know min and max now, and they are valid.
2615 minMax.first = min;
2616 minMax.second = max;
2618 // return the index following the '}'
2619 return index;
2623 * Return a human readable form of the compiled regular expression,
2624 * useful for debugging.
2626 public String toString ()
2628 CPStringBuilder sb = new CPStringBuilder ();
2629 dump (sb);
2630 return sb.toString ();
2633 void dump (CPStringBuilder os)
2635 os.append ("(?#startRE subIndex=" + subIndex + ")");
2636 if (subIndex == 0)
2637 os.append ("?:");
2638 if (firstToken != null)
2639 firstToken.dumpAll (os);
2640 if (subIndex == 0)
2641 os.append (")");
2642 os.append ("(?#endRE subIndex=" + subIndex + ")");
2645 // Cast input appropriately or throw exception
2646 // This method was originally a private method, but has been made
2647 // public because java.util.regex.Matcher uses this.
2648 public static CharIndexed makeCharIndexed (Object input, int index)
2650 // The case where input is already a CharIndexed is supposed
2651 // be the most likely because this is the case with
2652 // java.util.regex.Matcher.
2653 // We could let a String or a CharSequence fall through
2654 // to final input, but since it'a very likely input type,
2655 // we check it first.
2656 if (input instanceof CharIndexed)
2658 CharIndexed ci = (CharIndexed) input;
2659 ci.setAnchor (index);
2660 return ci;
2662 else if (input instanceof CharSequence)
2663 return new CharIndexedCharSequence ((CharSequence) input, index);
2664 else if (input instanceof String)
2665 return new CharIndexedString ((String) input, index);
2666 else if (input instanceof char[])
2667 return new CharIndexedCharArray ((char[]) input, index);
2668 else if (input instanceof StringBuffer)
2669 return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670 else if (input instanceof InputStream)
2671 return new CharIndexedInputStream ((InputStream) input, index);
2672 else
2673 return new CharIndexedString (input.toString (), index);