2 Copyright (C) 2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu
.java
.util
.regex
;
40 import gnu
.java
.lang
.CPStringBuilder
;
42 import java
.io
.InputStream
;
43 import java
.io
.Serializable
;
45 import java
.util
.ArrayList
;
46 import java
.util
.List
;
47 import java
.util
.Locale
;
48 import java
.util
.PropertyResourceBundle
;
49 import java
.util
.ResourceBundle
;
52 * RE provides the user interface for compiling and matching regular
55 * A regular expression object (class RE) is compiled by constructing it
56 * from a String, StringBuffer or character array, with optional
57 * compilation flags (below)
58 * and an optional syntax specification (see RESyntax; if not specified,
59 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
61 * Once compiled, a regular expression object is reusable as well as
62 * threadsafe: multiple threads can use the RE instance simultaneously
63 * to match against different input text.
65 * Various methods attempt to match input text against a compiled
66 * regular expression. These methods are:
67 * <LI><code>isMatch</code>: returns true if the input text in its
68 * entirety matches the regular expression pattern.
69 * <LI><code>getMatch</code>: returns the first match found in the
70 * input text, or null if no match is found.
71 * <LI><code>getAllMatches</code>: returns an array of all
72 * non-overlapping matches found in the input text. If no matches are
73 * found, the array is zero-length.
74 * <LI><code>substitute</code>: substitute the first occurence of the
75 * pattern in the input text with a replacement string (which may
76 * include metacharacters $0-$9, see REMatch.substituteInto).
77 * <LI><code>substituteAll</code>: same as above, but repeat for each
78 * match before returning.
79 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
80 * object that allows iteration over the matches (see
81 * REMatchEnumeration for some reasons why you may want to do this
82 * instead of using <code>getAllMatches</code>.
85 * These methods all have similar argument lists. The input can be a
86 * CharIndexed, String, a character array, a StringBuffer, or an
87 * InputStream of some sort. Note that when using an
88 * InputStream, the stream read position cannot be guaranteed after
89 * attempting a match (this is not a bug, but a consequence of the way
90 * regular expressions work). Using an REMatchEnumeration can
91 * eliminate most positioning problems.
93 * Although the input object can be of various types, it is recommended
94 * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
95 * can show the last match found on this input, which helps the expression
96 * \G work as the end of the previous match.
100 * The optional index argument specifies the offset from the beginning
101 * of the text at which the search should start (see the descriptions
102 * of some of the execution flags for how this can affect positional
103 * pattern operators). For an InputStream, this means an
104 * offset from the current read position, so subsequent calls with the
105 * same index argument on an InputStream will not
106 * necessarily access the same position on the stream, whereas
107 * repeated searches at a given index in a fixed string will return
108 * consistent results.
111 * You can optionally affect the execution environment by using a
112 * combination of execution flags (constants listed below).
115 * All operations on a regular expression are performed in a
116 * thread-safe manner.
118 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
119 * @version 1.1.5-dev, to be released
122 public class RE
extends REToken
125 private static final class IntPair
implements Serializable
127 public int first
, second
;
130 private static final class CharUnit
implements Serializable
136 // This String will be returned by getVersion()
137 private static final String VERSION
= "1.1.5-dev";
139 // The localized strings are kept in a separate file
140 // Used by getLocalizedMessage().
141 private static ResourceBundle messages
;
143 // Name of the bundle that contains the localized messages.
144 private static final String bundle
= "gnu/java/util/regex/MessagesBundle";
146 // These are, respectively, the first and last tokens in our linked list
147 // If there is only one token, firstToken == lastToken
148 private REToken firstToken
, lastToken
;
150 // This is the number of subexpressions in this regular expression,
151 // with a minimum value of zero. Returned by getNumSubs()
154 /** Minimum length, in characters, of any possible match. */
155 private int minimumLength
;
156 private int maximumLength
;
159 * Compilation flag. Do not differentiate case. Subsequent
160 * searches using this RE will be case insensitive.
162 public static final int REG_ICASE
= 0x02;
165 * Compilation flag. The match-any-character operator (dot)
166 * will match a newline character. When set this overrides the syntax
167 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
168 * the "/s" operator in Perl.
170 public static final int REG_DOT_NEWLINE
= 0x04;
173 * Compilation flag. Use multiline mode. In this mode, the ^ and $
174 * anchors will match based on newlines within the input. This is
175 * equivalent to the "/m" operator in Perl.
177 public static final int REG_MULTILINE
= 0x08;
181 * The match-beginning operator (^) will not match at the beginning
182 * of the input string. Useful for matching on a substring when you
183 * know the context of the input is such that position zero of the
184 * input to the match test is not actually position zero of the text.
186 * This example demonstrates the results of various ways of matching on
190 * String s = "food bar fool";<BR>
191 * RE exp = new RE("^foo.");<BR>
192 * REMatch m0 = exp.getMatch(s);<BR>
193 * REMatch m1 = exp.getMatch(s.substring(8));<BR>
194 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
195 * REMatch m3 = exp.getMatch(s,8); <BR>
196 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
199 * // m0.toString(): "food"<BR>
200 * // m1.toString(): "fool"<BR>
201 * // m2.toString(): null<BR>
202 * // m3.toString(): null<BR>
203 * // m4.toString(): "fool"<BR>
206 public static final int REG_NOTBOL
= 0x10;
210 * The match-end operator ($) does not match at the end
211 * of the input string. Useful for matching on substrings.
213 public static final int REG_NOTEOL
= 0x20;
217 * When a match method is invoked that starts matching at a non-zero
218 * index into the input, treat the input as if it begins at the index
219 * given. The effect of this flag is that the engine does not "see"
220 * any text in the input before the given index. This is useful so
221 * that the match-beginning operator (^) matches not at position 0
222 * in the input string, but at the position the search started at
223 * (based on the index input given to the getMatch function). See
224 * the example under REG_NOTBOL. It also affects the use of the \<
227 public static final int REG_ANCHORINDEX
= 0x40;
231 * The substitute and substituteAll methods will not attempt to
232 * interpolate occurrences of $1-$9 in the replacement text with
233 * the corresponding subexpressions. For example, you may want to
234 * replace all matches of "one dollar" with "$1".
236 public static final int REG_NO_INTERPOLATE
= 0x80;
240 * Try to match the whole input string. An implicit match-end operator
241 * is added to this regexp.
243 public static final int REG_TRY_ENTIRE_MATCH
= 0x0100;
247 * The substitute and substituteAll methods will treat the
248 * character '\' in the replacement as an escape to a literal
249 * character. In this case "\n", "\$", "\\", "\x40" and "\012"
250 * will become "n", "$", "\", "x40" and "012" respectively.
251 * This flag has no effect if REG_NO_INTERPOLATE is set on.
253 public static final int REG_REPLACE_USE_BACKSLASHESCAPE
= 0x0200;
256 * Compilation flag. Allow whitespace and comments in pattern.
257 * This is equivalent to the "/x" operator in Perl.
259 public static final int REG_X_COMMENTS
= 0x0400;
262 * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
264 public static final int REG_ICASE_USASCII
= 0x0800;
268 * Do not move the position at which the search begins. If not set,
269 * the starting position will be moved until a match is found.
271 public static final int REG_FIX_STARTING_POSITION
= 0x1000;
273 /** Returns a string representing the version of the gnu.regexp package. */
274 public static final String
version ()
279 // Retrieves a message from the ResourceBundle
280 static final String
getLocalizedMessage (String key
)
282 if (messages
== null)
284 PropertyResourceBundle
.getBundle (bundle
, Locale
.getDefault ());
285 return messages
.getString (key
);
289 * Constructs a regular expression pattern buffer without any compilation
290 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
292 * @param pattern A regular expression pattern, in the form of a String,
293 * StringBuffer or char[]. Other input types will be converted to
294 * strings using the toString() method.
295 * @exception REException The input pattern could not be parsed.
296 * @exception NullPointerException The pattern was null.
298 public RE (Object pattern
) throws REException
300 this (pattern
, 0, RESyntax
.RE_SYNTAX_PERL5
, 0, 0);
304 * Constructs a regular expression pattern buffer using the specified
305 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
307 * @param pattern A regular expression pattern, in the form of a String,
308 * StringBuffer, or char[]. Other input types will be converted to
309 * strings using the toString() method.
310 * @param cflags The logical OR of any combination of the compilation flags listed above.
311 * @exception REException The input pattern could not be parsed.
312 * @exception NullPointerException The pattern was null.
314 public RE (Object pattern
, int cflags
) throws REException
316 this (pattern
, cflags
, RESyntax
.RE_SYNTAX_PERL5
, 0, 0);
320 * Constructs a regular expression pattern buffer using the specified
321 * compilation flags and regular expression syntax.
323 * @param pattern A regular expression pattern, in the form of a String,
324 * StringBuffer, or char[]. Other input types will be converted to
325 * strings using the toString() method.
326 * @param cflags The logical OR of any combination of the compilation flags listed above.
327 * @param syntax The type of regular expression syntax to use.
328 * @exception REException The input pattern could not be parsed.
329 * @exception NullPointerException The pattern was null.
331 public RE (Object pattern
, int cflags
, RESyntax syntax
) throws REException
333 this (pattern
, cflags
, syntax
, 0, 0);
336 // internal constructor used for alternation
337 private RE (REToken first
, REToken last
, int subs
, int subIndex
,
338 int minLength
, int maxLength
)
344 minimumLength
= minLength
;
345 maximumLength
= maxLength
;
346 addToken (new RETokenEndSub (subIndex
));
349 private RE (Object patternObj
, int cflags
, RESyntax syntax
, int myIndex
,
350 int nextSub
) throws REException
352 super (myIndex
); // Subexpression index of this token.
353 initialize (patternObj
, cflags
, syntax
, myIndex
, nextSub
);
356 // For use by subclasses
362 // The meat of construction
363 protected void initialize (Object patternObj
, int cflags
, RESyntax syntax
,
364 int myIndex
, int nextSub
) throws REException
367 if (patternObj
instanceof String
)
369 pattern
= ((String
) patternObj
).toCharArray ();
371 else if (patternObj
instanceof char[])
373 pattern
= (char[]) patternObj
;
375 else if (patternObj
instanceof StringBuffer
)
377 pattern
= new char[((StringBuffer
) patternObj
).length ()];
378 ((StringBuffer
) patternObj
).getChars (0, pattern
.length
, pattern
, 0);
380 else if (patternObj
instanceof StringBuilder
)
382 pattern
= new char[((StringBuilder
) patternObj
).length ()];
383 ((StringBuilder
) patternObj
).getChars (0, pattern
.length
, pattern
, 0);
385 else if (patternObj
instanceof CPStringBuilder
)
387 pattern
= new char[((CPStringBuilder
) patternObj
).length ()];
388 ((CPStringBuilder
) patternObj
).getChars (0, pattern
.length
, pattern
,
393 pattern
= patternObj
.toString ().toCharArray ();
396 int pLength
= pattern
.length
;
398 numSubs
= 0; // Number of subexpressions in this token.
399 ArrayList
< REToken
> branches
= null;
401 // linked list of tokens (sort of -- some closed loops can exist)
402 firstToken
= lastToken
= null;
404 // Precalculate these so we don't pay for the math every time we
405 // need to access them.
406 boolean insens
= ((cflags
& REG_ICASE
) > 0);
407 boolean insensUSASCII
= ((cflags
& REG_ICASE_USASCII
) > 0);
409 // Parse pattern into tokens. Does anyone know if it's more efficient
410 // to use char[] than a String.charAt()? I'm assuming so.
412 // index tracks the position in the char array
415 // this will be the current parse character (pattern[index])
416 CharUnit unit
= new CharUnit ();
418 // This is used for {x,y} calculations
419 IntPair minMax
= new IntPair ();
421 // Buffer a token so we can create a TokenRepeated, etc.
422 REToken currentToken
= null;
423 boolean quot
= false;
425 // Saved syntax and flags.
426 RESyntax savedSyntax
= null;
428 boolean flagsSaved
= false;
430 while (index
< pLength
)
432 // read the next character unit (including backslash escapes)
433 index
= getCharUnit (pattern
, index
, unit
, quot
);
441 else if (unit
.ch
== 'E')
449 if (((cflags
& REG_X_COMMENTS
) > 0) && (!unit
.bk
) && (!quot
))
451 if (Character
.isWhitespace (unit
.ch
))
457 for (int i
= index
; i
< pLength
; i
++)
459 if (pattern
[i
] == '\n')
464 else if (pattern
[i
] == '\r')
466 if (i
+ 1 < pLength
&& pattern
[i
+ 1] == '\n')
482 // ALTERNATION OPERATOR
483 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
484 // not available if RE_LIMITED_OPS is set
486 // TODO: the '\n' literal here should be a test against REToken.newline,
487 // which unfortunately may be more than a single character.
489 && (syntax
.get (RESyntax
.RE_NO_BK_VBAR
) ^
(unit
.bk
|| quot
)))
490 || (syntax
.get (RESyntax
.RE_NEWLINE_ALT
) && (unit
.ch
== '\n')
491 && !(unit
.bk
|| quot
)))
492 && !syntax
.get (RESyntax
.RE_LIMITED_OPS
))
494 // make everything up to here be a branch. create vector if nec.
495 addToken (currentToken
);
497 new RE (firstToken
, lastToken
, numSubs
, subIndex
, minimumLength
,
501 if (branches
== null)
503 branches
= new ArrayList
< REToken
> ();
505 branches
.add (theBranch
);
506 firstToken
= lastToken
= currentToken
= null;
509 // INTERVAL OPERATOR:
510 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
511 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
514 // what is proper interpretation of '{' at start of string?
516 // This method used to check "repeat.empty.token" to avoid such regexp
517 // as "(a*){2,}", but now "repeat.empty.token" is allowed.
519 else if ((unit
.ch
== '{') && syntax
.get (RESyntax
.RE_INTERVALS
)
521 get (RESyntax
.RE_NO_BK_BRACES
) ^
(unit
.bk
|| quot
)))
523 int newIndex
= getMinMax (pattern
, index
, minMax
, syntax
);
524 if (newIndex
> index
)
526 if (minMax
.first
> minMax
.second
)
528 REException (getLocalizedMessage ("interval.order"),
529 REException
.REG_BADRPT
, newIndex
);
530 if (currentToken
== null)
532 REException (getLocalizedMessage ("repeat.no.token"),
533 REException
.REG_BADRPT
, newIndex
);
534 if (currentToken
instanceof RETokenRepeated
)
536 REException (getLocalizedMessage ("repeat.chained"),
537 REException
.REG_BADRPT
, newIndex
);
538 if (currentToken
instanceof RETokenWordBoundary
539 || currentToken
instanceof RETokenWordBoundary
)
541 REException (getLocalizedMessage ("repeat.assertion"),
542 REException
.REG_BADRPT
, newIndex
);
545 setRepeated (currentToken
, minMax
.first
, minMax
.second
,
550 addToken (currentToken
);
551 currentToken
= new RETokenChar (subIndex
, unit
.ch
, insens
);
553 currentToken
.unicodeAware
= false;
560 else if ((unit
.ch
== '[') && !(unit
.bk
|| quot
))
562 // Create a new RETokenOneOf
563 ParseCharClassResult result
=
564 parseCharClass (subIndex
, pattern
, index
, pLength
, cflags
,
566 addToken (currentToken
);
567 currentToken
= result
.token
;
568 index
= result
.index
;
572 // (...) | \(...\) depending on RE_NO_BK_PARENS
574 else if ((unit
.ch
== '(')
576 get (RESyntax
.RE_NO_BK_PARENS
) ^
(unit
.bk
|| quot
)))
578 boolean pure
= false;
579 boolean comment
= false;
580 boolean lookAhead
= false;
581 boolean lookBehind
= false;
582 boolean independent
= false;
583 boolean negativelh
= false;
584 boolean negativelb
= false;
585 if ((index
+ 1 < pLength
) && (pattern
[index
] == '?'))
587 switch (pattern
[index
+ 1])
590 if (syntax
.get (RESyntax
.RE_LOOKAHEAD
))
599 if (syntax
.get (RESyntax
.RE_LOOKAHEAD
))
607 // We assume that if the syntax supports look-ahead,
608 // it also supports look-behind.
609 if (syntax
.get (RESyntax
.RE_LOOKAHEAD
))
612 switch (pattern
[index
+ 1])
628 // We assume that if the syntax supports look-ahead,
629 // it also supports independent group.
630 if (syntax
.get (RESyntax
.RE_LOOKAHEAD
))
644 if (!syntax
.get (RESyntax
.RE_EMBEDDED_FLAGS
))
646 // Set or reset syntax flags.
647 int flagIndex
= index
+ 1;
649 RESyntax newSyntax
= new RESyntax (syntax
);
650 int newCflags
= cflags
;
651 boolean negate
= false;
652 while (flagIndex
< pLength
&& endFlag
< 0)
654 switch (pattern
[flagIndex
])
658 newCflags
&= ~REG_ICASE
;
660 newCflags
|= REG_ICASE
;
665 newSyntax
.setLineSeparator (RESyntax
.
666 DEFAULT_LINE_SEPARATOR
);
668 newSyntax
.setLineSeparator ("\n");
673 newCflags
&= ~REG_MULTILINE
;
675 newCflags
|= REG_MULTILINE
;
680 newCflags
&= ~REG_DOT_NEWLINE
;
682 newCflags
|= REG_DOT_NEWLINE
;
687 newCflags
|= REG_ICASE_USASCII
;
689 newCflags
&= ~REG_ICASE_USASCII
;
694 newCflags
&= ~REG_X_COMMENTS
;
696 newCflags
|= REG_X_COMMENTS
;
705 endFlag
= pattern
[flagIndex
];
709 REException (getLocalizedMessage
711 REException
.REG_BADRPT
, index
);
718 insens
= ((cflags
& REG_ICASE
) > 0);
719 insensUSASCII
= ((cflags
& REG_ICASE_USASCII
) > 0);
720 // This can be treated as though it were a comment.
722 index
= flagIndex
- 1;
727 savedSyntax
= syntax
;
728 savedCflags
= cflags
;
732 insens
= ((cflags
& REG_ICASE
) > 0);
733 insensUSASCII
= ((cflags
& REG_ICASE_USASCII
) > 0);
734 index
= flagIndex
- 1;
735 // Fall through to the next case.
740 REException (getLocalizedMessage
742 REException
.REG_ESUBREG
, index
);
745 if (syntax
.get (RESyntax
.RE_PURE_GROUPING
))
752 if (syntax
.get (RESyntax
.RE_COMMENTS
))
759 REException (getLocalizedMessage ("repeat.no.token"),
760 REException
.REG_BADRPT
, index
);
764 if (index
>= pLength
)
767 REException (getLocalizedMessage ("unmatched.paren"),
768 REException
.REG_ESUBREG
, index
);
771 // find end of subexpression
772 int endIndex
= index
;
773 int nextIndex
= index
;
777 getCharUnit (pattern
, endIndex
, unit
, false)) > 0)
778 && !(nested
== 0 && (unit
.ch
== ')')
780 get (RESyntax
.RE_NO_BK_PARENS
) ^
(unit
.bk
783 if ((endIndex
= nextIndex
) >= pLength
)
785 REException (getLocalizedMessage ("subexpr.no.end"),
786 REException
.REG_ESUBREG
, nextIndex
);
788 if ((unit
.ch
== '[') && !(unit
.bk
|| quot
))
790 // I hate to do something similar to the LIST OPERATOR matters
792 int listIndex
= nextIndex
;
793 if (listIndex
< pLength
&& pattern
[listIndex
] == '^')
795 if (listIndex
< pLength
&& pattern
[listIndex
] == ']')
797 int listEndIndex
= -1;
799 while (listIndex
< pLength
&& listEndIndex
< 0)
801 switch (pattern
[listIndex
++])
807 // Sun's API document says that regexp like "[a-d[m-p]]"
808 // is legal. Even something like "[[[^]]]]" is accepted.
810 if (listIndex
< pLength
811 && pattern
[listIndex
] == '^')
813 if (listIndex
< pLength
814 && pattern
[listIndex
] == ']')
819 listEndIndex
= listIndex
;
824 if (listEndIndex
>= 0)
826 nextIndex
= listEndIndex
;
827 if ((endIndex
= nextIndex
) >= pLength
)
829 REException (getLocalizedMessage ("subexpr.no.end"),
830 REException
.REG_ESUBREG
, nextIndex
);
835 REException (getLocalizedMessage ("subexpr.no.end"),
836 REException
.REG_ESUBREG
, nextIndex
);
838 else if (unit
.ch
== '('
840 get (RESyntax
.RE_NO_BK_PARENS
) ^
(unit
.bk
843 else if (unit
.ch
== ')'
845 get (RESyntax
.RE_NO_BK_PARENS
) ^
(unit
.bk
850 // endIndex is now position at a ')','\)'
851 // nextIndex is end of string or position after ')' or '\)'
857 // create RE subexpression as token.
858 addToken (currentToken
);
864 int useIndex
= (pure
|| lookAhead
|| lookBehind
865 || independent
) ?
0 : nextSub
+ numSubs
;
867 new RE (String
.valueOf (pattern
, index
, endIndex
- index
).
868 toCharArray (), cflags
, syntax
, useIndex
,
870 numSubs
+= ((RE
) currentToken
).getNumSubs ();
875 new RETokenLookAhead (currentToken
, negativelh
);
880 new RETokenLookBehind (currentToken
, negativelb
);
882 else if (independent
)
884 currentToken
= new RETokenIndependent (currentToken
);
890 syntax
= savedSyntax
;
891 cflags
= savedCflags
;
892 insens
= ((cflags
& REG_ICASE
) > 0);
893 insensUSASCII
= ((cflags
& REG_ICASE_USASCII
) > 0);
899 // UNMATCHED RIGHT PAREN
900 // ) or \) throw exception if
901 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
902 else if (!syntax
.get (RESyntax
.RE_UNMATCHED_RIGHT_PAREN_ORD
)
905 get (RESyntax
.RE_NO_BK_PARENS
) ^
(unit
.bk
|| quot
))))
907 throw new REException (getLocalizedMessage ("unmatched.paren"),
908 REException
.REG_EPAREN
, index
);
911 // START OF LINE OPERATOR
914 else if ((unit
.ch
== '^') && !(unit
.bk
|| quot
))
916 addToken (currentToken
);
918 RETokenStart token
= null;
919 if ((cflags
& REG_MULTILINE
) > 0)
921 String sep
= syntax
.getLineSeparator ();
924 token
= new RETokenStart (subIndex
, null, true);
928 token
= new RETokenStart (subIndex
, sep
);
933 token
= new RETokenStart (subIndex
, null);
938 // END OF LINE OPERATOR
941 else if ((unit
.ch
== '$') && !(unit
.bk
|| quot
))
943 addToken (currentToken
);
945 RETokenEnd token
= null;
946 if ((cflags
& REG_MULTILINE
) > 0)
948 String sep
= syntax
.getLineSeparator ();
951 token
= new RETokenEnd (subIndex
, null, true);
955 token
= new RETokenEnd (subIndex
, sep
);
960 token
= new RETokenEnd (subIndex
, null);
965 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
968 else if ((unit
.ch
== '.') && !(unit
.bk
|| quot
))
970 addToken (currentToken
);
972 new RETokenAny (subIndex
, syntax
.get (RESyntax
.RE_DOT_NEWLINE
)
973 || ((cflags
& REG_DOT_NEWLINE
) > 0),
974 syntax
.get (RESyntax
.RE_DOT_NOT_NULL
));
977 // ZERO-OR-MORE REPEAT OPERATOR
980 // This method used to check "repeat.empty.token" to avoid such regexp
981 // as "(a*)*", but now "repeat.empty.token" is allowed.
983 else if ((unit
.ch
== '*') && !(unit
.bk
|| quot
))
985 if (currentToken
== null)
986 throw new REException (getLocalizedMessage ("repeat.no.token"),
987 REException
.REG_BADRPT
, index
);
988 if (currentToken
instanceof RETokenRepeated
)
989 throw new REException (getLocalizedMessage ("repeat.chained"),
990 REException
.REG_BADRPT
, index
);
991 if (currentToken
instanceof RETokenWordBoundary
992 || currentToken
instanceof RETokenWordBoundary
)
993 throw new REException (getLocalizedMessage ("repeat.assertion"),
994 REException
.REG_BADRPT
, index
);
996 setRepeated (currentToken
, 0, Integer
.MAX_VALUE
, index
);
999 // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000 // + | \+ depending on RE_BK_PLUS_QM
1001 // not available if RE_LIMITED_OPS is set
1003 // This method used to check "repeat.empty.token" to avoid such regexp
1004 // as "(a*)+", but now "repeat.empty.token" is allowed.
1006 else if ((unit
.ch
== '+') && !syntax
.get (RESyntax
.RE_LIMITED_OPS
)
1008 get (RESyntax
.RE_BK_PLUS_QM
) ^
(unit
.bk
|| quot
)))
1010 if (currentToken
== null)
1011 throw new REException (getLocalizedMessage ("repeat.no.token"),
1012 REException
.REG_BADRPT
, index
);
1014 // Check for possessive matching on RETokenRepeated
1015 if (currentToken
instanceof RETokenRepeated
)
1017 RETokenRepeated tokenRep
= (RETokenRepeated
) currentToken
;
1018 if (syntax
.get (RESyntax
.RE_POSSESSIVE_OPS
)
1019 && !tokenRep
.isPossessive () && !tokenRep
.isStingy ())
1020 tokenRep
.makePossessive ();
1023 REException (getLocalizedMessage ("repeat.chained"),
1024 REException
.REG_BADRPT
, index
);
1027 else if (currentToken
instanceof RETokenWordBoundary
1028 || currentToken
instanceof RETokenWordBoundary
)
1029 throw new REException (getLocalizedMessage ("repeat.assertion"),
1030 REException
.REG_BADRPT
, index
);
1033 setRepeated (currentToken
, 1, Integer
.MAX_VALUE
, index
);
1036 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037 // ? | \? depending on RE_BK_PLUS_QM
1038 // not available if RE_LIMITED_OPS is set
1039 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1041 else if ((unit
.ch
== '?') && !syntax
.get (RESyntax
.RE_LIMITED_OPS
)
1043 get (RESyntax
.RE_BK_PLUS_QM
) ^
(unit
.bk
|| quot
)))
1045 if (currentToken
== null)
1046 throw new REException (getLocalizedMessage ("repeat.no.token"),
1047 REException
.REG_BADRPT
, index
);
1049 // Check for stingy matching on RETokenRepeated
1050 if (currentToken
instanceof RETokenRepeated
)
1052 RETokenRepeated tokenRep
= (RETokenRepeated
) currentToken
;
1053 if (syntax
.get (RESyntax
.RE_STINGY_OPS
)
1054 && !tokenRep
.isStingy () && !tokenRep
.isPossessive ())
1055 tokenRep
.makeStingy ();
1058 REException (getLocalizedMessage ("repeat.chained"),
1059 REException
.REG_BADRPT
, index
);
1061 else if (currentToken
instanceof RETokenWordBoundary
1062 || currentToken
instanceof RETokenWordBoundary
)
1063 throw new REException (getLocalizedMessage ("repeat.assertion"),
1064 REException
.REG_BADRPT
, index
);
1066 currentToken
= setRepeated (currentToken
, 0, 1, index
);
1072 else if (unit
.bk
&& (unit
.ch
== '0')
1073 && syntax
.get (RESyntax
.RE_OCTAL_CHAR
))
1076 getCharExpression (pattern
, index
- 2, pLength
, syntax
);
1078 throw new REException ("invalid octal character",
1079 REException
.REG_ESCAPE
, index
);
1080 index
= index
- 2 + ce
.len
;
1081 addToken (currentToken
);
1082 currentToken
= new RETokenChar (subIndex
, ce
.ch
, insens
);
1084 currentToken
.unicodeAware
= false;
1087 // BACKREFERENCE OPERATOR
1088 // \1 \2 ... \9 and \10 \11 \12 ...
1089 // not available if RE_NO_BK_REFS is set
1090 // Perl recognizes \10, \11, and so on only if enough number of
1091 // parentheses have opened before it, otherwise they are treated
1092 // as aliases of \010, \011, ... (octal characters). In case of
1093 // Sun's JDK, octal character expression must always begin with \0.
1094 // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095 // JDK treats \2 as a back reference to the 2nd group because
1096 // there are only two groups. But in our poor implementation,
1097 // we cannot help but treat \29 as a back reference to the 29th group.
1099 else if (unit
.bk
&& Character
.isDigit (unit
.ch
)
1100 && !syntax
.get (RESyntax
.RE_NO_BK_REFS
))
1102 addToken (currentToken
);
1103 int numBegin
= index
- 1;
1104 int numEnd
= pLength
;
1105 for (int i
= index
; i
< pLength
; i
++)
1107 if (!Character
.isDigit (pattern
[i
]))
1113 int num
= parseInt (pattern
, numBegin
, numEnd
- numBegin
, 10);
1115 currentToken
= new RETokenBackRef (subIndex
, num
, insens
);
1117 currentToken
.unicodeAware
= false;
1121 // START OF STRING OPERATOR
1122 // \A if RE_STRING_ANCHORS is set
1124 else if (unit
.bk
&& (unit
.ch
== 'A')
1125 && syntax
.get (RESyntax
.RE_STRING_ANCHORS
))
1127 addToken (currentToken
);
1128 currentToken
= new RETokenStart (subIndex
, null);
1131 // WORD BREAK OPERATOR
1134 else if (unit
.bk
&& (unit
.ch
== 'b')
1135 && syntax
.get (RESyntax
.RE_STRING_ANCHORS
))
1137 addToken (currentToken
);
1139 new RETokenWordBoundary (subIndex
,
1140 RETokenWordBoundary
.
1141 BEGIN
| RETokenWordBoundary
.END
,
1145 // WORD BEGIN OPERATOR
1147 else if (unit
.bk
&& (unit
.ch
== '<'))
1149 addToken (currentToken
);
1151 new RETokenWordBoundary (subIndex
, RETokenWordBoundary
.BEGIN
,
1155 // WORD END OPERATOR
1157 else if (unit
.bk
&& (unit
.ch
== '>'))
1159 addToken (currentToken
);
1161 new RETokenWordBoundary (subIndex
, RETokenWordBoundary
.END
,
1165 // NON-WORD BREAK OPERATOR
1168 else if (unit
.bk
&& (unit
.ch
== 'B')
1169 && syntax
.get (RESyntax
.RE_STRING_ANCHORS
))
1171 addToken (currentToken
);
1173 new RETokenWordBoundary (subIndex
,
1174 RETokenWordBoundary
.
1175 BEGIN
| RETokenWordBoundary
.END
, true);
1180 // \d if RE_CHAR_CLASS_ESCAPES is set
1182 else if (unit
.bk
&& (unit
.ch
== 'd')
1183 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1185 addToken (currentToken
);
1187 new RETokenPOSIX (subIndex
, RETokenPOSIX
.DIGIT
, insens
, false);
1189 currentToken
.unicodeAware
= false;
1192 // NON-DIGIT OPERATOR
1195 else if (unit
.bk
&& (unit
.ch
== 'D')
1196 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1198 addToken (currentToken
);
1200 new RETokenPOSIX (subIndex
, RETokenPOSIX
.DIGIT
, insens
, true);
1202 currentToken
.unicodeAware
= false;
1208 else if (unit
.bk
&& (unit
.ch
== 'n'))
1210 addToken (currentToken
);
1211 currentToken
= new RETokenChar (subIndex
, '\n', false);
1217 else if (unit
.bk
&& (unit
.ch
== 'r'))
1219 addToken (currentToken
);
1220 currentToken
= new RETokenChar (subIndex
, '\r', false);
1223 // WHITESPACE OPERATOR
1224 // \s if RE_CHAR_CLASS_ESCAPES is set
1226 else if (unit
.bk
&& (unit
.ch
== 's')
1227 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1229 addToken (currentToken
);
1231 new RETokenPOSIX (subIndex
, RETokenPOSIX
.SPACE
, insens
, false);
1233 currentToken
.unicodeAware
= false;
1236 // NON-WHITESPACE OPERATOR
1239 else if (unit
.bk
&& (unit
.ch
== 'S')
1240 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1242 addToken (currentToken
);
1244 new RETokenPOSIX (subIndex
, RETokenPOSIX
.SPACE
, insens
, true);
1246 currentToken
.unicodeAware
= false;
1252 else if (unit
.bk
&& (unit
.ch
== 't'))
1254 addToken (currentToken
);
1255 currentToken
= new RETokenChar (subIndex
, '\t', false);
1258 // ALPHANUMERIC OPERATOR
1261 else if (unit
.bk
&& (unit
.ch
== 'w')
1262 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1264 addToken (currentToken
);
1266 new RETokenPOSIX (subIndex
, RETokenPOSIX
.ALNUM
, insens
, false);
1268 currentToken
.unicodeAware
= false;
1271 // NON-ALPHANUMERIC OPERATOR
1274 else if (unit
.bk
&& (unit
.ch
== 'W')
1275 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESCAPES
))
1277 addToken (currentToken
);
1279 new RETokenPOSIX (subIndex
, RETokenPOSIX
.ALNUM
, insens
, true);
1281 currentToken
.unicodeAware
= false;
1284 // END OF STRING OPERATOR
1287 // FIXME: \Z and \z are different in that if the input string
1288 // ends with a line terminator, \Z matches the position before
1289 // the final terminator. This special behavior of \Z is yet
1290 // to be implemented.
1292 else if (unit
.bk
&& (unit
.ch
== 'Z' || unit
.ch
== 'z') &&
1293 syntax
.get (RESyntax
.RE_STRING_ANCHORS
))
1295 addToken (currentToken
);
1296 currentToken
= new RETokenEnd (subIndex
, null);
1299 // HEX CHARACTER, UNICODE CHARACTER
1303 if ((unit
.bk
&& (unit
.ch
== 'x')
1304 && syntax
.get (RESyntax
.RE_HEX_CHAR
)) || (unit
.bk
1311 getCharExpression (pattern
, index
- 2, pLength
, syntax
);
1313 throw new REException ("invalid hex character",
1314 REException
.REG_ESCAPE
, index
);
1315 index
= index
- 2 + ce
.len
;
1316 addToken (currentToken
);
1317 currentToken
= new RETokenChar (subIndex
, ce
.ch
, insens
);
1319 currentToken
.unicodeAware
= false;
1323 // \p{prop}, \P{prop}
1326 if ((unit
.bk
&& (unit
.ch
== 'p')
1327 && syntax
.get (RESyntax
.RE_NAMED_PROPERTY
)) || (unit
.bk
1332 RE_NAMED_PROPERTY
)))
1334 NamedProperty np
= getNamedProperty (pattern
, index
- 2, pLength
);
1336 throw new REException ("invalid escape sequence",
1337 REException
.REG_ESCAPE
, index
);
1338 index
= index
- 2 + np
.len
;
1339 addToken (currentToken
);
1341 getRETokenNamedProperty (subIndex
, np
, insens
, index
);
1343 currentToken
.unicodeAware
= false;
1346 // END OF PREVIOUS MATCH
1349 else if (unit
.bk
&& (unit
.ch
== 'G') &&
1350 syntax
.get (RESyntax
.RE_STRING_ANCHORS
))
1352 addToken (currentToken
);
1353 currentToken
= new RETokenEndOfPreviousMatch (subIndex
);
1356 // NON-SPECIAL CHARACTER (or escape to make literal)
1357 // c | \* for example
1360 { // not a special character
1361 addToken (currentToken
);
1362 currentToken
= new RETokenChar (subIndex
, unit
.ch
, insens
);
1364 currentToken
.unicodeAware
= false;
1368 // Add final buffered token and an EndSub marker
1369 addToken (currentToken
);
1371 if (branches
!= null)
1375 RE (firstToken
, lastToken
, numSubs
, subIndex
, minimumLength
,
1377 branches
.trimToSize (); // compact the Vector
1380 firstToken
= lastToken
= null;
1381 addToken (new RETokenOneOf (subIndex
, branches
, false));
1384 addToken (new RETokenEndSub (subIndex
));
1388 private static class ParseCharClassResult
1392 boolean returnAtAndOperator
= false;
1396 * Parse [...] or [^...] and make an RETokenOneOf instance.
1397 * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398 * @param pattern Input array of characters to be parsed.
1399 * @param index Index pointing to the character next to the beginning '['.
1400 * @param pLength Limit of the input array.
1401 * @param cflags Compilation flags used to parse the pattern.
1402 * @param pflags Flags that affect the behavior of this method.
1403 * @param syntax Syntax used to parse the pattern.
1405 private static ParseCharClassResult
parseCharClass (int subIndex
,
1407 int index
, int pLength
,
1414 boolean insens
= ((cflags
& REG_ICASE
) > 0);
1415 boolean insensUSASCII
= ((cflags
& REG_ICASE_USASCII
) > 0);
1416 final ArrayList
< REToken
> options
= new ArrayList
< REToken
> ();
1417 ArrayList
< Object
> addition
= new ArrayList
< Object
> ();
1418 boolean additionAndAppeared
= false;
1419 final int RETURN_AT_AND
= 0x01;
1420 boolean returnAtAndOperator
= ((pflags
& RETURN_AT_AND
) != 0);
1421 boolean negative
= false;
1425 boolean lastCharIsSet
= false;
1426 if (index
== pLength
)
1427 throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428 REException
.REG_EBRACK
, index
);
1430 // Check for initial caret, negation
1431 if ((ch
= pattern
[index
]) == '^')
1434 if (++index
== pLength
)
1435 throw new REException (getLocalizedMessage ("class.no.end"),
1436 REException
.REG_EBRACK
, index
);
1437 ch
= pattern
[index
];
1440 // Check for leading right bracket literal
1444 lastCharIsSet
= true;
1445 if (++index
== pLength
)
1446 throw new REException (getLocalizedMessage ("class.no.end"),
1447 REException
.REG_EBRACK
, index
);
1450 while ((ch
= pattern
[index
++]) != ']')
1452 if ((ch
== '-') && (lastCharIsSet
))
1454 if (index
== pLength
)
1455 throw new REException (getLocalizedMessage ("class.no.end"),
1456 REException
.REG_EBRACK
, index
);
1457 if ((ch
= pattern
[index
]) == ']')
1459 RETokenChar t
= new RETokenChar (subIndex
, lastChar
, insens
);
1461 t
.unicodeAware
= false;
1468 && syntax
.get (RESyntax
.RE_BACKSLASH_ESCAPE_IN_LISTS
))
1471 getCharExpression (pattern
, index
, pLength
, syntax
);
1473 throw new REException ("invalid escape sequence",
1474 REException
.REG_ESCAPE
, index
);
1476 index
= index
+ ce
.len
- 1;
1479 new RETokenRange (subIndex
, lastChar
, ch
, insens
);
1481 t
.unicodeAware
= false;
1484 lastCharIsSet
= false;
1488 else if ((ch
== '\\')
1489 && syntax
.get (RESyntax
.RE_BACKSLASH_ESCAPE_IN_LISTS
))
1491 if (index
== pLength
)
1492 throw new REException (getLocalizedMessage ("class.no.end"),
1493 REException
.REG_EBRACK
, index
);
1495 boolean negate
= false;
1497 boolean asciiEscIsSet
= false;
1498 NamedProperty np
= null;
1499 if (("dswDSW".indexOf (pattern
[index
]) != -1)
1500 && syntax
.get (RESyntax
.RE_CHAR_CLASS_ESC_IN_LISTS
))
1502 switch (pattern
[index
])
1507 posixID
= RETokenPOSIX
.DIGIT
;
1512 posixID
= RETokenPOSIX
.SPACE
;
1517 posixID
= RETokenPOSIX
.ALNUM
;
1521 if (("pP".indexOf (pattern
[index
]) != -1)
1522 && syntax
.get (RESyntax
.RE_NAMED_PROPERTY
))
1524 np
= getNamedProperty (pattern
, index
- 1, pLength
);
1526 throw new REException ("invalid escape sequence",
1527 REException
.REG_ESCAPE
, index
);
1528 index
= index
- 1 + np
.len
- 1;
1533 getCharExpression (pattern
, index
- 1, pLength
, syntax
);
1535 throw new REException ("invalid escape sequence",
1536 REException
.REG_ESCAPE
, index
);
1538 asciiEscIsSet
= true;
1539 index
= index
- 1 + ce
.len
- 1;
1543 RETokenChar t
= new RETokenChar (subIndex
, lastChar
, insens
);
1545 t
.unicodeAware
= false;
1552 new RETokenPOSIX (subIndex
, posixID
, insens
, negate
);
1554 t
.unicodeAware
= false;
1557 else if (np
!= null)
1559 RETokenNamedProperty t
=
1560 getRETokenNamedProperty (subIndex
, np
, insens
, index
);
1562 t
.unicodeAware
= false;
1565 else if (asciiEscIsSet
)
1567 lastChar
= asciiEsc
;
1568 lastCharIsSet
= true;
1572 lastChar
= pattern
[index
];
1573 lastCharIsSet
= true;
1577 else if ((ch
== '[') && (syntax
.get (RESyntax
.RE_CHAR_CLASSES
))
1578 && (index
< pLength
) && (pattern
[index
] == ':'))
1580 CPStringBuilder posixSet
= new CPStringBuilder ();
1581 index
= getPosixSet (pattern
, index
+ 1, posixSet
);
1582 int posixId
= RETokenPOSIX
.intValue (posixSet
.toString ());
1586 new RETokenPOSIX (subIndex
, posixId
, insens
, false);
1588 t
.unicodeAware
= false;
1592 else if ((ch
== '[') && (syntax
.get (RESyntax
.RE_NESTED_CHARCLASS
)))
1594 ParseCharClassResult result
=
1595 parseCharClass (subIndex
, pattern
, index
, pLength
, cflags
,
1597 addition
.add (result
.token
);
1599 index
= result
.index
;
1601 else if ((ch
== '&') &&
1602 (syntax
.get (RESyntax
.RE_NESTED_CHARCLASS
)) &&
1603 (index
< pLength
) && (pattern
[index
] == '&'))
1605 if (returnAtAndOperator
)
1607 ParseCharClassResult result
= new ParseCharClassResult ();
1608 options
.trimToSize ();
1609 if (additionAndAppeared
)
1611 if (addition
.size () == 0)
1613 result
.token
= new RETokenOneOf (subIndex
,
1614 options
, addition
, negative
);
1615 result
.index
= index
- 1;
1616 result
.returnAtAndOperator
= true;
1619 // The precedence of the operator "&&" is the lowest.
1620 // So we postpone adding "&" until other elements
1621 // are added. And we insert Boolean.FALSE at the
1622 // beginning of the list of tokens following "&&".
1623 // So, "&&[a-b][k-m]" will be stored in the Vecter
1624 // addition in this order:
1625 // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626 if (additionAndAppeared
)
1628 addition
.add (Boolean
.FALSE
);
1629 additionAndAppeared
= true;
1631 // The part on which "&&" operates may be either
1632 // (1) explicitly enclosed by []
1634 // (2) not enclosed by [] and terminated by the
1635 // next "&&" or the end of the character list.
1636 // Let the preceding else if block do the case (1).
1637 // We must do something in case of (2).
1638 if ((index
+ 1 < pLength
) && (pattern
[index
+ 1] != '['))
1640 ParseCharClassResult result
=
1641 parseCharClass (subIndex
, pattern
, index
+ 1, pLength
,
1644 addition
.add (result
.token
);
1646 // If the method returned at the next "&&", it is OK.
1647 // Otherwise we have eaten the mark of the end of this
1648 // character list "]". In this case we must give back
1650 index
= (result
.returnAtAndOperator ?
1651 result
.index
: result
.index
- 1);
1658 RETokenChar t
= new RETokenChar (subIndex
, lastChar
, insens
);
1660 t
.unicodeAware
= false;
1664 lastCharIsSet
= true;
1666 if (index
== pLength
)
1667 throw new REException (getLocalizedMessage ("class.no.end"),
1668 REException
.REG_EBRACK
, index
);
1670 // Out of list, index is one past ']'
1674 RETokenChar t
= new RETokenChar (subIndex
, lastChar
, insens
);
1676 t
.unicodeAware
= false;
1680 ParseCharClassResult result
= new ParseCharClassResult ();
1681 // Create a new RETokenOneOf
1682 options
.trimToSize ();
1683 if (additionAndAppeared
)
1685 if (addition
.size () == 0)
1687 result
.token
= new RETokenOneOf (subIndex
, options
, addition
, negative
);
1688 result
.index
= index
;
1692 private static int getCharUnit (char[]input
, int index
, CharUnit unit
,
1693 boolean quot
) throws REException
1695 unit
.ch
= input
[index
++];
1696 unit
.bk
= (unit
.ch
== '\\'
1697 && (!quot
|| index
>= input
.length
|| input
[index
] == 'E'));
1699 if (index
< input
.length
)
1700 unit
.ch
= input
[index
++];
1702 throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703 REException
.REG_ESCAPE
, index
);
1707 private static int parseInt (char[]input
, int pos
, int len
, int radix
)
1710 for (int i
= pos
; i
< pos
+ len
; i
++)
1712 ret
= ret
* radix
+ Character
.digit (input
[i
], radix
);
1718 * This class represents various expressions for a character.
1720 * "\0123" : Octal char 0123
1721 * "\x1b" : Hex char 0x1b
1722 * "\u1234" : Unicode char \u1234
1724 private static class CharExpression
1726 /** character represented by this expression */
1728 /** String expression */
1730 /** length of this expression */
1732 public String
toString ()
1738 private static CharExpression
getCharExpression (char[]input
, int pos
,
1739 int lim
, RESyntax syntax
)
1741 CharExpression ce
= new CharExpression ();
1742 char c
= input
[pos
];
1764 if ((c
== 'x' && syntax
.get (RESyntax
.RE_HEX_CHAR
)) ||
1765 (c
== 'u' && syntax
.get (RESyntax
.RE_UNICODE_CHAR
)))
1768 int expectedLength
= (c
== 'x' ?
2 : 4);
1769 for (int i
= pos
+ 2; i
< pos
+ 2 + expectedLength
; i
++)
1773 if (!((input
[i
] >= '0' && input
[i
] <= '9') ||
1774 (input
[i
] >= 'A' && input
[i
] <= 'F') ||
1775 (input
[i
] >= 'a' && input
[i
] <= 'f')))
1779 if (l
!= expectedLength
)
1781 ce
.ch
= (char) (parseInt (input
, pos
+ 2, l
, 16));
1791 if (syntax
.get (RESyntax
.RE_OCTAL_CHAR
))
1794 for (int i
= pos
+ 2; i
< pos
+ 2 + 3; i
++)
1798 if (input
[i
] < '0' || input
[i
] > '7')
1802 if (l
== 3 && input
[pos
+ 2] > '3')
1806 ce
.ch
= (char) (parseInt (input
, pos
+ 2, l
, 8));
1826 ce
.expr
= new String (input
, pos
, ce
.len
);
1831 * This class represents a substring in a pattern string expressing
1833 * "\pA" : Property named "A"
1834 * "\p{prop}" : Property named "prop"
1835 * "\PA" : Property named "A" (Negated)
1836 * "\P{prop}" : Property named "prop" (Negated)
1838 private static class NamedProperty
1840 /** Property name */
1842 /** Negated or not */
1844 /** length of this expression */
1848 private static NamedProperty
getNamedProperty (char[]input
, int pos
,
1851 NamedProperty np
= new NamedProperty ();
1852 char c
= input
[pos
];
1873 for (int i
= pos
; i
< lim
; i
++)
1875 if (input
[i
] == '}')
1884 np
.name
= new String (input
, pos
, len
);
1889 np
.name
= new String (input
, pos
- 1, 1);
1898 private static RETokenNamedProperty
getRETokenNamedProperty (int subIndex
,
1907 return new RETokenNamedProperty (subIndex
, np
.name
, insens
, np
.negate
);
1909 catch (REException e
)
1912 ree
= new REException (e
.getMessage (), REException
.REG_ESCAPE
, index
);
1919 * Checks if the regular expression matches the input in its entirety.
1921 * @param input The input text.
1923 public boolean isMatch (Object input
)
1925 return isMatch (input
, 0, 0);
1929 * Checks if the input string, starting from index, is an exact match of
1930 * this regular expression.
1932 * @param input The input text.
1933 * @param index The offset index at which the search should be begin.
1935 public boolean isMatch (Object input
, int index
)
1937 return isMatch (input
, index
, 0);
1942 * Checks if the input, starting from index and using the specified
1943 * execution flags, is an exact match of this regular expression.
1945 * @param input The input text.
1946 * @param index The offset index at which the search should be begin.
1947 * @param eflags The logical OR of any execution flags above.
1949 public boolean isMatch (Object input
, int index
, int eflags
)
1951 return isMatchImpl (makeCharIndexed (input
, index
), index
, eflags
);
1954 private boolean isMatchImpl (CharIndexed input
, int index
, int eflags
)
1956 if (firstToken
== null) // Trivial case
1957 return (input
.charAt (0) == CharIndexed
.OUT_OF_BOUNDS
);
1958 REMatch m
= new REMatch (numSubs
, index
, eflags
);
1959 if (firstToken
.match (input
, m
))
1963 if (input
.charAt (m
.index
) == CharIndexed
.OUT_OF_BOUNDS
)
1973 * Returns the maximum number of subexpressions in this regular expression.
1974 * If the expression contains branches, the value returned will be the
1975 * maximum subexpressions in any of the branches.
1977 public int getNumSubs ()
1982 // Overrides REToken.setUncle
1983 void setUncle (REToken uncle
)
1985 if (lastToken
!= null)
1987 lastToken
.setUncle (uncle
);
1990 super.setUncle (uncle
); // to deal with empty subexpressions
1993 // Overrides REToken.chain
1995 boolean chain (REToken next
)
2003 * Returns the minimum number of characters that could possibly
2004 * constitute a match of this regular expression.
2006 public int getMinimumLength ()
2008 return minimumLength
;
2011 public int getMaximumLength ()
2013 return maximumLength
;
2017 * Returns an array of all matches found in the input.
2019 * If the regular expression allows the empty string to match, it will
2020 * substitute matches at all positions except the end of the input.
2022 * @param input The input text.
2023 * @return a non-null (but possibly zero-length) array of matches
2025 public REMatch
[] getAllMatches (Object input
)
2027 return getAllMatches (input
, 0, 0);
2031 * Returns an array of all matches found in the input,
2032 * beginning at the specified index position.
2034 * If the regular expression allows the empty string to match, it will
2035 * substitute matches at all positions except the end of the input.
2037 * @param input The input text.
2038 * @param index The offset index at which the search should be begin.
2039 * @return a non-null (but possibly zero-length) array of matches
2041 public REMatch
[] getAllMatches (Object input
, int index
)
2043 return getAllMatches (input
, index
, 0);
2047 * Returns an array of all matches found in the input string,
2048 * beginning at the specified index position and using the specified
2051 * If the regular expression allows the empty string to match, it will
2052 * substitute matches at all positions except the end of the input.
2054 * @param input The input text.
2055 * @param index The offset index at which the search should be begin.
2056 * @param eflags The logical OR of any execution flags above.
2057 * @return a non-null (but possibly zero-length) array of matches
2059 public REMatch
[] getAllMatches (Object input
, int index
, int eflags
)
2061 return getAllMatchesImpl (makeCharIndexed (input
, index
), index
, eflags
);
2064 // this has been changed since 1.03 to be non-overlapping matches
2065 private REMatch
[] getAllMatchesImpl (CharIndexed input
, int index
,
2068 List
< REMatch
> all
= new ArrayList
< REMatch
> ();
2070 while ((m
= getMatchImpl (input
, index
, eflags
, null)) != null)
2073 index
= m
.getEndIndex ();
2075 { // handle pathological case of zero-length match
2081 input
.move (m
.end
[0]);
2083 if (!input
.isValid ())
2086 return all
.toArray (new REMatch
[all
.size ()]);
2089 /* Implements abstract method REToken.match() */
2090 boolean match (CharIndexed input
, REMatch mymatch
)
2092 input
.setHitEnd (mymatch
);
2093 if (firstToken
== null)
2095 return next (input
, mymatch
);
2098 // Note the start of this subexpression
2099 mymatch
.start1
[subIndex
] = mymatch
.index
;
2101 return firstToken
.match (input
, mymatch
);
2104 REMatch
findMatch (CharIndexed input
, REMatch mymatch
)
2106 if (mymatch
.backtrackStack
== null)
2107 mymatch
.backtrackStack
= new BacktrackStack ();
2108 boolean b
= match (input
, mymatch
);
2117 * Returns the first match found in the input. If no match is found,
2120 * @param input The input text.
2121 * @return An REMatch instance referencing the match, or null if none.
2123 public REMatch
getMatch (Object input
)
2125 return getMatch (input
, 0, 0);
2129 * Returns the first match found in the input, beginning
2130 * the search at the specified index. If no match is found,
2133 * @param input The input text.
2134 * @param index The offset within the text to begin looking for a match.
2135 * @return An REMatch instance referencing the match, or null if none.
2137 public REMatch
getMatch (Object input
, int index
)
2139 return getMatch (input
, index
, 0);
2143 * Returns the first match found in the input, beginning
2144 * the search at the specified index, and using the specified
2145 * execution flags. If no match is found, returns null.
2147 * @param input The input text.
2148 * @param index The offset index at which the search should be begin.
2149 * @param eflags The logical OR of any execution flags above.
2150 * @return An REMatch instance referencing the match, or null if none.
2152 public REMatch
getMatch (Object input
, int index
, int eflags
)
2154 return getMatch (input
, index
, eflags
, null);
2158 * Returns the first match found in the input, beginning the search
2159 * at the specified index, and using the specified execution flags.
2160 * If no match is found, returns null. If a StringBuffer is
2161 * provided and is non-null, the contents of the input text from the
2162 * index to the beginning of the match (or to the end of the input,
2163 * if there is no match) are appended to the StringBuffer.
2165 * @param input The input text.
2166 * @param index The offset index at which the search should be begin.
2167 * @param eflags The logical OR of any execution flags above.
2168 * @param buffer The StringBuffer to save pre-match text in.
2169 * @return An REMatch instance referencing the match, or null if none. */
2170 public REMatch
getMatch (Object input
, int index
, int eflags
,
2171 CPStringBuilder buffer
)
2173 return getMatchImpl (makeCharIndexed (input
, index
), index
, eflags
,
2177 REMatch
getMatchImpl (CharIndexed input
, int anchor
, int eflags
,
2178 CPStringBuilder buffer
)
2180 boolean tryEntireMatch
= ((eflags
& REG_TRY_ENTIRE_MATCH
) != 0);
2181 boolean doMove
= ((eflags
& REG_FIX_STARTING_POSITION
) == 0);
2182 RE re
= (tryEntireMatch ?
(RE
) this.clone () : this);
2185 RETokenEnd reEnd
= new RETokenEnd (0, null);
2186 reEnd
.setFake (true);
2189 // Create a new REMatch to hold results
2190 REMatch mymatch
= new REMatch (numSubs
, anchor
, eflags
);
2193 /* The following potimization is commented out because
2194 the matching should be tried even if the length of
2195 input is obviously too short in order that
2196 java.util.regex.Matcher#hitEnd() may work correctly.
2197 // Optimization: check if anchor + minimumLength > length
2198 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2200 if (re
.match (input
, mymatch
))
2202 REMatch best
= mymatch
;
2203 // We assume that the match that coms first is the best.
2204 // And the following "The longer, the better" rule has
2205 // been commented out. The longest is not neccesarily
2206 // the best. For example, "a" out of "aaa" is the best
2209 // Find best match of them all to observe leftmost longest
2210 while ((mymatch = mymatch.next) != null) {
2211 if (mymatch.index > best.index) {
2216 best
.end
[0] = best
.index
;
2217 best
.finish (input
);
2218 input
.setLastMatch (best
);
2221 /* End of the optimization commented out
2224 mymatch
.clear (++anchor
);
2225 // Append character to buffer if needed
2226 if (buffer
!= null && input
.charAt (0) != CharIndexed
.OUT_OF_BOUNDS
)
2228 buffer
.append (input
.charAt (0));
2230 // java.util.regex.Matcher#hitEnd() requires that the search should
2231 // be tried at the end of input, so we use move1(1) instead of move(1)
2233 while (doMove
&& input
.move1 (1));
2235 // Special handling at end of input for e.g. "$"
2236 if (minimumLength
== 0)
2238 if (match (input
, mymatch
))
2240 mymatch
.finish (input
);
2249 * Returns an REMatchEnumeration that can be used to iterate over the
2250 * matches found in the input text.
2252 * @param input The input text.
2253 * @return A non-null REMatchEnumeration instance.
2255 public REMatchEnumeration
getMatchEnumeration (Object input
)
2257 return getMatchEnumeration (input
, 0, 0);
2262 * Returns an REMatchEnumeration that can be used to iterate over the
2263 * matches found in the input text.
2265 * @param input The input text.
2266 * @param index The offset index at which the search should be begin.
2267 * @return A non-null REMatchEnumeration instance, with its input cursor
2268 * set to the index position specified.
2270 public REMatchEnumeration
getMatchEnumeration (Object input
, int index
)
2272 return getMatchEnumeration (input
, index
, 0);
2276 * Returns an REMatchEnumeration that can be used to iterate over the
2277 * matches found in the input text.
2279 * @param input The input text.
2280 * @param index The offset index at which the search should be begin.
2281 * @param eflags The logical OR of any execution flags above.
2282 * @return A non-null REMatchEnumeration instance, with its input cursor
2283 * set to the index position specified.
2285 public REMatchEnumeration
getMatchEnumeration (Object input
, int index
,
2288 return new REMatchEnumeration (this, makeCharIndexed (input
, index
),
2294 * Substitutes the replacement text for the first match found in the input.
2296 * @param input The input text.
2297 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298 * @return A String interpolating the substituted text.
2299 * @see REMatch#substituteInto
2301 public String
substitute (Object input
, String replace
)
2303 return substitute (input
, replace
, 0, 0);
2307 * Substitutes the replacement text for the first match found in the input
2308 * beginning at the specified index position. Specifying an index
2309 * effectively causes the regular expression engine to throw away the
2310 * specified number of characters.
2312 * @param input The input text.
2313 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314 * @param index The offset index at which the search should be begin.
2315 * @return A String containing the substring of the input, starting
2316 * at the index position, and interpolating the substituted text.
2317 * @see REMatch#substituteInto
2319 public String
substitute (Object input
, String replace
, int index
)
2321 return substitute (input
, replace
, index
, 0);
2325 * Substitutes the replacement text for the first match found in the input
2326 * string, beginning at the specified index position and using the
2327 * specified execution flags.
2329 * @param input The input text.
2330 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331 * @param index The offset index at which the search should be begin.
2332 * @param eflags The logical OR of any execution flags above.
2333 * @return A String containing the substring of the input, starting
2334 * at the index position, and interpolating the substituted text.
2335 * @see REMatch#substituteInto
2337 public String
substitute (Object input
, String replace
, int index
,
2340 return substituteImpl (makeCharIndexed (input
, index
), replace
, index
,
2344 private String
substituteImpl (CharIndexed input
, String replace
, int index
,
2347 CPStringBuilder buffer
= new CPStringBuilder ();
2348 REMatch m
= getMatchImpl (input
, index
, eflags
, buffer
);
2350 return buffer
.toString ();
2351 buffer
.append (getReplacement (replace
, m
, eflags
));
2352 if (input
.move (m
.end
[0]))
2356 buffer
.append (input
.charAt (0));
2358 while (input
.move (1));
2360 return buffer
.toString ();
2364 * Substitutes the replacement text for each non-overlapping match found
2365 * in the input text.
2367 * @param input The input text.
2368 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369 * @return A String interpolating the substituted text.
2370 * @see REMatch#substituteInto
2372 public String
substituteAll (Object input
, String replace
)
2374 return substituteAll (input
, replace
, 0, 0);
2378 * Substitutes the replacement text for each non-overlapping match found
2379 * in the input text, starting at the specified index.
2381 * If the regular expression allows the empty string to match, it will
2382 * substitute matches at all positions except the end of the input.
2384 * @param input The input text.
2385 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386 * @param index The offset index at which the search should be begin.
2387 * @return A String containing the substring of the input, starting
2388 * at the index position, and interpolating the substituted text.
2389 * @see REMatch#substituteInto
2391 public String
substituteAll (Object input
, String replace
, int index
)
2393 return substituteAll (input
, replace
, index
, 0);
2397 * Substitutes the replacement text for each non-overlapping match found
2398 * in the input text, starting at the specified index and using the
2399 * specified execution flags.
2401 * @param input The input text.
2402 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403 * @param index The offset index at which the search should be begin.
2404 * @param eflags The logical OR of any execution flags above.
2405 * @return A String containing the substring of the input, starting
2406 * at the index position, and interpolating the substituted text.
2407 * @see REMatch#substituteInto
2409 public String
substituteAll (Object input
, String replace
, int index
,
2412 return substituteAllImpl (makeCharIndexed (input
, index
), replace
, index
,
2416 private String
substituteAllImpl (CharIndexed input
, String replace
,
2417 int index
, int eflags
)
2419 CPStringBuilder buffer
= new CPStringBuilder ();
2421 while ((m
= getMatchImpl (input
, index
, eflags
, buffer
)) != null)
2423 buffer
.append (getReplacement (replace
, m
, eflags
));
2424 index
= m
.getEndIndex ();
2427 char ch
= input
.charAt (0);
2428 if (ch
!= CharIndexed
.OUT_OF_BOUNDS
)
2434 input
.move (m
.end
[0]);
2437 if (!input
.isValid ())
2440 return buffer
.toString ();
2443 public static String
getReplacement (String replace
, REMatch m
, int eflags
)
2445 if ((eflags
& REG_NO_INTERPOLATE
) > 0)
2449 if ((eflags
& REG_REPLACE_USE_BACKSLASHESCAPE
) > 0)
2451 CPStringBuilder sb
= new CPStringBuilder ();
2452 int l
= replace
.length ();
2453 for (int i
= 0; i
< l
; i
++)
2455 char c
= replace
.charAt (i
);
2460 // Let StringIndexOutOfBoundsException be thrown.
2461 sb
.append (replace
.charAt (i
));
2465 while (i1
< replace
.length () &&
2466 Character
.isDigit (replace
.charAt (i1
)))
2468 sb
.append (m
.substituteInto (replace
.substring (i
, i1
)));
2475 return sb
.toString ();
2478 return m
.substituteInto (replace
);
2482 /* Helper function for constructor */
2483 private void addToken (REToken next
)
2487 minimumLength
+= next
.getMinimumLength ();
2488 int nmax
= next
.getMaximumLength ();
2489 if (nmax
< Integer
.MAX_VALUE
&& maximumLength
< Integer
.MAX_VALUE
)
2490 maximumLength
+= nmax
;
2492 maximumLength
= Integer
.MAX_VALUE
;
2494 if (firstToken
== null)
2496 lastToken
= firstToken
= next
;
2500 // if chain returns false, it "rejected" the token due to
2501 // an optimization, and next was combined with lastToken
2502 if (lastToken
.chain (next
))
2509 private static REToken
setRepeated (REToken current
, int min
, int max
,
2510 int index
) throws REException
2512 if (current
== null)
2513 throw new REException (getLocalizedMessage ("repeat.no.token"),
2514 REException
.REG_BADRPT
, index
);
2515 return new RETokenRepeated (current
.subIndex
, current
, min
, max
);
2518 private static int getPosixSet (char[]pattern
, int index
,
2519 CPStringBuilder buf
)
2521 // Precondition: pattern[index-1] == ':'
2522 // we will return pos of closing ']'.
2524 for (i
= index
; i
< (pattern
.length
- 1); i
++)
2526 if ((pattern
[i
] == ':') && (pattern
[i
+ 1] == ']'))
2528 buf
.append (pattern
[i
]);
2530 return index
; // didn't match up
2533 private int getMinMax (char[]input
, int index
, IntPair minMax
,
2534 RESyntax syntax
) throws REException
2536 // Precondition: input[index-1] == '{', minMax != null
2538 boolean mustMatch
= !syntax
.get (RESyntax
.RE_NO_BK_BRACES
);
2539 int startIndex
= index
;
2540 if (index
== input
.length
)
2543 throw new REException (getLocalizedMessage ("unmatched.brace"),
2544 REException
.REG_EBRACE
, index
);
2550 CharUnit unit
= new CharUnit ();
2551 CPStringBuilder buf
= new CPStringBuilder ();
2553 // Read string of digits
2556 index
= getCharUnit (input
, index
, unit
, false);
2557 if (Character
.isDigit (unit
.ch
))
2558 buf
.append (unit
.ch
);
2560 while ((index
!= input
.length
) && Character
.isDigit (unit
.ch
));
2562 // Check for {} tomfoolery
2563 if (buf
.length () == 0)
2566 throw new REException (getLocalizedMessage ("interval.error"),
2567 REException
.REG_EBRACE
, index
);
2572 min
= Integer
.parseInt (buf
.toString ());
2574 if ((unit
.ch
== '}') && (syntax
.get (RESyntax
.RE_NO_BK_BRACES
) ^ unit
.bk
))
2576 else if (index
== input
.length
)
2578 throw new REException (getLocalizedMessage ("interval.no.end"),
2579 REException
.REG_EBRACE
, index
);
2583 if ((unit
.ch
== ',') && !unit
.bk
)
2585 buf
= new CPStringBuilder ();
2586 // Read string of digits
2588 getCharUnit (input
, index
, unit
, false)) != input
.length
)
2589 && Character
.isDigit (unit
.ch
))
2590 buf
.append (unit
.ch
);
2594 && (syntax
.get (RESyntax
.RE_NO_BK_BRACES
) ^ unit
.bk
)))
2596 throw new REException (getLocalizedMessage ("interval.error"),
2597 REException
.REG_EBRACE
, index
);
2601 // This is the case of {x,}
2602 if (buf
.length () == 0)
2603 max
= Integer
.MAX_VALUE
;
2605 max
= Integer
.parseInt (buf
.toString ());
2608 throw new REException (getLocalizedMessage ("interval.error"),
2609 REException
.REG_EBRACE
, index
);
2613 // We know min and max now, and they are valid.
2616 minMax
.second
= max
;
2618 // return the index following the '}'
2623 * Return a human readable form of the compiled regular expression,
2624 * useful for debugging.
2626 public String
toString ()
2628 CPStringBuilder sb
= new CPStringBuilder ();
2630 return sb
.toString ();
2633 void dump (CPStringBuilder os
)
2635 os
.append ("(?#startRE subIndex=" + subIndex
+ ")");
2638 if (firstToken
!= null)
2639 firstToken
.dumpAll (os
);
2642 os
.append ("(?#endRE subIndex=" + subIndex
+ ")");
2645 // Cast input appropriately or throw exception
2646 // This method was originally a private method, but has been made
2647 // public because java.util.regex.Matcher uses this.
2648 public static CharIndexed
makeCharIndexed (Object input
, int index
)
2650 // The case where input is already a CharIndexed is supposed
2651 // be the most likely because this is the case with
2652 // java.util.regex.Matcher.
2653 // We could let a String or a CharSequence fall through
2654 // to final input, but since it'a very likely input type,
2655 // we check it first.
2656 if (input
instanceof CharIndexed
)
2658 CharIndexed ci
= (CharIndexed
) input
;
2659 ci
.setAnchor (index
);
2662 else if (input
instanceof CharSequence
)
2663 return new CharIndexedCharSequence ((CharSequence
) input
, index
);
2664 else if (input
instanceof String
)
2665 return new CharIndexedString ((String
) input
, index
);
2666 else if (input
instanceof char[])
2667 return new CharIndexedCharArray ((char[]) input
, index
);
2668 else if (input
instanceof StringBuffer
)
2669 return new CharIndexedStringBuffer ((StringBuffer
) input
, index
);
2670 else if (input
instanceof InputStream
)
2671 return new CharIndexedInputStream ((InputStream
) input
, index
);
2673 return new CharIndexedString (input
.toString (), index
);