3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System
.Collections
;
32 using System
.Globalization
;
34 namespace System
.Text
.RegularExpressions
.Syntax
{
37 public static int ParseDecimal (string str
, ref int ptr
) {
38 return ParseNumber (str
, ref ptr
, 10, 1, Int32
.MaxValue
);
41 public static int ParseOctal (string str
, ref int ptr
) {
42 return ParseNumber (str
, ref ptr
, 8, 1, 3);
45 public static int ParseHex (string str
, ref int ptr
, int digits
) {
46 return ParseNumber (str
, ref ptr
, 16, digits
, digits
);
49 public static int ParseNumber (string str
, ref int ptr
, int b
, int min
, int max
) {
50 int p
= ptr
, n
= 0, digits
= 0, d
;
54 while (digits
< max
&& p
< str
.Length
) {
55 d
= ParseDigit (str
[p
++], b
, digits
);
72 public static string ParseName (string str
, ref int ptr
) {
73 if (Char
.IsDigit (str
[ptr
])) {
74 int gid
= ParseNumber (str
, ref ptr
, 10, 1, 0);
76 return gid
.ToString ();
83 if (!IsNameChar (str
[ptr
]))
89 return str
.Substring (start
, ptr
- start
);
94 public static string Escape (string str
) {
96 for (int i
= 0; i
< str
.Length
; ++ i
) {
99 case '\\': case '*': case '+': case '?': case '|':
100 case '{': case '[': case '(': case ')': case '^':
101 case '$': case '.': case '#': case ' ':
105 case '\t': result
+= "\\t"; break;
106 case '\n': result
+= "\\n"; break;
107 case '\r': result
+= "\\r"; break;
108 case '\f': result
+= "\\f"; break;
110 default: result
+= c
; break;
117 public static string Unescape (string str
) {
118 return new Parser ().ParseString (str
);
124 this.caps
= new ArrayList ();
125 this.refs
= new Hashtable ();
128 public RegularExpression
ParseRegularExpression (string pattern
, RegexOptions options
) {
129 this.pattern
= pattern
;
137 RegularExpression re
= new RegularExpression ();
138 ParseGroup (re
, options
, null);
139 ResolveReferences ();
141 re
.GroupCount
= num_groups
;
145 catch (IndexOutOfRangeException
) {
146 throw NewParseException ("Unexpected end of pattern.");
150 public IDictionary
GetMapping () {
151 Hashtable mapping
= new Hashtable ();
152 Hashtable numbers
= new Hashtable ();
153 int end
= caps
.Count
;
154 mapping
.Add ("0", 0);
155 for (int i
= 0; i
< end
; i
++) {
156 CapturingGroup
group = (CapturingGroup
) caps
[i
];
157 if (group.Name
!= null && !mapping
.Contains (group.Name
)) {
158 mapping
.Add (group.Name
, group.Number
);
159 numbers
.Add (group.Number
, group.Number
);
163 for (int i
= 1; i
< end
; i
++) {
164 if (numbers
[i
] == null)
165 mapping
.Add (i
.ToString (), i
);
173 private void ParseGroup (Group
group, RegexOptions options
, Assertion assertion
) {
174 bool is_top_level
= group is RegularExpression
;
176 Alternation alternation
= null;
177 string literal
= null;
179 Group current
= new Group ();
180 Expression expr
= null;
184 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
185 if (ptr
>= pattern
.Length
)
188 // (1) Parse for Expressions
190 char ch
= pattern
[ptr
++];
195 IsMultiline (options
) ? Position
.StartOfLine
: Position
.Start
;
196 expr
= new PositionAssertion (pos
);
202 IsMultiline (options
) ? Position
.EndOfLine
: Position
.End
;
203 expr
= new PositionAssertion (pos
);
209 IsSingleline (options
) ? Category
.AnySingleline
: Category
.Any
;
210 expr
= new CharacterClass (cat
, false);
215 int c
= ParseEscape ();
219 expr
= ParseSpecial (options
);
222 ch
= pattern
[ptr
++]; // default escape
228 expr
= ParseCharacterClass (options
);
233 bool ignore
= IsIgnoreCase (options
);
234 expr
= ParseGroupingConstruct (ref options
);
236 if (literal
!= null && IsIgnoreCase (options
) != ignore
) {
237 current
.AppendExpression (new Literal (literal
, IsIgnoreCase (options
)));
252 if (literal
!= null) {
253 current
.AppendExpression (new Literal (literal
, IsIgnoreCase (options
)));
257 if (assertion
!= null) {
258 if (assertion
.TrueExpression
== null)
259 assertion
.TrueExpression
= current
;
260 else if (assertion
.FalseExpression
== null)
261 assertion
.FalseExpression
= current
;
263 throw NewParseException ("Too many | in (?()|).");
266 if (alternation
== null)
267 alternation
= new Alternation ();
269 alternation
.AddAlternative (current
);
272 current
= new Group ();
276 case '*': case '+': case '?': {
277 throw NewParseException ("Bad quantifier.");
281 break; // literal character
284 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
286 // (2) Check for Repetitions
288 if (ptr
< pattern
.Length
) {
289 char k
= pattern
[ptr
];
291 if (k
== '?' || k
== '*' || k
== '+' || k
== '{') {
294 int min
= 0, max
= 0;
298 case '?': min
= 0; max
= 1; break;
299 case '*': min
= 0; max
= 0xffff; break;
300 case '+': min
= 1; max
= 0xffff; break;
301 case '{': ParseRepetitionBounds (out min
, out max
, options
); break;
304 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
305 if (ptr
< pattern
.Length
&& pattern
[ptr
] == '?') {
310 Repetition repetition
= new Repetition (min
, max
, lazy
);
313 repetition
.Expression
= new Literal (ch
.ToString (), IsIgnoreCase (options
));
315 repetition
.Expression
= expr
;
321 // (3) Append Expression and/or Literal
329 if (literal
!= null) {
330 current
.AppendExpression (new Literal (literal
, IsIgnoreCase (options
)));
334 current
.AppendExpression (expr
);
338 if (is_top_level
&& ptr
>= pattern
.Length
)
343 if (is_top_level
&& closed
)
344 throw NewParseException ("Too many )'s.");
345 if (!is_top_level
&& !closed
)
346 throw NewParseException ("Not enough )'s.");
349 // clean up literals and alternations
352 current
.AppendExpression (new Literal (literal
, IsIgnoreCase (options
)));
354 if (assertion
!= null) {
355 if (assertion
.TrueExpression
== null)
356 assertion
.TrueExpression
= current
;
358 assertion
.FalseExpression
= current
;
360 group.AppendExpression (assertion
);
362 else if (alternation
!= null) {
363 alternation
.AddAlternative (current
);
364 group.AppendExpression (alternation
);
367 group.AppendExpression (current
);
370 private Expression
ParseGroupingConstruct (ref RegexOptions options
) {
371 if (pattern
[ptr
] != '?') {
374 if (IsExplicitCapture (options
))
375 group = new Group ();
377 group = new CapturingGroup ();
381 ParseGroup (group, options
, null);
387 switch (pattern
[ptr
]) {
388 case ':': { // non-capturing group
390 Group
group = new Group ();
391 ParseGroup (group, options
, null);
396 case '>': { // non-backtracking group
398 Group
group = new NonBacktrackingGroup ();
399 ParseGroup (group, options
, null);
404 case 'i': case 'm': case 'n':
405 case 's': case 'x': case '-': { // options
406 RegexOptions o
= options
;
407 ParseOptions (ref o
, false);
408 if (pattern
[ptr
] == '-') {
410 ParseOptions (ref o
, true);
413 if (pattern
[ptr
] == ':') { // pass options to child group
415 Group
group = new Group ();
416 ParseGroup (group, o
, null);
419 else if (pattern
[ptr
] == ')') { // change options of enclosing group
425 throw NewParseException ("Bad options");
428 case '<': case '=': case '!': { // lookahead/lookbehind
429 ExpressionAssertion asn
= new ExpressionAssertion ();
430 if (!ParseAssertionType (asn
))
431 goto case '\''; // it's a (?<name> ) construct
433 Group test
= new Group ();
434 ParseGroup (test
, options
, null);
436 asn
.TestExpression
= test
;
440 case '\'': { // named/balancing group
442 if (pattern
[ptr
] == '<')
448 string name
= ParseName ();
450 if (pattern
[ptr
] == delim
) {
454 throw NewParseException ("Bad group name.");
457 CapturingGroup cap
= new CapturingGroup ();
460 ParseGroup (cap
, options
, null);
464 else if (pattern
[ptr
] == '-') {
468 string balance_name
= ParseName ();
469 if (balance_name
== null || pattern
[ptr
] != delim
)
470 throw NewParseException ("Bad balancing group name.");
473 BalancingGroup bal
= new BalancingGroup ();
480 refs
.Add (bal
, balance_name
);
482 ParseGroup (bal
, options
, null);
487 throw NewParseException ("Bad group name.");
490 case '(': { // expression/capture test
495 string name
= ParseName ();
496 if (name
== null || pattern
[ptr
] != ')') { // expression test
497 // FIXME MS implementation doesn't seem to
498 // implement this version of (?(x) ...)
501 ExpressionAssertion expr_asn
= new ExpressionAssertion ();
503 if (pattern
[ptr
] == '?') {
505 if (!ParseAssertionType (expr_asn
))
506 throw NewParseException ("Bad conditional.");
509 expr_asn
.Negate
= false;
510 expr_asn
.Reverse
= false;
513 Group test
= new Group ();
514 ParseGroup (test
, options
, null);
515 expr_asn
.TestExpression
= test
;
518 else { // capture test
520 asn
= new CaptureAssertion ();
521 refs
.Add (asn
, name
);
524 Group
group = new Group ();
525 ParseGroup (group, options
, asn
);
529 case '#': { // comment
531 while (pattern
[ptr
++] != ')') {
532 if (ptr
>= pattern
.Length
)
533 throw NewParseException ("Unterminated (?#...) comment.");
539 throw NewParseException ("Bad grouping construct.");
543 private bool ParseAssertionType (ExpressionAssertion assertion
) {
544 if (pattern
[ptr
] == '<') {
545 switch (pattern
[ptr
+ 1]) {
547 assertion
.Negate
= false;
550 assertion
.Negate
= true;
556 assertion
.Reverse
= true;
560 switch (pattern
[ptr
]) {
562 assertion
.Negate
= false;
565 assertion
.Negate
= true;
571 assertion
.Reverse
= false;
578 private void ParseOptions (ref RegexOptions options
, bool negate
) {
580 switch (pattern
[ptr
]) {
583 options
&= ~RegexOptions
.IgnoreCase
;
585 options
|= RegexOptions
.IgnoreCase
;
590 options
&= ~RegexOptions
.Multiline
;
592 options
|= RegexOptions
.Multiline
;
597 options
&= ~RegexOptions
.ExplicitCapture
;
599 options
|= RegexOptions
.ExplicitCapture
;
604 options
&= ~RegexOptions
.Singleline
;
606 options
|= RegexOptions
.Singleline
;
611 options
&= ~RegexOptions
.IgnorePatternWhitespace
;
613 options
|= RegexOptions
.IgnorePatternWhitespace
;
624 private Expression
ParseCharacterClass (RegexOptions options
) {
626 if (pattern
[ptr
] == '^') {
633 ecma
= IsECMAScript (options
);
634 CharacterClass cls
= new CharacterClass (negate
, IsIgnoreCase (options
));
636 if (pattern
[ptr
] == ']') {
637 cls
.AddCharacter (']');
645 while (ptr
< pattern
.Length
) {
661 // didn't recognize escape
665 case 'b': c
= '\b'; break;
668 cls
.AddCategory (ecma
? Category
.EcmaDigit
: Category
.Digit
, false);
673 cls
.AddCategory (ecma
? Category
.EcmaWord
: Category
.Word
, false);
678 cls
.AddCategory (ecma
? Category
.EcmaWhiteSpace
: Category
.WhiteSpace
, false);
683 cls
.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
688 cls
.AddCategory (ecma
? Category
.EcmaDigit
: Category
.Digit
, true);
693 cls
.AddCategory (ecma
? Category
.EcmaWord
: Category
.Word
, true);
698 cls
.AddCategory (ecma
? Category
.EcmaWhiteSpace
: Category
.WhiteSpace
, true);
703 cls
.AddCategory (ParseUnicodeCategory (), true);
707 default: break; // add escaped character
714 throw NewParseException ("[x-y] range in reverse order.");
717 cls
.AddRange ((char)last
, (char)c
);
719 cls
.AddCharacter ((char)c
);
720 cls
.AddCharacter ('-');
727 cls
.AddCharacter ((char)c
);
733 throw NewParseException ("Unterminated [] set.");
736 cls
.AddCharacter ('-');
741 private void ParseRepetitionBounds (out int min
, out int max
, RegexOptions options
) {
746 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
748 if (pattern
[ptr
] == ',') {
751 n
= ParseNumber (10, 1, 0);
752 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
755 switch (pattern
[ptr
++]) {
760 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
761 m
= ParseNumber (10, 1, 0);
762 ConsumeWhitespace (IsIgnorePatternWhitespace (options
));
763 if (pattern
[ptr
++] != '}')
764 throw NewParseException ("Illegal {x,y} - bad value of y.");
767 throw NewParseException ("Illegal {x,y}");
770 /* check bounds and ordering */
772 if (n
>= 0xffff || m
>= 0xffff)
773 throw NewParseException ("Illegal {x, y} - maximum of 65535.");
775 throw NewParseException ("Illegal {x, y} with x > y.");
777 /* assign min and max */
786 private Category
ParseUnicodeCategory () {
787 if (pattern
[ptr
++] != '{')
788 throw NewParseException ("Incomplete \\p{X} character escape.");
790 string name
= ParseName (pattern
, ref ptr
);
792 throw NewParseException ("Incomplete \\p{X} character escape.");
794 Category cat
= CategoryUtils
.CategoryFromName (name
);
795 if (cat
== Category
.None
)
796 throw NewParseException ("Unknown property '" + name
+ "'.");
798 if (pattern
[ptr
++] != '}')
799 throw NewParseException ("Incomplete \\p{X} character escape.");
804 private Expression
ParseSpecial (RegexOptions options
) {
806 bool ecma
= IsECMAScript (options
);
807 Expression expr
= null;
809 switch (pattern
[ptr
++]) {
814 expr
= new CharacterClass (ecma
? Category
.EcmaDigit
: Category
.Digit
, false);
818 expr
= new CharacterClass (ecma
? Category
.EcmaWord
: Category
.Word
, false);
822 expr
= new CharacterClass (ecma
? Category
.EcmaWhiteSpace
: Category
.WhiteSpace
, false);
826 // this is odd - ECMAScript isn't supposed to support Unicode,
827 // yet \p{..} compiles and runs under the MS implementation
828 // identically to canonical mode. That's why I'm ignoring the
829 // value of ecma here.
831 expr
= new CharacterClass (ParseUnicodeCategory (), false);
835 expr
= new CharacterClass (ecma
? Category
.EcmaDigit
: Category
.Digit
, true);
839 expr
= new CharacterClass (ecma
? Category
.EcmaWord
: Category
.Word
, true);
843 expr
= new CharacterClass (ecma
? Category
.EcmaWhiteSpace
: Category
.WhiteSpace
, true);
847 expr
= new CharacterClass (ParseUnicodeCategory (), true);
852 case 'A': expr
= new PositionAssertion (Position
.StartOfString
); break;
853 case 'Z': expr
= new PositionAssertion (Position
.End
); break;
854 case 'z': expr
= new PositionAssertion (Position
.EndOfString
); break;
855 case 'G': expr
= new PositionAssertion (Position
.StartOfScan
); break;
856 case 'b': expr
= new PositionAssertion (Position
.Boundary
); break;
857 case 'B': expr
= new PositionAssertion (Position
.NonBoundary
); break;
861 case '1': case '2': case '3': case '4': case '5':
862 case '6': case '7': case '8': case '9': {
864 int n
= ParseNumber (10, 1, 0);
870 // FIXME test if number is within number of assigned groups
871 // this may present a problem for right-to-left matching
873 Reference reference
= new Reference (IsIgnoreCase (options
));
874 refs
.Add (reference
, n
.ToString ());
880 char delim
= pattern
[ptr
++];
883 else if (delim
!= '\'')
884 throw NewParseException ("Malformed \\k<...> named backreference.");
886 string name
= ParseName ();
887 if (name
== null || pattern
[ptr
] != delim
)
888 throw NewParseException ("Malformed \\k<...> named backreference.");
891 Reference reference
= new Reference (IsIgnoreCase (options
));
892 refs
.Add (reference
, name
);
908 private int ParseEscape () {
912 if (p
>= pattern
.Length
)
913 throw new ArgumentException (
914 String
.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
915 "pattern.", pattern
), pattern
);
917 switch (pattern
[ptr
++]) {
919 // standard escapes (except \b)
921 case 'a': return '\u0007';
922 case 't': return '\u0009';
923 case 'r': return '\u000d';
924 case 'v': return '\u000b';
925 case 'f': return '\u000c';
926 case 'n': return '\u000a';
927 case 'e': return '\u001b';
928 case '\\': return '\\';
934 int result
= ParseOctal (pattern
, ref ptr
);
935 if (result
== -1 && prevptr
== ptr
)
941 c
= ParseHex (pattern
, ref ptr
, 2);
943 throw NewParseException ("Insufficient hex digits");
948 c
= ParseHex (pattern
, ref ptr
, 4);
950 throw NewParseException ("Insufficient hex digits");
954 // control characters
958 if (c
>= '@' && c
<= '_')
961 throw NewParseException ("Unrecognized control character.");
971 private string ParseName () {
972 return Parser
.ParseName (pattern
, ref ptr
);
975 private static bool IsNameChar (char c
) {
976 UnicodeCategory cat
= Char
.GetUnicodeCategory (c
);
977 if (cat
== UnicodeCategory
.ModifierLetter
)
979 if (cat
== UnicodeCategory
.ConnectorPunctuation
)
981 return Char
.IsLetterOrDigit (c
);
984 private int ParseNumber (int b
, int min
, int max
) {
985 return Parser
.ParseNumber (pattern
, ref ptr
, b
, min
, max
);
988 private int ParseDecimal () {
989 return Parser
.ParseDecimal (pattern
, ref ptr
);
992 private static int ParseDigit (char c
, int b
, int n
) {
995 if (c
>= '0' && c
<= '7')
1000 if (c
>= '0' && c
<= '9')
1005 if (c
>= '0' && c
<= '9')
1007 else if (c
>= 'a' && c
<= 'f')
1008 return 10 + c
- 'a';
1009 else if (c
>= 'A' && c
<= 'F')
1010 return 10 + c
- 'A';
1018 private void ConsumeWhitespace (bool ignore
) {
1020 if (ptr
>= pattern
.Length
)
1023 if (pattern
[ptr
] == '(') {
1024 if (ptr
+ 3 >= pattern
.Length
)
1027 if (pattern
[ptr
+ 1] != '?' || pattern
[ptr
+ 2] != '#')
1031 while (pattern
[ptr
++] != ')')
1034 else if (ignore
&& pattern
[ptr
] == '#') {
1035 while (ptr
< pattern
.Length
&& pattern
[ptr
++] != '\n')
1038 else if (ignore
&& Char
.IsWhiteSpace (pattern
[ptr
])) {
1039 while (ptr
< pattern
.Length
&& Char
.IsWhiteSpace (pattern
[ptr
]))
1047 private string ParseString (string pattern
) {
1048 this.pattern
= pattern
;
1051 StringBuilder result
= new StringBuilder (pattern
.Length
);
1052 while (ptr
< pattern
.Length
) {
1053 int c
= pattern
[ptr
++];
1058 c
= pattern
[ptr
++];
1063 result
.Append ((char) c
);
1066 return result
.ToString ();
1069 private void ResolveReferences () {
1071 Hashtable dict
= new Hashtable ();
1073 // number unnamed groups
1075 foreach (CapturingGroup
group in caps
) {
1076 if (group.Name
== null) {
1077 dict
.Add (gid
.ToString (), group);
1078 group.Number
= gid
++;
1084 // number named groups
1086 foreach (CapturingGroup
group in caps
) {
1087 if (group.Name
!= null) {
1088 if (!dict
.Contains (group.Name
)) {
1089 dict
.Add (group.Name
, group);
1090 group.Number
= gid
++;
1095 CapturingGroup prev
= (CapturingGroup
)dict
[group.Name
];
1096 group.Number
= prev
.Number
;
1101 // resolve references
1103 foreach (Expression expr
in refs
.Keys
) {
1104 string name
= (string)refs
[expr
];
1105 if (!dict
.Contains (name
)) {
1106 throw NewParseException ("Reference to undefined group " +
1107 (Char
.IsDigit (name
[0]) ? "number " : "name ") +
1111 CapturingGroup
group = (CapturingGroup
)dict
[name
];
1112 if (expr
is Reference
)
1113 ((Reference
)expr
).CapturingGroup
= group;
1114 else if (expr
is CaptureAssertion
)
1115 ((CaptureAssertion
)expr
).CapturingGroup
= group;
1116 else if (expr
is BalancingGroup
)
1117 ((BalancingGroup
)expr
).Balance
= group;
1121 // flag helper functions
1123 private static bool IsIgnoreCase (RegexOptions options
) {
1124 return (options
& RegexOptions
.IgnoreCase
) != 0;
1127 private static bool IsMultiline (RegexOptions options
) {
1128 return (options
& RegexOptions
.Multiline
) != 0;
1131 private static bool IsExplicitCapture (RegexOptions options
) {
1132 return (options
& RegexOptions
.ExplicitCapture
) != 0;
1135 private static bool IsSingleline (RegexOptions options
) {
1136 return (options
& RegexOptions
.Singleline
) != 0;
1139 private static bool IsIgnorePatternWhitespace (RegexOptions options
) {
1140 return (options
& RegexOptions
.IgnorePatternWhitespace
) != 0;
1143 private static bool IsRightToLeft (RegexOptions options
) {
1144 return (options
& RegexOptions
.RightToLeft
) != 0;
1147 private static bool IsECMAScript (RegexOptions options
) {
1148 return (options
& RegexOptions
.ECMAScript
) != 0;
1151 // exception creation
1153 private ArgumentException
NewParseException (string msg
) {
1154 msg
= "parsing \"" + pattern
+ "\" - " + msg
;
1155 return new ArgumentException (msg
, pattern
);
1158 private string pattern
;
1161 private ArrayList caps
;
1162 private Hashtable refs
;
1163 private int num_groups
;