3 // namespace: System.Text.RegularExpressions
4 // file: interpreter.cs
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System
.Collections
;
32 using System
.Diagnostics
;
33 using System
.Globalization
;
35 namespace System
.Text
.RegularExpressions
{
37 class Interpreter
: IMachine
{
38 public Interpreter (ushort[] program
) {
39 this.program
= program
;
44 Debug
.Assert ((OpCode
)program
[0] == OpCode
.Info
, "Regex", "Cant' find info block");
46 this.group_count
= program
[1] + 1;
47 this.match_min
= program
[2];
48 this.match_max
= program
[3];
52 this.program_start
= 4;
53 this.groups
= new int [group_count
];
56 // IMachine implementation
58 public Match
Scan (Regex regex
, string text
, int start
, int end
) {
61 this.scan_ptr
= start
;
63 if (Eval (Mode
.Match
, ref scan_ptr
, program_start
))
64 return GenerateMatch (regex
);
71 private void Reset () {
76 private bool Eval (Mode mode
, ref int ref_ptr
, int pc
) {
80 ushort word
= program
[pc
];
81 OpCode op
= (OpCode
)(word
& 0x00ff);
82 OpFlags flags
= (OpFlags
)(word
& 0xff00);
86 int skip
= program
[pc
+ 1];
88 int anch_offset
= program
[pc
+ 2];
89 bool anch_reverse
= (flags
& OpFlags
.RightToLeft
) != 0;
90 int anch_ptr
= anch_reverse
? ptr
- anch_offset
: ptr
+ anch_offset
;
91 int anch_end
= text_end
- match_min
+ anch_offset
; // maximum anchor position
97 // the general case for an anchoring expression is at the bottom, however we
98 // do some checks for the common cases before to save processing time. the current
99 // optimizer only outputs three types of anchoring expressions: fixed position,
100 // fixed substring, and no anchor.
102 OpCode anch_op
= (OpCode
)(program
[pc
+ 3] & 0x00ff);
103 if (anch_op
== OpCode
.Position
&& skip
== 6) { // position anchor
108 switch ((Position
)program
[pc
+ 4]) {
109 case Position
.StartOfString
:
110 if (anch_reverse
|| anch_offset
== 0) {
112 if (TryMatch (ref ptr
, pc
+ skip
))
117 case Position
.StartOfLine
:
121 if (TryMatch (ref ptr
, pc
+ skip
))
127 while ((anch_reverse
&& anch_ptr
>= 0) || (!anch_reverse
&& anch_ptr
<= anch_end
)) {
128 if (anch_ptr
== 0 || text
[anch_ptr
- 1] == '\n') {
130 ptr
= anch_ptr
== anch_end
? anch_ptr
: anch_ptr
+ anch_offset
;
132 ptr
= anch_ptr
== 0 ? anch_ptr
: anch_ptr
- anch_offset
;
133 if (TryMatch (ref ptr
, pc
+ skip
))
144 case Position
.StartOfScan
:
145 if (anch_ptr
== scan_ptr
) {
146 ptr
= anch_reverse
? scan_ptr
+ anch_offset
: scan_ptr
- anch_offset
;
147 if (TryMatch (ref ptr
, pc
+ skip
))
157 else if (qs
!= null ||
158 (anch_op
== OpCode
.String
&& skip
== 6 + program
[pc
+ 4])) { // substring anchor
163 bool reverse
= ((OpFlags
)program
[pc
+ 3] & OpFlags
.RightToLeft
) != 0;
164 string substring
= GetString (pc
+ 3);
167 bool ignore
= ((OpFlags
)program
[pc
+ 3] & OpFlags
.IgnoreCase
) != 0;
169 qs
= new QuickSearch (substring
, ignore
, reverse
);
171 while ((anch_reverse
&& anch_ptr
>= anch_begin
)
172 || (!anch_reverse
&& anch_ptr
<= anch_end
)) {
176 anch_ptr
= qs
.Search (text
, anch_ptr
, anch_begin
);
178 anch_ptr
+= substring
.Length
;
182 anch_ptr
= qs
.Search (text
, anch_ptr
, anch_end
);
186 ptr
= reverse
? anch_ptr
+ anch_offset
: anch_ptr
- anch_offset
;
187 if (TryMatch (ref ptr
, pc
+ skip
))
196 else if (anch_op
== OpCode
.True
) { // no anchor
201 while ((anch_reverse
&& anch_ptr
>= anch_begin
)
202 || (!anch_reverse
&& anch_ptr
<= anch_end
)) {
205 if (TryMatch (ref ptr
, pc
+ skip
))
213 else { // general case
218 while ((anch_reverse
&& anch_ptr
>= anch_begin
)
219 || (!anch_reverse
&& anch_ptr
<= anch_end
)) {
222 if (Eval (Mode
.Match
, ref ptr
, pc
+ 3)) {
223 // anchor expression passed: try real expression at the correct offset
225 ptr
= anch_reverse
? anch_ptr
+ anch_offset
: anch_ptr
- anch_offset
;
226 if (TryMatch (ref ptr
, pc
+ skip
))
248 case OpCode
.Position
: {
249 if (!IsPosition ((Position
)program
[pc
+ 1], ptr
))
255 case OpCode
.String
: {
256 bool reverse
= (flags
& OpFlags
.RightToLeft
) != 0;
257 bool ignore
= (flags
& OpFlags
.IgnoreCase
) != 0;
258 int len
= program
[pc
+ 1];
266 if (ptr
+ len
> text_end
)
270 for (int i
= 0; i
< len
; ++ i
) {
271 char c
= text
[ptr
+ i
];
273 c
= Char
.ToLower (c
);
275 if (c
!= (char)program
[pc
++])
284 case OpCode
.Reference
: {
285 bool reverse
= (flags
& OpFlags
.RightToLeft
) != 0;
286 bool ignore
= (flags
& OpFlags
.IgnoreCase
) != 0;
287 int m
= GetLastDefined (program
[pc
+ 1]);
291 int str
= marks
[m
].Index
;
292 int len
= marks
[m
].Length
;
299 else if (ptr
+ len
> text_end
)
303 for (int i
= 0; i
< len
; ++ i
) {
305 if (Char
.ToLower (text
[ptr
+ i
]) != Char
.ToLower (text
[str
+ i
]))
309 if (text
[ptr
+ i
] != text
[str
+ i
])
319 case OpCode
.Character
: case OpCode
.Category
:
320 case OpCode
.Range
: case OpCode
.Set
: {
321 if (!EvalChar (mode
, ref ptr
, ref pc
, false))
327 int target
= pc
+ program
[pc
+ 1];
329 if (!EvalChar (mode
, ref ptr
, ref pc
, true))
337 Open (program
[pc
+ 1], ptr
);
343 Close (program
[pc
+ 1], ptr
);
348 case OpCode
.BalanceStart
: {
350 int start
= ptr
; //point before the balancing group
352 if (!Eval (Mode
.Match
, ref ptr
, pc
+ 5))
357 if(!Balance (program
[pc
+ 1], program
[pc
+ 2], (program
[pc
+ 3] == 1 ? true : false) , start
)) {
362 pc
+= program
[pc
+ 4];
366 case OpCode
.Balance
: {
370 case OpCode
.IfDefined
: {
371 int m
= GetLastDefined (program
[pc
+ 2]);
373 pc
+= program
[pc
+ 1];
380 if (!Eval (Mode
.Match
, ref ptr
, pc
+ 2))
383 pc
+= program
[pc
+ 1];
388 int cp
= Checkpoint ();
390 if (Eval (Mode
.Match
, ref test_ptr
, pc
+ 3))
391 pc
+= program
[pc
+ 1];
394 pc
+= program
[pc
+ 2];
399 case OpCode
.Branch
: {
402 int cp
= Checkpoint ();
403 if (Eval (Mode
.Match
, ref ptr
, pc
+ 2))
408 pc
+= program
[pc
+ 1];
409 branch_op
= (OpCode
)(program
[pc
] & 0xff);
410 } while (branch_op
!= OpCode
.False
);
416 pc
+= program
[pc
+ 1];
420 case OpCode
.Repeat
: {
421 this.repeat
= new RepeatContext (
422 this.repeat
, // previous context
423 program
[pc
+ 2], // minimum
424 program
[pc
+ 3], // maximum
425 (flags
& OpFlags
.Lazy
) != 0, // lazy
426 pc
+ 4 // subexpression
429 if (Eval (Mode
.Match
, ref ptr
, pc
+ program
[pc
+ 1]))
432 this.repeat
= this.repeat
.Previous
;
438 RepeatContext current
= this.repeat
;
439 int start
= current
.Start
;
441 if (!current
.IsMinimum
) {
444 if (Eval (Mode
.Match
, ref ptr
, repeat
.Expression
))
447 current
.Start
= start
;
452 if (ptr
== current
.Start
) {
453 // degenerate match ... match tail or fail
455 this.repeat
= current
.Previous
;
456 if (Eval (Mode
.Match
, ref ptr
, pc
+ 1))
459 this.repeat
= current
;
463 if (current
.IsLazy
) {
464 // match tail first ...
466 this.repeat
= current
.Previous
;
467 int cp
= Checkpoint ();
468 if (Eval (Mode
.Match
, ref ptr
, pc
+ 1))
473 // ... then match more
475 this.repeat
= current
;
476 if (!current
.IsMaximum
) {
479 if (Eval (Mode
.Match
, ref ptr
, current
.Expression
))
482 current
.Start
= start
;
490 // match more first ...
492 if (!current
.IsMaximum
) {
493 int cp
= Checkpoint ();
496 if (Eval (Mode
.Match
, ref ptr
, current
.Expression
))
499 current
.Start
= start
;
504 // ... then match tail
506 this.repeat
= current
.Previous
;
507 if (Eval (Mode
.Match
, ref ptr
, pc
+ 1))
510 this.repeat
= current
;
515 case OpCode
.FastRepeat
: {
516 this.fast
= new RepeatContext (
518 program
[pc
+ 2], // minimum
519 program
[pc
+ 3], // maximum
520 (flags
& OpFlags
.Lazy
) != 0, // lazy
521 pc
+ 4 // subexpression
525 int cp
= Checkpoint ();
527 pc
+= program
[pc
+ 1]; // tail expression
528 ushort tail_word
= program
[pc
];
530 int c1
, c2
; // first character of tail operator
531 int coff
; // 0 or -1 depending on direction
533 OpCode tail_op
= (OpCode
)(tail_word
& 0xff);
534 if (tail_op
== OpCode
.Character
|| tail_op
== OpCode
.String
) {
535 OpFlags tail_flags
= (OpFlags
)(tail_word
& 0xff00);
537 if (tail_op
== OpCode
.String
)
541 if ((tail_flags
& OpFlags
.RightToLeft
) != 0)
543 offset
= program
[pc
+ 1] - 1 ;
546 c1
= program
[pc
+ 2 + offset
]; // first char of string
549 c1
= program
[pc
+ 1]; // character
551 if ((tail_flags
& OpFlags
.IgnoreCase
) != 0)
552 c2
= Char
.ToUpper ((char)c1
); // ignore case
556 if ((tail_flags
& OpFlags
.RightToLeft
) != 0)
557 coff
= -1; // reverse
567 if (!fast
.IsMinimum
&& !Eval (Mode
.Count
, ref ptr
, fast
.Expression
)) {
568 //Console.WriteLine ("lazy fast: failed mininum.");
569 fast
= fast
.Previous
;
575 if ((c1
< 0 || (p
>= 0 && p
< text_end
&& (c1
== text
[p
] || c2
== text
[p
]))) &&
576 Eval (Mode
.Match
, ref ptr
, pc
))
579 if (fast
.IsMaximum
) {
580 //Console.WriteLine ("lazy fast: failed with maximum.");
581 fast
= fast
.Previous
;
586 if (!Eval (Mode
.Count
, ref ptr
, fast
.Expression
)) {
587 //Console.WriteLine ("lazy fast: no more.");
588 fast
= fast
.Previous
;
592 fast
= fast
.Previous
;
596 if (!Eval (Mode
.Count
, ref ptr
, fast
.Expression
)) {
597 fast
= fast
.Previous
;
603 width
= (ptr
- fast
.Start
) / fast
.Count
;
609 if ((c1
< 0 || (p
>= 0 && p
< text_end
&& (c1
== text
[p
] || c2
== text
[p
]))) &&
610 Eval (Mode
.Match
, ref ptr
, pc
))
614 if (!fast
.IsMinimum
) {
615 fast
= fast
.Previous
;
622 fast
= fast
.Previous
;
628 Debug
.Assert (false, "Regex", "Info block found in pattern");
642 if (fast
.IsMaximum
|| (fast
.IsLazy
&& fast
.IsMinimum
))
645 pc
= fast
.Expression
;
656 if (!fast
.IsLazy
&& fast
.IsMinimum
)
659 ref_ptr
= fast
.Start
;
667 private bool EvalChar (Mode mode
, ref int ptr
, ref int pc
, bool multi
) {
668 bool consumed
= false;
674 ushort word
= program
[pc
];
675 OpCode op
= (OpCode
)(word
& 0x00ff);
676 OpFlags flags
= (OpFlags
)(word
& 0xff00);
680 ignore
= (flags
& OpFlags
.IgnoreCase
) != 0;
682 // consume character: the direction of an In construct is
683 // determined by the direction of its first op
686 if ((flags
& OpFlags
.RightToLeft
) != 0) {
700 c
= Char
.ToLower (c
);
707 negate
= (flags
& OpFlags
.Negate
) != 0;
718 case OpCode
.Character
: {
719 if (c
== (char)program
[pc
++])
724 case OpCode
.Category
: {
725 if (CategoryUtils
.IsCategory ((Category
)program
[pc
++], c
))
732 int lo
= (char)program
[pc
++];
733 int hi
= (char)program
[pc
++];
734 if (lo
<= c
&& c
<= hi
)
740 int lo
= (char)program
[pc
++];
741 int len
= (char)program
[pc
++];
746 if (i
< 0 || i
>= len
<< 4)
750 if ((program
[bits
+ (i
>> 4)] & (1 << (i
& 0xf))) != 0)
760 private bool TryMatch (ref int ref_ptr
, int pc
) {
764 marks
[groups
[0]].Start
= ptr
;
765 if (Eval (Mode
.Match
, ref ptr
, pc
)) {
766 marks
[groups
[0]].End
= ptr
;
774 private bool IsPosition (Position pos
, int ptr
) {
776 case Position
.Start
: case Position
.StartOfString
:
779 case Position
.StartOfLine
:
780 return ptr
== 0 || text
[ptr
- 1] == '\n';
782 case Position
.StartOfScan
:
783 return ptr
== scan_ptr
;
786 return ptr
== text_end
||
787 (ptr
== text_end
- 1 && text
[ptr
] == '\n');
789 case Position
.EndOfLine
:
790 return ptr
== text_end
|| text
[ptr
] == '\n';
792 case Position
.EndOfString
:
793 return ptr
== text_end
;
795 case Position
.Boundary
:
800 return IsWordChar (text
[ptr
]);
801 else if (ptr
== text_end
)
802 return IsWordChar (text
[ptr
- 1]);
804 return IsWordChar (text
[ptr
]) != IsWordChar (text
[ptr
- 1]);
806 case Position
.NonBoundary
:
811 return !IsWordChar (text
[ptr
]);
812 else if (ptr
== text_end
)
813 return !IsWordChar (text
[ptr
- 1]);
815 return IsWordChar (text
[ptr
]) == IsWordChar (text
[ptr
- 1]);
822 private bool IsWordChar (char c
) {
823 return CategoryUtils
.IsCategory (Category
.Word
, c
);
826 private string GetString (int pc
) {
827 int len
= program
[pc
+ 1];
830 char[] cs
= new char[len
];
831 for (int i
= 0; i
< len
; ++ i
)
832 cs
[i
] = (char)program
[str
++];
834 return new string (cs
);
837 // capture management
839 private void Open (int gid
, int ptr
) {
840 int m
= groups
[gid
];
841 if (m
< mark_start
|| marks
[m
].IsDefined
) {
846 marks
[m
].Start
= ptr
;
849 private void Close (int gid
, int ptr
) {
850 marks
[groups
[gid
]].End
= ptr
;
853 private bool Balance (int gid
, int balance_gid
, bool capture
, int ptr
) {
854 int b
= groups
[balance_gid
];
856 if(b
== -1 || marks
[b
].Index
< 0) {
857 //Group not previously matched
861 Debug
.Assert (marks
[b
].IsDefined
, "Regex", "Balancng group not closed");
863 if (gid
> 0 && capture
){
864 Open (gid
, marks
[b
].Index
+ marks
[b
].Length
);
868 groups
[balance_gid
] = marks
[b
].Previous
;
873 private int Checkpoint () {
874 mark_start
= mark_end
;
878 private void Backtrack (int cp
) {
879 Debug
.Assert (cp
> mark_start
, "Regex", "Attempt to backtrack forwards");
881 for (int i
= 0; i
< groups
.Length
; ++ i
) {
884 m
= marks
[m
].Previous
;
890 private void ResetGroups () {
891 int n
= groups
.Length
;
893 marks
= new Mark
[n
* 10];
895 for (int i
= 0; i
< n
; ++ i
) {
898 marks
[i
].Start
= -1;
900 marks
[i
].Previous
= -1;
907 private int GetLastDefined (int gid
) {
908 int m
= groups
[gid
];
909 while (m
>= 0 && !marks
[m
].IsDefined
)
910 m
= marks
[m
].Previous
;
915 private int CreateMark (int previous
) {
916 if (mark_end
== marks
.Length
) {
917 Mark
[] dest
= new Mark
[marks
.Length
* 2];
918 marks
.CopyTo (dest
, 0);
923 marks
[m
].Start
= marks
[m
].End
= -1;
924 marks
[m
].Previous
= previous
;
929 private Match
GenerateMatch (Regex regex
) {
930 int[][] grps
= new int[groups
.Length
][];
931 ArrayList caps
= new ArrayList ();
933 for (int gid
= 0; gid
< groups
.Length
; ++ gid
) {
935 for (int m
= groups
[gid
]; m
>= 0; m
= marks
[m
].Previous
) {
936 if (!marks
[m
].IsDefined
)
939 caps
.Add (marks
[m
].Index
);
940 caps
.Add (marks
[m
].Length
);
943 grps
[gid
] = (int[])caps
.ToArray (typeof (int));
946 return new Match (regex
, this, text
, text_end
, grps
);
949 // interpreter attributes
951 private ushort[] program
; // regex program
952 private int program_start
; // first instruction after info block
953 private string text
; // input text
954 private int text_end
; // end of input text (last character + 1)
955 private int group_count
; // number of capturing groups
956 private int match_min
, match_max
; // match width information
957 private QuickSearch qs
; // fast substring matcher
961 private int scan_ptr
; // start of scan
963 private RepeatContext repeat
; // current repeat context
964 private RepeatContext fast
; // fast repeat context
966 private Mark
[] marks
= null; // mark stack
967 private int mark_start
; // start of current checkpoint
968 private int mark_end
; // end of checkpoint/next free mark
970 private int[] groups
; // current group definitions
974 private struct Mark {
975 public int Start, End;
978 public bool IsDefined {
979 get { return Start >= 0 && End >= 0; }
983 get { return Start < End ? Start : End; }
987 get { return Start < End ? End - Start : Start - End; }
991 private class RepeatContext
{
992 public RepeatContext (RepeatContext previous
, int min
, int max
, bool lazy
, int expr_pc
) {
993 this.previous
= previous
;
997 this.expr_pc
= expr_pc
;
1004 get { return count; }
1005 set { count = value; }
1009 get { return start; }
1010 set { start = value; }
1013 public bool IsMinimum
{
1014 get { return min <= count; }
1017 public bool IsMaximum
{
1018 get { return max <= count; }
1021 public bool IsLazy
{
1022 get { return lazy; }
1025 public int Expression
{
1026 get { return expr_pc; }
1029 public RepeatContext Previous
{
1030 get { return previous; }
1034 private int min
, max
;
1036 private int expr_pc
;
1037 private RepeatContext previous
;