3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System
.Collections
;
33 namespace System
.Text
.RegularExpressions
.Syntax
{
36 class ExpressionCollection
: CollectionBase
{
37 public void Add (Expression e
) {
41 public Expression
this[int i
] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o
) {
47 // allow null elements
53 abstract class Expression
{
54 public abstract void Compile (ICompiler cmp
, bool reverse
);
55 public abstract void GetWidth (out int min
, out int max
);
57 public int GetFixedWidth () {
59 GetWidth (out min
, out max
);
67 public virtual AnchorInfo
GetAnchorInfo (bool reverse
) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public virtual bool IsComplex () {
76 // composite expressions
78 abstract class CompositeExpression
: Expression
{
79 public CompositeExpression () {
80 expressions
= new ExpressionCollection ();
83 protected ExpressionCollection Expressions
{
84 get { return expressions; }
87 protected void GetWidth (out int min
, out int max
, int count
) {
92 for (int i
= 0; i
< count
; ++ i
) {
93 Expression e
= Expressions
[i
];
99 e
.GetWidth (out a
, out b
);
100 if (a
< min
) min
= a
;
101 if (b
> max
) max
= b
;
108 private ExpressionCollection expressions
;
113 class Group
: CompositeExpression
{
117 public Expression Expression
{
118 get { return Expressions[0]; }
119 set { Expressions[0] = value; }
122 public void AppendExpression (Expression e
) {
126 public override void Compile (ICompiler cmp
, bool reverse
) {
127 int count
= Expressions
.Count
;
128 for (int i
= 0; i
< count
; ++ i
) {
131 e
= Expressions
[count
- i
- 1];
135 e
.Compile (cmp
, reverse
);
139 public override void GetWidth (out int min
, out int max
) {
143 foreach (Expression e
in Expressions
) {
145 e
.GetWidth (out a
, out b
);
147 if (max
== Int32
.MaxValue
|| b
== Int32
.MaxValue
)
148 max
= Int32
.MaxValue
;
154 public override AnchorInfo
GetAnchorInfo (bool reverse
) {
156 int width
= GetFixedWidth ();
158 ArrayList infos
= new ArrayList ();
159 IntervalCollection segments
= new IntervalCollection ();
161 // accumulate segments
164 //foreach (Expression e in Expressions) {
165 int count
= Expressions
.Count
;
166 for (int i
= 0; i
< count
; ++ i
) {
169 e
= Expressions
[count
- i
- 1];
173 AnchorInfo info
= e
.GetAnchorInfo (reverse
);
177 return new AnchorInfo (this, ptr
+ info
.Offset
, width
, info
.Position
);
179 if (info
.IsSubstring
)
180 segments
.Add (info
.GetInterval (ptr
));
182 if (info
.IsUnknownWidth
)
188 // normalize and find the longest segment
190 segments
.Normalize ();
192 Interval longest
= Interval
.Empty
;
193 foreach (Interval segment
in segments
) {
194 if (segment
.Size
> longest
.Size
)
198 // now chain the substrings that made this segment together
200 if (!longest
.IsEmpty
) {
202 ArrayList strs
= new ArrayList();
207 //foreach (AnchorInfo info in infos) {
208 for (int i
= 0; i
< infos
.Count
; ++ i
) {
211 info
= (AnchorInfo
)infos
[i
];
213 if (info
.IsSubstring
&& longest
.Contains (info
.GetInterval (ptr
))) {
214 //str += info.Substring; // TODO mark subexpressions
215 strs
.Add(info
.Substring
);
216 ignore
|= info
.IgnoreCase
;
220 if (info
.IsUnknownWidth
)
226 for (int i
= 0; i
< strs
.Count
; ++i
)
229 str
+= strs
[strs
.Count
- i
- 1];
237 return new AnchorInfo (this, longest
.low
, width
, str
, ignore
);
240 return new AnchorInfo (this, width
);
243 public override bool IsComplex () {
245 foreach (Expression e
in Expressions
) {
246 comp
|= e
.IsComplex ();
249 return comp
| GetFixedWidth () <= 0;
253 class RegularExpression
: Group
{
254 public RegularExpression () {
258 public int GroupCount
{
259 get { return group_count; }
260 set { group_count = value; }
263 public override void Compile (ICompiler cmp
, bool reverse
) {
267 GetWidth (out min
, out max
);
268 cmp
.EmitInfo (group_count
, min
, max
);
270 // anchoring expression
272 AnchorInfo info
= GetAnchorInfo (reverse
);
274 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
276 LinkRef pattern
= cmp
.NewLink ();
277 cmp
.EmitAnchor (reverse
, info
.Offset
, pattern
);
280 cmp
.EmitPosition (info
.Position
);
281 else if (info
.IsSubstring
)
282 cmp
.EmitString (info
.Substring
, info
.IgnoreCase
, reverse
);
288 cmp
.ResolveLink (pattern
);
289 base.Compile (cmp
, reverse
);
293 private int group_count
;
296 class CapturingGroup
: Group
{
297 public CapturingGroup () {
309 set { name = value; }
312 public bool IsNamed
{
313 get { return name != null; }
316 public override void Compile (ICompiler cmp
, bool reverse
) {
318 base.Compile (cmp
, reverse
);
322 public override bool IsComplex () {
330 class BalancingGroup
: CapturingGroup
{
331 public BalancingGroup () {
335 public CapturingGroup Balance
{
336 get { return balance; }
337 set { balance = value; }
340 public override void Compile (ICompiler cmp
, bool reverse
) {
341 // can't invoke Group.Compile from here :(
342 // so I'll just repeat the code
344 LinkRef tail
= cmp
.NewLink ();
346 cmp
.EmitBalanceStart (this.Number
, balance
.Number
, this.IsNamed
, tail
);
348 int count
= Expressions
.Count
;
349 for (int i
= 0; i
< count
; ++ i
) {
352 e
= Expressions
[count
- i
- 1];
356 e
.Compile (cmp
, reverse
);
360 cmp
.ResolveLink(tail
);
363 private CapturingGroup balance
;
366 class NonBacktrackingGroup
: Group
{
367 public NonBacktrackingGroup () {
370 public override void Compile (ICompiler cmp
, bool reverse
) {
371 LinkRef tail
= cmp
.NewLink ();
374 base.Compile (cmp
, reverse
);
376 cmp
.ResolveLink (tail
);
379 public override bool IsComplex () {
386 class Repetition
: CompositeExpression
{
387 public Repetition (int min
, int max
, bool lazy
) {
388 Expressions
.Add (null);
395 public Expression Expression
{
396 get { return Expressions[0]; }
397 set { Expressions[0] = value; }
412 set { lazy = value; }
415 public override void Compile (ICompiler cmp
, bool reverse
) {
416 if (Expression
.IsComplex ()) {
417 LinkRef until
= cmp
.NewLink ();
419 cmp
.EmitRepeat (min
, max
, lazy
, until
);
420 Expression
.Compile (cmp
, reverse
);
421 cmp
.EmitUntil (until
);
424 LinkRef tail
= cmp
.NewLink ();
426 cmp
.EmitFastRepeat (min
, max
, lazy
, tail
);
427 Expression
.Compile (cmp
, reverse
);
429 cmp
.ResolveLink (tail
);
433 public override void GetWidth (out int min
, out int max
) {
434 Expression
.GetWidth (out min
, out max
);
435 min
= min
* this.min
;
436 if (max
== Int32
.MaxValue
|| this.max
== 0xffff)
437 max
= Int32
.MaxValue
;
439 max
= max
* this.max
;
442 public override AnchorInfo
GetAnchorInfo (bool reverse
) {
443 int width
= GetFixedWidth ();
445 return new AnchorInfo (this, width
);
447 AnchorInfo info
= Expression
.GetAnchorInfo (reverse
);
449 return new AnchorInfo (this, info
.Offset
, width
, info
.Position
);
451 if (info
.IsSubstring
) {
452 if (info
.IsComplete
) {
454 for (int i
= 0; i
< Minimum
; ++ i
)
455 str
+= info
.Substring
;
457 return new AnchorInfo (this, 0, width
, str
, info
.IgnoreCase
);
460 return new AnchorInfo (this, info
.Offset
, width
, info
.Substring
, info
.IgnoreCase
);
463 return new AnchorInfo (this, width
);
466 private int min
, max
;
472 abstract class Assertion
: CompositeExpression
{
473 public Assertion () {
474 Expressions
.Add (null); // true expression
475 Expressions
.Add (null); // false expression
478 public Expression TrueExpression
{
479 get { return Expressions[0]; }
480 set { Expressions[0] = value; }
483 public Expression FalseExpression
{
484 get { return Expressions[1]; }
485 set { Expressions[1] = value; }
488 public override void GetWidth (out int min
, out int max
) {
489 GetWidth (out min
, out max
, 2);
491 if (TrueExpression
== null || FalseExpression
== null)
496 class CaptureAssertion
: Assertion
{
497 public CaptureAssertion () {
500 public CapturingGroup CapturingGroup
{
501 get { return group; }
502 set { group = value; }
505 public override void Compile (ICompiler cmp
, bool reverse
) {
506 int gid
= group.Number
;
507 LinkRef tail
= cmp
.NewLink ();
509 if (FalseExpression
== null) {
514 cmp
.EmitIfDefined (gid
, tail
);
515 TrueExpression
.Compile (cmp
, reverse
);
524 LinkRef false_expr
= cmp
.NewLink ();
525 cmp
.EmitIfDefined (gid
, false_expr
);
526 TrueExpression
.Compile (cmp
, reverse
);
528 cmp
.ResolveLink (false_expr
);
529 FalseExpression
.Compile (cmp
, reverse
);
532 cmp
.ResolveLink (tail
);
535 public override bool IsComplex () {
537 if (TrueExpression
!= null)
538 comp
|= TrueExpression
.IsComplex ();
539 if (FalseExpression
!= null)
540 comp
|= FalseExpression
.IsComplex ();
542 return comp
| GetFixedWidth () <= 0;
545 private CapturingGroup
group;
548 class ExpressionAssertion
: Assertion
{
549 public ExpressionAssertion () {
550 Expressions
.Add (null); // test expression
553 public bool Reverse
{
554 get { return reverse; }
555 set { reverse = value; }
559 get { return negate; }
560 set { negate = value; }
563 public Expression TestExpression
{
564 get { return Expressions[2]; }
565 set { Expressions[2] = value; }
568 public override void Compile (ICompiler cmp
, bool reverse
) {
569 LinkRef true_expr
= cmp
.NewLink ();
570 LinkRef false_expr
= cmp
.NewLink ();
572 // test op: positive / negative
575 cmp
.EmitTest (true_expr
, false_expr
);
577 cmp
.EmitTest (false_expr
, true_expr
);
579 // test expression: lookahead / lookbehind
581 TestExpression
.Compile (cmp
, this.reverse
);
584 // target expressions
586 if (TrueExpression
== null) { // (?= ...)
592 cmp
.ResolveLink (false_expr
);
594 cmp
.ResolveLink (true_expr
);
597 cmp
.ResolveLink (true_expr
);
598 TrueExpression
.Compile (cmp
, reverse
);
600 if (FalseExpression
== null) { // (?(...) ...)
606 cmp
.ResolveLink (false_expr
);
608 else { // (?(...) ... | ...)
616 LinkRef tail
= cmp
.NewLink ();
619 cmp
.ResolveLink (false_expr
);
620 FalseExpression
.Compile (cmp
, reverse
);
621 cmp
.ResolveLink (tail
);
626 private bool reverse
, negate
;
631 class Alternation
: CompositeExpression
{
632 public Alternation () {
635 public ExpressionCollection Alternatives
{
636 get { return Expressions; }
639 public void AddAlternative (Expression e
) {
640 Alternatives
.Add (e
);
643 public override void Compile (ICompiler cmp
, bool reverse
) {
644 // LinkRef next = cmp.NewLink ();
645 LinkRef tail
= cmp
.NewLink ();
647 foreach (Expression e
in Alternatives
) {
648 LinkRef next
= cmp
.NewLink ();
649 cmp
.EmitBranch (next
);
650 e
.Compile (cmp
, reverse
);
652 cmp
.ResolveLink (next
);
657 cmp
.ResolveLink (tail
);
658 cmp
.EmitAlternationEnd();
661 public override void GetWidth (out int min
, out int max
) {
662 GetWidth (out min
, out max
, Alternatives
.Count
);
665 public override bool IsComplex () {
667 foreach (Expression e
in Alternatives
) {
668 comp
|= e
.IsComplex ();
671 return comp
| GetFixedWidth () <= 0;
675 // terminal expressions
677 class Literal
: Expression
{
678 public Literal (string str
, bool ignore
) {
680 this.ignore
= ignore
;
683 public string String
{
688 public bool IgnoreCase
{
689 get { return ignore; }
690 set { ignore = value; }
693 public override void Compile (ICompiler cmp
, bool reverse
) {
698 cmp
.EmitCharacter (str
[0], false, ignore
, reverse
);
700 cmp
.EmitString (str
, ignore
, reverse
);
703 public override void GetWidth (out int min
, out int max
) {
704 min
= max
= str
.Length
;
707 public override AnchorInfo
GetAnchorInfo (bool reverse
) {
708 return new AnchorInfo (this, 0, str
.Length
, str
, ignore
);
711 public override bool IsComplex () {
719 class PositionAssertion
: Expression
{
720 public PositionAssertion (Position pos
) {
724 public Position Position
{
729 public override void Compile (ICompiler cmp
, bool reverse
) {
730 cmp
.EmitPosition (pos
);
733 public override void GetWidth (out int min
, out int max
) {
737 public override bool IsComplex () {
741 public override AnchorInfo
GetAnchorInfo (bool revers
) {
743 case Position
.StartOfString
: case Position
.StartOfLine
: case Position
.StartOfScan
:
744 return new AnchorInfo (this, 0, 0, pos
);
747 return new AnchorInfo (this, 0);
751 private Position pos
;
754 class Reference
: Expression
{
755 public Reference (bool ignore
) {
756 this.ignore
= ignore
;
759 public CapturingGroup CapturingGroup
{
760 get { return group; }
761 set { group = value; }
764 public bool IgnoreCase
{
765 get { return ignore; }
766 set { ignore = value; }
769 public override void Compile (ICompiler cmp
, bool reverse
) {
770 cmp
.EmitReference (group.Number
, ignore
, reverse
);
773 public override void GetWidth (out int min
, out int max
) {
774 //group.GetWidth (out min, out max);
775 // TODO set width to referenced group for non-cyclical references
777 max
= Int32
.MaxValue
;
780 public override bool IsComplex () {
781 return true; // FIXME incorporate cyclic check
784 private CapturingGroup
group;
788 class CharacterClass
: Expression
{
789 public CharacterClass (bool negate
, bool ignore
) {
790 this.negate
= negate
;
791 this.ignore
= ignore
;
793 intervals
= new IntervalCollection ();
795 // initialize pos/neg category arrays
797 int cat_size
= (int) Category
.LastValue
;
798 pos_cats
= new BitArray (cat_size
);
799 neg_cats
= new BitArray (cat_size
);
802 public CharacterClass (Category cat
, bool negate
) : this (false, false) {
803 this.AddCategory (cat
, negate
);
807 get { return negate; }
808 set { negate = value; }
811 public bool IgnoreCase
{
812 get { return ignore; }
813 set { ignore = value; }
816 public void AddCategory (Category cat
, bool negate
) {
826 public void AddCharacter (char c
) {
827 // TODO: this is certainly not the most efficient way of doing things
828 // TODO: but at least it produces correct results.
832 public void AddRange (char lo
, char hi
) {
833 Interval new_interval
= new Interval (lo
, hi
);
835 // ignore case is on. we must make sure our interval does not
836 // use upper case. if it does, we must normalize the upper case
837 // characters into lower case.
839 if (upper_case_characters
.Intersects (new_interval
)) {
840 Interval partial_new_interval
;
842 if (new_interval
.low
< upper_case_characters
.low
) {
843 partial_new_interval
= new Interval (upper_case_characters
.low
+ distance_between_upper_and_lower_case
,
844 new_interval
.high
+ distance_between_upper_and_lower_case
);
845 new_interval
.high
= upper_case_characters
.low
- 1;
848 partial_new_interval
= new Interval (new_interval
.low
+ distance_between_upper_and_lower_case
,
849 upper_case_characters
.high
+ distance_between_upper_and_lower_case
);
850 new_interval
.low
= upper_case_characters
.high
+ 1;
852 intervals
.Add (partial_new_interval
);
854 else if (upper_case_characters
.Contains (new_interval
)) {
855 new_interval
.high
+= distance_between_upper_and_lower_case
;
856 new_interval
.low
+= distance_between_upper_and_lower_case
;
859 intervals
.Add (new_interval
);
862 public override void Compile (ICompiler cmp
, bool reverse
) {
863 // create the meta-collection
864 IntervalCollection meta
=
865 intervals
.GetMetaCollection (new IntervalCollection
.CostDelegate (GetIntervalCost
));
869 int count
= meta
.Count
;
870 for (int i
= 0; i
< pos_cats
.Length
; ++ i
) {
871 if (pos_cats
[i
]) ++ count
;
872 if (neg_cats
[i
]) ++ count
;
878 // emit in op for |meta| > 1
880 LinkRef tail
= cmp
.NewLink ();
887 for (int i
= 0; i
< pos_cats
.Length
; ++ i
) {
890 cmp
.EmitCategory (Category
.Any
, negate
, reverse
);
892 cmp
.EmitCategory ((Category
)i
, negate
, reverse
);
893 } else if (neg_cats
[i
]) {
894 cmp
.EmitCategory ((Category
)i
, !negate
, reverse
);
898 // emit character/range/sets from meta-collection
900 foreach (Interval a
in meta
) {
901 if (a
.IsDiscontiguous
) { // Set
902 BitArray bits
= new BitArray (a
.Size
);
903 foreach (Interval b
in intervals
) {
904 if (a
.Contains (b
)) {
905 for (int i
= b
.low
; i
<= b
.high
; ++ i
)
906 bits
[i
- a
.low
] = true;
910 cmp
.EmitSet ((char)a
.low
, bits
, negate
, ignore
, reverse
);
912 else if (a
.IsSingleton
) // Character
913 cmp
.EmitCharacter ((char)a
.low
, negate
, ignore
, reverse
);
915 cmp
.EmitRange ((char)a
.low
, (char)a
.high
, negate
, ignore
, reverse
);
926 cmp
.ResolveLink (tail
);
930 public override void GetWidth (out int min
, out int max
) {
934 public override bool IsComplex () {
940 private static double GetIntervalCost (Interval i
) {
941 // use op length as cost metric (=> optimize for space)
943 if (i
.IsDiscontiguous
)
944 return 3 + ((i
.Size
+ 0xf) >> 4); // Set
945 else if (i
.IsSingleton
)
946 return 2; // Character
951 private static Interval upper_case_characters
= new Interval ((char)65, (char)90);
952 private const int distance_between_upper_and_lower_case
= 32;
953 private bool negate
, ignore
;
954 private BitArray pos_cats
, neg_cats
;
955 private IntervalCollection intervals
;
959 private Expression expr
;
961 private Position pos
;
968 public AnchorInfo (Expression expr
, int width
) {
975 this.pos
= Position
.Any
;
978 public AnchorInfo (Expression expr
, int offset
, int width
, string str
, bool ignore
) {
980 this.offset
= offset
;
983 this.str
= ignore
? str
.ToLower () : str
;
985 this.ignore
= ignore
;
986 this.pos
= Position
.Any
;
989 public AnchorInfo (Expression expr
, int offset
, int width
, Position pos
) {
991 this.offset
= offset
;
1000 public Expression Expression
{
1001 get { return expr; }
1005 get { return offset; }
1009 get { return width; }
1013 get { return (str != null) ? str.Length : 0; }
1016 public bool IsUnknownWidth
{
1017 get { return width < 0; }
1020 public bool IsComplete
{
1021 get { return Length == Width; }
1024 public string Substring
{
1028 public bool IgnoreCase
{
1029 get { return ignore; }
1032 public Position Position
{
1036 public bool IsSubstring
{
1037 get { return str != null; }
1040 public bool IsPosition
{
1041 get { return pos != Position.Any; }
1044 public Interval
GetInterval () {
1045 return GetInterval (0);
1048 public Interval
GetInterval (int start
) {
1050 return Interval
.Empty
;
1052 return new Interval (start
+ Offset
, start
+ Offset
+ Length
- 1);