1 //////////////////////////////////////////////////////////////////////////////
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
23 //////////////////////////////////////////////////////////////////////////////
25 #ifndef regular_expression_search_h
26 #define regular_expression_search_h
28 /////////////////////////////////////////////////////////////////////////////
29 // Class |RegExp| implements a regular expression searching package.
30 // A regular expression pattern is first ``compiled'' into a
31 // non-deterministic finite automata (NFA) in a compact form.
32 // Searching then involved actually running a text string through
33 // this NFA. Non-determinism is handled by recursion and backtracking.
35 /////////////////////////////////////////////////////////////////////////////
37 #include <AD/generic/generic.h> // Generic definitions
42 META_STAR
, // Meta character for Kleene star `*'
43 META_PLUS
, // Meta character Non-reflexive Kleene star `+'
44 META_OPEN
, // Meta character for '('
45 META_CLOSE
, // Meta character for ')'
46 META_SET_OPEN
, // Meta character '['
47 META_SET_CLOSE
, // Meta character ']'
48 META_OPTIONAL
, // Meta character for '?'
49 META_BEGIN
, // Meta character for '^', the beginning of a line
50 META_END
, // Meta character for '$', the end of a line
51 META_RANGE
, // Meta character for '-', range for characters
52 META_WILD
, // Meta character for '.', the wild card
53 META_OR
, // Meta character for '|', disjunction
54 META_ESCAPE
, // Escape meta character `\'
55 META_EOP
, // End of pattern, normally '\0'
56 META_NONE
// A non-meta character
61 /////////////////////////////////////////////////////////////////////////
62 // If matching is successful, the location of each pair of
63 // matching substrings within the patterns '(' and ')' is
64 // marked and remembered. This make it possible to extract
65 // substring within a pattern easily.
67 // We'll allow a maximum of 10 groups within a pattern and
68 // these are cached within the following fields.
69 /////////////////////////////////////////////////////////////////////////
72 // maximum number of paranthesized groups
73 enum { REGEXP_MAX_GROUPS
= 10 };
74 const char * myText
; // The current text string.
75 int left
[ REGEXP_MAX_GROUPS
]; // starting point of group `n'
76 int right
[ REGEXP_MAX_GROUPS
]; // ending of group `n'
77 const char * matchedText
;
78 const char * matchedTextEnd
;
80 /////////////////////////////////////////////////////////////////////////
81 // We'll represent the NFA in linear form as a set primitive string
82 // matching and branching(backtracking) instructions. These instructions
83 // will appear like ``opcodes'' of some abstract machine. Like this:
84 /////////////////////////////////////////////////////////////////////////
85 Opcode
* nfa
; // The list of instructions representing the nfa
86 int size
; // The size of the nfa.
87 int minMatchLen
; // The minimum length of matching text.
91 /////////////////////////////////////////////////////////////////////////
92 // Constructors and destructors
93 /////////////////////////////////////////////////////////////////////////
95 RegExp(const RegExp
&);
96 RegExp(const char * pattern
, const char * meta
= 0) : nfa(0)
97 { this->compile(pattern
,meta
); }
98 ~RegExp() { delete nfa
; }
100 /////////////////////////////////////////////////////////////////////////
102 /////////////////////////////////////////////////////////////////////////
103 RegExp
& operator = (const RegExp
&);
105 /////////////////////////////////////////////////////////////////////////
106 // Recompile a pattern
107 /////////////////////////////////////////////////////////////////////////
108 RegExp
& compile (const char * pattern
, const char * meta
= 0);
109 inline RegExp
& operator = (const char * pattern
) { return this->compile(pattern
); }
110 inline Bool
ok() { return nfa
!= 0; }
112 /////////////////////////////////////////////////////////////////////////
114 /////////////////////////////////////////////////////////////////////////
115 int Match(const char * text
, int length
= -1);
117 /////////////////////////////////////////////////////////////////////////
118 // Retrieve information about matching substrings
119 /////////////////////////////////////////////////////////////////////////
120 inline const char * operator[] (int i
) const { return myText
+ left
[i
]; }
121 inline const char * operator[] (char i
) const { return myText
+ left
[i
]; }
122 inline int begin() const { return matchedText
- myText
; }
123 inline int end() const { return matchedTextEnd
- myText
; }
124 inline int start(int i
) const { return left
[i
]; }
125 inline int stop(int i
) const { return right
[i
]; }
126 inline int len(int i
) const { return right
[i
] - left
[i
] + 1; }
129 /////////////////////////////////////////////////////////////////////////
130 // Internal matching and compilation function
131 /////////////////////////////////////////////////////////////////////////
132 const char * grep(const Opcode
*, const char *, const char *);
133 const char * times(int m
, int n
, const char *, const char *, const Opcode
*);
134 int comp(const char * pattern
, const MetaChar
[], Bool emit
);
135 friend void shift(Opcode
*,Opcode
*,int);
136 int minMatch(const Opcode
*);