initial
[prop.git] / include / AD / strings / regexp.h
blob794a6923225fb0fe84480fa4395af0129c03bb1e
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #ifndef regular_expression_search_h
26 #define regular_expression_search_h
28 /////////////////////////////////////////////////////////////////////////////
29 // Class |RegExp| implements a regular expression searching package.
30 // A regular expression pattern is first ``compiled'' into a
31 // non-deterministic finite automata (NFA) in a compact form.
32 // Searching then involved actually running a text string through
33 // this NFA. Non-determinism is handled by recursion and backtracking.
35 /////////////////////////////////////////////////////////////////////////////
37 #include <AD/generic/generic.h> // Generic definitions
39 class RegExp {
40 public:
41 enum MetaChar {
42 META_STAR, // Meta character for Kleene star `*'
43 META_PLUS, // Meta character Non-reflexive Kleene star `+'
44 META_OPEN, // Meta character for '('
45 META_CLOSE, // Meta character for ')'
46 META_SET_OPEN, // Meta character '['
47 META_SET_CLOSE, // Meta character ']'
48 META_OPTIONAL, // Meta character for '?'
49 META_BEGIN, // Meta character for '^', the beginning of a line
50 META_END, // Meta character for '$', the end of a line
51 META_RANGE, // Meta character for '-', range for characters
52 META_WILD, // Meta character for '.', the wild card
53 META_OR, // Meta character for '|', disjunction
54 META_ESCAPE, // Escape meta character `\'
55 META_EOP, // End of pattern, normally '\0'
56 META_NONE // A non-meta character
59 typedef char Opcode;
61 /////////////////////////////////////////////////////////////////////////
62 // If matching is successful, the location of each pair of
63 // matching substrings within the patterns '(' and ')' is
64 // marked and remembered. This make it possible to extract
65 // substring within a pattern easily.
67 // We'll allow a maximum of 10 groups within a pattern and
68 // these are cached within the following fields.
69 /////////////////////////////////////////////////////////////////////////
71 private:
72 // maximum number of paranthesized groups
73 enum { REGEXP_MAX_GROUPS = 10 };
74 const char * myText; // The current text string.
75 int left [ REGEXP_MAX_GROUPS ]; // starting point of group `n'
76 int right [ REGEXP_MAX_GROUPS ]; // ending of group `n'
77 const char * matchedText;
78 const char * matchedTextEnd;
80 /////////////////////////////////////////////////////////////////////////
81 // We'll represent the NFA in linear form as a set primitive string
82 // matching and branching(backtracking) instructions. These instructions
83 // will appear like ``opcodes'' of some abstract machine. Like this:
84 /////////////////////////////////////////////////////////////////////////
85 Opcode * nfa; // The list of instructions representing the nfa
86 int size; // The size of the nfa.
87 int minMatchLen; // The minimum length of matching text.
89 public:
91 /////////////////////////////////////////////////////////////////////////
92 // Constructors and destructors
93 /////////////////////////////////////////////////////////////////////////
94 RegExp() : nfa(0) {}
95 RegExp(const RegExp&);
96 RegExp(const char * pattern, const char * meta = 0) : nfa(0)
97 { this->compile(pattern,meta); }
98 ~RegExp() { delete nfa; }
100 /////////////////////////////////////////////////////////////////////////
101 // Assignment
102 /////////////////////////////////////////////////////////////////////////
103 RegExp& operator = (const RegExp&);
105 /////////////////////////////////////////////////////////////////////////
106 // Recompile a pattern
107 /////////////////////////////////////////////////////////////////////////
108 RegExp& compile (const char * pattern, const char * meta = 0);
109 inline RegExp& operator = (const char * pattern) { return this->compile(pattern); }
110 inline Bool ok() { return nfa != 0; }
112 /////////////////////////////////////////////////////////////////////////
113 // String matching
114 /////////////////////////////////////////////////////////////////////////
115 int Match(const char * text, int length = -1);
117 /////////////////////////////////////////////////////////////////////////
118 // Retrieve information about matching substrings
119 /////////////////////////////////////////////////////////////////////////
120 inline const char * operator[] (int i) const { return myText + left[i]; }
121 inline const char * operator[] (char i) const { return myText + left[i]; }
122 inline int begin() const { return matchedText - myText; }
123 inline int end() const { return matchedTextEnd - myText; }
124 inline int start(int i) const { return left[i]; }
125 inline int stop(int i) const { return right[i]; }
126 inline int len(int i) const { return right[i] - left[i] + 1; }
128 private:
129 /////////////////////////////////////////////////////////////////////////
130 // Internal matching and compilation function
131 /////////////////////////////////////////////////////////////////////////
132 const char * grep(const Opcode *, const char *, const char *);
133 const char * times(int m, int n, const char *, const char *, const Opcode *);
134 int comp(const char * pattern, const MetaChar [], Bool emit);
135 friend void shift(Opcode *,Opcode *,int);
136 int minMatch(const Opcode *);
139 #endif