1 define: #Regex &parents: {Cloneable} &slots: {}.
2 Regex traits ensureNamespace: #Syntax.
3 Regex traits define: #Match.
5 "counter is incremented and stored inside expressions"
6 Regex define: #Parser &parents: {Regex} &slots: {
12 "regex slot is a regex syntax node"
13 Regex define: #Expression &parents: {Regex} &slots: {
15 #parser -> Regex Parser
18 "regex slot is a regex expression. subexpressions are the () portions that match. see the counter property.
19 the matchee is what we are trying to match"
20 Regex define: #Matcher &parents: {Regex} &slots: {
22 #subexpressions -> Dictionary new.
26 "nodes are output by the parser"
27 Regex Syntax define: #Node &parents: {Regex}.
29 "the top element with expressions surrounded by branches 'exp|exp'"
30 Regex Syntax define: #Branch &parents: {Regex Syntax Node} &slots: {
32 #expressions -> ExtensibleArray new
35 "must match in consecutive order 'ab'"
36 Regex Syntax define: #Expression &parents: {Regex Syntax Node} &slots: {
43 Regex Syntax define: #BeginningMarker &parents: {Regex Syntax Node} &slots: {}.
45 Regex Syntax define: #EndMarker &parents: {Regex Syntax Node} &slots: {}.
48 Regex Syntax define: #CharacterGroup &parents: {Regex Syntax Node} &slots: {
50 #elements -> ExtensibleArray new
53 group@(Regex Syntax CharacterGroup traits) newForAll: elements &negated: negated
55 negated `defaultsTo: group negated.
56 group new `>> [elements := elements as: group elements. negated := negated. ]
60 Regex Syntax define: #Character &parents: {Regex Syntax Node} &slots: {
64 Regex Syntax define: #CharacterRange &parents: {Regex Syntax Node} &slots: {
69 "Regex Match define: #Terminator &parents: {Regex}."
71 r@(Regex Syntax Character traits) newOn: c
73 r new `>> [character := c. ]
76 Regex Syntax define: #DecimalDigits -> (Regex Syntax CharacterRange new `>> [start := $0. end := $9. ]).
77 Regex Syntax define: #UppercaseLetters -> (Regex Syntax CharacterRange new `>> [start := $A. end := $Z. ]).
78 Regex Syntax define: #LowercaseLetters -> (Regex Syntax CharacterRange new `>> [start := $a. end := $z. ]).
79 Regex Syntax define: #WhitespaceElements -> {
80 Regex Syntax Character newOn: $\e.
81 Regex Syntax Character newOn: $\n.
82 Regex Syntax Character newOn: $\s .
83 Regex Syntax Character newOn: $\r.
84 Regex Syntax Character newOn: $\f.
85 Regex Syntax Character newOn: $\t.
88 Regex Parser traits define: #BackslashSpecials -> (Dictionary new*,
89 $e -> (Regex Syntax Character newOn: $\e),
90 $n -> (Regex Syntax Character newOn: $\n),
91 $r -> (Regex Syntax Character newOn: $\r),
92 $f -> (Regex Syntax Character newOn: $\f),
93 $t -> (Regex Syntax Character newOn: $\t),
94 $w -> (Regex Syntax CharacterGroup newForAll:
95 {Regex Syntax UppercaseLetters.
96 Regex Syntax LowercaseLetters.
97 Regex Syntax DecimalDigits.
98 Regex Syntax Character newOn: $_. }),
99 $W -> (Regex Syntax CharacterGroup newForAll:
100 {Regex Syntax UppercaseLetters.
101 Regex Syntax LowercaseLetters.
102 Regex Syntax DecimalDigits.
103 Regex Syntax Character newOn: $_} &negated: True),
104 $s -> (Regex Syntax CharacterGroup newForAll: Regex Syntax WhitespaceElements),
105 $S -> (Regex Syntax CharacterGroup newForAll: Regex Syntax WhitespaceElements &negated: True),
106 $d -> (Regex Syntax CharacterGroup newForAll: {Regex Syntax DecimalDigits}),
107 $D -> (Regex Syntax CharacterGroup newForAll: {Regex Syntax DecimalDigits} &negated: True)).
109 r@(Regex Expression traits) newOn: s@(String traits)
113 parser := r parser newOn: s.
114 regex := result parser parse. ]
117 p@(Regex Parser traits) newOn: s@(String traits)
125 n@(Regex Syntax Node traits) as: e@(Regex Syntax Expression traits)
127 e new `>> [term := n. ]
130 n@(Regex Syntax Expression traits) as: e@(Regex Syntax Expression traits)
135 p@(Regex Parser traits) next
137 p position >= p source size
138 ifTrue: [p position += 1. Nil]
139 ifFalse: [obj ::= p source at: p position.
144 p@(Regex Parser traits) previous
149 p@(Regex Parser traits) expected: x butFound: y
151 error: 'Parser expected ' ; x printString ; ' but found ' ; y printString.
154 p@(Regex Parser traits) parse
159 p@(Regex Parser traits) nextAssertIs: item
161 (next ::= p next) = item ifFalse: [p expected: item butFound: next].
164 p@(Regex Parser traits) backspaceCharacterFor: c
166 p BackslashSpecials at: c ifAbsent: [p Syntax Character newOn: c]
169 "branch ::= expression*
172 p@(Regex Parser traits) parseBranch
173 [| branch oldBranch obj next firstBranch |
174 branch := p Syntax Branch new `>> [| :b | nextBranch := Nil. expressions := b expressions new. ].
175 firstBranch := branch.
177 obj := p parseExpression.
179 branch expressions addLast: obj.
181 Nil -> [^ (branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [obj] ifFalse: [firstBranch])].
183 ^ (branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [obj] ifFalse: [firstBranch])].
184 $| -> [oldBranch := branch.
185 branch := p Syntax Branch new `>> [| :b | nextBranch := Nil. expressions := b expressions new. ].
186 oldBranch nextBranch := branch].
187 } otherwise: [p previous]
190 "repeatcount ::= {m,n}"
192 p@(Regex Parser traits) parseRepeatCount
197 "expression ::= term | term '?' | term '+' | term '*' | term numericrange"
198 p@(Regex Parser traits) parseExpression
201 (next := p next) caseOf: {
202 $? -> [obj := obj as: p Syntax Expression. obj repeatCount := 0 to: 1 by: 1].
203 $+ -> [obj := obj as: p Syntax Expression. obj repeatCount := 1 to: PositiveInfinity by: 1].
204 $* -> [obj := obj as: p Syntax Expression. obj repeatCount := 0 to: PositiveInfinity by: 1].
205 $\{ -> [obj := obj as: p Syntax Expression. obj repeatCount := p parseRepeatCount. p nextAssertIs: $\}]
206 } otherwise: [p previous].
210 "term ::= group | '(' branch ')"
211 p@(Regex Parser traits) parseTerm
214 $\( -> [count := p counter.
216 obj ::= p parseBranch as: p Syntax Expression. "need the counter slot"
217 obj counter := count.
220 } otherwise: [p previous. p parseGroup]
223 "group ::= symbol | '[' '^'? range* ']'"
224 p@(Regex Parser traits) parseGroup
226 (next := p next) caseOf: {
227 $\[ -> [obj ::= p Syntax CharacterGroup new `>> [elements := ExtensibleArray new. ].
228 (next := p next) = $^ ifTrue: [obj negated := True] ifFalse: [p previous].
229 [(next := p next) ifNil: [p expected: $\] butFound: Nil].
230 next = $\]] whileFalse:
231 [p previous. p parseRange ifNotNilDo: [|:r| obj elements addLast: r]].
233 } otherwise: [p previous. p parseSymbol]
236 "range ::= symbol | symbol '-' symbol"
237 p@(Regex Parser traits) parseRange
239 start ::= p parseSymbol.
241 $\] -> [p previous. start].
242 $- -> [p Syntax CharacterRange new `>> [start := start. end := p parseSymbol. ]]
243 } otherwise: [p previous. start]
246 "symbol ::= . | char | escape char"
247 p@(Regex Parser traits) parseSymbol
249 (next ::= p next) caseOf: {
250 $^ -> [p Syntax BeginningMarker new].
251 $$ -> [p Syntax EndMarker new].
252 $. -> [p Syntax CharacterRange new `>> [start: $\x00. end: $\xFF. ]]. "fixme ascii"
253 $\\ -> [p backspaceCharacterFor: p next]
254 } otherwise: [p Syntax Character new `>> [character := next. ]]
264 Regex Matcher define: #Fail -> -1.
266 m@(Regex Matcher traits) newOn: s@(String traits)
268 m newOn: (m Expression newOn: s)
271 m@(Regex Matcher traits) newOn: r@(Regex Expression traits)
273 m new `>> [regex := r. ]
276 s@(String traits) regex
278 Regex Expression newOn: s
281 r@(Regex Expression traits) matcher
283 Regex Matcher newOn: r
286 m@(Regex Matcher traits) subexpressionAt: x
288 (se ::= m subexpressions at: x ifAbsent: [^ Nil]) key >= m matchee size
289 ifFalse: [m matchee copyFrom: se key to: se value]
292 m@(Regex Matcher traits) subexpressionMatches
294 matches ::= Dictionary new.
295 m subexpressions keysDo:
296 [| :key | matches at: key put: (m subexpressionAt: key)].
300 m@(Regex Matcher traits) subexpressionMatchesArray
302 capacity ::= m subexpressions keys reduce: #max:`er ifEmpty: [0].
303 matches ::= Array new &capacity: capacity + 1.
304 m subexpressions keysDo:
305 [| :key | matches at: key put: (m subexpressionAt: key)].
309 m@(Regex Matcher traits) matches: node
311 (m match: node) ~= m Fail
314 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos to: endPos
316 node counter ifNotNilDo:
317 [| :counter | m subexpressions at: counter put: {pos. endPos}]
320 m@(Regex Matcher traits) match: s@(String traits)
325 string@(String traits) =~ regex@(Regex Matcher traits)
327 regex `>> [match: string. subexpressionMatchesArray]
330 string@(String traits) =~ regex@(String traits)
332 string =~ (Regex Matcher newOn: regex)
335 string@(String traits) =~ regex@(Regex Expression traits)
337 string =~ (regex Matcher newOn: regex)
340 m@(Regex Matcher traits) match: s@(String traits) from: fromStart
342 m subexpressions := m subexpressions new.
344 ((res ::= m match: m regex regex from: fromStart) is: Sequence)
345 ifTrue: [res isEmpty ifTrue: [-1] ifFalse: [res last]]
349 "these functions return the position they match until"
350 m@(Regex Matcher traits) match: node@(Regex Syntax Node traits) from: pos
355 "match all in sequence(from start to end in sequence) or none"
356 m@(Regex Matcher traits) match: expressions@(Sequence traits) from: pos start: start end: end
357 [| curPos newPos exp futurePos|
359 [start > end ifTrue: [^ curPos].
360 exp := expressions at: start.
361 newPos := m match: exp from: curPos.
362 "wildcard matches produce array results of all the possible matches."
363 "do them in reverse to be greedy"
364 "inform: expressions printString ; 'from pos: ' ; pos printString ; ' start: ' ; start printString ; ' end: ' ; end printString.
365 inform: 'newPos: ' ; newPos printString."
366 (newPos is: Sequence)
367 ifTrue: [newPos reverseDo:
369 futurePos := m match: expressions from: matchPos start: start + 1 end: end.
370 (futurePos is: Sequence) ifTrue: [error: 'crap'].
371 futurePos ~= -1 ifTrue: [m markExpression: exp from: curPos to: matchPos.
374 ifFalse: [newPos = -1 ifTrue:
375 [m markExpression: exp from: curPos to: curPos.
382 m@(Regex Matcher traits) match: node@(Sequence traits) from: pos
383 [m match: node from: pos start: 0 end: node size - 1].
385 "expressions can return a list of matches because of repeating things"
386 m@(Regex Matcher traits) match: node@(Regex Syntax Branch traits) from: pos
388 (newPos ::= m match: node expressions from: pos) = -1
389 /\ [node nextBranch isNotNil]
390 ifTrue: [m match: node nextBranch from: pos]
394 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Expression traits) from: start to: end
396 node counter ifNotNil: [m subexpressions at: node counter put: start -> (end - 1)]
399 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Node traits) from: start to: end
400 [ "cannot mark node" ].
402 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos
403 [| curPos newPos matches numMatches |
404 "no repeatCount means it just has to match once"
405 node repeatCount ifNil:
406 [(curPos := m match: node term from: pos) > pos
407 ifTrue: [m markExpression: node from: pos to: curPos].
410 matches := ExtensibleArray new.
411 node repeatCount start = 0 ifTrue: [matches addLast: pos]. "optional argument may match nil matchee"
413 [numMatches < node repeatCount end] whileTrue:
414 [newPos: (m match: node term from: curPos).
416 "inform: 'match: ' ; newPos printString ; ' from: ' ; node printString."
417 newPos = -1 ifTrue: [numMatches >= node repeatCount start
418 ifTrue: [curPos > pos ifTrue: [m markExpression: node from: pos to: curPos].
420 ifFalse: [^ newPos]].
422 matches addLast: curPos.
424 matches isEmpty ifTrue: [-1] ifFalse: [matches]
427 m@(Regex Matcher traits) match: node@(Regex Syntax Character traits) from: pos
429 pos >= m matchee size ifTrue: [^ -1].
430 (m matchee at: pos) = node character ifTrue: [pos + 1] ifFalse: [-1]
433 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterRange traits) from: pos
435 pos >= m matchee size ifTrue: [^ -1].
436 (charCode := (m matchee at: pos) code) >= node start code
437 /\ [charCode <= node end code]
442 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterGroup traits) from: pos
443 [ "there should be another way than assuming a match is just pos+1"
444 pos >= m matchee size ifTrue: [^ -1].
446 ifTrue: [(node elements noneSatisfy: [| :each | (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
447 ifFalse: [(node elements anySatisfy: [| :each | (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
450 m@(Regex Matcher traits) match: node@(Regex Syntax BeginningMarker traits) from: pos
452 pos = 0 ifTrue: [pos] ifFalse: [-1]
455 m@(Regex Matcher traits) match: node@(Regex Syntax EndMarker traits) from: pos
457 m matchee size <= pos ifTrue: [pos] ifFalse: [-1]
460 s@(PositionableStream traits) upToRegex: pattern@(String traits)
462 s upTo: pattern regex matcher
465 "fixme put this somewhere reasonable"
467 s@(PositionableStream traits) upToRegex: str@(String traits)
469 s upToRegex: str regex matcher
472 s@(PositionableStream traits) throughRegex: str@(String traits)
474 s throughRegex: str regex matcher
477 s@(PositionableStream traits) scanForRegex: matcher@(Regex Matcher traits) &goThrough: goThrough
479 goThrough `defaultsTo: False.
480 data := s collectionType new writer.
481 s restoringPositionDo:
482 [[s isAtEnd not /\ [(line := s nextLine) isNotNil]] whileTrue:
485 (found := matcher match: line from: pos) >= 0
486 ifTrue: [(line first: (goThrough ifTrue: [found] ifFalse: [pos])) >> data.
492 s@(PositionableStream traits) upToRegex: matcher@(Regex Matcher traits)
494 s scanForRegex: matcher &goThrough: False
497 s@(PositionableStream traits) throughRegex: matcher@(Regex Matcher traits)
499 s scanForRegex: matcher &goThrough: True
502 l maskedEntries: mask@(Regex Expression traits) do: block
503 "Works with glob code."
505 l maskedEntries: (mask Matcher newOn: mask) do: block