1 lobby ensureNamespace: #Regex.
2 Regex ensureNamespace: #Syntax.
3 Regex ensureNamespace: #Token.
4 Regex ensureNamespace: #Match.
6 "counter is incremented and stored inside expressions"
7 Regex define: #Parser &parents: {Cloneable} &slots: {
13 "regex is a regex syntax node"
14 Regex define: #Regex &parents: {Cloneable} &slots: {
16 #parser -> Regex Parser
19 "regex is a regex regex. subexpressions are the () portions that match. see the counter property.
20 the matchee is what we are trying to match"
21 Regex define: #Matcher &parents: {Cloneable} &slots: {
23 #subexpressions -> Dictionary new.
27 "nodes are output by the parser"
28 Regex Syntax define: #Node &parents: {Cloneable}.
30 "the top element with expressions surrounded by branches 'exp|exp'"
31 Regex Syntax define: #Branch &parents: {Regex Syntax Node} &slots: {
33 #expressions -> ExtensibleArray new
36 "must match in consecutive order 'ab'"
37 Regex Syntax define: #Expression &parents: {Regex Syntax Node} &slots: {
44 Regex Syntax define: #BeginningMarker &parents: {Regex Syntax Node} &slots: {}.
46 Regex Syntax define: #EndMarker &parents: {Regex Syntax Node} &slots: {}.
49 Regex Syntax define: #CharacterGroup &parents: {Regex Syntax Node} &slots: {
51 #elements -> ExtensibleArray new
54 Regex Syntax define: #Character &parents: {Regex Syntax Node} &slots: {
58 Regex Syntax define: #CharacterRange &parents: {Regex Syntax Node} &slots: {
63 "Regex Match define: #Terminator &parents: {Cloneable}."
65 r@(Regex Syntax Character traits) newOn: c
67 r new `>> [character := c. ]
70 Regex Syntax define: #DecimalDigits -> (Regex Syntax CharacterRange new `>> [start := $0. end := $9. ]).
71 Regex Syntax define: #UppercaseLetters -> (Regex Syntax CharacterRange new `>> [start := $A. end := $Z. ]).
72 Regex Syntax define: #LowercaseLetters -> (Regex Syntax CharacterRange new `>> [start := $a. end := $z. ]).
73 Regex Syntax define: #WhitespaceElements -> {
74 Regex Syntax Character newOn: $\e.
75 Regex Syntax Character newOn: $\n.
76 Regex Syntax Character newOn: $\s .
77 Regex Syntax Character newOn: $\r.
78 Regex Syntax Character newOn: $\f.
79 Regex Syntax Character newOn: $\t.
82 Regex Parser traits define: #BackslashSpecials -> (Dictionary new*,
83 $e -> (Regex Syntax Character newOn: $\e),
84 $n -> (Regex Syntax Character newOn: $\n),
85 $r -> (Regex Syntax Character newOn: $\r),
86 $f -> (Regex Syntax Character newOn: $\f),
87 $t -> (Regex Syntax Character newOn: $\t),
88 $w -> (Regex Syntax CharacterGroup new `>> [elements := {Regex Syntax UppercaseLetters.
89 Regex Syntax LowercaseLetters.
90 Regex Syntax DecimalDigits.
91 Regex Syntax Character newOn: $_. }. ]),
92 $W -> (Regex Syntax CharacterGroup new `>> [elements := {Regex Syntax UppercaseLetters.
93 Regex Syntax LowercaseLetters.
94 Regex Syntax DecimalDigits.
95 Regex Syntax Character newOn: $_. }.
97 $s -> (Regex Syntax CharacterGroup new `>> [elements := Regex Syntax WhitespaceElements. ]),
98 $S -> (Regex Syntax CharacterGroup new `>> [elements := Regex Syntax WhitespaceElements.
100 $d -> (Regex Syntax CharacterGroup new `>> [elements := {Regex Syntax DecimalDigits}. ]),
101 $D -> (Regex Syntax CharacterGroup new `>> [elements := {Regex Syntax DecimalDigits}.
102 negated := True. ])).
104 r@(Regex Regex traits) newOn: s@(String traits)
108 parser := r parser newOn: s.
109 regex := result parser parse. ]
112 rp@(Regex Parser traits) newOn: s@(String traits)
120 n@(Regex Syntax Node traits) as: e@(Regex Syntax Expression traits)
122 e new `>> [term := n. ]
125 n@(Regex Syntax Expression traits) as: e@(Regex Syntax Expression traits)
130 p@(Regex Parser traits) next
132 p position >= p source size
133 ifTrue: [p position += 1. Nil]
134 ifFalse: [obj ::= p source at: p position.
139 p@(Regex Parser traits) previous
144 p@(Regex Parser traits) expected: x butFound: y
146 error: 'Parser expected ' ; x printString ; ' but found ' ; y printString.
149 p@(Regex Parser traits) parse
154 p@(Regex Parser traits) nextAssertIs: item
156 (next ::= p next) = item ifFalse: [p expected: item butFound: next].
159 p@(Regex Parser traits) backspaceCharacterFor: c
161 Regex Parser BackslashSpecials at: c ifAbsent: [Regex Syntax Character newOn: c]
164 "branch ::= expression*
167 p@(Regex Parser traits) parseBranch
168 [| branch oldBranch obj next firstBranch |
169 branch := Regex Syntax Branch new `>> [| :b | nextBranch := Nil. expressions := b expressions new. ].
170 firstBranch := branch.
172 obj := p parseExpression.
174 branch expressions addLast: obj.
176 Nil -> [^ (branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [obj] ifFalse: [firstBranch])].
178 ^ (branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [obj] ifFalse: [firstBranch])].
179 $| -> [oldBranch := branch.
180 branch := Regex Syntax Branch new `>> [| :b | nextBranch := Nil. expressions := b expressions new. ].
181 oldBranch nextBranch := branch].
182 } otherwise: [p previous]
185 "repeatcount ::= {m,n}"
187 p@(Regex Parser traits) parseRepeatCount
192 "expression ::= term | term '?' | term '+' | term '*' | term numericrange"
193 p@(Regex Parser traits) parseExpression
198 $? -> [obj := obj as: Regex Syntax Expression. obj repeatCount := 0 to: 1 by: 1].
199 $+ -> [obj := obj as: Regex Syntax Expression. obj repeatCount := 1 to: PositiveInfinity by: 1].
200 $* -> [obj := obj as: Regex Syntax Expression. obj repeatCount := 0 to: PositiveInfinity by: 1].
201 $\{ -> [obj := obj as: Regex Syntax Expression. obj repeatCount := p parseRepeatCount. p nextAssertIs: $\}]
202 } otherwise: [p previous].
206 "term ::= group | '(' branch ')"
207 p@(Regex Parser traits) parseTerm
210 $\( -> [count := p counter.
212 obj ::= p parseBranch as: Regex Syntax Expression. "need the counter slot"
213 obj counter := count.
216 } otherwise: [p previous. p parseGroup]
219 "group ::= symbol | '[' '^'? range* ']'"
220 p@(Regex Parser traits) parseGroup
222 (next := p next) caseOf: {
223 $\[ -> [obj ::= Regex Syntax CharacterGroup new `>> [elements := ExtensibleArray new. ].
224 (next := p next) = $^ ifTrue: [obj negated := True] ifFalse: [p previous].
225 [(next := p next) ifNil: [p expected: $\] butFound: Nil].
226 next = $\]] whileFalse:
227 [p previous. p parseRange ifNotNilDo: [|:r| obj elements addLast: r]].
229 } otherwise: [p previous. p parseSymbol]
232 "range ::= symbol | symbol '-' symbol"
233 p@(Regex Parser traits) parseRange
235 start ::= p parseSymbol.
237 $\] -> [p previous. start].
238 $- -> [Regex Syntax CharacterRange new `>> [start := start. end := p parseSymbol. ]]
239 } otherwise: [p previous. start]
242 "symbol ::= . | char | escape char"
243 p@(Regex Parser traits) parseSymbol
246 $^ -> [Regex Syntax BeginningMarker new].
247 $$ -> [Regex Syntax EndMarker new].
248 $. -> [Regex Syntax CharacterRange new `>> [start: $\x00. end: $\xFF. ]]. "fixme ascii"
249 $\\ -> [p backspaceCharacterFor: p next]
250 } otherwise: [Regex Syntax Character new `>> [character := next. ]]
260 Regex Matcher define: #Fail -> -1.
262 m@(Regex Matcher traits) newOn: s@(String traits)
264 m newOn: (Regex Regex newOn: s)
267 m@(Regex Matcher traits) newOn: r@(Regex Regex traits)
269 m new `>> [regex := r. ]
272 s@(String traits) regex
277 r@(Regex Regex traits) matcher
279 Regex Matcher newOn: r
282 m@(Regex Matcher traits) subexpressionAt: x
284 (se ::= m subexpressions at: x ifAbsent: [^ Nil]) key >= m matchee size
285 ifFalse: [m matchee copyFrom: se key to: se value]
288 m@(Regex Matcher traits) subexpressionMatches
290 matches ::= Dictionary new.
291 m subexpressions keysDo:
292 [| :key | matches at: key put: (m subexpressionAt: key)].
296 m@(Regex Matcher traits) subexpressionMatchesArray
298 matches ::= Array new &capacity: m subexpressions keys max + 1.
299 m subexpressions keysDo:
300 [| :key | matches at: key put: (m subexpressionAt: key)].
304 m@(Regex Matcher traits) matches: node
306 (m match: node) ~= Regex Matcher Fail
309 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos to: endPos
311 node counter ifNotNilDo:
312 [| :counter | m subexpressions at: counter put: {pos. endPos}]
315 m@(Regex Matcher traits) match: s@(String traits)
320 string@(String traits) =~ regex@(Regex Matcher traits)
322 regex `>> [match: string. subexpressionMatchesArray]
325 string@(String traits) =~ regex@(String traits)
327 string =~ (Regex Matcher newOn: regex)
330 string@(String traits) =~ regex@(Regex Regex traits)
332 string =~ (Regex Matcher newOn: regex)
335 m@(Regex Matcher traits) match: s@(String traits) from: fromStart
337 m subexpressions := m subexpressions new.
339 ((res ::= m match: m regex regex from: fromStart) is: Sequence)
340 ifTrue: [res isEmpty ifTrue: [-1] ifFalse: [res last]]
344 "these functions return the position they match until"
345 m@(Regex Matcher traits) match: node@(Regex Syntax Node traits) from: pos
350 "match all in sequence(from start to end in sequence) or none"
351 m@(Regex Matcher traits) match: expressions@(Sequence traits) from: pos start: start end: end
352 [| curPos newPos exp futurePos|
354 [start > end ifTrue: [^ curPos].
355 exp := expressions at: start.
356 newPos := m match: exp from: curPos.
357 "wildcard matches produce array results of all the possible matches."
358 "do them in reverse to be greedy"
359 "inform: expressions printString ; 'from pos: ' ; pos printString ; ' start: ' ; start printString ; ' end: ' ; end printString.
360 inform: 'newPos: ' ; newPos printString."
361 (newPos is: Sequence)
362 ifTrue: [newPos reverseDo:
364 futurePos := m match: expressions from: matchPos start: start + 1 end: end.
365 (futurePos is: Sequence) ifTrue: [error: 'crap'].
366 futurePos ~= -1 ifTrue: [m markExpression: exp from: curPos to: matchPos.
369 ifFalse: [newPos = -1 ifTrue:
370 [m markExpression: exp from: curPos to: curPos.
377 m@(Regex Matcher traits) match: node@(Sequence traits) from: pos
378 [m match: node from: pos start: 0 end: node size - 1].
380 "expressions can return a list of matches because of repeating things"
381 m@(Regex Matcher traits) match: node@(Regex Syntax Branch traits) from: pos
383 (newPos ::= m match: node expressions from: pos) = -1
384 /\ [node nextBranch isNotNil]
385 ifTrue: [m match: node nextBranch from: pos]
389 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Expression traits) from: start to: end
391 node counter ifNotNil: [m subexpressions at: node counter put: start -> (end - 1)]
394 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Node traits) from: start to: end
395 [ "cannot mark node" ].
397 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos
398 [| curPos newPos matches numMatches |
399 "no repeatCount means it just has to match once"
400 node repeatCount ifNil:
401 [(curPos := m match: node term from: pos) > pos
402 ifTrue: [m markExpression: node from: pos to: curPos].
405 matches := ExtensibleArray new.
406 node repeatCount start = 0 ifTrue: [matches addLast: pos]. "optional argument may match nil matchee"
408 [numMatches < node repeatCount end] whileTrue:
409 [newPos: (m match: node term from: curPos).
411 "inform: 'match: ' ; newPos printString ; ' from: ' ; node printString."
412 newPos = -1 ifTrue: [numMatches >= node repeatCount start
413 ifTrue: [curPos > pos ifTrue: [m markExpression: node from: pos to: curPos].
415 ifFalse: [^ newPos]].
417 matches addLast: curPos.
419 matches isEmpty ifTrue: [-1] ifFalse: [matches]
422 m@(Regex Matcher traits) match: node@(Regex Syntax Character traits) from: pos
424 pos >= m matchee size ifTrue: [^ -1].
425 (m matchee at: pos) = node character ifTrue: [pos + 1] ifFalse: [-1]
428 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterRange traits) from: pos
430 pos >= m matchee size ifTrue: [^ -1].
431 (charCode := (m matchee at: pos) code) >= node start code
432 /\ [charCode <= node end code]
437 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterGroup traits) from: pos
438 [ "there should be another way than assuming a match is just pos+1"
439 pos >= m matchee size ifTrue: [^ -1].
441 ifTrue: [(node elements noneSatisfy: [| :each | (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
442 ifFalse: [(node elements anySatisfy: [| :each | (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
445 m@(Regex Matcher traits) match: node@(Regex Syntax BeginningMarker traits) from: pos
447 pos = 0 ifTrue: [pos] ifFalse: [-1]
450 m@(Regex Matcher traits) match: node@(Regex Syntax EndMarker traits) from: pos
452 m matchee size <= pos ifTrue: [pos] ifFalse: [-1]
455 s@(PositionableStream traits) upToRegex: pattern@(String traits)
457 s upTo: pattern regex matcher
460 "fixme put this somewhere reasonable"
462 s@(PositionableStream traits) upToRegex: str@(String traits)
464 s upToRegex: str regex matcher
467 s@(PositionableStream traits) throughRegex: str@(String traits)
469 s throughRegex: str regex matcher
472 s@(PositionableStream traits) scanForRegex: matcher@(Regex Matcher traits) &goThrough: goThrough
474 goThrough `defaultsTo: False.
475 data := s collectionType new writer.
476 s restoringPositionDo:
477 [[s isAtEnd not /\ [(line := s nextLine) isNotNil]] whileTrue:
480 (found := matcher match: line from: pos) >= 0
481 ifTrue: [(line first: (goThrough ifTrue: [found] ifFalse: [pos])) >> data.
487 s@(PositionableStream traits) upToRegex: matcher@(Regex Matcher traits)
489 s scanForRegex: matcher &goThrough: False
492 s@(PositionableStream traits) throughRegex: matcher@(Regex Matcher traits)
494 s scanForRegex: matcher &goThrough: True