1 lobby ensureNamespace: #Regex.
2 Regex ensureNamespace: #Syntax.
3 Regex ensureNamespace: #Token.
4 Regex ensureNamespace: #Match.
6 "counter is incremented and stored inside expressions"
7 Regex define: #Parser &parents: {Cloneable} &slots: {#position. #source. #counter}.
9 "regex is a regex syntax node"
10 Regex define: #Regex &parents: {Cloneable} &slots: {#regex. #parser -> Regex Parser.}.
12 "regex is a regex regex. subexpressions are the () portions that match. see the counter property.
13 the matchee is what we are trying to match"
14 Regex define: #Matcher &parents: {Cloneable} &slots: {#regex. #subexpressions. #matchee}.
16 "nodes are output by the parser"
17 Regex Syntax define: #Node &parents: {Cloneable}.
19 "the top element with expressions surrounded by branches 'exp|exp'"
20 Regex Syntax define: #Branch &parents: {Regex Syntax Node} &slots: {#nextBranch. #expressions}.
22 "must match in consecutive order 'ab'"
23 Regex Syntax define: #Expression &parents: {Regex Syntax Node} &slots: {#term. #counter. #repeatCount}.
26 Regex Syntax define: #BeginningMarker &parents: {Regex Syntax Node} &slots: {}.
28 Regex Syntax define: #EndMarker &parents: {Regex Syntax Node} &slots: {}.
31 Regex Syntax define: #CharacterGroup &parents: {Regex Syntax Node} &slots: {#negated -> False. #elements -> ExtensibleArray new}.
33 Regex Syntax define: #Character &parents: {Regex Syntax Node} &slots: {#character}.
35 Regex Syntax define: #CharacterRange &parents: {Regex Syntax Node} &slots: {#start. #end}.
37 "Regex Match define: #Terminator &parents: {Cloneable}."
39 r@(Regex Syntax Character traits) newOn: c
41 r new `>> [character: c. ]
44 Regex Syntax define: #DecimalDigits -> (Regex Syntax CharacterRange new `>> [start: $0. end: $9. ]).
45 Regex Syntax define: #UppercaseLetters -> (Regex Syntax CharacterRange new `>> [start: $A. end: $Z. ]).
46 Regex Syntax define: #LowercaseLetters -> (Regex Syntax CharacterRange new `>> [start: $a. end: $z. ]).
47 Regex Syntax define: #WhitespaceElements -> {
48 Regex Syntax Character newOn: $\e.
49 Regex Syntax Character newOn: $\n.
50 Regex Syntax Character newOn: $\s .
51 Regex Syntax Character newOn: $\r.
52 Regex Syntax Character newOn: $\f.
53 Regex Syntax Character newOn: $\t.
56 Regex Parser traits define: #BackslashSpecials -> (Dictionary new*,
57 $e -> (Regex Syntax Character newOn: $\e),
58 $n -> (Regex Syntax Character newOn: $\n),
59 $r -> (Regex Syntax Character newOn: $\r),
60 $f -> (Regex Syntax Character newOn: $\f),
61 $t -> (Regex Syntax Character newOn: $\t),
62 $w -> (Regex Syntax CharacterGroup new `>> [elements: {Regex Syntax UppercaseLetters.
63 Regex Syntax LowercaseLetters.
64 Regex Syntax DecimalDigits.
65 Regex Syntax Character newOn: $_. }. ]),
66 $W -> (Regex Syntax CharacterGroup new `>> [elements: {Regex Syntax UppercaseLetters.
67 Regex Syntax LowercaseLetters.
68 Regex Syntax DecimalDigits.
69 Regex Syntax Character newOn: $_. }.
71 $s -> (Regex Syntax CharacterGroup new `>> [elements: Regex Syntax WhitespaceElements. ]),
72 $S -> (Regex Syntax CharacterGroup new `>> [elements: Regex Syntax WhitespaceElements.
74 $d -> (Regex Syntax CharacterGroup new `>> [elements: {Regex Syntax DecimalDigits}. ]),
75 $D -> (Regex Syntax CharacterGroup new `>> [elements: {Regex Syntax DecimalDigits}.
78 r@(Regex Regex traits) newOn: s@(String traits)
81 re parser: (r parser newOn: s).
82 re regex: re parser parse.
86 rp@(Regex Parser traits) newOn: s@(String traits)
95 n@(Regex Syntax Node traits) as: e@(Regex Syntax Expression traits)
100 n@(Regex Syntax Expression traits) as: e@(Regex Syntax Expression traits)
105 p@(Regex Parser traits) next
107 p position >= p source size ifTrue: [p position: p position + 1. ^ Nil].
108 obj: (p source at: p position).
109 p position: p position + 1.
113 p@(Regex Parser traits) previous
115 p position: p position - 1.
118 p@(Regex Parser traits) expected: x butFound: y
120 error: 'Parser expected ' ; x printString ; ' but found ' ; y printString.
123 p@(Regex Parser traits) parse
128 p@(Regex Parser traits) nextAssertIs: item
131 next = item ifFalse: [p expected: item butFound: next].
134 p@(Regex Parser traits) backspaceCharacterFor: c
136 Regex Parser BackslashSpecials at: c ifAbsent: [(Regex Syntax Character newOn: c)]
139 "branch ::= expression*
142 p@(Regex Parser traits) parseBranch
143 [ | branch oldBranch obj next firstBranch |
144 branch: Regex Syntax Branch new `>> [nextBranch: Nil. expressions: ExtensibleArray new. ].
147 obj: p parseExpression.
149 branch expressions addLast: obj.
152 Nil -> [branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [^ obj] ifFalse: [^ firstBranch]].
154 branch expressions size <= 1 /\ [oldBranch isNil] ifTrue: [^ obj] ifFalse: [^ firstBranch]].
155 $| -> [oldBranch: branch.
156 branch: Regex Syntax Branch new `>> [nextBranch: Nil. expressions: ExtensibleArray new. ].
157 oldBranch nextBranch: branch].
158 } otherwise: [p previous]
163 "repeatcount ::= {m,n}"
165 p@(Regex Parser traits) parseRepeatCount
170 "expression ::= term | term '?' | term '+' | term '*' | term numericrange"
171 p@(Regex Parser traits) parseExpression
176 { $? -> [obj: (obj as: Regex Syntax Expression). obj repeatCount: (0 to: 1 by: 1). ^ obj].
177 $+ -> [obj: (obj as: Regex Syntax Expression). obj repeatCount: (1 to: PositiveInfinity by: 1). ^ obj].
178 $* -> [obj: (obj as: Regex Syntax Expression). obj repeatCount: (0 to: PositiveInfinity by: 1). ^ obj].
179 ${ -> [obj: (obj as: Regex Syntax Expression). obj repeatCount: p parseRepeatCount. p nextAssertIs: $}. ^ obj].
181 otherwise: [p previous. ^ obj]
185 "term ::= group | '(' branch ')"
186 p@(Regex Parser traits) parseTerm
190 { $( -> [count: p counter.
191 p counter: p counter + 1.
192 obj: (p parseBranch as: Regex Syntax Expression). "need the counter slot"
197 otherwise: [p previous. ^ p parseGroup]
201 "group ::= symbol | '[' '^'? range* ']'"
202 p@(Regex Parser traits) parseGroup
206 ifTrue: [obj: Regex Syntax CharacterGroup new `>> [elements: ExtensibleArray new. ].
208 next = $^ ifTrue: [obj negated: True] ifFalse: [p previous].
210 next ifNil: [p expected: $] butFound: Nil].
211 next = $]] whileFalse: [p previous. p parseRange ifNotNilDo: [|:r| obj elements addLast: r]].
213 ifFalse: [p previous. ^ p parseSymbol]
217 "range ::= symbol | symbol '-' symbol"
218 p@(Regex Parser traits) parseRange
219 [ | next start end obj |
220 start: p parseSymbol.
222 next = $] ifTrue: [p previous. ^ start].
224 ifTrue: [end: p parseSymbol.
225 obj: Regex Syntax CharacterRange new `>> [start: start. end: end. ].
227 ifFalse: [p previous. ^ start]
231 "symbol ::= . | char | escape char"
232 p@(Regex Parser traits) parseSymbol
235 next = $^ ifTrue: [^ (Regex Syntax BeginningMarker new)].
236 next = $$ ifTrue: [^ (Regex Syntax EndMarker new)].
237 next = $. ifTrue: [^ (Regex Syntax CharacterRange new `>> [start: $\x00. end: $\xFF. ])]. "fixme ascii"
239 ifTrue: [next: p next.
240 p backspaceCharacterFor: next]
241 ifFalse: [Regex Syntax Character new `>> [character: next. ]]
252 Regex Matcher define: #Fail -> -1.
254 m@(Regex Matcher traits) newOn: s@(String traits)
256 m newOn: (Regex Regex newOn: s)
259 m@(Regex Matcher traits) newOn: r@(Regex Regex traits)
261 m new `>> [regex: r. ]
264 s@(String traits) regex
269 r@(Regex Regex traits) matcher
271 Regex Matcher newOn: r
275 m@(Regex Matcher traits) subexpression: x
277 se: (m subexpressions at: x ifAbsent: [^ Nil]).
278 se key >= m matchee size ifTrue: [^ Nil].
279 m matchee copyFrom: se key to: se value
282 m@(Regex Matcher traits) subexpressionMatches
284 matches: Dictionary new.
285 m subexpressions keysDo: [|:key | matches at: key put: (m subexpression: key)].
289 m@(Regex Matcher traits) subexpressionMatchesArray
291 matches: (Array new &capacity: m subexpressions keys max + 1).
292 m subexpressions keysDo: [|:key | matches at: key put: (m subexpression: key)].
296 m@(Regex Matcher traits) matches: node
298 (m match: node) ~= Regex Matcher Fail
301 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos to: endPos
303 node counter ifNotNil: [m subexpressions at: node counter put: {pos. endPos}]
306 m@(Regex Matcher traits) match: s@(String traits)
311 string@(String traits) =~ regex@(Regex Matcher traits)
313 regex `>> [match: string. subexpressionMatchesArray]
316 string@(String traits) =~ regex@(String traits)
318 string =~ (Regex Matcher newOn: regex)
321 string@(String traits) =~ regex@(Regex Regex traits)
323 string =~ (Regex Matcher newOn: regex)
327 m@(Regex Matcher traits) match: s@(String traits) from: fromStart
329 m subexpressions: Dictionary new.
331 res: (m match: m regex regex from: fromStart).
332 (res is: Sequence) ifTrue: [res isEmpty ifTrue: [-1] ifFalse: [res last]] ifFalse: [res]
335 "these functions return the position they match until"
336 m@(Regex Matcher traits) match: node@(Regex Syntax Node traits) from: pos
341 "match all in sequence(from start to end in sequence) or none"
342 m@(Regex Matcher traits) match: expressions@(Sequence traits) from: pos start: start end: end
343 [ | curPos newPos exp futurePos|
346 start > end ifTrue: [^ curPos].
347 exp: (expressions at: start).
348 newPos: (m match: exp from: curPos).
349 "wildcard matches produce array results of all the possible matches."
350 "do them in reverse to be greedy"
351 "inform: expressions printString ; 'from pos: ' ; pos printString ; ' start: ' ; start printString ; ' end: ' ; end printString.
352 inform: 'newPos: ' ; newPos printString."
353 (newPos is: Sequence)
354 ifTrue: [newPos reverseDo: [|:matchPos|
355 futurePos: (m match: expressions from: matchPos
356 start: start + 1 end: end).
357 (futurePos is: Sequence) ifTrue: [error: 'crap'].
358 futurePos ~= -1 ifTrue: [m markExpression: exp from: curPos to: matchPos.
362 ifFalse: [newPos = -1 ifTrue: [m markExpression: exp from: curPos to: curPos.
369 m@(Regex Matcher traits) match: node@(Sequence traits) from: pos
370 [ m match: node from: pos start: 0 end: node size - 1].
372 "expressions can return a list of matches because of repeating things"
373 m@(Regex Matcher traits) match: node@(Regex Syntax Branch traits) from: pos
375 newPos: (m match: node expressions from: pos).
376 newPos = -1 /\ [node nextBranch isNotNil]
377 ifTrue: [m match: node nextBranch from: pos]
381 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Expression traits) from: start to: end
383 node counter ifNotNil: [m subexpressions at: node counter put: start -> (end - 1)]
386 m@(Regex Matcher traits) markExpression: node@(Regex Syntax Node traits) from: start to: end
387 [ "cannot mark node" ].
389 m@(Regex Matcher traits) match: node@(Regex Syntax Expression traits) from: pos
390 [ | curPos newPos matches numMatches |
391 "no repeatCount means it just has to match once"
392 node repeatCount ifNil: [curPos: (m match: node term from: pos).
393 curPos > pos ifTrue: [m markExpression: node from: pos to: curPos].
397 matches: ExtensibleArray new.
398 node repeatCount start = 0 ifTrue: [matches addLast: pos]. "optional argument may match nil matchee"
400 [numMatches < node repeatCount end]
401 whileTrue: [newPos: (m match: node term from: curPos).
403 "inform: 'match: ' ; newPos printString ; ' from: ' ; node printString."
404 newPos = -1 ifTrue: [numMatches >= node repeatCount start
405 ifTrue: [curPos > pos ifTrue: [m markExpression: node from: pos to: curPos].
409 matches addLast: curPos.
410 numMatches: numMatches + 1].
412 matches isEmpty ifTrue: [-1] ifFalse: [matches]
415 m@(Regex Matcher traits) match: node@(Regex Syntax Character traits) from: pos
417 pos >= m matchee size ifTrue: [^ -1].
418 (m matchee at: pos) = node character ifTrue: [pos + 1] ifFalse: [-1]
421 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterRange traits) from: pos
423 pos >= m matchee size ifTrue: [^ -1].
424 charCode: (m matchee at: pos) code.
425 charCode >= node start code /\ [charCode <= node end code] ifTrue: [pos + 1] ifFalse: [-1]
428 m@(Regex Matcher traits) match: node@(Regex Syntax CharacterGroup traits) from: pos
429 [ "there should be another way than assuming a match is just pos+1"
430 pos >= m matchee size ifTrue: [^ -1].
432 ifTrue: [(node elements noneSatisfy: [|:each| (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
433 ifFalse: [(node elements anySatisfy: [|:each| (m match: each from: pos) ~= -1]) ifTrue: [pos + 1] ifFalse: [-1]]
436 m@(Regex Matcher traits) match: node@(Regex Syntax BeginningMarker traits) from: pos
438 pos = 0 ifTrue: [pos] ifFalse: [-1]
441 m@(Regex Matcher traits) match: node@(Regex Syntax EndMarker traits) from: pos
443 m matchee size <= pos ifTrue: [pos] ifFalse: [-1]
446 s@(PositionableStream traits) upToRegex: pattern@(String traits)
448 s upTo: pattern regex matcher
451 "fixme put this somewhere reasonable"
453 s@(PositionableStream traits) upToRegex: str@(String traits)
455 s upToRegex: str regex matcher
458 s@(PositionableStream traits) throughRegex: str@(String traits)
460 s throughRegex: str regex matcher
463 s@(PositionableStream traits) scanForRegex: matcher@(Regex Matcher traits) &goThrough: goThrough
464 [ | start data line |
465 goThrough `defaultsTo: False.
467 data: s collectionType new writer.
468 s restoringPositionDo:
469 [[s isAtEnd not. line: (s nextLine). line isNotNil] whileTrue:
470 [0 to: line size do: [| :pos found | found: (matcher match: line from: pos).
471 found >= 0 ifTrue: [(line first: (goThrough ifTrue: [found] ifFalse: [pos])) >> data.
480 s@(PositionableStream traits) upToRegex: matcher@(Regex Matcher traits)
482 s scanForRegex: matcher &goThrough: False
485 s@(PositionableStream traits) throughRegex: matcher@(Regex Matcher traits)
487 s scanForRegex: matcher &goThrough: True