1 # Copyright (C) 2007-2008, The Perl Foundation.
6 lib/luaregex.pir - Lua regex compiler
10 See "Lua 5.1 Reference Manual", section 5.4.1 "Patterns",
11 L<http://www.lua.org/manual/5.1/manual.html#5.4.1>.
13 =head2 Character Class:
15 A I<character class> is used to represent a set of characters. The following
16 combinations are allowed in describing a character class:
22 (where I<x> is not one of the I<magic characters> C<^$()%.[]*+-?)> represents
23 the character I<x> itself.
27 (a dot) represents all characters.
31 represents all letters.
35 represents all control characters.
39 represents all digits.
43 represents all lowercase letters.
47 represents all punctuation characters.
51 represents all space characters.
55 represents all uppercase letters.
59 represents all alphanumeric characters.
63 represents all hexadecimal digits.
67 represents the character with representation 0.
71 (where I<x> is any non-alphanumeric character) represents the character I<x>.
72 This is the standard way to escape the magic characters. Any punctuation
73 character (even the non magic) can be preceded by a C<'%'> when used to
74 represent itself in a pattern.
78 represents the class which is the union of all characters in I<set>. A range of
79 characters may be specified by separating the end characters of the range with
80 a C<'-'>. All classes C<%x> described above may also be used as components in
81 I<set>. All other characters in I<set> represent themselves. For example,
82 C<[%w_]> (or C<[_%w]>) represents all alphanumeric characters plus the
83 underscore, C<[0-7]> represents the octal digits, and C<[0-7%l%-]> represents
84 the octal digits plus the lowercase letters plus the C<'-'> character.
86 The interaction between ranges and classes is not defined. Therefore, patterns
87 like C<[%a-z]> or C<[a-%%]> have no meaning.
91 represents the complement of I<set>, where I<set> is interpreted as above.
95 For all classes represented by single letters (C<%a>, C<%c>, etc.), the
96 corresponding uppercase letter represents the complement of the class. For
97 instance, C<%S> represents all non-space characters.
99 The definitions of letter, space, and other character groups depend on the
100 current locale. In particular, the class C<[a-z]> may not be equivalent to
105 A I<pattern item> may be
111 a single character class, which matches any single character in the class;
115 a single character class followed by C<'*'>, which matches 0 or more
116 repetitions of characters in the class. These repetition items will always
117 match the longest possible sequence;
121 a single character class followed by C<'+'>, which matches 1 or more
122 repetitions of characters in the class. These repetition items will always
123 match the longest possible sequence;
127 a single character class followed by C<'-'>, which also matches 0 or more
128 repetitions of characters in the class. Unlike C<'*'>, these repetition items
129 will always match the I<shortest> possible sequence;
133 a single character class followed by C<'?'>, which matches 0 or 1
134 occurrence of a character in the class;
138 C<%n>, for I<n> between 1 and 9; such item matches a substring equal to
139 the i<n>-th captured string (see below);
143 C<%bxy>, where I<x> and I<y> are two distinct characters; such item
144 matches strings that start with I<x>, end with I<y>, and where the I<x> and
145 I<y> are I<balanced>. This means that, if one reads the string from left to
146 right, counting I<+1> for an I<x> and I<-1> for a I<y>, the ending I<y> is the
147 first I<y> where the count reaches 0. For instance, the item C<%b()> matches
148 expressions with balanced parentheses.
154 A I<pattern> is a sequence of pattern items. A C<'^'> at the beginning of a
155 pattern anchors the match at the beginning of the subject string. A C<'$'> at
156 the end of a pattern anchors the match at the end of the subject string. At
157 other positions, C<'^'> and C<'$'> have no special meaning and represent
162 A pattern may contain sub-patterns enclosed in parentheses; they describe
163 I<captures>. When a match succeeds, the substrings of the subject string that
164 match captures are stored (I<captured>) for future use. Captures are numbered
165 according to their left parentheses. For instance, in the pattern
166 C<"(a*(.)%w(%s*))">, the part of the string matching C<"a*(.)%w(%s*)"> is
167 stored as the first capture (and therefore has number 1); the character
168 matching C<"."> is captured with number 2, and the part matching C<"%s*"> has
171 As a special case, the empty capture C<()> captures the current string
172 position (a number). For instance, if we apply the pattern C<"()aa()"> on the
173 string C<"flaaap">, there will be two captures: 3 and 5.
175 A pattern cannot contain embedded zeros. Use C<%z> instead.
179 Mostly taken from F<compilers/pge/PGE/P5Regex.pir>.
187 .sub '__onload' :anon :load :init
188 load_bytecode 'PGE.pbc'
190 $P0 = subclass 'PGE::Exp::CCShortcut', 'PGE::Exp::LuaCCShortcut'
191 $P0 = subclass 'PGE::Exp::CGroup', 'PGE::Exp::LuaCGroup'
192 $P0 = subclass 'PGE::Exp', 'PGE::Exp::LuaBalanced'
193 $P0 = subclass 'Hash', 'PGE::Cache'
196 .namespace [ 'PGE::LuaRegex' ]
198 .sub 'compile_luaregex'
200 .param pmc adverbs :slurpy :named
202 $I0 = exists adverbs['grammar']
203 if $I0 goto have_grammar
204 adverbs['grammar'] = 'PGE::Grammar'
208 target = adverbs['target']
209 target = downcase target
211 unless target == '' goto no_cache
213 cache = get_hll_global ['PGE::LuaRegex'], 'cache_compile'
215 if null $P0 goto no_cache
220 match = luaregex(source)
221 if target != 'parse' goto check
225 unless match goto check_1
228 if $S0 == $S1 goto analyze
234 .local pmc expr, pad, code
238 expr = expr.'luaanalyze'(pad)
239 code = expr.'compile'(adverbs :flat :named)
248 optable = get_hll_global ['PGE::LuaRegex'], '$optable'
249 $P0 = optable.'parse'(mob)
254 .include 'cclass.pasm'
257 .sub '__onload' :anon :load :init
259 new optable, 'PGE::OPTable'
260 set_hll_global ['PGE::LuaRegex'], '$optable', optable
262 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_literal'
263 optable.newtok('term:', 'precedence'=>'=', 'nows'=>1, 'parsed'=>$P0)
265 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_anchor'
266 optable.newtok('term:^', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
267 optable.newtok('term:$', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
269 optable.newtok('term:%a', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
270 optable.newtok('term:%A', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
271 optable.newtok('term:%c', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
272 optable.newtok('term:%C', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
273 optable.newtok('term:%d', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
274 optable.newtok('term:%D', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
275 optable.newtok('term:%l', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
276 optable.newtok('term:%L', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
277 optable.newtok('term:%p', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
278 optable.newtok('term:%P', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
279 optable.newtok('term:%s', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
280 optable.newtok('term:%S', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
281 optable.newtok('term:%u', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
282 optable.newtok('term:%U', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
283 optable.newtok('term:%w', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
284 optable.newtok('term:%W', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
285 optable.newtok('term:%x', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
286 optable.newtok('term:%X', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::LuaCCShortcut')
288 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_backref'
289 optable.newtok('term:%1', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
290 optable.newtok('term:%2', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
291 optable.newtok('term:%3', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
292 optable.newtok('term:%4', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
293 optable.newtok('term:%5', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
294 optable.newtok('term:%6', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
295 optable.newtok('term:%7', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
296 optable.newtok('term:%8', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
297 optable.newtok('term:%9', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
299 optable.newtok('circumfix:( )', 'equiv'=>'term:', 'nows'=>1, 'nullterm'=>1, 'match'=>'PGE::Exp::LuaCGroup')
301 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_enumclass'
302 optable.newtok('term:[', 'precedence'=>'=', 'nows'=>1, 'parsed'=>$P0)
303 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_enumclass2'
304 optable.newtok('term:.', 'precedence'=>'=', 'nows'=>1, 'parsed'=>$P0)
305 optable.newtok('term:%z', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
306 optable.newtok('term:%Z', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
308 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_balanced'
309 optable.newtok('term:%b', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
311 $P0 = get_hll_global ['PGE::LuaRegex'], 'parse_quantifier'
312 optable.newtok('postfix:*', 'looser'=>'term:', 'left'=>1, 'nows'=>1, 'parsed'=>$P0)
313 optable.newtok('postfix:+', 'equiv'=>'postfix:*', 'left'=>1, 'nows'=>1, 'parsed'=>$P0)
314 optable.newtok('postfix:?', 'equiv'=>'postfix:*', 'left'=>1, 'nows'=>1, 'parsed'=>$P0)
315 optable.newtok('postfix:-', 'equiv'=>'postfix:*', 'left'=>1, 'nows'=>1, 'parsed'=>$P0)
317 optable.newtok('infix:', 'looser'=>'postfix:*', 'right'=>1, 'nows'=>1, 'match'=>'PGE::Exp::Concat')
320 new cache, 'PGE::Cache'
321 set_hll_global ['PGE::LuaRegex'], 'cache_compile', cache
323 $P0 = get_hll_global ['PGE::LuaRegex'], 'compile_luaregex'
324 compreg 'PGE::LuaRegex', $P0
331 .param string message
332 $P0 = getattribute mob, '$.pos'
334 $S0 = 'luaregex parse error: '
340 $P1 = getattribute mob, '$.target'
342 $S1 = substr $S1, pos, 1
352 .local int pos, lastpos
353 .local int litstart, litlen
354 .local string initchar
355 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
356 lastpos = length target
357 initchar = substr target, pos, 1
358 if initchar == ')' goto end
361 if initchar != '%' goto term_literal
362 if pos < lastpos goto term_percent_ok
363 parse_error(mob, pos, "malformed pattern (ends with '%')")
365 initchar = substr target, pos, 1
371 if pos >= lastpos goto term_literal_end
372 $S0 = substr target, pos, 1
373 $I0 = index '()%.[]*+-?', $S0
374 # if not in circumfix:( ) throw error on end paren
375 if $I0 >= 0 goto term_literal_end
378 goto term_literal_loop
380 if litlen < 1 goto term_literal_one
384 $S0 = substr target, litstart, $I0
385 $S0 = concat initchar, $S0
386 mob.'result_object'($S0)
393 .const int PGE_INF = 2147483647
394 .const int PGE_BACKTRACK_GREEDY = 1
395 .const int PGE_BACKTRACK_EAGER = 2
397 .sub 'parse_quantifier'
400 .local int min, max, backtrack
401 .local int pos, lastpos
404 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Quant')
405 lastpos = length target
408 backtrack = PGE_BACKTRACK_GREEDY
409 if key != '+' goto quant_max
412 if key != '?' goto quant_eager
415 if key != '-' goto end
416 backtrack = PGE_BACKTRACK_EAGER
420 mob['backtrack'] = backtrack
426 .sub 'parse_enumclass'
429 .local int pos, lastpos
431 .local string charlist
434 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::EnumCharList')
435 lastpos = length target
439 $S0 = substr target, pos, 1
440 if $S0 != '^' goto scan_first
444 if pos >= lastpos goto err_close
445 $S0 = substr target, pos, 1
447 if $S0 == '%' goto percent
450 if pos >= lastpos goto err_close
451 $S0 = substr target, pos, 1
453 if $S0 == ']' goto endclass
454 if $S0 == '-' goto hyphenrange
455 if $S0 != '%' goto addchar
457 $S0 = substr target, pos, 1
460 if isrange goto addrange
465 $I2 = ord charlist, -1
467 if $I0 < $I2 goto err_range
470 if $I2 > $I0 goto scan
475 if isrange goto addrange
479 if isrange == 0 goto end
483 mob.'result_object'(charlist)
487 parse_error(mob, pos, "malformed pattern (missing ']')")
489 $S0 = 'Invalid [] range "'
496 parse_error(mob, pos, $S0)
500 .sub 'parse_enumclass2'
504 .local string charlist
507 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::EnumCharList')
508 unless key == '.' goto zero
515 unless key == '%Z' goto end
519 mob.'result_object'(charlist)
524 .sub 'parse_balanced'
527 .local int pos, lastpos
529 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::LuaBalanced')
530 lastpos = length target
531 if lastpos < 2 goto err
532 xy = substr target, pos, 2
535 mob.'result_object'(xy)
539 parse_error(mob, pos, "unbalanced pattern")
548 $P0 = getattribute mob, '$.target'
550 $P0 = getattribute mob, '$.pos'
553 $S0 = substr target, $I0, 1
554 (mob, $I1, $S1) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Scalar')
566 .local int pos, lastpos
569 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Anchor')
570 lastpos = length target
571 unless key == '$' goto start
572 unless pos == lastpos goto end
575 unless pos == 1 goto end
582 .namespace [ 'PGE::Exp' ]
584 .sub 'luaanalyze' :method
589 $I1 = defined self[$I0]
592 $P0 = $P0.'luaanalyze'(pad)
601 .namespace [ 'PGE::Exp::LuaCGroup' ]
603 .sub 'luaanalyze' :method
607 self['iscapture'] = 0
608 if self != '(' goto end
609 self['iscapture'] = 1
618 expr = expr.'luaanalyze'(pad)
629 $I0 = isa $P0, 'PGE::Exp::Literal'
630 unless $I0 goto super
632 unless $S0 == '' goto super
635 args = self.'getargs'(label, next)
637 .local string captgen, captsave, captback
638 (captgen, captsave, captback) = self.'gencapture'(label)
640 code.'emit'(<<" CODE", captgen, captsave, captback, args :flat :named)
643 push ustack, captscope
644 new captob, 'Integer'
650 captscope = pop ustack
656 $P0 = get_hll_global ['PGE::Exp::CGroup'], 'pir'
657 .return $P0(self, code, label, next)
661 .namespace [ 'PGE::Exp::LuaCCShortcut' ]
663 .sub 'reduce' :method
669 if token == '%A' goto letter
670 if token == '%C' goto ctrl
671 if token == '%D' goto digit
672 if token == '%L' goto lower
673 if token == '%P' goto ponct
674 if token == '%S' goto space
675 if token == '%U' goto upper
676 if token == '%W' goto word
677 if token == '%X' goto hexa
679 if token == '%a' goto letter
680 if token == '%c' goto ctrl
681 if token == '%d' goto digit
682 if token == '%l' goto lower
683 if token == '%p' goto ponct
684 if token == '%s' goto space
685 if token == '%u' goto upper
686 if token == '%w' goto word
687 if token == '%x' goto hexa
688 self['cclass'] = .CCLASS_ANY
691 self['cclass'] = .CCLASS_ALPHABETIC
694 self['cclass'] = .CCLASS_CONTROL
697 self['cclass'] = .CCLASS_NUMERIC
700 self['cclass'] = .CCLASS_LOWERCASE
703 self['cclass'] = .CCLASS_PUNCTUATION
706 self['cclass'] = .CCLASS_WHITESPACE
709 self['cclass'] = .CCLASS_UPPERCASE
712 self['cclass'] = .CCLASS_WORD
715 self['cclass'] = .CCLASS_HEXADECIMAL
721 .namespace [ 'PGE::Exp::LuaBalanced' ]
723 .sub 'reduce' :method
733 .local string begin, end
735 begin = substr $S0, 0, 1
736 begin = code.'escape'(begin)
737 end = substr $S0, 1, 1
738 end = code.'escape'(end)
740 code.'emit'(<<" CODE", label, begin, end, next)
742 if pos >= lastpos goto fail
743 $S0 = substr target, pos, 1
744 if $S0 != %1 goto fail
748 if pos >= lastpos goto fail
749 $S0 = substr target, pos, 1
750 if $S0 != %2 goto %0_2
752 if $I1 != 0 goto %0_1
756 if $S0 != %1 goto %0_1
768 # vim: expandtab shiftwidth=4 ft=pir: