JPC-RR r11.7
[jpcrr.git] / mnj / lua / MatchState.java
blob3b41fea3f51a36c7a5375e74ad0be6829cb01163
1 /* $Header: //info.ravenbrook.com/project/jili/version/1.1/code/mnj/lua/StringLib.java#1 $
2 * Copyright (c) 2006 Nokia Corporation and/or its subsidiary(-ies).
3 * All rights reserved.
5 * Permission is hereby granted, free of charge, to any person obtaining
6 * a copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject
11 * to the following conditions:
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
20 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
21 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 // Modified 2009-12-26 by Ilari Liusvaara
26 // Split -> StringLib -> StringLib, MatchState and FormatItem as J2SE
27 // doesn't like multiple classes in the same file.
29 package mnj.lua;
31 import java.util.Vector;
33 final class MatchState
35 Lua L;
36 /** The entire string that is the subject of the match. */
37 String src;
38 /** The subject's length. */
39 int end;
40 /** Total number of captures (finished or unfinished). */
41 int level;
42 /** Each capture element is a 2-element array of (index, len). */
43 Vector<Object> capture = new Vector<Object>();
44 // :todo: consider adding the pattern string as a member (and removing
45 // p parameter from methods).
47 // :todo: consider removing end parameter, if end always == // src.length()
48 MatchState(Lua L, String src, int end)
50 this.L = L;
51 this.src = src;
52 this.end = end;
55 /**
56 * Returns the length of capture <var>i</var>.
58 private int captureLen(int i)
60 int[] c = (int[])capture.elementAt(i);
61 return c[1];
64 /**
65 * Returns the init index of capture <var>i</var>.
67 private int captureInit(int i)
69 int[] c = (int[])capture.elementAt(i);
70 return c[0];
73 /**
74 * Returns the 2-element array for the capture <var>i</var>.
76 private int[] capture(int i)
78 return (int[])capture.elementAt(i);
81 int capInvalid()
83 return L.error("invalid capture index");
86 int malBra()
88 return L.error("malformed pattern (missing '[')");
91 int capUnfinished()
93 return L.error("unfinished capture");
96 int malEsc()
98 return L.error("malformed pattern (ends with '%')");
101 char check_capture(char l)
103 l -= '1'; // relies on wraparound.
104 if (l >= level || captureLen(l) == CAP_UNFINISHED)
105 capInvalid();
106 return l;
109 int capture_to_close()
111 int lev = level;
112 for (lev--; lev>=0; lev--)
113 if (captureLen(lev) == CAP_UNFINISHED)
114 return lev;
115 return capInvalid();
118 int classend(String p, int pi)
120 switch (p.charAt(pi++))
122 case L_ESC:
123 // assert pi < p.length() // checked by callers
124 return pi+1;
126 case '[':
127 if (p.length() == pi)
128 return malBra();
129 if (p.charAt(pi) == '^')
130 ++pi;
131 do // look for a ']'
133 if (p.length() == pi)
134 return malBra();
135 if (p.charAt(pi++) == L_ESC)
137 if (p.length() == pi)
138 return malBra();
139 ++pi; // skip escapes (e.g. '%]')
140 if (p.length() == pi)
141 return malBra();
143 } while (p.charAt(pi) != ']');
144 return pi+1;
146 default:
147 return pi;
152 * @param c char match.
153 * @param cl character class.
155 static boolean match_class(char c, char cl)
157 boolean res;
158 switch (Character.toLowerCase(cl))
160 case 'a' : res = Syntax.isalpha(c); break;
161 case 'c' : res = Syntax.iscntrl(c); break;
162 case 'd' : res = Syntax.isdigit(c); break;
163 case 'l' : res = Syntax.islower(c); break;
164 case 'p' : res = Syntax.ispunct(c); break;
165 case 's' : res = Syntax.isspace(c); break;
166 case 'u' : res = Syntax.isupper(c); break;
167 case 'w' : res = Syntax.isalnum(c); break;
168 case 'x' : res = Syntax.isxdigit(c); break;
169 case 'z' : res = (c == 0); break;
170 default: return (cl == c);
172 return Character.isLowerCase(cl) ? res : !res;
176 * @param pi index in p of start of class.
177 * @param ec index in p of end of class.
179 static boolean matchbracketclass(char c, String p, int pi, int ec)
181 // :todo: consider changing char c to int c, then -1 could be used
182 // represent a guard value at the beginning and end of all strings (a
183 // better NUL). -1 of course would match no positive class.
185 // assert p.charAt(pi) == '[';
186 // assert p.charAt(ec) == ']';
187 boolean sig = true;
188 if (p.charAt(pi+1) == '^')
190 sig = false;
191 ++pi; // skip the '6'
193 while (++pi < ec)
195 if (p.charAt(pi) == L_ESC)
197 ++pi;
198 if (match_class(c, p.charAt(pi)))
199 return sig;
201 else if ((p.charAt(pi+1) == '-') && (pi+2 < ec))
203 pi += 2;
204 if (p.charAt(pi-2) <= c && c <= p.charAt(pi))
205 return sig;
207 else if (p.charAt(pi) == c)
209 return sig;
212 return !sig;
215 static boolean singlematch(char c, String p, int pi, int ep)
217 switch (p.charAt(pi))
219 case '.': return true; // matches any char
220 case L_ESC: return match_class(c, p.charAt(pi+1));
221 case '[': return matchbracketclass(c, p, pi, ep-1);
222 default: return p.charAt(pi) == c;
226 // Generally all the various match functions from PUC-Rio which take a
227 // MatchState and return a "const char *" are transformed into
228 // instance methods that take and return string indexes.
230 int matchbalance(int si, String p, int pi)
232 if (pi+1 >= p.length())
233 L.error("unbalanced pattern");
234 if (si >= end || src.charAt(si) != p.charAt(pi))
236 return -1;
238 char b = p.charAt(pi);
239 char e = p.charAt(pi+1);
240 int cont = 1;
241 while (++si < end)
243 if (src.charAt(si) == e)
245 if (--cont == 0)
246 return si+1;
248 else if (src.charAt(si) == b)
250 ++cont;
253 return -1; // string ends out of balance
256 int max_expand(int si, String p, int pi, int ep)
258 int i = 0; // counts maximum expand for item
259 while (si+i < end && singlematch(src.charAt(si+i), p, pi, ep))
261 ++i;
263 // keeps trying to match with the maximum repetitions
264 while (i >= 0)
266 int res = match(si+i, p, ep+1);
267 if (res >= 0)
268 return res;
269 --i; // else didn't match; reduce 1 repetition to try again
271 return -1;
274 int min_expand(int si, String p, int pi, int ep)
276 while (true)
278 int res = match(si, p, ep+1);
279 if (res >= 0)
280 return res;
281 else if (si < end && singlematch(src.charAt(si), p, pi, ep))
282 ++si; // try with one more repetition
283 else
284 return -1;
288 int start_capture(int si, String p, int pi, int what)
290 capture.setSize(level + 1);
291 capture.setElementAt(new int[] { si, what }, level);
292 ++level;
293 int res = match(si, p, pi);
294 if (res < 0) // match failed
296 --level;
298 return res;
301 int end_capture(int si, String p, int pi)
303 int l = capture_to_close();
304 capture(l)[1] = si - captureInit(l); // close it
305 int res = match(si, p, pi);
306 if (res < 0) // match failed?
308 capture(l)[1] = CAP_UNFINISHED; // undo capture
310 return res;
313 int match_capture(int si, char l)
315 l = check_capture(l);
316 int len = captureLen(l);
317 if (end - si >= len &&
318 src.regionMatches(false,
319 captureInit(l),
320 src,
322 len))
324 return si+len;
326 return -1;
329 static final char L_ESC = '%';
330 static final String SPECIALS = "^$*+?.([%-";
331 private static final int CAP_UNFINISHED = -1;
332 private static final int CAP_POSITION = -2;
335 * @param si index of subject at which to attempt match.
336 * @param p pattern string.
337 * @param pi index into pattern (from which to being matching).
338 * @return the index of the end of the match, -1 for no match.
340 int match(int si, String p, int pi)
342 // This code has been considerably changed in the transformation
343 // from C to Java. There are the following non-obvious changes:
344 // - The C code routinely relies on NUL being accessible at the end of
345 // the pattern string. In Java we can't do this, so we use many
346 // more explicit length checks and pull error cases into this
347 // function. :todo: consider appending NUL to the pattern string.
348 // - The C code uses a "goto dflt" which is difficult to transform in
349 // the usual way.
350 init: // labelled while loop emulates "goto init", which we use to
351 // optimize tail recursion.
352 while (true)
354 if (p.length() == pi) // end of pattern
355 return si; // match succeeded
356 switch (p.charAt(pi))
358 case '(':
359 if (p.length() == pi + 1)
361 return capUnfinished();
363 if (p.charAt(pi+1) == ')') // position capture?
364 return start_capture(si, p, pi+2, CAP_POSITION);
365 return start_capture(si, p, pi+1, CAP_UNFINISHED);
367 case ')': // end capture
368 return end_capture(si, p, pi+1);
370 case L_ESC:
371 if (p.length() == pi + 1)
373 return malEsc();
375 switch (p.charAt(pi+1))
377 case 'b': // balanced string?
378 si = matchbalance(si, p, pi+2);
379 if (si < 0)
380 return si;
381 pi += 4;
382 // else return match(ms, s, p+4);
383 continue init; // goto init
385 case 'f': // frontier
387 pi += 2;
388 if (p.length() == pi || p.charAt(pi) != '[')
389 return L.error("missing '[' after '%f' in pattern");
390 int ep = classend(p, pi); // indexes what is next
391 char previous = (si == 0) ? '\0' : src.charAt(si-1);
392 char at = (si == end) ? '\0' : src.charAt(si);
393 if (matchbracketclass(previous, p, pi, ep-1) ||
394 !matchbracketclass(at, p, pi, ep-1))
396 return -1;
398 pi = ep;
399 // else return match(ms, s, ep);
401 continue init; // goto init
403 default:
404 if (Syntax.isdigit(p.charAt(pi+1))) // capture results (%0-%09)?
406 si = match_capture(si, p.charAt(pi+1));
407 if (si < 0)
408 return si;
409 pi += 2;
410 // else return match(ms, s, p+2);
411 continue init; // goto init
413 // We emulate a goto dflt by a fallthrough to the next
414 // case (of the outer switch) and making sure that the
415 // next case has no effect when we fallthrough to it from here.
416 // goto dflt;
418 // FALLTHROUGH
419 case '$':
420 if (p.charAt(pi) == '$')
422 if (p.length() == pi+1) // is the '$' the last char in pattern?
423 return (si == end) ? si : -1; // check end of string
424 // else goto dflt;
426 // FALLTHROUGH
427 default: // it is a pattern item
429 int ep = classend(p, pi); // indexes what is next
430 boolean m = si < end && singlematch(src.charAt(si), p, pi, ep);
431 if (p.length() > ep)
433 switch (p.charAt(ep))
435 case '?': // optional
436 if (m)
438 int res = match(si+1, p, ep+1);
439 if (res >= 0)
440 return res;
442 pi = ep+1;
443 // else return match(s, ep+1);
444 continue init; // goto init
446 case '*': // 0 or more repetitions
447 return max_expand(si, p, pi, ep);
449 case '+': // 1 or more repetitions
450 return m ? max_expand(si+1, p, pi, ep) : -1;
452 case '-': // 0 or more repetitions (minimum)
453 return min_expand(si, p, pi, ep);
456 // else or default:
457 if (!m)
458 return -1;
459 ++si;
460 pi = ep;
461 // return match(ms, s+1, ep);
462 continue init;
469 * @param s index of start of match.
470 * @param e index of end of match.
472 Object onecapture(int i, int s, int e)
474 if (i >= level)
476 if (i == 0) // level == 0, too
477 return src.substring(s, e); // add whole match
478 else
479 capInvalid();
480 // NOTREACHED;
482 int l = captureLen(i);
483 if (l == CAP_UNFINISHED)
484 capUnfinished();
485 if (l == CAP_POSITION)
486 return Lua.valueOfNumber(captureInit(i) +1);
487 return src.substring(captureInit(i), captureInit(i) + l);
490 void push_onecapture(int i, int s, int e)
492 L.push(onecapture(i, s, e));
496 * @param s index of start of match.
497 * @param e index of end of match.
499 int push_captures(int s, int e)
501 int nlevels = (level == 0 && s >= 0) ? 1 : level;
502 for (int i=0; i<nlevels; ++i)
503 push_onecapture(i, s, e);
504 return nlevels; // number of strings pushed
507 /** A helper for gsub. Equivalent to add_s from lstrlib.c. */
508 void adds(StringBuffer b, int si, int ei)
510 String news = L.toString(L.value(3));
511 int l = news.length();
512 for (int i=0; i<l; ++i)
514 if (news.charAt(i) != L_ESC)
516 b.append(news.charAt(i));
518 else
520 ++i; // skip L_ESC
521 if (!Syntax.isdigit(news.charAt(i)))
523 b.append(news.charAt(i));
525 else if (news.charAt(i) == '0')
527 b.append(src.substring(si, ei));
529 else
531 // add capture to accumulated result
532 b.append(L.toString(onecapture(news.charAt(i) - '1', si, ei)));
538 /** A helper for gsub. Equivalent to add_value from lstrlib.c. */
539 void addvalue(StringBuffer b, int si, int ei)
541 switch (L.type(3))
543 case Lua.TNUMBER:
544 case Lua.TSTRING:
545 adds(b, si, ei);
546 return;
548 case Lua.TFUNCTION:
550 L.pushValue(3);
551 int n = push_captures(si, ei);
552 L.call(n, 1);
554 break;
556 case Lua.TTABLE:
557 L.push(L.getTable(L.value(3), onecapture(0, si, ei)));
558 break;
560 default:
562 L.argRaiseError(3, "string/function/table expected");
563 return;
566 if (!L.toBoolean(L.value(-1))) // nil or false
568 L.pop(1);
569 L.pushString(src.substring(si, ei));
571 else if (!Lua.isString(L.value(-1)))
573 L.error("invalid replacement value (a " +
574 Lua.typeName(L.type(-1)) + ")");
576 b.append(L.toString(L.value(-1))); // add result to accumulator
577 L.pop(1);