1 /* $Header: //info.ravenbrook.com/project/jili/version/1.1/code/mnj/lua/StringLib.java#1 $
2 * Copyright (c) 2006 Nokia Corporation and/or its subsidiary(-ies).
5 * Permission is hereby granted, free of charge, to any person obtaining
6 * a copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject
11 * to the following conditions:
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
20 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
21 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 // Modified 2009-12-26 by Ilari Liusvaara
26 // Split -> StringLib -> StringLib, MatchState and FormatItem as J2SE
27 // doesn't like multiple classes in the same file.
31 import java
.util
.Vector
;
33 final class MatchState
36 /** The entire string that is the subject of the match. */
38 /** The subject's length. */
40 /** Total number of captures (finished or unfinished). */
42 /** Each capture element is a 2-element array of (index, len). */
43 Vector
<Object
> capture
= new Vector
<Object
>();
44 // :todo: consider adding the pattern string as a member (and removing
45 // p parameter from methods).
47 // :todo: consider removing end parameter, if end always == // src.length()
48 MatchState(Lua L
, String src
, int end
)
56 * Returns the length of capture <var>i</var>.
58 private int captureLen(int i
)
60 int[] c
= (int[])capture
.elementAt(i
);
65 * Returns the init index of capture <var>i</var>.
67 private int captureInit(int i
)
69 int[] c
= (int[])capture
.elementAt(i
);
74 * Returns the 2-element array for the capture <var>i</var>.
76 private int[] capture(int i
)
78 return (int[])capture
.elementAt(i
);
83 return L
.error("invalid capture index");
88 return L
.error("malformed pattern (missing '[')");
93 return L
.error("unfinished capture");
98 return L
.error("malformed pattern (ends with '%')");
101 char check_capture(char l
)
103 l
-= '1'; // relies on wraparound.
104 if (l
>= level
|| captureLen(l
) == CAP_UNFINISHED
)
109 int capture_to_close()
112 for (lev
--; lev
>=0; lev
--)
113 if (captureLen(lev
) == CAP_UNFINISHED
)
118 int classend(String p
, int pi
)
120 switch (p
.charAt(pi
++))
123 // assert pi < p.length() // checked by callers
127 if (p
.length() == pi
)
129 if (p
.charAt(pi
) == '^')
133 if (p
.length() == pi
)
135 if (p
.charAt(pi
++) == L_ESC
)
137 if (p
.length() == pi
)
139 ++pi
; // skip escapes (e.g. '%]')
140 if (p
.length() == pi
)
143 } while (p
.charAt(pi
) != ']');
152 * @param c char match.
153 * @param cl character class.
155 static boolean match_class(char c
, char cl
)
158 switch (Character
.toLowerCase(cl
))
160 case 'a' : res
= Syntax
.isalpha(c
); break;
161 case 'c' : res
= Syntax
.iscntrl(c
); break;
162 case 'd' : res
= Syntax
.isdigit(c
); break;
163 case 'l' : res
= Syntax
.islower(c
); break;
164 case 'p' : res
= Syntax
.ispunct(c
); break;
165 case 's' : res
= Syntax
.isspace(c
); break;
166 case 'u' : res
= Syntax
.isupper(c
); break;
167 case 'w' : res
= Syntax
.isalnum(c
); break;
168 case 'x' : res
= Syntax
.isxdigit(c
); break;
169 case 'z' : res
= (c
== 0); break;
170 default: return (cl
== c
);
172 return Character
.isLowerCase(cl
) ? res
: !res
;
176 * @param pi index in p of start of class.
177 * @param ec index in p of end of class.
179 static boolean matchbracketclass(char c
, String p
, int pi
, int ec
)
181 // :todo: consider changing char c to int c, then -1 could be used
182 // represent a guard value at the beginning and end of all strings (a
183 // better NUL). -1 of course would match no positive class.
185 // assert p.charAt(pi) == '[';
186 // assert p.charAt(ec) == ']';
188 if (p
.charAt(pi
+1) == '^')
191 ++pi
; // skip the '6'
195 if (p
.charAt(pi
) == L_ESC
)
198 if (match_class(c
, p
.charAt(pi
)))
201 else if ((p
.charAt(pi
+1) == '-') && (pi
+2 < ec
))
204 if (p
.charAt(pi
-2) <= c
&& c
<= p
.charAt(pi
))
207 else if (p
.charAt(pi
) == c
)
215 static boolean singlematch(char c
, String p
, int pi
, int ep
)
217 switch (p
.charAt(pi
))
219 case '.': return true; // matches any char
220 case L_ESC
: return match_class(c
, p
.charAt(pi
+1));
221 case '[': return matchbracketclass(c
, p
, pi
, ep
-1);
222 default: return p
.charAt(pi
) == c
;
226 // Generally all the various match functions from PUC-Rio which take a
227 // MatchState and return a "const char *" are transformed into
228 // instance methods that take and return string indexes.
230 int matchbalance(int si
, String p
, int pi
)
232 if (pi
+1 >= p
.length())
233 L
.error("unbalanced pattern");
234 if (si
>= end
|| src
.charAt(si
) != p
.charAt(pi
))
238 char b
= p
.charAt(pi
);
239 char e
= p
.charAt(pi
+1);
243 if (src
.charAt(si
) == e
)
248 else if (src
.charAt(si
) == b
)
253 return -1; // string ends out of balance
256 int max_expand(int si
, String p
, int pi
, int ep
)
258 int i
= 0; // counts maximum expand for item
259 while (si
+i
< end
&& singlematch(src
.charAt(si
+i
), p
, pi
, ep
))
263 // keeps trying to match with the maximum repetitions
266 int res
= match(si
+i
, p
, ep
+1);
269 --i
; // else didn't match; reduce 1 repetition to try again
274 int min_expand(int si
, String p
, int pi
, int ep
)
278 int res
= match(si
, p
, ep
+1);
281 else if (si
< end
&& singlematch(src
.charAt(si
), p
, pi
, ep
))
282 ++si
; // try with one more repetition
288 int start_capture(int si
, String p
, int pi
, int what
)
290 capture
.setSize(level
+ 1);
291 capture
.setElementAt(new int[] { si
, what
}, level
);
293 int res
= match(si
, p
, pi
);
294 if (res
< 0) // match failed
301 int end_capture(int si
, String p
, int pi
)
303 int l
= capture_to_close();
304 capture(l
)[1] = si
- captureInit(l
); // close it
305 int res
= match(si
, p
, pi
);
306 if (res
< 0) // match failed?
308 capture(l
)[1] = CAP_UNFINISHED
; // undo capture
313 int match_capture(int si
, char l
)
315 l
= check_capture(l
);
316 int len
= captureLen(l
);
317 if (end
- si
>= len
&&
318 src
.regionMatches(false,
329 static final char L_ESC
= '%';
330 static final String SPECIALS
= "^$*+?.([%-";
331 private static final int CAP_UNFINISHED
= -1;
332 private static final int CAP_POSITION
= -2;
335 * @param si index of subject at which to attempt match.
336 * @param p pattern string.
337 * @param pi index into pattern (from which to being matching).
338 * @return the index of the end of the match, -1 for no match.
340 int match(int si
, String p
, int pi
)
342 // This code has been considerably changed in the transformation
343 // from C to Java. There are the following non-obvious changes:
344 // - The C code routinely relies on NUL being accessible at the end of
345 // the pattern string. In Java we can't do this, so we use many
346 // more explicit length checks and pull error cases into this
347 // function. :todo: consider appending NUL to the pattern string.
348 // - The C code uses a "goto dflt" which is difficult to transform in
350 init
: // labelled while loop emulates "goto init", which we use to
351 // optimize tail recursion.
354 if (p
.length() == pi
) // end of pattern
355 return si
; // match succeeded
356 switch (p
.charAt(pi
))
359 if (p
.length() == pi
+ 1)
361 return capUnfinished();
363 if (p
.charAt(pi
+1) == ')') // position capture?
364 return start_capture(si
, p
, pi
+2, CAP_POSITION
);
365 return start_capture(si
, p
, pi
+1, CAP_UNFINISHED
);
367 case ')': // end capture
368 return end_capture(si
, p
, pi
+1);
371 if (p
.length() == pi
+ 1)
375 switch (p
.charAt(pi
+1))
377 case 'b': // balanced string?
378 si
= matchbalance(si
, p
, pi
+2);
382 // else return match(ms, s, p+4);
383 continue init
; // goto init
385 case 'f': // frontier
388 if (p
.length() == pi
|| p
.charAt(pi
) != '[')
389 return L
.error("missing '[' after '%f' in pattern");
390 int ep
= classend(p
, pi
); // indexes what is next
391 char previous
= (si
== 0) ?
'\0' : src
.charAt(si
-1);
392 char at
= (si
== end
) ?
'\0' : src
.charAt(si
);
393 if (matchbracketclass(previous
, p
, pi
, ep
-1) ||
394 !matchbracketclass(at
, p
, pi
, ep
-1))
399 // else return match(ms, s, ep);
401 continue init
; // goto init
404 if (Syntax
.isdigit(p
.charAt(pi
+1))) // capture results (%0-%09)?
406 si
= match_capture(si
, p
.charAt(pi
+1));
410 // else return match(ms, s, p+2);
411 continue init
; // goto init
413 // We emulate a goto dflt by a fallthrough to the next
414 // case (of the outer switch) and making sure that the
415 // next case has no effect when we fallthrough to it from here.
420 if (p
.charAt(pi
) == '$')
422 if (p
.length() == pi
+1) // is the '$' the last char in pattern?
423 return (si
== end
) ? si
: -1; // check end of string
427 default: // it is a pattern item
429 int ep
= classend(p
, pi
); // indexes what is next
430 boolean m
= si
< end
&& singlematch(src
.charAt(si
), p
, pi
, ep
);
433 switch (p
.charAt(ep
))
435 case '?': // optional
438 int res
= match(si
+1, p
, ep
+1);
443 // else return match(s, ep+1);
444 continue init
; // goto init
446 case '*': // 0 or more repetitions
447 return max_expand(si
, p
, pi
, ep
);
449 case '+': // 1 or more repetitions
450 return m ?
max_expand(si
+1, p
, pi
, ep
) : -1;
452 case '-': // 0 or more repetitions (minimum)
453 return min_expand(si
, p
, pi
, ep
);
461 // return match(ms, s+1, ep);
469 * @param s index of start of match.
470 * @param e index of end of match.
472 Object
onecapture(int i
, int s
, int e
)
476 if (i
== 0) // level == 0, too
477 return src
.substring(s
, e
); // add whole match
482 int l
= captureLen(i
);
483 if (l
== CAP_UNFINISHED
)
485 if (l
== CAP_POSITION
)
486 return Lua
.valueOfNumber(captureInit(i
) +1);
487 return src
.substring(captureInit(i
), captureInit(i
) + l
);
490 void push_onecapture(int i
, int s
, int e
)
492 L
.push(onecapture(i
, s
, e
));
496 * @param s index of start of match.
497 * @param e index of end of match.
499 int push_captures(int s
, int e
)
501 int nlevels
= (level
== 0 && s
>= 0) ?
1 : level
;
502 for (int i
=0; i
<nlevels
; ++i
)
503 push_onecapture(i
, s
, e
);
504 return nlevels
; // number of strings pushed
507 /** A helper for gsub. Equivalent to add_s from lstrlib.c. */
508 void adds(StringBuffer b
, int si
, int ei
)
510 String news
= L
.toString(L
.value(3));
511 int l
= news
.length();
512 for (int i
=0; i
<l
; ++i
)
514 if (news
.charAt(i
) != L_ESC
)
516 b
.append(news
.charAt(i
));
521 if (!Syntax
.isdigit(news
.charAt(i
)))
523 b
.append(news
.charAt(i
));
525 else if (news
.charAt(i
) == '0')
527 b
.append(src
.substring(si
, ei
));
531 // add capture to accumulated result
532 b
.append(L
.toString(onecapture(news
.charAt(i
) - '1', si
, ei
)));
538 /** A helper for gsub. Equivalent to add_value from lstrlib.c. */
539 void addvalue(StringBuffer b
, int si
, int ei
)
551 int n
= push_captures(si
, ei
);
557 L
.push(L
.getTable(L
.value(3), onecapture(0, si
, ei
)));
562 L
.argRaiseError(3, "string/function/table expected");
566 if (!L
.toBoolean(L
.value(-1))) // nil or false
569 L
.pushString(src
.substring(si
, ei
));
571 else if (!Lua
.isString(L
.value(-1)))
573 L
.error("invalid replacement value (a " +
574 Lua
.typeName(L
.type(-1)) + ")");
576 b
.append(L
.toString(L
.value(-1))); // add result to accumulator