4 * Copyright (c) 2005 Malete Partner, Berlin, partner@malete.org
5 * Available under "Lua 5.0 license", see http://www.lua.org/license.html#5
6 * $Id: slnunico.c,v 1.5 2006/07/26 17:20:04 paul Exp $
9 ** lstrlib.c,v 1.109 2004/12/01 15:46:06 roberto Exp
10 ** Standard library for string operations and pattern-matching
11 ** See Copyright Notice in lua.h
13 * uses the udata table and a couple of expressions from Tcl 8.4.x UTF-8
14 * which comes with the following license.terms:
16 This software is copyrighted by the Regents of the University of
17 California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
18 Corporation and other parties. The following terms apply to all files
19 associated with the software unless explicitly disclaimed in
22 The authors hereby grant permission to use, copy, modify, distribute,
23 and license this software and its documentation for any purpose, provided
24 that existing copyright notices are retained in all copies and that this
25 notice is included verbatim in any distributions. No written agreement,
26 license, or royalty fee is required for any of the authorized uses.
27 Modifications to this software may be copyrighted by their authors
28 and need not follow the licensing terms described here, provided that
29 the new terms are clearly indicated on the first page of each file where
32 IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
33 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
34 ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
35 DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
38 THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
39 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
41 IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
42 NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
45 GOVERNMENT USE: If you are acquiring this software on behalf of the
46 U.S. government, the Government shall have only "Restricted Rights"
47 in the software and related documentation as defined in the Federal
48 Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
49 are acquiring the software on behalf of the Department of Defense, the
50 software shall be classified as "Commercial Computer Software" and the
51 Government shall have only "Restricted Rights" as defined in Clause
52 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
53 authors grant the U.S. Government and others acting in its behalf
54 permission to use and distribute the software in accordance with the
55 terms specified in this license.
57 (end of Tcl license terms)
61 According to http://ietf.org/rfc/rfc3629.txt we support up to 4-byte
62 (21 bit) sequences encoding the UTF-16 reachable 0-0x10FFFF.
63 Any byte not part of a 2-4 byte sequence in that range decodes to itself.
64 Ill formed (non-shortest) "C0 80" will be decoded as two code points C0 and 80,
65 not code point 0; see security considerations in the RFC.
66 However, UTF-16 surrogates (D800-DFFF) are accepted.
68 See http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
69 for default grapheme clusters.
70 Lazy westerners we are (and lacking the Hangul_Syllable_Type data),
71 we care for base char + Grapheme_Extend, but not for Hangul syllable sequences.
73 For http://unicode.org/Public/UNIDATA/UCD.html#Grapheme_Extend
74 we use Mn (NON_SPACING_MARK) + Me (ENCLOSING_MARK),
75 ignoring the 18 mostly south asian Other_Grapheme_Extend (16 Mc, 2 Cf) from
76 http://www.unicode.org/Public/UNIDATA/PropList.txt
92 #include "lua/lauxlib_bridge.h"
98 #ifndef SLN_UNICODENAME /* unless set it luaconf */
99 # define SLN_UNICODENAME "unicode"
102 #define LUA_MAXCAPTURES 32
103 #if defined(LUA_USELONGLONG)
105 #define LUA_INTFRMLEN "ll"
106 #define LUA_INTFRM_T long long
110 #define LUA_INTFRMLEN "l"
111 #define LUA_INTFRM_T long
116 UTF-8 Bit Distribution pag 103 Unicode 5.0
123 #define U8_LENGTH(c) ((unsigned char)(c)<=0x7f ? 1 : ((unsigned char)(c)<=0xdf ? 2 : ((unsigned char)(c)<=0xef ? 3 : ((unsigned char)(c)<=0xf7 ? 4:-1))))
125 #include "slnudata.c"
126 #define charinfo(c) (~0xFFFF&(c) ? 0 : GetUniCharInfo(c)) /* BMP only */
127 #define charcat(c) (UNICODE_CATEGORY_MASK & charinfo(c))
128 #define Grapheme_Extend(code) \
129 (1 & (((1<<NON_SPACING_MARK)|(1<<ENCLOSING_MARK)) >> charcat(code)))
131 enum { /* operation modes */
132 MODE_ASCII
, /* single byte 7bit */
133 MODE_LATIN
, /* single byte 8859-1 */
134 MODE_UTF8
, /* UTF-8 by code points */
135 MODE_GRAPH
/* UTF-8 by grapheme clusters */
136 #define MODE_MBYTE(mode) (~1&(mode))
140 /* macro to `unsign' a character */
141 #define uchar(c) ((unsigned char)(c))
143 typedef const unsigned char cuc
; /* it's just toooo long :) */
146 static void utf8_enco (luaL_Buffer
*b
, unsigned c
)
153 luaL_addchar(b
, 0xC0|(c
>>6));
156 luaL_addchar(b
, 0xE0|(c
>>12));
158 luaL_addchar(b
, 0xF0|(c
>>18));
159 luaL_addchar(b
, 0x80|(0x3F&(c
>>12)));
161 luaL_addchar(b
, 0x80|(0x3F&(c
>>6)));
163 luaL_addchar(b
, 0x80|(0x3F&c
));
167 /* end must be > *pp */
168 static unsigned utf8_deco (const char **pp
, const char *end
)
170 register cuc
*p
= (cuc
*)*pp
, * const e
= (cuc
*)end
;
171 unsigned first
= *p
, code
;
173 *pp
= (const char*)++p
; /* eat one */
174 /* check ASCII, dangling cont., non-shortest or not continued */
175 if (0xC2 > first
|| e
== p
|| 0x80 != (0xC0&*p
)) return first
;
176 code
= 0x3F&*p
++; /* save 1st cont. */
177 /* check 2 byte (5+6 = 11 bit) sequence up to 0x7FF */
178 if (0xE0 > first
) { /* any >= C2 is valid */
179 code
|= (0x1F&first
)<<6;
182 if (e
!= p
&& 0x80 == (0xC0&*p
)) { /* is continued */
183 code
= code
<<6 | (0x3F&*p
++); /* save 2nd */
184 if (0xF0 > first
) { /* 3 byte (4+6+6 = 16 bit) seq -- want 2nd cont. */
185 if ( 0xF800&(code
|= (0x0F&first
)<<12) /* >0x7FF: not non-shortest */
186 /* && 0xD800 != (0xD800 & code) -- nah, let surrogates pass */
189 } else if (e
!= p
&& 0x80 == (0xC0&*p
) /* check 3rd */
190 /* catch 0xF4 < first and other out-of-bounds */
191 /* TH: add the 256 out-of-range glyphs in 'plane 18' */
192 && 0x110100 > (code
= (0x0F&first
)<<18 | code
<<6 | (0x3F&*p
++))
193 && 0xFFFF < code
/* not a 16 bitty */
199 *pp
= (const char*)p
;
204 /* reverse decode before pp > start */
205 static unsigned utf8_oced (const char **pp
, const char *start
)
207 register cuc
*p
= (cuc
*)*pp
, * const s
= (cuc
*)start
;
208 unsigned last
= *--p
, code
;
210 *pp
= (const char*)p
; /* eat one */
211 /* check non-continuer or at the edge */
212 if (0x80 != (0xC0&last
) || s
== p
) return last
;
213 code
= 0x3F&last
; /* save last cont. */
214 if (0xC0 == (0xE0&*--p
)) { /* preceeded by 2 byte seq starter */
215 if (0xC2 <= *p
) { code
|= (0x1F&*p
)<<6; goto seq
; }
216 } else if (0x80 == (0xC0&*p
) && s
<p
) {
217 code
|= (0x3F&*p
)<<6;
218 if (0xE0 == (0xF0&*--p
)) { /* 3 byte starter */
219 if (0xF800&(code
|= (0x0F&*p
)<<12)) goto seq
;
220 } else if (0x80 == (0xC0&*p
) && s
<=--p
/* valid 4 byte ? */
221 /* TH: add the 256 out-of-range glyphs in 'plane 18' */
222 && 0x110100 > (code
|= (0x0F&*p
)<<18 | (0x3F&p
[1])<<12)
229 *pp
= (const char*)p
;
234 /* skip over Grapheme_Extend codes */
235 static void utf8_graphext (const char **pp
, const char *end
)
238 for (; p
< end
; *pp
=p
) {
239 unsigned code
= utf8_deco(&p
, end
);
240 if (!Grapheme_Extend(code
)) break;
242 } /* utf8_graphext */
245 static int utf8_count (const char **pp
, int bytes
, int graph
, int max
)
247 const char *const end
= *pp
+bytes
;
249 while (*pp
< end
&& count
!= max
) {
250 unsigned code
= utf8_deco(pp
, end
);
252 if (!graph
) continue;
253 if (Grapheme_Extend(code
) && 1<count
) count
--; /* uncount */
255 if (graph
&& count
== max
) /* gather more extending */
256 utf8_graphext(pp
, end
);
262 static int unic_len (lua_State
*L
) {
264 const char *s
= luaL_checklstring(L
, 1, &l
);
265 int mode
= lua_tointeger(L
, lua_upvalueindex(1));
266 if (MODE_MBYTE(mode
)) l
= (size_t)utf8_count(&s
, l
, mode
-2, -1);
267 lua_pushinteger(L
, l
);
272 static ptrdiff_t posrelat (ptrdiff_t pos
, size_t len
) {
273 /* relative string position: negative means back from end */
274 return (pos
>=0) ? pos
: (ptrdiff_t)len
+pos
+1;
278 static int unic_sub (lua_State
*L
) {
280 const char *s
= luaL_checklstring(L
, 1, &l
), *p
, *e
=s
+l
;
281 ptrdiff_t start
= luaL_checkinteger(L
, 2);
282 ptrdiff_t end
= luaL_optinteger(L
, 3, -1);
283 int mode
= lua_tointeger(L
, lua_upvalueindex(1));
285 if (MODE_MBYTE(mode
)) { p
=s
; l
= (size_t)utf8_count(&p
, l
, mode
-2, -1); }
286 start
= posrelat(start
, l
);
287 end
= posrelat(end
, l
);
288 if (start
< 1) start
= 1;
289 if (end
> (ptrdiff_t)l
) end
= (ptrdiff_t)l
;
291 lua_pushliteral(L
, "");
293 l
= end
- --start
; /* #units */
294 if (!(MODE_MBYTE(mode
))) /* single byte */
297 if (start
) utf8_count(&s
, e
-s
, mode
-2, start
); /* skip */
299 utf8_count(&p
, e
-p
, mode
-2, l
);
302 lua_pushlstring(L
, s
, l
);
308 static int str_reverse (lua_State
*L
) { /* TODO? whatfor? */
311 const char *s
= luaL_checklstring(L
, 1, &l
), *p
= s
+l
, *q
;
312 int mode
= lua_tointeger(L
, lua_upvalueindex(1)), mb
= MODE_MBYTE(mode
);
314 luaL_buffinit(L
, &b
);
316 while (s
< p
--) luaL_addchar(&b
, *p
);
321 code
= utf8_oced(&p
, s
);
322 if (MODE_GRAPH
== mode
)
323 while (Grapheme_Extend(code
) && p
>s
) code
= utf8_oced(&p
, s
);
324 luaL_addlstring(&b
, p
, q
-p
);
333 static int unic_lower (lua_State
*L
) {
336 const char *s
= luaL_checklstring(L
, 1, &l
), * const e
=s
+l
;
337 int mode
= lua_tointeger(L
, lua_upvalueindex(1)), mb
= MODE_MBYTE(mode
);
338 luaL_buffinit(L
, &b
);
340 unsigned c
= mb
? utf8_deco(&s
, e
) : uchar(*s
++);
341 int info
= charinfo(c
);
342 if (GetCaseType(info
)&0x02 && (mode
|| !(0x80&c
))) c
+= GetDelta(info
);
343 if (mb
) utf8_enco(&b
, c
); else luaL_addchar(&b
, c
);
350 static int unic_upper (lua_State
*L
) {
353 const char *s
= luaL_checklstring(L
, 1, &l
), * const e
=s
+l
;
354 int mode
= lua_tointeger(L
, lua_upvalueindex(1)), mb
= MODE_MBYTE(mode
);
355 luaL_buffinit(L
, &b
);
357 unsigned c
= mb
? utf8_deco(&s
, e
) : uchar(*s
++);
358 int info
= charinfo(c
);
359 if (GetCaseType(info
)&0x04 && (mode
|| !(0x80&c
))) c
-= GetDelta(info
);
360 if (mb
) utf8_enco(&b
, c
); else luaL_addchar(&b
, c
);
367 static int str_rep (lua_State
*L
) {
370 const char *s
= luaL_checklstring(L
, 1, &l
);
371 int n
= luaL_checkint(L
, 2);
372 luaL_buffinit(L
, &b
);
374 luaL_addlstring(&b
, s
, l
);
380 static int unic_byte (lua_State
*L
) {
382 ptrdiff_t posi
, pose
;
383 const char *s
= luaL_checklstring(L
, 1, &l
), *p
, *e
=s
+l
;
384 int n
, mode
= lua_tointeger(L
, lua_upvalueindex(1)), mb
= MODE_MBYTE(mode
);
386 if (mb
) { p
=s
; l
= (size_t)utf8_count(&p
, l
, mode
-2, -1); }
387 posi
= posrelat(luaL_optinteger(L
, 2, 1), l
);
388 pose
= posrelat(luaL_optinteger(L
, 3, posi
), l
);
389 if (posi
<= 0) posi
= 1;
390 if ((size_t)pose
> l
) pose
= l
;
391 if (0 >= (n
= pose
- --posi
)) return 0; /* empty interval */
395 if (posi
) utf8_count(&s
, e
-s
, mode
-2, posi
); /* skip */
397 utf8_count(&p
, e
-s
, mode
-2, n
);
400 /* byte count is upper bound on #elements */
401 luaL_checkstack(L
, e
-s
, "string slice too long");
403 lua_pushinteger(L
, mb
? utf8_deco(&s
, e
) : uchar(*s
++));
408 static int unic_char (lua_State
*L
) {
409 int i
, n
= lua_gettop(L
); /* number of arguments */
410 int mode
= lua_tointeger(L
, lua_upvalueindex(1)), mb
= MODE_MBYTE(mode
);
411 /* TH: add the 256 out-of-range glyphs in 'plane 18' */
412 unsigned lim
= mb
? 0x110100 : 0x100;
415 luaL_buffinit(L
, &b
);
416 for (i
=1; i
<=n
; i
++) {
417 unsigned c
= luaL_checkint(L
, i
);
418 luaL_argcheck(L
, lim
> c
, i
, "invalid value");
419 if (mb
) utf8_enco(&b
, c
); else luaL_addchar(&b
, c
);
426 static int writer (lua_State
*L
, const void* b
, size_t size
, void* B
) {
428 luaL_addlstring((luaL_Buffer
*) B
, (const char *)b
, size
);
433 static int str_dump (lua_State
*L
) {
435 luaL_checktype(L
, 1, LUA_TFUNCTION
);
438 if (lua_dump(L
, writer
, &b
) != 0)
439 luaL_error(L
, "unable to dump given function");
447 ** {======================================================
449 ** =======================================================
450 * find/gfind(_aux) -> match, push_captures
451 * gsub -> match, add_s (-> push_captures)
452 * push_captures, add_s -> push_onecapture
454 * start/end_capture -> match,
455 * match_capture, matchbalance, classend -> -,
456 * min/max_expand -> match, singlematch
457 * singlematch -> matchbracketclass, match_class,
458 * matchbracketclass -> match_class -> -,
462 #define CAP_UNFINISHED (-1)
463 #define CAP_POSITION (-2)
465 typedef struct MatchState
{
466 const char *src_init
; /* init of source string */
467 const char *src_end
; /* end (`\0') of source string */
469 int level
; /* total number of captures (finished or unfinished) */
475 } capture
[LUA_MAXCAPTURES
];
480 #define SPECIALS "^$*+?.([%-"
483 static int check_capture (MatchState
*ms
, int l
) {
485 if (l
< 0 || l
>= ms
->level
|| ms
->capture
[l
].len
== CAP_UNFINISHED
)
486 return luaL_error(ms
->L
, "invalid capture index");
491 static int capture_to_close (MatchState
*ms
)
493 int level
= ms
->level
;
494 for (level
--; level
>=0; level
--)
495 if (ms
->capture
[level
].len
== CAP_UNFINISHED
) return level
;
496 return luaL_error(ms
->L
, "invalid pattern capture");
500 static const char *classend (MatchState
*ms
, const char *p
)
504 if (!*++p
) luaL_error(ms
->L
, "malformed pattern (ends with " LUA_QL("%%") ")");
507 /* if (*p == '^') p++; -- no effect */
508 do { /* look for a `]' */
509 if (!*p
) luaL_error(ms
->L
, "malformed pattern (missing " LUA_QL("]") ")");
510 if (L_ESC
== *(p
++) && *p
) p
++; /* skip escapes (e.g. `%]') */
523 * The following macros are used for fast character category tests. The
524 * x_BITS values are shifted right by the category value to determine whether
525 * the given category is included in the set.
528 #define LETTER_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
529 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
531 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
533 #define NUMBER_BITS (1 << DECIMAL_DIGIT_NUMBER) \
534 | (1 << LETTER_NUMBER) | (1 << OTHER_NUMBER)
536 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
537 | (1 << PARAGRAPH_SEPARATOR))
539 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
541 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
542 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
543 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
544 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
547 /* character c matches class cl. undefined for cl not ascii */
548 static int match_class (int c
, int cl
, int mode
)
551 switch (0x20|cl
/*tolower*/) {
552 case 'a' : msk
= LETTER_BITS
; break;
553 case 'c' : msk
= 1<<CONTROL
; break;
554 case 'x' : /* hexdigits */
555 if (0x40==(~0x3f&c
)/*64-127*/ && 1&(0x7e/*a-f*/>>(0x1f&c
))) goto matched
;
556 case 'd' : msk
= 1<<DECIMAL_DIGIT_NUMBER
; mode
=0;/* ASCII only */ break;
557 case 'l' : msk
= 1<<LOWERCASE_LETTER
; break;
558 case 'n' : msk
= NUMBER_BITS
; break; /* new */
559 case 'p' : msk
= PUNCT_BITS
; break;
561 #define STDSPACE /* standard "space" controls 9-13 */ \
562 (1<<9/*TAB*/|1<<10/*LF*/|1<<11/*VT*/|1<<12/*FF*/|1<<13/*CR*/)
563 if (!(~0x1f & c
) && 1&(STDSPACE
>> c
)) goto matched
;
566 case 'u' : msk
= 1<<UPPERCASE_LETTER
; break;
568 this is not compatible to lua 5.1, where %w is just a letter or a digit
569 case 'w' : msk = LETTER_BITS|NUMBER_BITS|CONNECTOR_BITS; break;
571 case 'w' : msk
= LETTER_BITS
|NUMBER_BITS
; break;
572 case 'z' : if (!c
) goto matched
; msk
= 0; break;
573 default: return cl
== c
;
575 res
= 1 & (msk
>> charcat(c
));
576 if (!mode
&& 0x80&c
) res
= 0;
581 return 0x20&cl
/*islower*/ ? res
: !res
;
585 /* decode single byte or UTF-8 seq; advance *s */
586 static unsigned deco (const MatchState
*ms
, const char **s
, const char *e
)
588 return ms
->mb
? utf8_deco(s
, e
) : *(unsigned char*)(*s
)++;
591 /* s must be < ms->src_end, p < ep */
592 static const char *singlematch (const MatchState
*ms
,
593 const char *s
, const char *p
, const char *ep
)
599 c
= deco(ms
, &s
, ms
->src_end
);
601 if (!ms
->mb
|| !(0x80&*s
))
602 c
= *(unsigned char*)s
++;
604 c
= utf8_deco(&s
, ms
->src_end
);
609 if (match_class(c
, uchar(p
[1]), ms
->mode
)) {
610 case '.': /* the all class */
611 #ifndef OPTIMIZE_SIZE
612 if (MODE_GRAPH
!= ms
->mode
) return s
; /* common fast path */
620 c1
= deco(ms
, &p
, ep
);
622 if (!ms
->mb
|| !(0x80&*p
))
623 c1
= *(unsigned char*)p
++;
625 c1
= utf8_deco(&p
, ep
);
629 case '[': /* matchbracketclass */
630 ep
--; /* now on the ']' */
631 if ((neg
= '^' == *++p
)) p
++; /* skip the `^' */
634 if (match_class(c
, uchar(*++p
), ms
->mode
)) goto matched_class_in_brack
;
638 c1
= deco(ms
, &p
, ep
);
639 /* in lua-5.1 and 5.1.1 a trailing '-' is allowed
640 dynasm.lua relies on this
642 if ( ep
<= p
+ 1 || '-' != *p
) {
643 const char *op
= p
, *es
;
644 if (MODE_GRAPH
== ms
->mode
) utf8_graphext(&p
, ep
);
645 if (c
!= c1
) continue;
646 if (MODE_GRAPH
!= ms
->mode
) goto matched
;
647 /* comp grapheme extension */
649 utf8_graphext(&es
, ms
->src_end
);
650 if (es
-s
== p
-op
&& (es
==s
|| !memcmp(s
, op
, es
-s
))) goto matched
;
655 /* range c1-c2 -- no extend support in range bounds... */
656 /* if (ep == ++p) break; see above */ /* bugger - trailing dash */
657 c2
= deco(ms
, &p
, ep
);
658 if (c2
< c1
) { unsigned swap
=c1
; c1
=c2
; c2
=swap
; }
659 if (c1
<= c
&& c
<= c2
) goto matched_class_in_brack
; /* ...but extend match */
665 /* matchbracketclass */
668 matched_class_in_brack
: /* matched %something or range in [] */
672 matched_class
: /* matched %something or . */
673 if (MODE_GRAPH
== ms
->mode
) utf8_graphext(&s
, ms
->src_end
);
679 static const char *match (MatchState
*ms
, const char *s
, const char *p
);
682 static const char *matchbalance (MatchState
*ms
, const char *s
,
684 if (*p
== 0 || *(p
+1) == 0)
685 luaL_error(ms
->L
, "unbalanced pattern");
686 if (*s
!= *p
) return NULL
;
691 while (++s
< ms
->src_end
) {
693 if (--cont
== 0) return s
+1;
695 else if (*s
== b
) cont
++;
698 return NULL
; /* string ends out of balance */
702 static const char *max_expand (MatchState
*ms
,
703 const char *s
, const char *p
, const char *ep
)
705 const char *sp
= s
, *es
;
706 while (sp
<ms
->src_end
&& (es
= singlematch(ms
, sp
, p
, ep
)))
708 /* keeps trying to match with the maximum repetitions */
710 const char *res
= match(ms
, sp
, ep
+1);
711 if (res
|| sp
==s
) return res
;
713 sp
--; /* else didn't match; reduce 1 repetition to try again */
715 unsigned code
= utf8_oced(&sp
, s
);
716 if (MODE_GRAPH
== ms
->mode
)
717 while (Grapheme_Extend(code
) && sp
>s
) code
= utf8_oced(&sp
, s
);
724 static const char *min_expand (MatchState
*ms
,
725 const char *s
, const char *p
, const char *ep
)
728 const char *res
= match(ms
, s
, ep
+1);
730 if (s
>= ms
->src_end
) break;
731 } while ((s
= singlematch(ms
, s
, p
, ep
))); /* try with one more repetition */
736 static const char *start_capture (MatchState
*ms
, const char *s
,
737 const char *p
, int what
) {
739 int level
= ms
->level
;
740 if (level
>= LUA_MAXCAPTURES
) luaL_error(ms
->L
, "too many captures");
741 ms
->capture
[level
].init
= s
;
742 ms
->capture
[level
].len
= what
;
744 if ((res
=match(ms
, s
, p
)) == NULL
) /* match failed? */
745 ms
->level
--; /* undo capture */
750 static const char *end_capture (MatchState
*ms
, const char *s
,
752 int l
= capture_to_close(ms
);
754 ms
->capture
[l
].len
= s
- ms
->capture
[l
].init
; /* close capture */
755 if ((res
= match(ms
, s
, p
)) == NULL
) /* match failed? */
756 ms
->capture
[l
].len
= CAP_UNFINISHED
; /* undo capture */
761 static const char *match_capture (MatchState
*ms
, const char *s
, int l
) {
763 l
= check_capture(ms
, l
);
764 len
= ms
->capture
[l
].len
;
765 if ((size_t)(ms
->src_end
-s
) >= len
&&
766 memcmp(ms
->capture
[l
].init
, s
, len
) == 0)
772 static const char *match (MatchState
*ms
, const char *s
, const char *p
) {
773 init
: /* using goto's to optimize tail recursion */
775 case '(': { /* start capture */
776 if (*(p
+1) == ')') /* position capture? */
777 return start_capture(ms
, s
, p
+2, CAP_POSITION
);
779 return start_capture(ms
, s
, p
+1, CAP_UNFINISHED
);
781 case ')': { /* end capture */
782 return end_capture(ms
, s
, p
+1);
786 case 'b': { /* balanced string? */
787 s
= matchbalance(ms
, s
, p
+2);
788 if (s
== NULL
) return NULL
;
789 p
+=4; goto init
; /* else return match(ms, s, p+4); */
792 case 'f': { /* frontier? */
793 const char *ep
; char previous
;
796 luaL_error(ms
->L
, "missing " LUA_QL("[") " after "
797 LUA_QL("%%f") " in pattern" );
798 luaL_error(ms
->L
, "missing `[' after `%%f' in pattern");
799 ep
= classend(ms
, p
); /* points to what is next */
800 /* with UTF-8, getting the previous is more complicated */
801 previous
= (s
== ms
->src_init
) ? '\0' : *(s
-1);
802 /* use singlematch to apply all necessary magic */
803 if (singlematch(uchar(previous
), p
, ep
-1) ||
804 !singlematch(uchar(*s
), p
, ep
-1)) return NULL
;
805 p
=ep
; goto init
; /* else return match(ms, s, ep); */
809 if (isdigit(uchar(*(p
+1)))) { /* capture results (%0-%9)? */
810 s
= match_capture(ms
, s
, uchar(*(p
+1)));
811 if (s
== NULL
) return NULL
;
812 p
+=2; goto init
; /* else return match(ms, s, p+2) */
814 goto dflt
; /* case default */
818 case '\0': { /* end of pattern */
819 return s
; /* match succeeded */
822 if (*(p
+1) == '\0') /* is the `$' the last char in pattern? */
823 return (s
== ms
->src_end
) ? s
: NULL
; /* check end of string */
824 else goto dflt
; /* ??? */
826 default: dflt
: { /* it is a pattern item */
827 const char *ep
= classend(ms
, p
); /* points to what is next */
829 if (s
< ms
->src_end
) es
= singlematch(ms
, s
, p
, ep
);
831 case '?': { /* optional */
833 if (es
&& (res
=match(ms
, es
, ep
+1))) return res
;
834 p
=ep
+1; goto init
; /* else return match(ms, s, ep+1); */
836 case '*': { /* 0 or more repetitions */
837 return max_expand(ms
, s
, p
, ep
);
839 case '+': { /* 1 or more repetitions */
840 return (es
? max_expand(ms
, es
, p
, ep
) : NULL
);
842 case '-': { /* 0 or more repetitions (minimum) */
843 return min_expand(ms
, s
, p
, ep
);
846 if (!es
) return NULL
;
847 s
=es
; p
=ep
; goto init
; /* else return match(ms, s+1, ep); */
856 static const char *lmemfind (const char *s1
, size_t l1
,
857 const char *s2
, size_t l2
) {
858 if (l2
== 0) return s1
; /* empty strings are everywhere */
859 else if (l2
> l1
) return NULL
; /* avoids a negative `l1' */
861 const char *init
; /* to search for a `*s2' inside `s1' */
862 l2
--; /* 1st char will be checked by `memchr' */
863 l1
= l1
-l2
; /* `s2' cannot be found after that */
864 while (l1
> 0 && (init
= (const char *)memchr(s1
, *s2
, l1
)) != NULL
) {
865 init
++; /* 1st char is already checked */
866 if (memcmp(init
, s2
+1, l2
) == 0)
868 else { /* correct `l1' and `s1' to try again */
873 return NULL
; /* not found */
878 static void push_onecapture (MatchState
*ms
, int i
, const char *s
,
881 if (i
>= ms
->level
) {
882 if (i
== 0) /* ms->level == 0, too */
883 lua_pushlstring(ms
->L
, s
, e
- s
); /* add whole match */
885 luaL_error(ms
->L
, "invalid capture index");
888 ptrdiff_t l
= ms
->capture
[i
].len
;
889 if (l
== CAP_UNFINISHED
) luaL_error(ms
->L
, "unfinished capture");
890 if (l
== CAP_POSITION
)
891 lua_pushinteger(ms
->L
, ms
->capture
[i
].init
- ms
->src_init
+ 1);
893 lua_pushlstring(ms
->L
, ms
->capture
[i
].init
, l
);
898 static int push_captures (MatchState
*ms
, const char *s
, const char *e
) {
900 int nlevels
= (ms
->level
== 0 && s
) ? 1 : ms
->level
;
901 luaL_checkstack(ms
->L
, nlevels
, "too many captures");
902 for ( i
= 0; i
< nlevels
; i
++ )
903 push_onecapture( ms
, i
, s
, e
);
904 return nlevels
; /* number of strings pushed */
908 static int unic_find_aux (lua_State
*L
, int find
) {
910 const char *s
= luaL_checklstring(L
, 1, &l1
);
911 const char *p
= luaL_checklstring(L
, 2, &l2
);
912 ptrdiff_t init
= posrelat(luaL_optinteger(L
, 3, 1), l1
) - 1;
913 if (init
< 0) init
= 0;
914 else if ((size_t)(init
) > l1
) init
= (ptrdiff_t)l1
;
915 if (find
&& (lua_toboolean(L
, 4) || /* explicit request? */
916 strpbrk(p
, SPECIALS
) == NULL
)) { /* or no special characters? */
917 /* do a plain search */
918 const char *s2
= lmemfind(s
+init
, l1
-init
, p
, l2
);
920 lua_pushinteger(L
, s2
-s
+1);
921 lua_pushinteger(L
, s2
-s
+l2
);
927 int anchor
= (*p
== '^') ? (p
++, 1) : 0;
928 const char *s1
=s
+init
;
929 unsigned char u8_lenght
= U8_LENGTH( (unsigned char)s
[0] );
933 ms
.mode
= lua_tointeger(L
, lua_upvalueindex(1));
934 ms
.mb
= MODE_MBYTE(ms
.mode
);
937 /* LS/HH : patch for tracker issue 869, concerning "%s" match of à ; the old code */
938 /* increments by 1 on a failure and can end up in the middle of an utf sequence */
939 /* so this was a major bug. */
944 if ((res
=match(&ms
, s1
, p
)) != NULL
) {
946 lua_pushinteger(L
, s1
-s
+1); /* start */
947 lua_pushinteger(L
, res
-s
); /* end */
948 return push_captures(&ms
, NULL
, 0) + 2;
950 return push_captures(&ms
, s1
, res
);
952 s1
= s1
+ (ms
.mode
> MODE_LATIN
? U8_LENGTH( uchar(s1
[0])) : 1) ;
953 } while (s1
< ms
.src_end
&& !anchor
);
955 lua_pushnil(L
); /* not found */
959 static int unic_find (lua_State
*L
) {
960 return unic_find_aux(L
, 1);
964 static int unic_match (lua_State
*L
) {
965 return unic_find_aux(L
, 0);
970 static int gmatch_aux (lua_State
*L
) {
973 const char *s
= lua_tolstring(L
, lua_upvalueindex(1), &ls
);
974 const char *p
= lua_tostring(L
, lua_upvalueindex(2));
979 ms
.mode
= lua_tointeger(L
, lua_upvalueindex(4));
980 ms
.mb
= MODE_MBYTE(ms
.mode
);
981 for (src
= s
+ (size_t)lua_tointeger(L
, lua_upvalueindex(3));
987 if ((e
= match(&ms
, src
, p
)) != NULL
) {
988 lua_Integer newstart
= e
-s
;
989 if (e
== src
) newstart
++; /* empty match? go at least one position */
990 lua_pushinteger(L
, newstart
);
991 lua_replace(L
, lua_upvalueindex(3));
992 return push_captures(&ms
, src
, e
);
995 return 0; /* not found */
1000 static int gmatch (lua_State
*L
) {
1001 luaL_checkstring(L
, 1);
1002 luaL_checkstring(L
, 2);
1004 lua_pushinteger(L
, 0);
1005 lua_pushinteger(L
, lua_upvalueindex(1));
1006 lua_pushcclosure(L
, gmatch_aux
, 4);
1010 static int gfind_nodef (lua_State
*L
) {
1011 return luaL_error(L
, LUA_QL("string.gfind") " was renamed to "
1012 LUA_QL("string.gmatch"));
1016 static void add_s (MatchState
*ms
, luaL_Buffer
*b
,
1017 const char *s
, const char *e
)
1020 const char *news
= lua_tolstring(ms
->L
, 3, &l
);
1021 for (i
= 0; i
< l
; i
++) {
1022 if (news
[i
] != L_ESC
)
1023 luaL_addchar(b
, news
[i
]);
1026 if (!isdigit(uchar(news
[i
])))
1027 luaL_addchar(b
, news
[i
]);
1028 else if (news
[i
] == '0')
1029 luaL_addlstring(b
, s
, e
- s
);
1031 push_onecapture(ms
, news
[i
] - '1', s
, e
);
1032 luaL_addvalue(b
); /* add capture to accumulated result */
1038 static void add_value (MatchState
*ms
, luaL_Buffer
*b
, const char *s
,
1041 lua_State
*L
= ms
->L
;
1042 switch (lua_type(L
, 3)) {
1048 case LUA_TFUNCTION
: {
1050 lua_pushvalue(L
, 3);
1051 n
= push_captures(ms
, s
, e
);
1056 push_onecapture(ms
, 0, s
, e
);
1061 luaL_argerror(L
, 3, "string/function/table expected");
1065 if (!lua_toboolean(L
, -1)) { /* nil or false? */
1067 lua_pushlstring(L
, s
, e
- s
); /* keep original text */
1069 else if (!lua_isstring(L
, -1))
1070 luaL_error(L
, "invalid replacement value (a %s)", luaL_typename(L
, -1));
1071 luaL_addvalue(b
); /* add result to accumulator */
1074 static int unic_gsub (lua_State
*L
) {
1076 const char *src
= luaL_checklstring(L
, 1, &srcl
);
1077 const char *p
= luaL_checkstring(L
, 2);
1078 int max_s
= luaL_optint(L
, 4, srcl
+1);
1079 int anchor
= (*p
== '^') ? (p
++, 1) : 0;
1083 luaL_buffinit(L
, &b
);
1086 ms
.src_end
= src
+srcl
;
1087 ms
.mode
= lua_tointeger(L
, lua_upvalueindex(1));
1088 ms
.mb
= MODE_MBYTE(ms
.mode
);
1092 e
= match(&ms
, src
, p
);
1095 add_value(&ms
, &b
, src
, e
);
1097 if (e
&& e
>src
) /* non empty match? */
1098 src
= e
; /* skip it */
1099 else if (src
< ms
.src_end
)
1100 luaL_addchar(&b
, *src
++);
1104 luaL_addlstring(&b
, src
, ms
.src_end
-src
);
1105 luaL_pushresult(&b
);
1106 lua_pushinteger(L
, n
); /* number of substitutions */
1110 /* }====================================================== */
1113 /* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
1114 #define MAX_ITEM 512
1115 /* valid flags in a format specification */
1116 #define FLAGS "-+ #0"
1118 ** maximum size of each format specification (such as '%-099.99d')
1119 ** (+10 accounts for %99.99x plus margin of error)
1121 #define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
1124 static void addquoted (lua_State
*L
, luaL_Buffer
*b
, int arg
) {
1126 const char *s
= luaL_checklstring(L
, arg
, &l
);
1127 luaL_addchar(b
, '"');
1130 case '"': case '\\': case '\n': {
1131 luaL_addchar(b
, '\\');
1132 luaL_addchar(b
, *s
);
1136 luaL_addlstring(b
, "\\r", 2);
1140 luaL_addlstring(b
, "\\000", 4);
1144 luaL_addchar(b
, *s
);
1150 luaL_addchar(b
, '"');
1154 static const char *scanformat (lua_State
*L
, const char *strfrmt
, char *form
,
1157 const char *p
= strfrmt
;
1158 while (strchr(FLAGS
, *p
)) p
++; /* skip flags */
1159 if ((size_t)(p
- strfrmt
) >= sizeof(FLAGS
))
1160 luaL_error(L
, "invalid format (repeated flags)");
1161 if (isdigit(uchar(*p
))) p
++; /* skip width */
1162 if (isdigit(uchar(*p
))) p
++; /* (2 digits at most) */
1166 if (isdigit(uchar(*p
))) p
++; /* skip precision */
1167 if (isdigit(uchar(*p
))) p
++; /* (2 digits at most) */
1169 if (isdigit(uchar(*p
)))
1170 luaL_error(L
, "invalid format (width or precision too long)");
1172 strncpy(form
+1, strfrmt
, p
-strfrmt
+1);
1173 form
[p
-strfrmt
+2] = 0;
1177 static void addintlen (char *form
) {
1178 size_t l
= strlen(form
);
1179 char spec
= form
[l
- 1];
1180 strcpy(form
+ l
- 1, LUA_INTFRMLEN
);
1181 form
[l
+ sizeof(LUA_INTFRMLEN
) - 2] = spec
;
1182 form
[l
+ sizeof(LUA_INTFRMLEN
) - 1] = '\0';
1185 static int str_format (lua_State
*L
) {
1188 const char *strfrmt
= luaL_checklstring(L
, arg
, &sfl
);
1189 const char *strfrmt_end
= strfrmt
+sfl
;
1191 luaL_buffinit(L
, &b
);
1192 while (strfrmt
< strfrmt_end
) {
1193 if (*strfrmt
!= L_ESC
)
1194 luaL_addchar(&b
, *strfrmt
++);
1195 else if (*++strfrmt
== L_ESC
)
1196 luaL_addchar(&b
, *strfrmt
++); /* %% */
1197 else { /* format item */
1198 char form
[MAX_FORMAT
]; /* to store the format (`%...') */
1199 char buff
[MAX_ITEM
]; /* to store the formatted item */
1200 int hasprecision
= 0;
1202 strfrmt
= scanformat(L
, strfrmt
, form
, &hasprecision
);
1203 switch (*strfrmt
++) {
1205 #ifdef LUA_USE_SNPRINTF
1206 snprintf( buff
, MAX_ITEM
, form
,
1207 (int) luaL_checknumber( L
, arg
) );
1209 sprintf(buff
, form
, (int) luaL_checknumber( L
, arg
) );
1213 case 'd': case 'i': {
1215 #ifdef LUA_USE_SNPRINTF
1216 snprintf( buff
, MAX_ITEM
, form
,
1217 (LUA_INTFRM_T
) luaL_checknumber(L
, arg
) );
1220 (LUA_INTFRM_T
) luaL_checknumber(L
, arg
) );
1224 case 'o': case 'u': case 'x': case 'X': {
1226 #ifdef LUA_USE_SNPRINTF
1227 snprintf(buff
, MAX_ITEM
, form
,
1228 (unsigned LUA_INTFRM_T
) luaL_checknumber(L
, arg
) );
1231 (unsigned LUA_INTFRM_T
) luaL_checknumber(L
, arg
) );
1235 case 'e': case 'E': case 'f':
1236 case 'g': case 'G': {
1237 #ifndef LUA_NUMBER_DOUBLE
1238 luaL_argerror( L
, 1, "double formatting not supported" );
1240 # ifdef __dietlibc__
1241 # warning "double formatting is broken in dietlibc"
1243 # ifdef LUA_USE_SNPRINTF
1244 snprintf(buff
, MAX_ITEM
, form
,
1245 (double) luaL_checknumber(L
, arg
) );
1247 sprintf(buff
, form
, (double) luaL_checknumber(L
, arg
) );
1253 addquoted(L
, &b
, arg
);
1254 continue; /* skip the `addsize' at the end */
1258 const char *s
= luaL_checklstring(L
, arg
, &l
);
1259 if (!hasprecision
&& l
>= 100) {
1260 /* no precision and string is too long to be formatted;
1261 keep original string */
1262 lua_pushvalue(L
, arg
);
1264 continue; /* skip the `addsize' at the end */
1267 #ifdef LUA_USE_SNPRINTF
1268 snprintf(buff
, MAX_ITEM
, form
, s
);
1270 sprintf(buff
, form
, s
);
1275 default: { /* also treat cases `pnLlh' */
1276 return luaL_error(L
, "invalid option " LUA_QL("%%%c") " to "
1277 LUA_QL("format"), *(strfrmt
- 1));
1280 luaL_addlstring(&b
, buff
, strlen(buff
));
1283 luaL_pushresult(&b
);
1287 #ifdef WANT_EXT_MATCH
1288 static struct { const char *k
; int v
; } unicflags
[] = {
1289 { "ASCII", MODE_ASCII
}
1290 ,{ "LATIN", MODE_LATIN
}
1291 ,{ "UTF8", MODE_UTF8
}
1292 ,{ "GRAPH", MODE_GRAPH
}
1294 #define unicflags_sz ( sizeof( unicflags ) / sizeof( unicflags[0] ) )
1297 allow direkt match calls from c
1299 int ext_uni_match ( void *state
, const char *s
, size_t n
,
1300 const char *p
, int init
, int mode
)
1302 lua_State
*L
= state
;
1304 int anchor
= (*p
== '^') ? (p
++, 1) : 0;
1306 int i
= posrelat( init
, n
) - 1;
1308 else if ((size_t)(i
) > n
) i
= (ptrdiff_t)n
;
1314 ms
.mb
= MODE_MBYTE(mode
);
1318 if ( ( res
=match(&ms
, s1
, p
)) != NULL
)
1320 } while ( s1
++ < ms
.src_end
&& !anchor
);
1325 static const luaL_Reg uniclib
[] = {
1326 {"byte", unic_byte
}, /* no cluster ! */
1327 {"char", unic_char
},
1329 {"find", unic_find
}, /* cluster */
1330 {"format", str_format
},
1331 {"gfind", gfind_nodef
},
1332 {"gmatch", gmatch
}, /* cluster */
1333 {"gsub", unic_gsub
}, /* cluster */
1334 {"len", unic_len
}, /* cluster/byte opt. */
1335 {"lower", unic_lower
},
1336 {"match", unic_match
}, /* cluster */
1338 {"reverse", str_reverse
},
1339 {"sub", unic_sub
}, /* cluster/byte opt. */
1340 {"upper", unic_upper
},
1344 #if defined( SLNUNICODE_AS_STRING ) && defined( STRING_WITH_METAT )
1345 static void createmetatable (lua_State
*L
) {
1346 lua_newtable(L
); /* create metatable for strings */
1347 lua_pushliteral(L
, ""); /* dummy string */
1348 lua_pushvalue(L
, -2);
1349 lua_setmetatable(L
, -2); /* set string metatable */
1350 lua_pop(L
, 1); /* pop dummy string */
1351 lua_pushvalue(L
, -2); /* string library... */
1352 lua_setfield(L
, -2, "__index"); /* ...is the __index metamethod */
1353 lua_pop(L
, 1); /* pop metatable */
1358 ** Open string library
1360 LUALIB_API
int luaopen_unicode (lua_State
*L
) {
1361 /* register unicode itself so require("unicode") works */
1362 luaL_register(L
, SLN_UNICODENAME
,
1363 uniclib
+ (sizeof uniclib
/sizeof uniclib
[0] - 1)); /* empty func list */
1365 lua_getglobal(L
,SLN_UNICODENAME
);
1367 lua_pushinteger(L
, MODE_ASCII
);
1368 luaL_setfuncs(L
, uniclib
, 1);
1369 lua_setfield(L
, -2, "ascii");
1372 lua_pushinteger(L
, MODE_LATIN
);
1373 luaL_setfuncs(L
, uniclib
, 1);
1374 lua_setfield(L
, -2, "latin1");
1377 lua_pushinteger(L
, MODE_GRAPH
);
1378 luaL_setfuncs(L
, uniclib
, 1);
1379 lua_setfield(L
, -2, "grapheme");
1382 lua_pushinteger(L
, MODE_UTF8
);
1383 luaL_setfuncs(L
, uniclib
, 1);
1384 lua_setfield(L
, -2, "utf8");
1386 #ifdef WANT_EXT_MATCH
1389 const char *ln
= SLN_UNICODENAME
".mode";
1390 luaL_findtable( L
, LUA_REGISTRYINDEX
, "_LOADED", 1 );
1391 lua_getfield( L
, -1, ln
);
1392 if ( !lua_istable(L
, -1 ) ) {
1394 if ( luaL_findtable( L
, LUA_GLOBALSINDEX
, ln
, unicflags_sz
) )
1395 luaL_error( L
, "name conflict for module " LUA_QS
, ln
);
1396 lua_pushvalue( L
, -1 );
1397 lua_setfield( L
, -3, ln
);
1399 lua_remove( L
, -2 );
1400 for( i
= 0; unicflags_sz
> i
; ++i
) {
1401 lua_pushnumber( L
, unicflags
[i
].v
);
1402 lua_setfield( L
, -2, unicflags
[i
].k
);