Disable loading bytecode with an extra header (BOM or #!).
[luajit-2.0.git] / src / lj_str.c
blob7bf4848afb2a3563e987908203c6d02f831d0731
1 /*
2 ** String handling.
3 ** Copyright (C) 2005-2012 Mike Pall. See Copyright Notice in luajit.h
4 **
5 ** Portions taken verbatim or adapted from the Lua interpreter.
6 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
7 */
9 #include <stdio.h>
11 #define lj_str_c
12 #define LUA_CORE
14 #include "lj_obj.h"
15 #include "lj_gc.h"
16 #include "lj_err.h"
17 #include "lj_str.h"
18 #include "lj_state.h"
19 #include "lj_char.h"
21 /* -- String interning ---------------------------------------------------- */
23 /* Ordered compare of strings. Assumes string data is 4-byte aligned. */
24 int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b)
26 MSize i, n = a->len > b->len ? b->len : a->len;
27 for (i = 0; i < n; i += 4) {
28 /* Note: innocuous access up to end of string + 3. */
29 uint32_t va = *(const uint32_t *)(strdata(a)+i);
30 uint32_t vb = *(const uint32_t *)(strdata(b)+i);
31 if (va != vb) {
32 #if LJ_LE
33 va = lj_bswap(va); vb = lj_bswap(vb);
34 #endif
35 i -= n;
36 if ((int32_t)i >= -3) {
37 va >>= 32+(i<<3); vb >>= 32+(i<<3);
38 if (va == vb) break;
40 return va < vb ? -1 : 1;
43 return (int32_t)(a->len - b->len);
46 /* Fast string data comparison. Caveat: unaligned access to 1st string! */
47 static LJ_AINLINE int str_fastcmp(const char *a, const char *b, MSize len)
49 MSize i = 0;
50 lua_assert(len > 0);
51 lua_assert((((uintptr_t)a + len) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4);
52 do { /* Note: innocuous access up to end of string + 3. */
53 uint32_t v = lj_getu32(a+i) ^ *(const uint32_t *)(b+i);
54 if (v) {
55 i -= len;
56 #if LJ_LE
57 return (int32_t)i >= -3 ? (v << (32+(i<<3))) : 1;
58 #else
59 return (int32_t)i >= -3 ? (v >> (32+(i<<3))) : 1;
60 #endif
62 i += 4;
63 } while (i < len);
64 return 0;
67 /* Resize the string hash table (grow and shrink). */
68 void lj_str_resize(lua_State *L, MSize newmask)
70 global_State *g = G(L);
71 GCRef *newhash;
72 MSize i;
73 if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
74 return; /* No resizing during GC traversal or if already too big. */
75 newhash = lj_mem_newvec(L, newmask+1, GCRef);
76 memset(newhash, 0, (newmask+1)*sizeof(GCRef));
77 for (i = g->strmask; i != ~(MSize)0; i--) { /* Rehash old table. */
78 GCobj *p = gcref(g->strhash[i]);
79 while (p) { /* Follow each hash chain and reinsert all strings. */
80 MSize h = gco2str(p)->hash & newmask;
81 GCobj *next = gcnext(p);
82 /* NOBARRIER: The string table is a GC root. */
83 setgcrefr(p->gch.nextgc, newhash[h]);
84 setgcref(newhash[h], p);
85 p = next;
88 lj_mem_freevec(g, g->strhash, g->strmask+1, GCRef);
89 g->strmask = newmask;
90 g->strhash = newhash;
93 /* Intern a string and return string object. */
94 GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
96 global_State *g;
97 GCstr *s;
98 GCobj *o;
99 MSize len = (MSize)lenx;
100 MSize a, b, h = len;
101 if (lenx >= LJ_MAX_STR)
102 lj_err_msg(L, LJ_ERR_STROV);
103 g = G(L);
104 /* Compute string hash. Constants taken from lookup3 hash by Bob Jenkins. */
105 if (len >= 4) { /* Caveat: unaligned access! */
106 a = lj_getu32(str);
107 h ^= lj_getu32(str+len-4);
108 b = lj_getu32(str+(len>>1)-2);
109 h ^= b; h -= lj_rol(b, 14);
110 b += lj_getu32(str+(len>>2)-1);
111 } else if (len > 0) {
112 a = *(const uint8_t *)str;
113 h ^= *(const uint8_t *)(str+len-1);
114 b = *(const uint8_t *)(str+(len>>1));
115 h ^= b; h -= lj_rol(b, 14);
116 } else {
117 return &g->strempty;
119 a ^= h; a -= lj_rol(h, 11);
120 b ^= a; b -= lj_rol(a, 25);
121 h ^= b; h -= lj_rol(b, 16);
122 /* Check if the string has already been interned. */
123 o = gcref(g->strhash[h & g->strmask]);
124 if (LJ_LIKELY((((uintptr_t)str + len) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4)) {
125 while (o != NULL) {
126 GCstr *sx = gco2str(o);
127 if (sx->len == len && str_fastcmp(str, strdata(sx), len) == 0) {
128 /* Resurrect if dead. Can only happen with fixstring() (keywords). */
129 if (isdead(g, o)) flipwhite(o);
130 return sx; /* Return existing string. */
132 o = gcnext(o);
134 } else { /* Slow path: end of string is too close to a page boundary. */
135 while (o != NULL) {
136 GCstr *sx = gco2str(o);
137 if (sx->len == len && memcmp(str, strdata(sx), len) == 0) {
138 /* Resurrect if dead. Can only happen with fixstring() (keywords). */
139 if (isdead(g, o)) flipwhite(o);
140 return sx; /* Return existing string. */
142 o = gcnext(o);
145 /* Nope, create a new string. */
146 s = lj_mem_newt(L, sizeof(GCstr)+len+1, GCstr);
147 newwhite(g, s);
148 s->gct = ~LJ_TSTR;
149 s->len = len;
150 s->hash = h;
151 s->reserved = 0;
152 memcpy(strdatawr(s), str, len);
153 strdatawr(s)[len] = '\0'; /* Zero-terminate string. */
154 /* Add it to string hash table. */
155 h &= g->strmask;
156 s->nextgc = g->strhash[h];
157 /* NOBARRIER: The string table is a GC root. */
158 setgcref(g->strhash[h], obj2gco(s));
159 if (g->strnum++ > g->strmask) /* Allow a 100% load factor. */
160 lj_str_resize(L, (g->strmask<<1)+1); /* Grow string table. */
161 return s; /* Return newly interned string. */
164 void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s)
166 g->strnum--;
167 lj_mem_free(g, s, sizestring(s));
170 /* -- Type conversions ---------------------------------------------------- */
172 /* Convert string object to number. */
173 int LJ_FASTCALL lj_str_tonum(GCstr *str, TValue *n)
175 int ok = lj_str_numconv(strdata(str), n);
176 if (ok && tvisint(n))
177 setnumV(n, (lua_Number)intV(n));
178 return ok;
181 int LJ_FASTCALL lj_str_tonumber(GCstr *str, TValue *n)
183 return lj_str_numconv(strdata(str), n);
186 /* Convert string to number. */
187 int LJ_FASTCALL lj_str_numconv(const char *s, TValue *n)
189 #if LJ_DUALNUM
190 int sign = 1;
191 #else
192 lua_Number sign = 1;
193 #endif
194 const uint8_t *p = (const uint8_t *)s;
195 while (lj_char_isspace(*p)) p++;
196 if (*p == '-') { p++; sign = -1; } else if (*p == '+') { p++; }
197 if ((uint32_t)(*p - '0') < 10) {
198 uint32_t k = (uint32_t)(*p++ - '0');
199 if (k == 0 && ((*p & ~0x20) == 'X')) {
200 p++;
201 if (!lj_char_isxdigit(*p))
202 return 0; /* Don't accept '0x' without hex digits. */
203 do {
204 if (k >= 0x10000000u) goto parsedbl;
205 k = (k << 4) + (*p & 15u);
206 if (!lj_char_isdigit(*p)) k += 9;
207 p++;
208 } while (lj_char_isxdigit(*p));
209 } else {
210 while ((uint32_t)(*p - '0') < 10) {
211 if (LJ_UNLIKELY(k >= 429496729) && (k != 429496729 || *p > '5'))
212 goto parsedbl;
213 k = k * 10u + (uint32_t)(*p++ - '0');
216 while (LJ_UNLIKELY(lj_char_isspace(*p))) p++;
217 if (LJ_LIKELY(*p == '\0')) {
218 #if LJ_DUALNUM
219 if (sign == 1) {
220 if (k < 0x80000000u) {
221 setintV(n, (int32_t)k);
222 return 1;
224 } else if (k <= 0x80000000u) {
225 setintV(n, -(int32_t)k);
226 return 1;
228 #endif
229 setnumV(n, sign * (lua_Number)k);
230 return 1;
233 parsedbl:
235 TValue tv;
236 char *endptr;
237 setnumV(&tv, lua_str2number(s, &endptr));
238 if (endptr == s) return 0; /* Conversion failed. */
239 if (LJ_UNLIKELY(*endptr != '\0')) {
240 while (lj_char_isspace((uint8_t)*endptr)) endptr++;
241 if (*endptr != '\0') return 0; /* Invalid trailing characters? */
243 if (LJ_LIKELY(!tvisnan(&tv)))
244 setnumV(n, numV(&tv));
245 else
246 setnanV(n); /* Canonicalize injected NaNs. */
247 return 1;
251 /* Print number to buffer. Canonicalizes non-finite values. */
252 size_t LJ_FASTCALL lj_str_bufnum(char *s, cTValue *o)
254 if (LJ_LIKELY((o->u32.hi << 1) < 0xffe00000)) { /* Finite? */
255 lua_Number n = o->n;
256 return (size_t)lua_number2str(s, n);
257 } else if (((o->u32.hi & 0x000fffff) | o->u32.lo) != 0) {
258 s[0] = 'n'; s[1] = 'a'; s[2] = 'n'; return 3;
259 } else if ((o->u32.hi & 0x80000000) == 0) {
260 s[0] = 'i'; s[1] = 'n'; s[2] = 'f'; return 3;
261 } else {
262 s[0] = '-'; s[1] = 'i'; s[2] = 'n'; s[3] = 'f'; return 4;
266 /* Print integer to buffer. Returns pointer to start. */
267 char * LJ_FASTCALL lj_str_bufint(char *p, int32_t k)
269 uint32_t u = (uint32_t)(k < 0 ? -k : k);
270 p += 1+10;
271 do { *--p = (char)('0' + u % 10); } while (u /= 10);
272 if (k < 0) *--p = '-';
273 return p;
276 /* Convert number to string. */
277 GCstr * LJ_FASTCALL lj_str_fromnum(lua_State *L, const lua_Number *np)
279 char buf[LJ_STR_NUMBUF];
280 size_t len = lj_str_bufnum(buf, (TValue *)np);
281 return lj_str_new(L, buf, len);
284 /* Convert integer to string. */
285 GCstr * LJ_FASTCALL lj_str_fromint(lua_State *L, int32_t k)
287 char s[1+10];
288 char *p = lj_str_bufint(s, k);
289 return lj_str_new(L, p, (size_t)(s+sizeof(s)-p));
292 GCstr * LJ_FASTCALL lj_str_fromnumber(lua_State *L, cTValue *o)
294 return tvisint(o) ? lj_str_fromint(L, intV(o)) : lj_str_fromnum(L, &o->n);
297 /* -- String formatting --------------------------------------------------- */
299 static void addstr(lua_State *L, SBuf *sb, const char *str, MSize len)
301 char *p;
302 MSize i;
303 if (sb->n + len > sb->sz) {
304 MSize sz = sb->sz * 2;
305 while (sb->n + len > sz) sz = sz * 2;
306 lj_str_resizebuf(L, sb, sz);
308 p = sb->buf + sb->n;
309 sb->n += len;
310 for (i = 0; i < len; i++) p[i] = str[i];
313 static void addchar(lua_State *L, SBuf *sb, int c)
315 if (sb->n + 1 > sb->sz) {
316 MSize sz = sb->sz * 2;
317 lj_str_resizebuf(L, sb, sz);
319 sb->buf[sb->n++] = (char)c;
322 /* Push formatted message as a string object to Lua stack. va_list variant. */
323 const char *lj_str_pushvf(lua_State *L, const char *fmt, va_list argp)
325 SBuf *sb = &G(L)->tmpbuf;
326 lj_str_needbuf(L, sb, (MSize)strlen(fmt));
327 lj_str_resetbuf(sb);
328 for (;;) {
329 const char *e = strchr(fmt, '%');
330 if (e == NULL) break;
331 addstr(L, sb, fmt, (MSize)(e-fmt));
332 /* This function only handles %s, %c, %d, %f and %p formats. */
333 switch (e[1]) {
334 case 's': {
335 const char *s = va_arg(argp, char *);
336 if (s == NULL) s = "(null)";
337 addstr(L, sb, s, (MSize)strlen(s));
338 break;
340 case 'c':
341 addchar(L, sb, va_arg(argp, int));
342 break;
343 case 'd': {
344 char buf[LJ_STR_INTBUF];
345 char *p = lj_str_bufint(buf, va_arg(argp, int32_t));
346 addstr(L, sb, p, (MSize)(buf+LJ_STR_INTBUF-p));
347 break;
349 case 'f': {
350 char buf[LJ_STR_NUMBUF];
351 TValue tv;
352 MSize len;
353 tv.n = (lua_Number)(va_arg(argp, LUAI_UACNUMBER));
354 len = (MSize)lj_str_bufnum(buf, &tv);
355 addstr(L, sb, buf, len);
356 break;
358 case 'p': {
359 #define FMTP_CHARS (2*sizeof(ptrdiff_t))
360 char buf[2+FMTP_CHARS];
361 ptrdiff_t p = (ptrdiff_t)(va_arg(argp, void *));
362 ptrdiff_t i, lasti = 2+FMTP_CHARS;
363 if (p == 0) {
364 addstr(L, sb, "NULL", 4);
365 break;
367 #if LJ_64
368 /* Shorten output for 64 bit pointers. */
369 lasti = 2+2*4+((p >> 32) ? 2+2*(lj_fls((uint32_t)(p >> 32))>>3) : 0);
370 #endif
371 buf[0] = '0';
372 buf[1] = 'x';
373 for (i = lasti-1; i >= 2; i--, p >>= 4)
374 buf[i] = "0123456789abcdef"[(p & 15)];
375 addstr(L, sb, buf, (MSize)lasti);
376 break;
378 case '%':
379 addchar(L, sb, '%');
380 break;
381 default:
382 addchar(L, sb, '%');
383 addchar(L, sb, e[1]);
384 break;
386 fmt = e+2;
388 addstr(L, sb, fmt, (MSize)strlen(fmt));
389 setstrV(L, L->top, lj_str_new(L, sb->buf, sb->n));
390 incr_top(L);
391 return strVdata(L->top - 1);
394 /* Push formatted message as a string object to Lua stack. Vararg variant. */
395 const char *lj_str_pushf(lua_State *L, const char *fmt, ...)
397 const char *msg;
398 va_list argp;
399 va_start(argp, fmt);
400 msg = lj_str_pushvf(L, fmt, argp);
401 va_end(argp);
402 return msg;
405 /* -- Buffer handling ----------------------------------------------------- */
407 char *lj_str_needbuf(lua_State *L, SBuf *sb, MSize sz)
409 if (sz > sb->sz) {
410 if (sz < LJ_MIN_SBUF) sz = LJ_MIN_SBUF;
411 lj_str_resizebuf(L, sb, sz);
413 return sb->buf;