OSX/iOS: Fix SDK incompatibility.
[luajit-2.0.git] / src / lj_str.c
blobcfdaec6f1a81cbd2238ef1dda4057eb02b31ea34
1 /*
2 ** String handling.
3 ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
4 */
6 #define lj_str_c
7 #define LUA_CORE
9 #include "lj_obj.h"
10 #include "lj_gc.h"
11 #include "lj_err.h"
12 #include "lj_str.h"
13 #include "lj_char.h"
14 #include "lj_prng.h"
16 /* -- String helpers ------------------------------------------------------ */
18 /* Ordered compare of strings. Assumes string data is 4-byte aligned. */
19 int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b)
21 MSize i, n = a->len > b->len ? b->len : a->len;
22 for (i = 0; i < n; i += 4) {
23 /* Note: innocuous access up to end of string + 3. */
24 uint32_t va = *(const uint32_t *)(strdata(a)+i);
25 uint32_t vb = *(const uint32_t *)(strdata(b)+i);
26 if (va != vb) {
27 #if LJ_LE
28 va = lj_bswap(va); vb = lj_bswap(vb);
29 #endif
30 i -= n;
31 if ((int32_t)i >= -3) {
32 va >>= 32+(i<<3); vb >>= 32+(i<<3);
33 if (va == vb) break;
35 return va < vb ? -1 : 1;
38 return (int32_t)(a->len - b->len);
41 /* Find fixed string p inside string s. */
42 const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen)
44 if (plen <= slen) {
45 if (plen == 0) {
46 return s;
47 } else {
48 int c = *(const uint8_t *)p++;
49 plen--; slen -= plen;
50 while (slen) {
51 const char *q = (const char *)memchr(s, c, slen);
52 if (!q) break;
53 if (memcmp(q+1, p, plen) == 0) return q;
54 q++; slen -= (MSize)(q-s); s = q;
58 return NULL;
61 /* Check whether a string has a pattern matching character. */
62 int lj_str_haspattern(GCstr *s)
64 const char *p = strdata(s), *q = p + s->len;
65 while (p < q) {
66 int c = *(const uint8_t *)p++;
67 if (lj_char_ispunct(c) && strchr("^$*+?.([%-", c))
68 return 1; /* Found a pattern matching char. */
70 return 0; /* No pattern matching chars found. */
73 /* -- String hashing ------------------------------------------------------ */
75 /* Keyed sparse ARX string hash. Constant time. */
76 static StrHash hash_sparse(uint64_t seed, const char *str, MSize len)
78 /* Constants taken from lookup3 hash by Bob Jenkins. */
79 StrHash a, b, h = len ^ (StrHash)seed;
80 if (len >= 4) { /* Caveat: unaligned access! */
81 a = lj_getu32(str);
82 h ^= lj_getu32(str+len-4);
83 b = lj_getu32(str+(len>>1)-2);
84 h ^= b; h -= lj_rol(b, 14);
85 b += lj_getu32(str+(len>>2)-1);
86 } else {
87 a = *(const uint8_t *)str;
88 h ^= *(const uint8_t *)(str+len-1);
89 b = *(const uint8_t *)(str+(len>>1));
90 h ^= b; h -= lj_rol(b, 14);
92 a ^= h; a -= lj_rol(h, 11);
93 b ^= a; b -= lj_rol(a, 25);
94 h ^= b; h -= lj_rol(b, 16);
95 return h;
98 #if LUAJIT_SECURITY_STRHASH
99 /* Keyed dense ARX string hash. Linear time. */
100 static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h,
101 const char *str, MSize len)
103 StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4));
104 if (len > 12) {
105 StrHash a = (StrHash)seed;
106 const char *pe = str+len-12, *p = pe, *q = str;
107 do {
108 a += lj_getu32(p);
109 b += lj_getu32(p+4);
110 h += lj_getu32(p+8);
111 p = q; q += 12;
112 h ^= b; h -= lj_rol(b, 14);
113 a ^= h; a -= lj_rol(h, 11);
114 b ^= a; b -= lj_rol(a, 25);
115 } while (p < pe);
116 h ^= b; h -= lj_rol(b, 16);
117 a ^= h; a -= lj_rol(h, 4);
118 b ^= a; b -= lj_rol(a, 14);
120 return b;
122 #endif
124 /* -- String interning ---------------------------------------------------- */
126 #define LJ_STR_MAXCOLL 32
128 /* Resize the string interning hash table (grow and shrink). */
129 void lj_str_resize(lua_State *L, MSize newmask)
131 global_State *g = G(L);
132 GCRef *newtab, *oldtab = g->str.tab;
133 MSize i;
135 /* No resizing during GC traversal or if already too big. */
136 if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
137 return;
139 newtab = lj_mem_newvec(L, newmask+1, GCRef);
140 memset(newtab, 0, (newmask+1)*sizeof(GCRef));
142 #if LUAJIT_SECURITY_STRHASH
143 /* Check which chains need secondary hashes. */
144 if (g->str.second) {
145 int newsecond = 0;
146 /* Compute primary chain lengths. */
147 for (i = g->str.mask; i != ~(MSize)0; i--) {
148 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
149 while (o) {
150 GCstr *s = gco2str(o);
151 MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) :
152 s->hash;
153 hash &= newmask;
154 setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1);
155 o = gcnext(o);
158 /* Mark secondary chains. */
159 for (i = newmask; i != ~(MSize)0; i--) {
160 int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL;
161 newsecond |= secondary;
162 setgcrefp(newtab[i], secondary);
164 g->str.second = newsecond;
166 #endif
168 /* Reinsert all strings from the old table into the new table. */
169 for (i = g->str.mask; i != ~(MSize)0; i--) {
170 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
171 while (o) {
172 GCobj *next = gcnext(o);
173 GCstr *s = gco2str(o);
174 MSize hash = s->hash;
175 #if LUAJIT_SECURITY_STRHASH
176 uintptr_t u;
177 if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */
178 hash &= newmask;
179 u = gcrefu(newtab[hash]);
180 if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */
181 s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len);
182 s->hashalg = 1;
183 hash &= newmask;
184 u = gcrefu(newtab[hash]);
186 } else { /* String hashed with secondary hash. */
187 MSize shash = hash_sparse(g->str.seed, strdata(s), s->len);
188 u = gcrefu(newtab[shash & newmask]);
189 if (u & 1) {
190 hash &= newmask;
191 u = gcrefu(newtab[hash]);
192 } else { /* Revert string back to primary hash. */
193 s->hash = shash;
194 s->hashalg = 0;
195 hash = (shash & newmask);
198 /* NOBARRIER: The string table is a GC root. */
199 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
200 setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1)));
201 #else
202 hash &= newmask;
203 /* NOBARRIER: The string table is a GC root. */
204 setgcrefr(o->gch.nextgc, newtab[hash]);
205 setgcref(newtab[hash], o);
206 #endif
207 o = next;
211 /* Free old table and replace with new table. */
212 lj_str_freetab(g);
213 g->str.tab = newtab;
214 g->str.mask = newmask;
217 #if LUAJIT_SECURITY_STRHASH
218 /* Rehash and rechain all strings in a chain. */
219 static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc,
220 const char *str, MSize len)
222 global_State *g = G(L);
223 int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */
224 GCRef *strtab = g->str.tab;
225 MSize strmask = g->str.mask;
226 GCobj *o = gcref(strtab[hashc & strmask]);
227 setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1));
228 g->str.second = 1;
229 while (o) {
230 uintptr_t u;
231 GCobj *next = gcnext(o);
232 GCstr *s = gco2str(o);
233 StrHash hash;
234 if (ow) { /* Must sweep while rechaining. */
235 if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */
236 lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED),
237 "sweep of undead string");
238 makewhite(g, o);
239 } else { /* Free dead string. */
240 lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED,
241 "sweep of unlive string");
242 lj_str_free(g, s);
243 o = next;
244 continue;
247 hash = s->hash;
248 if (!s->hashalg) { /* Rehash with secondary hash. */
249 hash = hash_dense(g->str.seed, hash, strdata(s), s->len);
250 s->hash = hash;
251 s->hashalg = 1;
253 /* Rechain. */
254 hash &= strmask;
255 u = gcrefu(strtab[hash]);
256 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
257 setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1)));
258 o = next;
260 /* Try to insert the pending string again. */
261 return lj_str_new(L, str, len);
263 #endif
265 /* Reseed String ID from PRNG after random interval < 2^bits. */
266 #if LUAJIT_SECURITY_STRID == 1
267 #define STRID_RESEED_INTERVAL 8
268 #elif LUAJIT_SECURITY_STRID == 2
269 #define STRID_RESEED_INTERVAL 4
270 #elif LUAJIT_SECURITY_STRID >= 3
271 #define STRID_RESEED_INTERVAL 0
272 #endif
274 /* Allocate a new string and add to string interning table. */
275 static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len,
276 StrHash hash, int hashalg)
278 GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr);
279 global_State *g = G(L);
280 uintptr_t u;
281 newwhite(g, s);
282 s->gct = ~LJ_TSTR;
283 s->len = len;
284 s->hash = hash;
285 #ifndef STRID_RESEED_INTERVAL
286 s->sid = g->str.id++;
287 #elif STRID_RESEED_INTERVAL
288 if (!g->str.idreseed--) {
289 uint64_t r = lj_prng_u64(&g->prng);
290 g->str.id = (StrID)r;
291 g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL));
293 s->sid = g->str.id++;
294 #else
295 s->sid = (StrID)lj_prng_u64(&g->prng);
296 #endif
297 s->reserved = 0;
298 s->hashalg = (uint8_t)hashalg;
299 /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */
300 *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0;
301 memcpy(strdatawr(s), str, len);
302 /* Add to string hash table. */
303 hash &= g->str.mask;
304 u = gcrefu(g->str.tab[hash]);
305 setgcrefp(s->nextgc, (u & ~(uintptr_t)1));
306 /* NOBARRIER: The string table is a GC root. */
307 setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1)));
308 if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */
309 lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */
310 return s; /* Return newly interned string. */
313 /* Intern a string and return string object. */
314 GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
316 global_State *g = G(L);
317 if (lenx-1 < LJ_MAX_STR-1) {
318 MSize len = (MSize)lenx;
319 StrHash hash = hash_sparse(g->str.seed, str, len);
320 MSize coll = 0;
321 int hashalg = 0;
322 /* Check if the string has already been interned. */
323 GCobj *o = gcref(g->str.tab[hash & g->str.mask]);
324 #if LUAJIT_SECURITY_STRHASH
325 if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */
326 hashalg = 1;
327 hash = hash_dense(g->str.seed, hash, str, len);
328 o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1);
330 #endif
331 while (o != NULL) {
332 GCstr *sx = gco2str(o);
333 if (sx->hash == hash && sx->len == len) {
334 if (memcmp(str, strdata(sx), len) == 0) {
335 if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */
336 return sx; /* Return existing string. */
338 coll++;
340 coll++;
341 o = gcnext(o);
343 #if LUAJIT_SECURITY_STRHASH
344 /* Rehash chain if there are too many collisions. */
345 if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) {
346 return lj_str_rehash_chain(L, hash, str, len);
348 #endif
349 /* Otherwise allocate a new string. */
350 return lj_str_alloc(L, str, len, hash, hashalg);
351 } else {
352 if (lenx)
353 lj_err_msg(L, LJ_ERR_STROV);
354 return &g->strempty;
358 void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s)
360 g->str.num--;
361 lj_mem_free(g, s, lj_str_size(s->len));
364 void LJ_FASTCALL lj_str_init(lua_State *L)
366 global_State *g = G(L);
367 g->str.seed = lj_prng_u64(&g->prng);
368 lj_str_resize(L, LJ_MIN_STRTAB-1);