3 ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
13 #include "lj_strscan.h"
15 /* -- Scanning numbers ---------------------------------------------------- */
18 ** Rationale for the builtin string to number conversion library:
20 ** It removes a dependency on libc's strtod(), which is a true portability
21 ** nightmare. Mainly due to the plethora of supported OS and toolchain
22 ** combinations. Sadly, the various implementations
23 ** a) are often buggy, incomplete (no hex floats) and/or imprecise,
24 ** b) sometimes crash or hang on certain inputs,
25 ** c) return non-standard NaNs that need to be filtered out, and
26 ** d) fail if the locale-specific decimal separator is not a dot,
27 ** which can only be fixed with atrocious workarounds.
29 ** Also, most of the strtod() implementations are hopelessly bloated,
30 ** which is not just an I-cache hog, but a problem for static linkage
31 ** on embedded systems, too.
33 ** OTOH the builtin conversion function is very compact. Even though it
34 ** does a lot more, like parsing long longs, octal or imaginary numbers
35 ** and returning the result in different formats:
36 ** a) It needs less than 3 KB (!) of machine code (on x64 with -Os),
37 ** b) it doesn't perform any dynamic allocation and,
38 ** c) it needs only around 600 bytes of stack space.
40 ** The builtin function is faster than strtod() for typical inputs, e.g.
41 ** "123", "1.5" or "1e6". Arguably, it's slower for very large exponents,
42 ** which are not very common (this could be fixed, if needed).
44 ** And most importantly, the builtin function is equally precise on all
45 ** platforms. It correctly converts and rounds any input to a double.
46 ** If this is not the case, please send a bug report -- but PLEASE verify
47 ** that the implementation you're comparing to is not the culprit!
49 ** The implementation quickly pre-scans the entire string first and
50 ** handles simple integers on-the-fly. Otherwise, it dispatches to the
51 ** base-specific parser. Hex and octal is straightforward.
53 ** Decimal to binary conversion uses a fixed-length circular buffer in
54 ** base 100. Some simple cases are handled directly. For other cases, the
55 ** number in the buffer is up-scaled or down-scaled until the integer part
56 ** is in the proper range. Then the integer part is rounded and converted
57 ** to a double which is finally rescaled to the result. Denormals need
58 ** special treatment to prevent incorrect 'double rounding'.
61 /* Definitions for circular decimal digit buffer (base 100 = 2 digits/byte). */
62 #define STRSCAN_DIG 1024
63 #define STRSCAN_MAXDIG 800 /* 772 + extra are sufficient. */
64 #define STRSCAN_DDIG (STRSCAN_DIG/2)
65 #define STRSCAN_DMASK (STRSCAN_DDIG-1)
66 #define STRSCAN_MAXEXP (1 << 20)
68 /* Helpers for circular buffer. */
69 #define DNEXT(a) (((a)+1) & STRSCAN_DMASK)
70 #define DPREV(a) (((a)-1) & STRSCAN_DMASK)
71 #define DLEN(lo, hi) ((int32_t)(((lo)-(hi)) & STRSCAN_DMASK))
73 #define casecmp(c, k) (((c) | 0x20) == k)
75 /* Final conversion to double. */
76 static void strscan_double(uint64_t x
, TValue
*o
, int32_t ex2
, int32_t neg
)
80 /* Avoid double rounding for denormals. */
81 if (LJ_UNLIKELY(ex2
<= -1075 && x
!= 0)) {
82 /* NYI: all of this generates way too much code on 32 bit CPUs. */
83 #if (defined(__GNUC__) || defined(__clang__)) && LJ_64
84 int32_t b
= (int32_t)(__builtin_clzll(x
)^63);
86 int32_t b
= (x
>>32) ? 32+(int32_t)lj_fls((uint32_t)(x
>>32)) :
87 (int32_t)lj_fls((uint32_t)x
);
89 if ((int32_t)b
+ ex2
<= -1023 && (int32_t)b
+ ex2
>= -1075) {
90 uint64_t rb
= (uint64_t)1 << (-1075-ex2
);
91 if ((x
& rb
) && ((x
& (rb
+rb
+rb
-1)))) x
+= rb
+rb
;
96 /* Convert to double using a signed int64_t conversion, then rescale. */
97 lj_assertX((int64_t)x
>= 0, "bad double conversion");
98 n
= (double)(int64_t)x
;
100 if (ex2
) n
= ldexp(n
, ex2
);
104 /* Parse hexadecimal number. */
105 static StrScanFmt
strscan_hex(const uint8_t *p
, TValue
*o
,
106 StrScanFmt fmt
, uint32_t opt
,
107 int32_t ex2
, int32_t neg
, uint32_t dig
)
112 /* Scan hex digits. */
113 for (i
= dig
> 16 ? 16 : dig
; i
; i
--, p
++) {
114 uint32_t d
= (*p
!= '.' ? *p
: *++p
); if (d
> '9') d
+= 9;
115 x
= (x
<< 4) + (d
& 15);
118 /* Summarize rounding-effect of excess digits. */
119 for (i
= 16; i
< dig
; i
++, p
++)
120 x
|= ((*p
!= '.' ? *p
: *++p
) != '0'), ex2
+= 4;
122 /* Format-specific handling. */
125 if (!(opt
& STRSCAN_OPT_TONUM
) && x
< 0x80000000u
+neg
&&
127 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
128 return STRSCAN_INT
; /* Fast path for 32 bit integers. */
130 if (!(opt
& STRSCAN_OPT_C
)) { fmt
= STRSCAN_NUM
; break; }
133 if (dig
> 8) return STRSCAN_ERROR
;
134 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
138 if (dig
> 16) return STRSCAN_ERROR
;
139 o
->u64
= neg
? ~x
+1u : x
;
145 /* Reduce range, then convert to double. */
146 if ((x
& U64x(c0000000
,0000000))) { x
= (x
>> 2) | (x
& 3); ex2
+= 2; }
147 strscan_double(x
, o
, ex2
, neg
);
151 /* Parse octal number. */
152 static StrScanFmt
strscan_oct(const uint8_t *p
, TValue
*o
,
153 StrScanFmt fmt
, int32_t neg
, uint32_t dig
)
157 /* Scan octal digits. */
158 if (dig
> 22 || (dig
== 22 && *p
> '1')) return STRSCAN_ERROR
;
160 if (!(*p
>= '0' && *p
<= '7')) return STRSCAN_ERROR
;
161 x
= (x
<< 3) + (*p
++ & 7);
164 /* Format-specific handling. */
167 if (x
>= 0x80000000u
+neg
) fmt
= STRSCAN_U32
;
170 if ((x
>> 32)) return STRSCAN_ERROR
;
171 o
->i
= neg
? (int32_t)(~(uint32_t)x
+1u) : (int32_t)x
;
176 o
->u64
= neg
? ~x
+1u : x
;
182 /* Parse decimal number. */
183 static StrScanFmt
strscan_dec(const uint8_t *p
, TValue
*o
,
184 StrScanFmt fmt
, uint32_t opt
,
185 int32_t ex10
, int32_t neg
, uint32_t dig
)
187 uint8_t xi
[STRSCAN_DDIG
], *xip
= xi
;
191 if (i
> STRSCAN_MAXDIG
) {
192 ex10
+= (int32_t)(i
- STRSCAN_MAXDIG
);
195 /* Scan unaligned leading digit. */
197 *xip
++ = ((*p
!= '.' ? *p
: *++p
) & 15), i
--, p
++;
198 /* Scan aligned double-digits. */
199 for ( ; i
> 1; i
-= 2) {
200 uint32_t d
= 10 * ((*p
!= '.' ? *p
: *++p
) & 15); p
++;
201 *xip
++ = d
+ ((*p
!= '.' ? *p
: *++p
) & 15); p
++;
203 /* Scan and realign trailing digit. */
204 if (i
) *xip
++ = 10 * ((*p
!= '.' ? *p
: *++p
) & 15), ex10
--, dig
++, p
++;
206 /* Summarize rounding-effect of excess digits. */
207 if (dig
> STRSCAN_MAXDIG
) {
209 if ((*p
!= '.' ? *p
: *++p
) != '0') { xip
[-1] |= 1; break; }
211 } while (--dig
> STRSCAN_MAXDIG
);
212 dig
= STRSCAN_MAXDIG
;
213 } else { /* Simplify exponent. */
214 while (ex10
> 0 && dig
<= 18) *xip
++ = 0, ex10
-= 2, dig
+= 2;
216 } else { /* Only got zeros. */
221 /* Fast path for numbers in integer format (but handles e.g. 1e6, too). */
222 if (dig
<= 20 && ex10
== 0) {
226 for (xis
= xi
+1; xis
< xip
; xis
++) x
= x
* 100 + *xis
;
227 if (!(dig
== 20 && (xi
[0] > 18 || (int64_t)x
>= 0))) { /* No overflow? */
228 /* Format-specific handling. */
231 if (!(opt
& STRSCAN_OPT_TONUM
) && x
< 0x80000000u
+neg
) {
232 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
233 return STRSCAN_INT
; /* Fast path for 32 bit integers. */
235 if (!(opt
& STRSCAN_OPT_C
)) { fmt
= STRSCAN_NUM
; goto plainnumber
; }
238 if ((x
>> 32) != 0) return STRSCAN_ERROR
;
239 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
243 o
->u64
= neg
? ~x
+1u : x
;
246 plainnumber
: /* Fast path for plain numbers < 2^63. */
247 if ((int64_t)x
< 0) break;
248 n
= (double)(int64_t)x
;
256 /* Slow non-integer path. */
257 if (fmt
== STRSCAN_INT
) {
258 if ((opt
& STRSCAN_OPT_C
)) return STRSCAN_ERROR
;
260 } else if (fmt
> STRSCAN_INT
) {
261 return STRSCAN_ERROR
;
264 uint32_t hi
= 0, lo
= (uint32_t)(xip
-xi
);
265 int32_t ex2
= 0, idig
= (int32_t)lo
+ (ex10
>> 1);
267 lj_assertX(lo
> 0 && (ex10
& 1) == 0, "bad lo %d ex10 %d", lo
, ex10
);
269 /* Handle simple overflow/underflow. */
270 if (idig
> 310/2) { if (neg
) setminfV(o
); else setpinfV(o
); return fmt
; }
271 else if (idig
< -326/2) { o
->n
= neg
? -0.0 : 0.0; return fmt
; }
273 /* Scale up until we have at least 17 or 18 integer part digits. */
274 while (idig
< 9 && idig
< DLEN(lo
, hi
)) {
277 for (i
= DPREV(lo
); ; i
= DPREV(i
)) {
278 uint32_t d
= (xi
[i
] << 6) + cy
;
279 cy
= (((d
>> 2) * 5243) >> 17); d
= d
- cy
* 100; /* Div/mod 100. */
282 if (d
== 0 && i
== DPREV(lo
)) lo
= i
;
286 if (xi
[DPREV(lo
)] == 0) lo
= DPREV(lo
);
287 else if (hi
== lo
) { lo
= DPREV(lo
); xi
[DPREV(lo
)] |= xi
[lo
]; }
288 xi
[hi
] = (uint8_t)cy
; idig
++;
292 /* Scale down until no more than 17 or 18 integer part digits remain. */
294 uint32_t i
= hi
, cy
= 0;
299 cy
= 100 * (cy
& 0x3f);
300 if (xi
[i
] == 0 && i
== hi
) hi
= DNEXT(hi
), idig
--;
304 if (hi
== lo
) { xi
[DPREV(lo
)] |= 1; break; }
305 xi
[lo
] = (cy
>> 6); lo
= DNEXT(lo
);
306 cy
= 100 * (cy
& 0x3f);
310 /* Collect integer part digits and convert to rescaled double. */
314 for (i
= DNEXT(hi
); --idig
> 0 && i
!= lo
; i
= DNEXT(i
))
317 while (--idig
>= 0) x
= x
* 100;
318 } else { /* Gather round bit from remaining digits. */
321 if (xi
[i
]) { x
|= 1; break; }
325 strscan_double(x
, o
, ex2
, neg
);
331 /* Parse binary number. */
332 static StrScanFmt
strscan_bin(const uint8_t *p
, TValue
*o
,
333 StrScanFmt fmt
, uint32_t opt
,
334 int32_t ex2
, int32_t neg
, uint32_t dig
)
339 if (ex2
|| dig
> 64) return STRSCAN_ERROR
;
341 /* Scan binary digits. */
342 for (i
= dig
; i
; i
--, p
++) {
343 if ((*p
& ~1) != '0') return STRSCAN_ERROR
;
344 x
= (x
<< 1) | (*p
& 1);
347 /* Format-specific handling. */
350 if (!(opt
& STRSCAN_OPT_TONUM
) && x
< 0x80000000u
+neg
) {
351 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
352 return STRSCAN_INT
; /* Fast path for 32 bit integers. */
354 if (!(opt
& STRSCAN_OPT_C
)) { fmt
= STRSCAN_NUM
; break; }
357 if (dig
> 32) return STRSCAN_ERROR
;
358 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
362 o
->u64
= neg
? ~x
+1u : x
;
368 /* Reduce range, then convert to double. */
369 if ((x
& U64x(c0000000
,0000000))) { x
= (x
>> 2) | (x
& 3); ex2
+= 2; }
370 strscan_double(x
, o
, ex2
, neg
);
374 /* Scan string containing a number. Returns format. Returns value in o. */
375 StrScanFmt
lj_strscan_scan(const uint8_t *p
, MSize len
, TValue
*o
,
379 const uint8_t *pe
= p
+ len
;
381 /* Remove leading space, parse sign and non-numbers. */
382 if (LJ_UNLIKELY(!lj_char_isdigit(*p
))) {
383 while (lj_char_isspace(*p
)) p
++;
384 if (*p
== '+' || *p
== '-') neg
= (*p
++ == '-');
385 if (LJ_UNLIKELY(*p
>= 'A')) { /* Parse "inf", "infinity" or "nan". */
388 if (casecmp(p
[0],'i') && casecmp(p
[1],'n') && casecmp(p
[2],'f')) {
389 if (neg
) setminfV(&tmp
); else setpinfV(&tmp
);
391 if (casecmp(p
[0],'i') && casecmp(p
[1],'n') && casecmp(p
[2],'i') &&
392 casecmp(p
[3],'t') && casecmp(p
[4],'y')) p
+= 5;
393 } else if (casecmp(p
[0],'n') && casecmp(p
[1],'a') && casecmp(p
[2],'n')) {
396 while (lj_char_isspace(*p
)) p
++;
397 if (*p
|| p
< pe
) return STRSCAN_ERROR
;
403 /* Parse regular number. */
405 StrScanFmt fmt
= STRSCAN_INT
;
406 int cmask
= LJ_CHAR_DIGIT
;
407 int base
= (opt
& STRSCAN_OPT_C
) && *p
== '0' ? 0 : 10;
408 const uint8_t *sp
, *dp
= NULL
;
409 uint32_t dig
= 0, hasdig
= 0, x
= 0;
412 /* Determine base and skip leading zeros. */
413 if (LJ_UNLIKELY(*p
<= '0')) {
415 if (casecmp(p
[1], 'x'))
416 base
= 16, cmask
= LJ_CHAR_XDIGIT
, p
+= 2;
417 else if (casecmp(p
[1], 'b'))
418 base
= 2, cmask
= LJ_CHAR_DIGIT
, p
+= 2;
423 } else if (*p
== '.') {
424 if (dp
) return STRSCAN_ERROR
;
432 /* Preliminary digit and decimal point scan. */
433 for (sp
= p
; ; p
++) {
434 if (LJ_LIKELY(lj_char_isa(*p
, cmask
))) {
435 x
= x
* 10 + (*p
& 15); /* For fast path below. */
437 } else if (*p
== '.') {
438 if (dp
) return STRSCAN_ERROR
;
444 if (!(hasdig
| dig
)) return STRSCAN_ERROR
;
446 /* Handle decimal point. */
448 if (base
== 2) return STRSCAN_ERROR
;
451 ex
= (int32_t)(dp
-(p
-1)); dp
= p
-1;
452 while (ex
< 0 && *dp
-- == '0') ex
++, dig
--; /* Skip trailing zeros. */
453 if (ex
<= -STRSCAN_MAXEXP
) return STRSCAN_ERROR
;
454 if (base
== 16) ex
*= 4;
458 /* Parse exponent. */
459 if (base
>= 10 && casecmp(*p
, (uint32_t)(base
== 16 ? 'p' : 'e'))) {
462 fmt
= STRSCAN_NUM
; p
++;
463 if (*p
== '+' || *p
== '-') negx
= (*p
++ == '-');
464 if (!lj_char_isdigit(*p
)) return STRSCAN_ERROR
;
466 while (lj_char_isdigit(*p
)) {
467 xx
= xx
* 10 + (*p
& 15);
468 if (xx
>= STRSCAN_MAXEXP
) return STRSCAN_ERROR
;
471 ex
+= negx
? (int32_t)(~xx
+1u) : (int32_t)xx
;
476 /* I (IMAG), U (U32), LL (I64), ULL/LLU (U64), L (long), UL/LU (ulong). */
477 /* NYI: f (float). Not needed until cp_number() handles non-integers. */
478 if (casecmp(*p
, 'i')) {
479 if (!(opt
& STRSCAN_OPT_IMAG
)) return STRSCAN_ERROR
;
480 p
++; fmt
= STRSCAN_IMAG
;
481 } else if (fmt
== STRSCAN_INT
) {
482 if (casecmp(*p
, 'u')) p
++, fmt
= STRSCAN_U32
;
483 if (casecmp(*p
, 'l')) {
485 if (casecmp(*p
, 'l')) p
++, fmt
+= STRSCAN_I64
- STRSCAN_INT
;
486 else if (!(opt
& STRSCAN_OPT_C
)) return STRSCAN_ERROR
;
487 else if (sizeof(long) == 8) fmt
+= STRSCAN_I64
- STRSCAN_INT
;
489 if (casecmp(*p
, 'u') && (fmt
== STRSCAN_INT
|| fmt
== STRSCAN_I64
))
490 p
++, fmt
+= STRSCAN_U32
- STRSCAN_INT
;
491 if ((fmt
== STRSCAN_U32
&& !(opt
& STRSCAN_OPT_C
)) ||
492 (fmt
>= STRSCAN_I64
&& !(opt
& STRSCAN_OPT_LL
)))
493 return STRSCAN_ERROR
;
495 while (lj_char_isspace(*p
)) p
++;
496 if (*p
) return STRSCAN_ERROR
;
498 if (p
< pe
) return STRSCAN_ERROR
;
500 /* Fast path for decimal 32 bit integers. */
501 if (fmt
== STRSCAN_INT
&& base
== 10 &&
502 (dig
< 10 || (dig
== 10 && *sp
<= '2' && x
< 0x80000000u
+neg
))) {
503 if ((opt
& STRSCAN_OPT_TONUM
)) {
504 o
->n
= neg
? -(double)x
: (double)x
;
506 } else if (x
== 0 && neg
) {
510 o
->i
= neg
? (int32_t)(~x
+1u) : (int32_t)x
;
515 /* Dispatch to base-specific parser. */
516 if (base
== 0 && !(fmt
== STRSCAN_NUM
|| fmt
== STRSCAN_IMAG
))
517 return strscan_oct(sp
, o
, fmt
, neg
, dig
);
519 fmt
= strscan_hex(sp
, o
, fmt
, opt
, ex
, neg
, dig
);
521 fmt
= strscan_bin(sp
, o
, fmt
, opt
, ex
, neg
, dig
);
523 fmt
= strscan_dec(sp
, o
, fmt
, opt
, ex
, neg
, dig
);
525 /* Try to convert number to integer, if requested. */
526 if (fmt
== STRSCAN_NUM
&& (opt
& STRSCAN_OPT_TOINT
) && !tvismzero(o
)) {
528 int32_t i
= lj_num2int(n
);
529 if (n
== (lua_Number
)i
) { o
->i
= i
; return STRSCAN_INT
; }
535 int LJ_FASTCALL
lj_strscan_num(GCstr
*str
, TValue
*o
)
537 StrScanFmt fmt
= lj_strscan_scan((const uint8_t *)strdata(str
), str
->len
, o
,
539 lj_assertX(fmt
== STRSCAN_ERROR
|| fmt
== STRSCAN_NUM
, "bad scan format");
540 return (fmt
!= STRSCAN_ERROR
);
544 int LJ_FASTCALL
lj_strscan_number(GCstr
*str
, TValue
*o
)
546 StrScanFmt fmt
= lj_strscan_scan((const uint8_t *)strdata(str
), str
->len
, o
,
548 lj_assertX(fmt
== STRSCAN_ERROR
|| fmt
== STRSCAN_NUM
|| fmt
== STRSCAN_INT
,
550 if (fmt
== STRSCAN_INT
) setitype(o
, LJ_TISNUM
);
551 return (fmt
!= STRSCAN_ERROR
);