beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / scanf / doscan.c
blob47b22706cede1f10a273531b753e05f4d165dff1
1 /* __gmp_doscan -- formatted input internals.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001-2003 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
24 or both in parallel, as here.
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
35 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
37 #include "config.h" /* needed for the HAVE_, could also move gmp incls */
39 #include <stdarg.h>
40 #include <ctype.h>
41 #include <stddef.h> /* for ptrdiff_t */
42 #include <stdio.h>
43 #include <stdlib.h> /* for strtol */
44 #include <string.h>
46 #if HAVE_LANGINFO_H
47 #include <langinfo.h> /* for nl_langinfo */
48 #endif
50 #if HAVE_LOCALE_H
51 #include <locale.h> /* for localeconv */
52 #endif
54 #if HAVE_INTTYPES_H
55 # include <inttypes.h> /* for intmax_t */
56 #else
57 # if HAVE_STDINT_H
58 # include <stdint.h>
59 # endif
60 #endif
62 #if HAVE_SYS_TYPES_H
63 #include <sys/types.h> /* for quad_t */
64 #endif
66 #include "gmp.h"
67 #include "gmp-impl.h"
70 /* Change this to "#define TRACE(x) x" for some traces. */
71 #define TRACE(x)
74 /* General:
76 It's necessary to parse up the format string to recognise the GMP
77 extra types F, Q and Z. Other types and conversions are passed
78 across to the standard sscanf or fscanf via funs->scan, for ease of
79 implementation. This is essential in the case of something like glibc
80 %p where the pointer format isn't actually documented.
82 Because funs->scan doesn't get the whole input it can't put the right
83 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
84 nor fscanf directly indicate how many characters were read, so an
85 extra %n is appended to each run for that. For fscanf this merely
86 supports our %n output, but for sscanf it lets funs->step move us
87 along the input string.
89 Whitespace and literal matches in the format string, including %%,
90 are handled directly within __gmp_doscan. This is reasonably
91 efficient, and avoids some suspicious behaviour observed in various
92 system libc's. GLIBC 2.2.4 for instance returns 0 on
94 sscanf(" ", " x")
96 sscanf(" ", " x%d",&n)
98 whereas we think they should return EOF, since end-of-string is
99 reached when a match of "x" is required.
101 For standard % conversions, funs->scan is called once for each
102 conversion. If we had vfscanf and vsscanf and could rely on their
103 fixed text matching behaviour then we could call them with multiple
104 consecutive standard conversions. But plain fscanf and sscanf work
105 fine, and parsing one field at a time shouldn't be too much of a
106 slowdown.
108 gmpscan:
110 gmpscan reads a gmp type. It's only used from one place, but is a
111 separate subroutine to avoid a big chunk of complicated code in the
112 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
113 possible to share code for parsing integers, rationals and floats.
115 In gmpscan normally one char of lookahead is maintained, but when width
116 is reached that stops, on the principle that an fgetc/ungetc of a char
117 past where we're told to stop would be undesirable. "chars" is how many
118 characters have been read so far, including the current c. When
119 chars==width and another character is desired then a jump is done to the
120 "convert" stage. c is invalid and mustn't be unget'ed in this case;
121 chars is set to width+1 to indicate that.
123 gmpscan normally returns the number of characters read. -1 means an
124 invalid field, -2 means EOF reached before any matching characters
125 were read.
127 For hex floats, the mantissa part is passed to mpf_set_str, then the
128 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
129 than teaching mpf_set_str about an exponent factor (ie. 2) differing
130 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
131 mpf_div_2exp will preserve the application requested precision, so
132 nothing in that respect is lost by making this a two-step process.
134 Matching and errors:
136 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
137 string which is a match for the appropriate type, or a prefix of a
138 match. With that done, if it's only a prefix then the result is a
139 matching failure, ie. invalid input.
141 This rule seems fairly clear, but doesn't seem to be universally
142 applied in system C libraries. Even GLIBC doesn't seem to get it
143 right, insofar as it seems to accept some apparently invalid forms.
144 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
145 standard would suggest a non-empty sequence of digits should be
146 required after an "0x".
148 A footnote to 7.19.6.2 para 17 notes how this input item reading can
149 mean inputs acceptable to strtol are not acceptable to fscanf. We
150 think this confirms our reading of "0x" as invalid.
152 Clearly gmp_sscanf could backtrack to a longest input which was a
153 valid match for a given item, but this is not done, since C99 says
154 sscanf is identical to fscanf, so we make gmp_sscanf identical to
155 gmp_fscanf.
157 Types:
159 C99 says "ll" is for long long, and "L" is for long double floats.
160 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
161 doesn't affect us directly, since both are passed through to plain
162 scanf. It seems wisest not to try to enforce the C99 rule. This is
163 consistent with what we said before, though whether it actually
164 worked was always up to the C library.
166 Alternatives:
168 Consideration was given to using separate code for gmp_fscanf and
169 gmp_sscanf. The sscanf case could zip across a string doing literal
170 matches or recognising digits in gmpscan, rather than making a
171 function call fun->get per character. The fscanf could use getc
172 rather than fgetc too, which might help those systems where getc is a
173 macro or otherwise inlined. But none of this scanning and converting
174 will be particularly fast, so the two are done together to keep it a
175 little simpler for now.
177 Various multibyte string issues are not addressed, for a start C99
178 scanf says the format string is multibyte. Since we pass %c, %s and
179 %[ to the system scanf, they might do multibyte reads already, but
180 it's another matter whether or not that can be used, since our digit
181 and whitespace parsing is only unibyte. The plan is to quietly
182 ignore multibyte locales for now. This is not as bad as it sounds,
183 since GMP is presumably used mostly on numbers, which can be
184 perfectly adequately treated in plain ASCII.
189 struct gmp_doscan_params_t {
190 int base;
191 int ignore;
192 char type;
193 int width;
197 #define GET(c) \
198 do { \
199 ASSERT (chars <= width); \
200 chars++; \
201 if (chars > width) \
202 goto convert; \
203 (c) = (*funs->get) (data); \
204 } while (0)
206 /* store into "s", extending if necessary */
207 #define STORE(c) \
208 do { \
209 ASSERT (s_upto <= s_alloc); \
210 if (s_upto >= s_alloc) \
212 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
213 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
214 s_alloc = s_alloc_new; \
216 s[s_upto++] = c; \
217 } while (0)
219 #define S_ALLOC_STEP 512
221 static int
222 gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
223 const struct gmp_doscan_params_t *p, void *dst)
225 int chars, c, base, first, width, seen_point, seen_digit, hexfloat;
226 size_t s_upto, s_alloc, hexexp;
227 char *s;
228 int invalid = 0;
230 TRACE (printf ("gmpscan\n"));
232 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
234 c = (*funs->get) (data);
235 if (c == EOF)
236 return -2;
238 chars = 1;
239 first = 1;
240 seen_point = 0;
241 width = (p->width == 0 ? INT_MAX-1 : p->width);
242 base = p->base;
243 s_alloc = S_ALLOC_STEP;
244 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
245 s_upto = 0;
246 hexfloat = 0;
247 hexexp = 0;
249 another:
250 seen_digit = 0;
251 if (c == '-')
253 STORE (c);
254 goto get_for_sign;
256 else if (c == '+')
258 /* don't store '+', it's not accepted by mpz_set_str etc */
259 get_for_sign:
260 GET (c);
263 if (base == 0)
265 base = 10; /* decimal if no base indicator */
266 if (c == '0')
268 seen_digit = 1; /* 0 alone is a valid number */
269 if (p->type != 'F')
270 base = 8; /* leading 0 is octal, for non-floats */
271 STORE (c);
272 GET (c);
273 if (c == 'x' || c == 'X')
275 base = 16;
276 seen_digit = 0; /* must have digits after an 0x */
277 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
278 hexfloat = 1;
279 else
280 STORE (c);
281 GET (c);
286 digits:
287 for (;;)
289 if (base == 16)
291 if (! isxdigit (c))
292 break;
294 else
296 if (! isdigit (c))
297 break;
298 if (base == 8 && (c == '8' || c == '9'))
299 break;
302 seen_digit = 1;
303 STORE (c);
304 GET (c);
307 if (first)
309 /* decimal point */
310 if (p->type == 'F' && ! seen_point)
312 /* For a multi-character decimal point, if the first character is
313 present then all of it must be, otherwise the input is
314 considered invalid. */
315 const char *point = GMP_DECIMAL_POINT;
316 int pc = (unsigned char) *point++;
317 if (c == pc)
319 for (;;)
321 STORE (c);
322 GET (c);
323 pc = (unsigned char) *point++;
324 if (pc == '\0')
325 break;
326 if (c != pc)
327 goto set_invalid;
329 seen_point = 1;
330 goto digits;
334 /* exponent */
335 if (p->type == 'F')
337 if (hexfloat && (c == 'p' || c == 'P'))
339 hexexp = s_upto; /* exponent location */
340 base = 10; /* exponent in decimal */
341 goto exponent;
343 else if (! hexfloat && (c == 'e' || c == 'E'))
345 exponent:
346 /* must have at least one digit in the mantissa, just an exponent
347 is not good enough */
348 if (! seen_digit)
349 goto set_invalid;
351 do_second:
352 first = 0;
353 STORE (c);
354 GET (c);
355 goto another;
359 /* denominator */
360 if (p->type == 'Q' && c == '/')
362 /* must have at least one digit in the numerator */
363 if (! seen_digit)
364 goto set_invalid;
366 /* now look for at least one digit in the denominator */
367 seen_digit = 0;
369 /* allow the base to be redetermined for "%i" */
370 base = p->base;
371 goto do_second;
375 convert:
376 if (! seen_digit)
378 set_invalid:
379 invalid = 1;
380 goto done;
383 if (! p->ignore)
385 STORE ('\0');
386 TRACE (printf (" convert \"%s\"\n", s));
388 /* We ought to have parsed out a valid string above, so just test
389 mpz_set_str etc with an ASSERT. */
390 switch (p->type) {
391 case 'F':
393 mpf_ptr f = (mpf_ptr) dst;
394 if (hexexp != 0)
395 s[hexexp] = '\0';
396 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
397 if (hexexp != 0)
399 char *dummy;
400 long exp;
401 exp = strtol (s + hexexp + 1, &dummy, 10);
402 if (exp >= 0)
403 mpf_mul_2exp (f, f, (unsigned long) exp);
404 else
405 mpf_div_2exp (f, f, - (unsigned long) exp);
408 break;
409 case 'Q':
410 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
411 break;
412 case 'Z':
413 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
414 break;
415 default:
416 ASSERT (0);
417 /*FALLTHRU*/
418 break;
422 done:
423 ASSERT (chars <= width+1);
424 if (chars != width+1)
426 (*funs->unget) (c, data);
427 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1));
429 chars--;
431 (*__gmp_free_func) (s, s_alloc);
433 if (invalid)
435 TRACE (printf (" invalid\n"));
436 return -1;
439 TRACE (printf (" return %d chars (cf width %d)\n", chars, width));
440 return chars;
444 /* Read and discard whitespace, if any. Return number of chars skipped.
445 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
446 it's not necessary to watch for EOF from funs->get, */
447 static int
448 skip_white (const struct gmp_doscan_funs_t *funs, void *data)
450 int c;
451 int ret = 0;
455 c = (funs->get) (data);
456 ret++;
458 while (isspace (c));
460 (funs->unget) (c, data);
461 ret--;
463 TRACE (printf (" skip white %d\n", ret));
464 return ret;
469 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
470 const char *orig_fmt, va_list orig_ap)
472 struct gmp_doscan_params_t param;
473 va_list ap;
474 char *alloc_fmt;
475 const char *fmt, *this_fmt, *end_fmt;
476 size_t orig_fmt_len, alloc_fmt_size, len;
477 int new_fields, new_chars;
478 char fchar;
479 int fields = 0;
480 int chars = 0;
482 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
483 if (funs->scan == (gmp_doscan_scan_t) sscanf)
484 printf (" s=\"%s\"\n", * (const char **) data));
486 /* Don't modify orig_ap, if va_list is actually an array and hence call by
487 reference. It could be argued that it'd be more efficient to leave
488 callers to make a copy if they care, but doing so here is going to be a
489 very small part of the total work, and we may as well keep applications
490 out of trouble. */
491 va_copy (ap, orig_ap);
493 /* Parts of the format string are going to be copied so that a " %n" can
494 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
495 needed if fmt consists of a single "%" specifier, but otherwise is an
496 overestimate. We're not going to be very fast here, so use
497 __gmp_allocate_func rather than TMP_ALLOC. */
498 orig_fmt_len = strlen (orig_fmt);
499 alloc_fmt_size = orig_fmt_len + 4;
500 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
502 fmt = orig_fmt;
503 end_fmt = orig_fmt + orig_fmt_len;
505 for (;;)
507 next:
508 fchar = *fmt++;
510 if (fchar == '\0')
511 break;
513 if (isspace (fchar))
515 chars += skip_white (funs, data);
516 continue;
519 if (fchar != '%')
521 int c;
522 literal:
523 c = (funs->get) (data);
524 if (c != fchar)
526 (funs->unget) (c, data);
527 if (c == EOF)
529 eof_no_match:
530 if (fields == 0)
531 fields = EOF;
533 goto done;
535 chars++;
536 continue;
539 param.type = '\0';
540 param.base = 0; /* for e,f,g,i */
541 param.ignore = 0;
542 param.width = 0;
544 this_fmt = fmt-1;
545 TRACE (printf (" this_fmt \"%s\"\n", this_fmt));
547 for (;;)
549 ASSERT (fmt <= end_fmt);
551 fchar = *fmt++;
552 switch (fchar) {
554 case '\0': /* unterminated % sequence */
555 ASSERT (0);
556 goto done;
558 case '%': /* literal % */
559 goto literal;
561 case '[': /* character range */
562 fchar = *fmt++;
563 if (fchar == '^')
564 fchar = *fmt++;
565 /* ']' allowed as the first char (possibly after '^') */
566 if (fchar == ']')
567 fchar = *fmt++;
568 for (;;)
570 ASSERT (fmt <= end_fmt);
571 if (fchar == '\0')
573 /* unterminated % sequence */
574 ASSERT (0);
575 goto done;
577 if (fchar == ']')
578 break;
579 fchar = *fmt++;
581 /*FALLTHRU*/
582 case 'c': /* characters */
583 case 's': /* string of non-whitespace */
584 case 'p': /* pointer */
585 libc_type:
586 len = fmt - this_fmt;
587 memcpy (alloc_fmt, this_fmt, len);
588 alloc_fmt[len++] = '%';
589 alloc_fmt[len++] = 'n';
590 alloc_fmt[len] = '\0';
592 TRACE (printf (" scan \"%s\"\n", alloc_fmt);
593 if (funs->scan == (gmp_doscan_scan_t) sscanf)
594 printf (" s=\"%s\"\n", * (const char **) data));
596 new_chars = -1;
597 if (param.ignore)
599 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
600 ASSERT (new_fields == 0 || new_fields == EOF);
602 else
604 void *arg = va_arg (ap, void *);
605 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
606 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
608 if (new_fields == 0)
609 goto done; /* invalid input */
611 if (new_fields == 1)
612 ASSERT (new_chars != -1);
614 TRACE (printf (" new_fields %d new_chars %d\n",
615 new_fields, new_chars));
617 if (new_fields == -1)
618 goto eof_no_match; /* EOF before anything matched */
620 /* Under param.ignore, when new_fields==0 we don't know if
621 it's a successful match or an invalid field. new_chars
622 won't have been assigned if it was an invalid field. */
623 if (new_chars == -1)
624 goto done; /* invalid input */
626 chars += new_chars;
627 (*funs->step) (data, new_chars);
629 increment_fields:
630 if (! param.ignore)
631 fields++;
632 goto next;
634 case 'd': /* decimal */
635 case 'u': /* decimal */
636 param.base = 10;
637 goto numeric;
639 case 'e': /* float */
640 case 'E': /* float */
641 case 'f': /* float */
642 case 'g': /* float */
643 case 'G': /* float */
644 case 'i': /* integer with base marker */
645 numeric:
646 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
647 goto libc_type;
649 chars += skip_white (funs, data);
651 new_chars = gmpscan (funs, data, &param,
652 param.ignore ? NULL : va_arg (ap, void*));
653 if (new_chars == -2)
654 goto eof_no_match;
655 if (new_chars == -1)
656 goto done;
658 ASSERT (new_chars >= 0);
659 chars += new_chars;
660 goto increment_fields;
662 case 'a': /* glibc allocate string */
663 case '\'': /* glibc digit groupings */
664 break;
666 case 'F': /* mpf_t */
667 case 'j': /* intmax_t */
668 case 'L': /* long long */
669 case 'q': /* quad_t */
670 case 'Q': /* mpq_t */
671 case 't': /* ptrdiff_t */
672 case 'z': /* size_t */
673 case 'Z': /* mpz_t */
674 set_type:
675 param.type = fchar;
676 break;
678 case 'h': /* short or char */
679 if (param.type != 'h')
680 goto set_type;
681 param.type = 'H'; /* internal code for "hh" */
682 break;
684 goto numeric;
686 case 'l': /* long, long long, double or long double */
687 if (param.type != 'l')
688 goto set_type;
689 param.type = 'L'; /* "ll" means "L" */
690 break;
692 case 'n':
693 if (! param.ignore)
695 void *p;
696 p = va_arg (ap, void *);
697 TRACE (printf (" store %%n to %p\n", p));
698 switch (param.type) {
699 case '\0': * (int *) p = chars; break;
700 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break;
701 case 'H': * (char *) p = chars; break;
702 case 'h': * (short *) p = chars; break;
703 #if HAVE_INTMAX_T
704 case 'j': * (intmax_t *) p = chars; break;
705 #else
706 case 'j': ASSERT_FAIL (intmax_t not available); break;
707 #endif
708 case 'l': * (long *) p = chars; break;
709 #if HAVE_QUAD_T && HAVE_LONG_LONG
710 case 'q':
711 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
712 /*FALLTHRU*/
713 #else
714 case 'q': ASSERT_FAIL (quad_t not available); break;
715 #endif
716 #if HAVE_LONG_LONG
717 case 'L': * (long long *) p = chars; break;
718 #else
719 case 'L': ASSERT_FAIL (long long not available); break;
720 #endif
721 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
722 #if HAVE_PTRDIFF_T
723 case 't': * (ptrdiff_t *) p = chars; break;
724 #else
725 case 't': ASSERT_FAIL (ptrdiff_t not available); break;
726 #endif
727 case 'z': * (size_t *) p = chars; break;
728 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break;
729 default: ASSERT (0); break;
732 goto next;
734 case 'o':
735 param.base = 8;
736 goto numeric;
738 case 'x':
739 case 'X':
740 param.base = 16;
741 goto numeric;
743 case '0': case '1': case '2': case '3': case '4':
744 case '5': case '6': case '7': case '8': case '9':
745 param.width = 0;
746 do {
747 param.width = param.width * 10 + (fchar-'0');
748 fchar = *fmt++;
749 } while (isdigit (fchar));
750 fmt--; /* unget the non-digit */
751 break;
753 case '*':
754 param.ignore = 1;
755 break;
757 default:
758 /* something invalid in a % sequence */
759 ASSERT (0);
760 goto next;
765 done:
766 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);
767 return fields;