1 /* __gmp_doscan -- formatted input internals.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001-2003 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
24 or both in parallel, as here.
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
35 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
37 #include "config.h" /* needed for the HAVE_, could also move gmp incls */
41 #include <stddef.h> /* for ptrdiff_t */
43 #include <stdlib.h> /* for strtol */
47 #include <langinfo.h> /* for nl_langinfo */
51 #include <locale.h> /* for localeconv */
55 # include <inttypes.h> /* for intmax_t */
63 #include <sys/types.h> /* for quad_t */
70 /* Change this to "#define TRACE(x) x" for some traces. */
76 It's necessary to parse up the format string to recognise the GMP
77 extra types F, Q and Z. Other types and conversions are passed
78 across to the standard sscanf or fscanf via funs->scan, for ease of
79 implementation. This is essential in the case of something like glibc
80 %p where the pointer format isn't actually documented.
82 Because funs->scan doesn't get the whole input it can't put the right
83 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
84 nor fscanf directly indicate how many characters were read, so an
85 extra %n is appended to each run for that. For fscanf this merely
86 supports our %n output, but for sscanf it lets funs->step move us
87 along the input string.
89 Whitespace and literal matches in the format string, including %%,
90 are handled directly within __gmp_doscan. This is reasonably
91 efficient, and avoids some suspicious behaviour observed in various
92 system libc's. GLIBC 2.2.4 for instance returns 0 on
96 sscanf(" ", " x%d",&n)
98 whereas we think they should return EOF, since end-of-string is
99 reached when a match of "x" is required.
101 For standard % conversions, funs->scan is called once for each
102 conversion. If we had vfscanf and vsscanf and could rely on their
103 fixed text matching behaviour then we could call them with multiple
104 consecutive standard conversions. But plain fscanf and sscanf work
105 fine, and parsing one field at a time shouldn't be too much of a
110 gmpscan reads a gmp type. It's only used from one place, but is a
111 separate subroutine to avoid a big chunk of complicated code in the
112 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
113 possible to share code for parsing integers, rationals and floats.
115 In gmpscan normally one char of lookahead is maintained, but when width
116 is reached that stops, on the principle that an fgetc/ungetc of a char
117 past where we're told to stop would be undesirable. "chars" is how many
118 characters have been read so far, including the current c. When
119 chars==width and another character is desired then a jump is done to the
120 "convert" stage. c is invalid and mustn't be unget'ed in this case;
121 chars is set to width+1 to indicate that.
123 gmpscan normally returns the number of characters read. -1 means an
124 invalid field, -2 means EOF reached before any matching characters
127 For hex floats, the mantissa part is passed to mpf_set_str, then the
128 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
129 than teaching mpf_set_str about an exponent factor (ie. 2) differing
130 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
131 mpf_div_2exp will preserve the application requested precision, so
132 nothing in that respect is lost by making this a two-step process.
136 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
137 string which is a match for the appropriate type, or a prefix of a
138 match. With that done, if it's only a prefix then the result is a
139 matching failure, ie. invalid input.
141 This rule seems fairly clear, but doesn't seem to be universally
142 applied in system C libraries. Even GLIBC doesn't seem to get it
143 right, insofar as it seems to accept some apparently invalid forms.
144 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
145 standard would suggest a non-empty sequence of digits should be
146 required after an "0x".
148 A footnote to 7.19.6.2 para 17 notes how this input item reading can
149 mean inputs acceptable to strtol are not acceptable to fscanf. We
150 think this confirms our reading of "0x" as invalid.
152 Clearly gmp_sscanf could backtrack to a longest input which was a
153 valid match for a given item, but this is not done, since C99 says
154 sscanf is identical to fscanf, so we make gmp_sscanf identical to
159 C99 says "ll" is for long long, and "L" is for long double floats.
160 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
161 doesn't affect us directly, since both are passed through to plain
162 scanf. It seems wisest not to try to enforce the C99 rule. This is
163 consistent with what we said before, though whether it actually
164 worked was always up to the C library.
168 Consideration was given to using separate code for gmp_fscanf and
169 gmp_sscanf. The sscanf case could zip across a string doing literal
170 matches or recognising digits in gmpscan, rather than making a
171 function call fun->get per character. The fscanf could use getc
172 rather than fgetc too, which might help those systems where getc is a
173 macro or otherwise inlined. But none of this scanning and converting
174 will be particularly fast, so the two are done together to keep it a
175 little simpler for now.
177 Various multibyte string issues are not addressed, for a start C99
178 scanf says the format string is multibyte. Since we pass %c, %s and
179 %[ to the system scanf, they might do multibyte reads already, but
180 it's another matter whether or not that can be used, since our digit
181 and whitespace parsing is only unibyte. The plan is to quietly
182 ignore multibyte locales for now. This is not as bad as it sounds,
183 since GMP is presumably used mostly on numbers, which can be
184 perfectly adequately treated in plain ASCII.
189 struct gmp_doscan_params_t
{
199 ASSERT (chars <= width); \
203 (c) = (*funs->get) (data); \
206 /* store into "s", extending if necessary */
209 ASSERT (s_upto <= s_alloc); \
210 if (s_upto >= s_alloc) \
212 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
213 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
214 s_alloc = s_alloc_new; \
219 #define S_ALLOC_STEP 512
222 gmpscan (const struct gmp_doscan_funs_t
*funs
, void *data
,
223 const struct gmp_doscan_params_t
*p
, void *dst
)
225 int chars
, c
, base
, first
, width
, seen_point
, seen_digit
, hexfloat
;
226 size_t s_upto
, s_alloc
, hexexp
;
230 TRACE (printf ("gmpscan\n"));
232 ASSERT (p
->type
== 'F' || p
->type
== 'Q' || p
->type
== 'Z');
234 c
= (*funs
->get
) (data
);
241 width
= (p
->width
== 0 ? INT_MAX
-1 : p
->width
);
243 s_alloc
= S_ALLOC_STEP
;
244 s
= __GMP_ALLOCATE_FUNC_TYPE (s_alloc
, char);
258 /* don't store '+', it's not accepted by mpz_set_str etc */
265 base
= 10; /* decimal if no base indicator */
268 seen_digit
= 1; /* 0 alone is a valid number */
270 base
= 8; /* leading 0 is octal, for non-floats */
273 if (c
== 'x' || c
== 'X')
276 seen_digit
= 0; /* must have digits after an 0x */
277 if (p
->type
== 'F') /* don't pass 'x' to mpf_set_str_point */
298 if (base
== 8 && (c
== '8' || c
== '9'))
310 if (p
->type
== 'F' && ! seen_point
)
312 /* For a multi-character decimal point, if the first character is
313 present then all of it must be, otherwise the input is
314 considered invalid. */
315 const char *point
= GMP_DECIMAL_POINT
;
316 int pc
= (unsigned char) *point
++;
323 pc
= (unsigned char) *point
++;
337 if (hexfloat
&& (c
== 'p' || c
== 'P'))
339 hexexp
= s_upto
; /* exponent location */
340 base
= 10; /* exponent in decimal */
343 else if (! hexfloat
&& (c
== 'e' || c
== 'E'))
346 /* must have at least one digit in the mantissa, just an exponent
347 is not good enough */
360 if (p
->type
== 'Q' && c
== '/')
362 /* must have at least one digit in the numerator */
366 /* now look for at least one digit in the denominator */
369 /* allow the base to be redetermined for "%i" */
386 TRACE (printf (" convert \"%s\"\n", s
));
388 /* We ought to have parsed out a valid string above, so just test
389 mpz_set_str etc with an ASSERT. */
393 mpf_ptr f
= (mpf_ptr
) dst
;
396 ASSERT_NOCARRY (mpf_set_str (f
, s
, hexfloat
? 16 : 10));
401 exp
= strtol (s
+ hexexp
+ 1, &dummy
, 10);
403 mpf_mul_2exp (f
, f
, (unsigned long) exp
);
405 mpf_div_2exp (f
, f
, - (unsigned long) exp
);
410 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr
) dst
, s
, p
->base
));
413 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr
) dst
, s
, p
->base
));
423 ASSERT (chars
<= width
+1);
424 if (chars
!= width
+1)
426 (*funs
->unget
) (c
, data
);
427 TRACE (printf (" ungetc %d, to give %d chars\n", c
, chars
-1));
431 (*__gmp_free_func
) (s
, s_alloc
);
435 TRACE (printf (" invalid\n"));
439 TRACE (printf (" return %d chars (cf width %d)\n", chars
, width
));
444 /* Read and discard whitespace, if any. Return number of chars skipped.
445 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
446 it's not necessary to watch for EOF from funs->get, */
448 skip_white (const struct gmp_doscan_funs_t
*funs
, void *data
)
455 c
= (funs
->get
) (data
);
460 (funs
->unget
) (c
, data
);
463 TRACE (printf (" skip white %d\n", ret
));
469 __gmp_doscan (const struct gmp_doscan_funs_t
*funs
, void *data
,
470 const char *orig_fmt
, va_list orig_ap
)
472 struct gmp_doscan_params_t param
;
475 const char *fmt
, *this_fmt
, *end_fmt
;
476 size_t orig_fmt_len
, alloc_fmt_size
, len
;
477 int new_fields
, new_chars
;
482 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt
);
483 if (funs
->scan
== (gmp_doscan_scan_t
) sscanf
)
484 printf (" s=\"%s\"\n", * (const char **) data
));
486 /* Don't modify orig_ap, if va_list is actually an array and hence call by
487 reference. It could be argued that it'd be more efficient to leave
488 callers to make a copy if they care, but doing so here is going to be a
489 very small part of the total work, and we may as well keep applications
491 va_copy (ap
, orig_ap
);
493 /* Parts of the format string are going to be copied so that a " %n" can
494 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
495 needed if fmt consists of a single "%" specifier, but otherwise is an
496 overestimate. We're not going to be very fast here, so use
497 __gmp_allocate_func rather than TMP_ALLOC. */
498 orig_fmt_len
= strlen (orig_fmt
);
499 alloc_fmt_size
= orig_fmt_len
+ 4;
500 alloc_fmt
= __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size
, char);
503 end_fmt
= orig_fmt
+ orig_fmt_len
;
515 chars
+= skip_white (funs
, data
);
523 c
= (funs
->get
) (data
);
526 (funs
->unget
) (c
, data
);
540 param
.base
= 0; /* for e,f,g,i */
545 TRACE (printf (" this_fmt \"%s\"\n", this_fmt
));
549 ASSERT (fmt
<= end_fmt
);
554 case '\0': /* unterminated % sequence */
558 case '%': /* literal % */
561 case '[': /* character range */
565 /* ']' allowed as the first char (possibly after '^') */
570 ASSERT (fmt
<= end_fmt
);
573 /* unterminated % sequence */
582 case 'c': /* characters */
583 case 's': /* string of non-whitespace */
584 case 'p': /* pointer */
586 len
= fmt
- this_fmt
;
587 memcpy (alloc_fmt
, this_fmt
, len
);
588 alloc_fmt
[len
++] = '%';
589 alloc_fmt
[len
++] = 'n';
590 alloc_fmt
[len
] = '\0';
592 TRACE (printf (" scan \"%s\"\n", alloc_fmt
);
593 if (funs
->scan
== (gmp_doscan_scan_t
) sscanf
)
594 printf (" s=\"%s\"\n", * (const char **) data
));
599 new_fields
= (*funs
->scan
) (data
, alloc_fmt
, &new_chars
, NULL
);
600 ASSERT (new_fields
== 0 || new_fields
== EOF
);
604 void *arg
= va_arg (ap
, void *);
605 new_fields
= (*funs
->scan
) (data
, alloc_fmt
, arg
, &new_chars
);
606 ASSERT (new_fields
==0 || new_fields
==1 || new_fields
==EOF
);
609 goto done
; /* invalid input */
612 ASSERT (new_chars
!= -1);
614 TRACE (printf (" new_fields %d new_chars %d\n",
615 new_fields
, new_chars
));
617 if (new_fields
== -1)
618 goto eof_no_match
; /* EOF before anything matched */
620 /* Under param.ignore, when new_fields==0 we don't know if
621 it's a successful match or an invalid field. new_chars
622 won't have been assigned if it was an invalid field. */
624 goto done
; /* invalid input */
627 (*funs
->step
) (data
, new_chars
);
634 case 'd': /* decimal */
635 case 'u': /* decimal */
639 case 'e': /* float */
640 case 'E': /* float */
641 case 'f': /* float */
642 case 'g': /* float */
643 case 'G': /* float */
644 case 'i': /* integer with base marker */
646 if (param
.type
!= 'F' && param
.type
!= 'Q' && param
.type
!= 'Z')
649 chars
+= skip_white (funs
, data
);
651 new_chars
= gmpscan (funs
, data
, ¶m
,
652 param
.ignore
? NULL
: va_arg (ap
, void*));
658 ASSERT (new_chars
>= 0);
660 goto increment_fields
;
662 case 'a': /* glibc allocate string */
663 case '\'': /* glibc digit groupings */
666 case 'F': /* mpf_t */
667 case 'j': /* intmax_t */
668 case 'L': /* long long */
669 case 'q': /* quad_t */
670 case 'Q': /* mpq_t */
671 case 't': /* ptrdiff_t */
672 case 'z': /* size_t */
673 case 'Z': /* mpz_t */
678 case 'h': /* short or char */
679 if (param
.type
!= 'h')
681 param
.type
= 'H'; /* internal code for "hh" */
686 case 'l': /* long, long long, double or long double */
687 if (param
.type
!= 'l')
689 param
.type
= 'L'; /* "ll" means "L" */
696 p
= va_arg (ap
, void *);
697 TRACE (printf (" store %%n to %p\n", p
));
698 switch (param
.type
) {
699 case '\0': * (int *) p
= chars
; break;
700 case 'F': mpf_set_si ((mpf_ptr
) p
, (long) chars
); break;
701 case 'H': * (char *) p
= chars
; break;
702 case 'h': * (short *) p
= chars
; break;
704 case 'j': * (intmax_t *) p
= chars
; break;
706 case 'j': ASSERT_FAIL (intmax_t not available
); break;
708 case 'l': * (long *) p
= chars
; break;
709 #if HAVE_QUAD_T && HAVE_LONG_LONG
711 ASSERT_ALWAYS (sizeof (quad_t
) == sizeof (long long));
714 case 'q': ASSERT_FAIL (quad_t
not available
); break;
717 case 'L': * (long long *) p
= chars
; break;
719 case 'L': ASSERT_FAIL (long long not available
); break;
721 case 'Q': mpq_set_si ((mpq_ptr
) p
, (long) chars
, 1L); break;
723 case 't': * (ptrdiff_t *) p
= chars
; break;
725 case 't': ASSERT_FAIL (ptrdiff_t not available
); break;
727 case 'z': * (size_t *) p
= chars
; break;
728 case 'Z': mpz_set_si ((mpz_ptr
) p
, (long) chars
); break;
729 default: ASSERT (0); break;
743 case '0': case '1': case '2': case '3': case '4':
744 case '5': case '6': case '7': case '8': case '9':
747 param
.width
= param
.width
* 10 + (fchar
-'0');
749 } while (isdigit (fchar
));
750 fmt
--; /* unget the non-digit */
758 /* something invalid in a % sequence */
766 (*__gmp_free_func
) (alloc_fmt
, alloc_fmt_size
);