2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/base/zend-scanf.h"
20 #include "hphp/runtime/base/builtin-functions.h"
22 ///////////////////////////////////////////////////////////////////////////////
26 This file contains the base code which implements sscanf and by extension
27 fscanf. Original code is from TCL8.3.0 and bears the following copyright:
29 This software is copyrighted by the Regents of the University of
30 California, Sun Microsystems, Inc., Scriptics Corporation,
31 and other parties. The following terms apply to all files associated
32 with the software unless explicitly disclaimed in individual files.
34 The authors hereby grant permission to use, copy, modify, distribute,
35 and license this software and its documentation for any purpose, provided
36 that existing copyright notices are retained in all copies and that this
37 notice is included verbatim in any distributions. No written agreement,
38 license, or royalty fee is required for any of the authorized uses.
39 Modifications to this software may be copyrighted by their authors
40 and need not follow the licensing terms described here, provided that
41 the new terms are clearly indicated on the first page of each file where
44 IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
45 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
46 ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
47 DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
48 POSSIBILITY OF SUCH DAMAGE.
50 THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
51 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
52 FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
53 IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
54 NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
57 GOVERNMENT USE: If you are acquiring this software on behalf of the
58 U.S. government, the Government shall have only "Restricted Rights"
59 in the software and related documentation as defined in the Federal
60 Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
61 are acquiring the software on behalf of the Department of Defense, the
62 software shall be classified as "Commercial Computer Software" and the
63 Government shall have only "Restricted Rights" as defined in Clause
64 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
65 authors grant the U.S. Government and others acting in its behalf
66 permission to use and distribute the software in accordance with the
67 terms specified in this license.
70 #define SCAN_MAX_ARGS 0xFF // Maximum number of variable which can be
71 // passed to (f|s)scanf. This is an artifical
72 // upper limit to keep resources in check and
73 // minimize the possibility of exploits
75 #define SCAN_ERROR_INTERNAL (SCAN_ERROR_WRONG_PARAM_COUNT - 1)
78 * Flag values used internally by [f|s]canf.
80 #define SCAN_NOSKIP 0x1 /* Don't skip blanks. */
81 #define SCAN_SUPPRESS 0x2 /* Suppress assignment. */
82 #define SCAN_UNSIGNED 0x4 /* Read an unsigned value. */
83 #define SCAN_WIDTH 0x8 /* A width value was supplied. */
85 #define SCAN_SIGNOK 0x10 /* A +/- character is allowed. */
86 #define SCAN_NODIGITS 0x20 /* No digits have been scanned. */
87 #define SCAN_NOZERO 0x40 /* No zero digits have been scanned. */
88 #define SCAN_XOK 0x80 /* An 'x' is allowed. */
89 #define SCAN_PTOK 0x100 /* Decimal point is allowed. */
90 #define SCAN_EXPOK 0x200 /* An exponent is allowed. */
92 #define UCHAR(x) (unsigned char)(x)
94 ///////////////////////////////////////////////////////////////////////////////
96 * The following structure contains the information associated with
104 typedef struct CharSet
{
105 int exclude
; /* 1 if this is an exclusion set. */
114 *----------------------------------------------------------------------
118 * This function examines a character set format specification
119 * and builds a CharSet containing the individual characters and
120 * character ranges specified.
123 * Returns the next format position.
126 * Initializes the charset.
128 *----------------------------------------------------------------------
130 static const char *BuildCharSet(CharSet
*cset
, const char *format
) {
136 memset(cset
, 0, sizeof(CharSet
));
143 end
= format
+ 1; /* verify this - cc */
146 * Find the close bracket so we can overallocate the set.
159 cset
->chars
= (char *)req::malloc_noptrs(end
- format
- 1);
161 cset
->ranges
= req::make_raw_array
<::Range
>(nranges
);
163 cset
->ranges
= nullptr;
167 * Now build the character set.
169 cset
->nchars
= cset
->nranges
= 0;
172 if (*ch
== ']' || *ch
== '-') {
173 cset
->chars
[cset
->nchars
++] = *ch
;
177 if (*format
== '-') {
179 * This may be the first character of a range, so don't add
183 } else if (*ch
== '-') {
185 * Check to see if this is the last character in the set, in which
186 * case it is not a range and we should add the previous character
187 * as well as the dash.
189 if (*format
== ']') {
190 cset
->chars
[cset
->nchars
++] = start
;
191 cset
->chars
[cset
->nchars
++] = *ch
;
196 * Check to see if the range is in reverse order.
199 cset
->ranges
[cset
->nranges
].start
= start
;
200 cset
->ranges
[cset
->nranges
].end
= *ch
;
202 cset
->ranges
[cset
->nranges
].start
= *ch
;
203 cset
->ranges
[cset
->nranges
].end
= start
;
208 cset
->chars
[cset
->nchars
++] = *ch
;
216 *----------------------------------------------------------------------
220 * Check to see if a character matches the given set.
223 * Returns non-zero if the character matches the given set.
228 *----------------------------------------------------------------------
230 static int CharInSet(CharSet
*cset
, int c
) {
234 for (i
= 0; i
< cset
->nchars
; i
++) {
235 if (cset
->chars
[i
] == ch
) {
241 for (i
= 0; i
< cset
->nranges
; i
++) {
242 if ((cset
->ranges
[i
].start
<= ch
)
243 && (ch
<= cset
->ranges
[i
].end
)) {
249 return (cset
->exclude
? !match
: match
);
253 *----------------------------------------------------------------------
257 * Free the storage associated with a character set.
265 *----------------------------------------------------------------------
267 static void ReleaseCharSet(CharSet
*cset
) {
268 req::free((char *)cset
->chars
);
270 req::free((char *)cset
->ranges
);
274 static inline void scan_set_error_return(int numVars
, Variant
&return_value
) {
276 return_value
= SCAN_ERROR_EOF
; /* EOF marker */
278 return_value
= uninit_null();
283 *----------------------------------------------------------------------
287 * Parse the format string and verify that it is properly formed
288 * and that there are exactly enough variables on the command line.
291 * format The format string.
292 * numVars The number of variables passed to the scan command.
293 * totalSubs The number of variables that will be required.
295 *----------------------------------------------------------------------
297 static int ValidateFormat(const char *format
, int numVars
, int *totalSubs
) {
298 #define STATIC_LIST_SIZE 16
299 int gotXpg
, gotSequential
, value
, i
, flags
;
300 const char *end
, *ch
= nullptr;
301 int staticAssign
[STATIC_LIST_SIZE
];
302 int *nassign
= staticAssign
;
303 int objIndex
, xpgSize
, nspace
= STATIC_LIST_SIZE
;
306 * Initialize an array that records the number of times a variable
307 * is assigned to by the format string. We use this to detect if
308 * a variable is multiply assigned or left unassigned.
310 if (numVars
> nspace
) {
311 nassign
= (int*)req::malloc_noptrs(sizeof(int) * numVars
);
314 for (i
= 0; i
< nspace
; i
++) {
318 xpgSize
= objIndex
= gotXpg
= gotSequential
= 0;
320 while (*format
!= '\0') {
332 flags
|= SCAN_SUPPRESS
;
337 if ( isdigit( (int)*ch
) ) {
339 * Check for an XPG3-style %n$ specification. Note: there
340 * must not be a mixture of XPG3 specs and non-XPG3 specs
341 * in the same format string.
344 value
= strtoul(format
-1, &endptr
, 10);
355 objIndex
= value
- 1;
356 if ((objIndex
< 0) || (numVars
&& (objIndex
>= numVars
))) {
358 } else if (numVars
== 0) {
360 * In the case where no vars are specified, the user can
361 * specify %9999$ legally, so we have to consider special
362 * rules for growing the assign array. 'value' is
363 * guaranteed to be > 0.
366 /* set a lower artificial limit on this
367 * in the interest of security and resource friendliness
368 * 255 arguments should be more than enough. - cc
370 if (value
> SCAN_MAX_ARGS
) {
374 xpgSize
= (xpgSize
> value
) ? xpgSize
: value
;
383 if (nassign
!= staticAssign
) req::free((char *)nassign
);
384 throw_invalid_argument
385 ("format: cannot mix \"%%\" and \"%%n$\" conversion specifiers");
386 return SCAN_ERROR_INVALID_FORMAT
;
391 * Parse any width specifier.
393 if (isdigit(UCHAR(*ch
))) {
395 value
= strtoul(format
-1, &endptr
, 10);
402 * Ignore size specifier.
404 if ((*ch
== 'l') || (*ch
== 'L') || (*ch
== 'h')) {
408 if (!(flags
& SCAN_SUPPRESS
) && numVars
&& (objIndex
>= numVars
)) {
413 * Handle the various field types.
432 /* we differ here with the TCL implementation in allowing for */
433 /* a character width specification, to be more consistent with */
434 /* ANSI. since Zend auto allocates space for vars, this is no */
437 if (flags & SCAN_WIDTH) {
438 throw_invalid_argument
439 ("format: Field width may not be specified in %c conversion");
441 return SCAN_ERROR_INVALID_FORMAT;
446 if (*format
== '\0') {
451 if (*format
== '\0') {
457 if (*format
== '\0') {
463 if (*format
== '\0') {
470 if (nassign
!= staticAssign
) req::free((char *)nassign
);
471 throw_invalid_argument("format: Unmatched [ in format string");
472 return SCAN_ERROR_INVALID_FORMAT
;
475 if (nassign
!= staticAssign
) req::free((char *)nassign
);
476 throw_invalid_argument("Bad scan conversion character \"%c\"", *ch
);
477 return SCAN_ERROR_INVALID_FORMAT
;
480 if (!(flags
& SCAN_SUPPRESS
)) {
481 if (objIndex
>= nspace
) {
483 * Expand the nassign buffer. If we are using XPG specifiers,
484 * make sure that we grow to a large enough size. xpgSize is
485 * guaranteed to be at least one larger than objIndex.
491 nspace
+= STATIC_LIST_SIZE
;
493 if (nassign
== staticAssign
) {
494 nassign
= (int*)req::malloc_noptrs(nspace
* sizeof(int));
495 for (i
= 0; i
< STATIC_LIST_SIZE
; ++i
) {
496 nassign
[i
] = staticAssign
[i
];
500 (int*)req::realloc_noptrs((void *)nassign
, nspace
* sizeof(int));
502 for (i
= value
; i
< nspace
; i
++) {
509 } /* while (*format != '\0') */
512 * Verify that all of the variable were assigned exactly once.
522 *totalSubs
= numVars
;
524 for (i
= 0; i
< numVars
; i
++) {
525 if (nassign
[i
] > 1) {
526 if (nassign
!= staticAssign
) req::free((char *)nassign
);
527 throw_invalid_argument
528 ("format: Variable is assigned by multiple \"%%n$\" specifiers");
529 return SCAN_ERROR_INVALID_FORMAT
;
530 } else if (!xpgSize
&& (nassign
[i
] == 0)) {
532 * If the space is empty, and xpgSize is 0 (means XPG wasn't
533 * used, and/or numVars != 0), then too many vars were given
535 if (nassign
!= staticAssign
) req::free((char *)nassign
);
536 throw_invalid_argument
537 ("format: Variable is not assigned by any conversion specifiers");
538 return SCAN_ERROR_INVALID_FORMAT
;
542 if (nassign
!= staticAssign
) req::free((char *)nassign
);
546 if (nassign
!= staticAssign
) req::free((char *)nassign
);
548 throw_invalid_argument
549 ("format: \"%%n$\" argument index out of range");
551 throw_invalid_argument
552 ("format: Different numbers of variable names and field specifiers");
554 return SCAN_ERROR_INVALID_FORMAT
;
555 #undef STATIC_LIST_SIZE
559 * This is the internal function which does processing on behalf of
560 * both sscanf() and fscanf()
563 * string literal string to be processed
564 * format format string
565 * return_value set with the results of the scan
567 int string_sscanf(const char *string
, const char *format
, int numVars
,
568 Variant
&return_value
) {
573 const char *baseString
;
578 long (*fn
)(const char *, char **, int) = nullptr;
582 char buf
[64]; /* Temporary buffer to hold scanned number
583 * strings before they are passed to strtoul() */
588 * Check for errors in the format string.
590 if (ValidateFormat(format
, numVars
, &totalVars
) != SCAN_SUCCESS
) {
591 scan_set_error_return(numVars
, return_value
);
592 return SCAN_ERROR_INVALID_FORMAT
;
598 * Iterate over the format string filling in the result objects until
599 * we reach the end of input, the end of the format string, or there
604 while (*format
!= '\0') {
609 * If we see whitespace in the format, skip whitespace in the string.
611 if ( isspace( (int)*ch
) ) {
613 while ( isspace( (int)sch
) ) {
614 if (*string
== '\0') {
625 if (*string
== '\0') {
643 * Check for assignment suppression ('*') or an XPG3-style
644 * assignment ('%n$').
647 flags
|= SCAN_SUPPRESS
;
649 } else if ( isdigit(UCHAR(*ch
))) {
650 value
= strtoul(format
-1, &end
, 10);
658 * Parse any width specifier.
660 if ( isdigit(UCHAR(*ch
))) {
662 width
= strtoul(format
-1, &endptr
, 10);
670 * Ignore size specifier.
672 if ((*ch
== 'l') || (*ch
== 'L') || (*ch
== 'h')) {
677 * Handle the various field types.
681 if (!(flags
& SCAN_SUPPRESS
)) {
682 returnArray
.append((int)(string
- baseString
));
691 fn
= (long (*)(const char *, char **, int))strtol
;
696 fn
= (long (*)(const char *, char **, int))strtol
;
701 fn
= (long (*)(const char *, char **, int))strtol
;
707 fn
= (long (*)(const char *, char **, int))strtol
;
712 flags
|= SCAN_UNSIGNED
;
713 fn
= (long (*)(const char *, char **, int))strtoul
;
729 flags
|= SCAN_NOSKIP
;
738 flags
|= SCAN_NOSKIP
;
743 * At this point, we will need additional characters from the
746 if (*string
== '\0') {
752 * Skip any leading whitespace at the beginning of a field unless
753 * the format suppresses this behavior.
755 if (!(flags
& SCAN_NOSKIP
)) {
756 while (*string
!= '\0') {
758 if (! isspace((int)sch
) ) {
763 if (*string
== '\0') {
770 * Perform the requested scanning operation.
776 * Scan a string up to width characters or whitespace.
782 while (*end
!= '\0') {
784 if ( isspace( (int)sch
) ) {
792 if (!(flags
& SCAN_SUPPRESS
)) {
793 returnArray
.append(String(string
, end
-string
, CopyString
));
806 format
= BuildCharSet(&cset
, format
);
807 while (*end
!= '\0') {
809 if (!CharInSet(&cset
, (int)sch
)) {
817 ReleaseCharSet(&cset
);
821 * Nothing matched the range, stop processing
825 if (!(flags
& SCAN_SUPPRESS
)) {
826 returnArray
.append(String(string
, end
-string
, CopyString
));
833 * Scan an unsigned or signed integer.
838 if ((width
== 0) || (width
> sizeof(buf
) - 1)) {
839 width
= sizeof(buf
) - 1;
842 flags
|= SCAN_SIGNOK
| SCAN_NODIGITS
| SCAN_NOZERO
;
843 for (end
= buf
; width
> 0; width
--) {
846 * The 0 digit has special meaning at the beginning of
847 * a number. If we are unsure of the base, it
848 * indicates that we are in base 8 or base 16 (if it is
849 * followed by an 'x').
861 if (flags
& SCAN_NOZERO
) {
862 flags
&= ~(SCAN_SIGNOK
| SCAN_NODIGITS
| SCAN_NOZERO
);
864 flags
&= ~(SCAN_SIGNOK
| SCAN_XOK
| SCAN_NODIGITS
);
868 case '1': case '2': case '3': case '4':
869 case '5': case '6': case '7':
873 flags
&= ~(SCAN_SIGNOK
| SCAN_XOK
| SCAN_NODIGITS
);
883 flags
&= ~(SCAN_SIGNOK
| SCAN_XOK
| SCAN_NODIGITS
);
886 case 'A': case 'B': case 'C':
887 case 'D': case 'E': case 'F':
888 case 'a': case 'b': case 'c':
889 case 'd': case 'e': case 'f':
893 flags
&= ~(SCAN_SIGNOK
| SCAN_XOK
| SCAN_NODIGITS
);
897 if (flags
& SCAN_SIGNOK
) {
898 flags
&= ~SCAN_SIGNOK
;
904 if ((flags
& SCAN_XOK
) && (end
== buf
+1)) {
913 * We got an illegal character so we are done accumulating.
919 * Add the character to the temporary buffer.
922 if (*string
== '\0') {
928 * Check to see if we need to back up because we only got a
929 * sign or a trailing x after a 0.
931 if (flags
& SCAN_NODIGITS
) {
932 if (*string
== '\0') {
936 } else if (end
[-1] == 'x' || end
[-1] == 'X') {
942 * Scan the value from the temporary buffer. If we are
943 * returning a large unsigned value, we have to convert it back
944 * to a string since PHP only supports signed values.
946 if (!(flags
& SCAN_SUPPRESS
)) {
948 value
= (int64_t) (*fn
)(buf
, nullptr, base
);
949 if ((flags
& SCAN_UNSIGNED
) && (value
< 0)) {
950 snprintf(buf
, sizeof(buf
), "%lu", (long)value
); /* INTL: ISO digit */
951 returnArray
.append(String(buf
, CopyString
));
953 returnArray
.append(value
);
960 * Scan a floating point number
962 buf
[0] = '\0'; /* call me pedantic */
963 if ((width
== 0) || (width
> sizeof(buf
) - 1)) {
964 width
= sizeof(buf
) - 1;
966 flags
|= SCAN_SIGNOK
| SCAN_NODIGITS
| SCAN_PTOK
| SCAN_EXPOK
;
967 for (end
= buf
; width
> 0; width
--) {
969 case '0': case '1': case '2': case '3':
970 case '4': case '5': case '6': case '7':
972 flags
&= ~(SCAN_SIGNOK
| SCAN_NODIGITS
);
976 if (flags
& SCAN_SIGNOK
) {
977 flags
&= ~SCAN_SIGNOK
;
982 if (flags
& SCAN_PTOK
) {
983 flags
&= ~(SCAN_SIGNOK
| SCAN_PTOK
);
990 * An exponent is not allowed until there has
991 * been at least one digit.
993 if ((flags
& (SCAN_NODIGITS
| SCAN_EXPOK
)) == SCAN_EXPOK
) {
994 flags
= (flags
& ~(SCAN_EXPOK
|SCAN_PTOK
))
995 | SCAN_SIGNOK
| SCAN_NODIGITS
;
1002 * We got an illegal character so we are done accumulating.
1008 * Add the character to the temporary buffer.
1011 if (*string
== '\0') {
1017 * Check to see if we need to back up because we saw a
1018 * trailing 'e' or sign.
1020 if (flags
& SCAN_NODIGITS
) {
1021 if (flags
& SCAN_EXPOK
) {
1023 * There were no digits at all so scanning has
1024 * failed and we are done.
1026 if (*string
== '\0') {
1033 * We got a bad exponent ('e' and maybe a sign).
1037 if (*end
!= 'e' && *end
!= 'E') {
1044 * Scan the value from the temporary buffer.
1046 if (!(flags
& SCAN_SUPPRESS
)) {
1049 dvalue
= strtod(buf
, nullptr);
1050 returnArray
.append(dvalue
);
1055 } /* while (*format != '\0') */
1058 if (underflow
&& (0==nconversions
)) {
1059 scan_set_error_return(numVars
, return_value
);
1060 return SCAN_ERROR_EOF
;
1061 } else if (nconversions
< totalVars
) {
1062 /* TODO: not all elements converted. we need to prune the list - cc */
1064 return_value
= returnArray
;
1065 return SCAN_SUCCESS
;
1068 ///////////////////////////////////////////////////////////////////////////////