Python/pystrtod.c

   1 /* -*- Mode: C; c-file-style: "python" -*- */
   2
   3 #include <Python.h>
   4 #include <locale.h>
   5
   6 /* ascii character tests (as opposed to locale tests) */
   7 #define ISSPACE(c)  ((c) == ' ' || (c) == '\f' || (c) == '\n' || \
   8                      (c) == '\r' || (c) == '\t' || (c) == '\v')
   9 #define ISDIGIT(c)  ((c) >= '0' && (c) <= '9')
  10
  11
  12 /**
  13  * PyOS_ascii_strtod:
  14  * @nptr:    the string to convert to a numeric value.
  15  * @endptr:  if non-%NULL, it returns the character after
  16  *           the last character used in the conversion.
  17  *
  18  * Converts a string to a #gdouble value.
  19  * This function behaves like the standard strtod() function
  20  * does in the C locale. It does this without actually
  21  * changing the current locale, since that would not be
  22  * thread-safe.
  23  *
  24  * This function is typically used when reading configuration
  25  * files or other non-user input that should be locale independent.
  26  * To handle input from the user you should normally use the
  27  * locale-sensitive system strtod() function.
  28  *
  29  * If the correct value would cause overflow, plus or minus %HUGE_VAL
  30  * is returned (according to the sign of the value), and %ERANGE is
  31  * stored in %errno. If the correct value would cause underflow,
  32  * zero is returned and %ERANGE is stored in %errno.
  33  * If memory allocation fails, %ENOMEM is stored in %errno.
  34  *
  35  * This function resets %errno before calling strtod() so that
  36  * you can reliably detect overflow and underflow.
  37  *
  38  * Return value: the #gdouble value.
  39  **/
  40
  41 /*
  42    Use system strtod;  since strtod is locale aware, we may
  43    have to first fix the decimal separator.
  44
  45    Note that unlike _Py_dg_strtod, the system strtod may not always give
  46    correctly rounded results.
  47 */
  48
  49 double
  50 PyOS_ascii_strtod(const char *nptr, char **endptr)
  51 {
  52         char *fail_pos;
  53         double val = -1.0;
  54         struct lconv *locale_data;
  55         const char *decimal_point;
  56         size_t decimal_point_len;
  57         const char *p, *decimal_point_pos;
  58         const char *end = NULL; /* Silence gcc */
  59         const char *digits_pos = NULL;
  60         int negate = 0;
  61
  62         assert(nptr != NULL);
  63
  64         fail_pos = NULL;
  65
  66         locale_data = localeconv();
  67         decimal_point = locale_data->decimal_point;
  68         decimal_point_len = strlen(decimal_point);
  69
  70         assert(decimal_point_len != 0);
  71
  72         decimal_point_pos = NULL;
  73
  74         /* Set errno to zero, so that we can distinguish zero results
  75            and underflows */
  76         errno = 0;
  77
  78         /* We process any leading whitespace and the optional sign manually,
  79            then pass the remainder to the system strtod.  This ensures that
  80            the result of an underflow has the correct sign. (bug #1725)  */
  81
  82         p = nptr;
  83         /* Skip leading space */
  84         while (ISSPACE(*p))
  85                 p++;
  86
  87         /* Process leading sign, if present */
  88         if (*p == '-') {
  89                 negate = 1;
  90                 p++;
  91         }
  92         else if (*p == '+') {
  93                 p++;
  94         }
  95
  96         /* Parse infinities and nans */
  97         if (*p == 'i' || *p == 'I') {
  98                 if (PyOS_strnicmp(p, "inf", 3) == 0) {
  99                         val = Py_HUGE_VAL;
 100                         if (PyOS_strnicmp(p+3, "inity", 5) == 0)
 101                                 fail_pos = (char *)p+8;
 102                         else
 103                                 fail_pos = (char *)p+3;
 104                         goto got_val;
 105                 }
 106                 else
 107                         goto invalid_string;
 108         }
 109 #ifdef Py_NAN
 110         if (*p == 'n' || *p == 'N') {
 111                 if (PyOS_strnicmp(p, "nan", 3) == 0) {
 112                         val = Py_NAN;
 113                         fail_pos = (char *)p+3;
 114                         goto got_val;
 115                 }
 116                 else
 117                         goto invalid_string;
 118         }
 119 #endif
 120
 121         /* Some platform strtods accept hex floats; Python shouldn't (at the
 122            moment), so we check explicitly for strings starting with '0x'. */
 123         if (*p == '0' && (*(p+1) == 'x' || *(p+1) == 'X'))
 124                 goto invalid_string;
 125
 126         /* Check that what's left begins with a digit or decimal point */
 127         if (!ISDIGIT(*p) && *p != '.')
 128                 goto invalid_string;
 129
 130         digits_pos = p;
 131         if (decimal_point[0] != '.' ||
 132             decimal_point[1] != 0)
 133         {
 134                 /* Look for a '.' in the input; if present, it'll need to be
 135                    swapped for the current locale's decimal point before we
 136                    call strtod.  On the other hand, if we find the current
 137                    locale's decimal point then the input is invalid. */
 138                 while (ISDIGIT(*p))
 139                         p++;
 140
 141                 if (*p == '.')
 142                 {
 143                         decimal_point_pos = p++;
 144
 145                         /* locate end of number */
 146                         while (ISDIGIT(*p))
 147                                 p++;
 148
 149                         if (*p == 'e' || *p == 'E')
 150                                 p++;
 151                         if (*p == '+' || *p == '-')
 152                                 p++;
 153                         while (ISDIGIT(*p))
 154                                 p++;
 155                         end = p;
 156                 }
 157                 else if (strncmp(p, decimal_point, decimal_point_len) == 0)
 158                         /* Python bug #1417699 */
 159                         goto invalid_string;
 160                 /* For the other cases, we need not convert the decimal
 161                    point */
 162         }
 163
 164         if (decimal_point_pos) {
 165                 char *copy, *c;
 166                 /* Create a copy of the input, with the '.' converted to the
 167                    locale-specific decimal point */
 168                 copy = (char *)PyMem_MALLOC(end - digits_pos +
 169                                             1 + decimal_point_len);
 170                 if (copy == NULL) {
 171                         if (endptr)
 172                                 *endptr = (char *)nptr;
 173                         errno = ENOMEM;
 174                         return val;
 175                 }
 176
 177                 c = copy;
 178                 memcpy(c, digits_pos, decimal_point_pos - digits_pos);
 179                 c += decimal_point_pos - digits_pos;
 180                 memcpy(c, decimal_point, decimal_point_len);
 181                 c += decimal_point_len;
 182                 memcpy(c, decimal_point_pos + 1,
 183                        end - (decimal_point_pos + 1));
 184                 c += end - (decimal_point_pos + 1);
 185                 *c = 0;
 186
 187                 val = strtod(copy, &fail_pos);
 188
 189                 if (fail_pos)
 190                 {
 191                         if (fail_pos > decimal_point_pos)
 192                                 fail_pos = (char *)digits_pos +
 193                                         (fail_pos - copy) -
 194                                         (decimal_point_len - 1);
 195                         else
 196                                 fail_pos = (char *)digits_pos +
 197                                         (fail_pos - copy);
 198                 }
 199
 200                 PyMem_FREE(copy);
 201
 202         }
 203         else {
 204                 val = strtod(digits_pos, &fail_pos);
 205         }
 206
 207         if (fail_pos == digits_pos)
 208                 goto invalid_string;
 209
 210   got_val:
 211         if (negate && fail_pos != nptr)
 212                 val = -val;
 213
 214         if (endptr)
 215                 *endptr = fail_pos;
 216
 217         return val;
 218
 219   invalid_string:
 220         if (endptr)
 221                 *endptr = (char*)nptr;
 222         errno = EINVAL;
 223         return -1.0;
 224 }
 225
 226 double
 227 PyOS_ascii_atof(const char *nptr)
 228 {
 229         return PyOS_ascii_strtod(nptr, NULL);
 230 }
 231
 232
 233 /* Given a string that may have a decimal point in the current
 234    locale, change it back to a dot.  Since the string cannot get
 235    longer, no need for a maximum buffer size parameter. */
 236 Py_LOCAL_INLINE(void)
 237 change_decimal_from_locale_to_dot(char* buffer)
 238 {
 239         struct lconv *locale_data = localeconv();
 240         const char *decimal_point = locale_data->decimal_point;
 241
 242         if (decimal_point[0] != '.' || decimal_point[1] != 0) {
 243                 size_t decimal_point_len = strlen(decimal_point);
 244
 245                 if (*buffer == '+' || *buffer == '-')
 246                         buffer++;
 247                 while (isdigit(Py_CHARMASK(*buffer)))
 248                         buffer++;
 249                 if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
 250                         *buffer = '.';
 251                         buffer++;
 252                         if (decimal_point_len > 1) {
 253                                 /* buffer needs to get smaller */
 254                                 size_t rest_len = strlen(buffer +
 255                                                      (decimal_point_len - 1));
 256                                 memmove(buffer,
 257                                         buffer + (decimal_point_len - 1),
 258                                         rest_len);
 259                                 buffer[rest_len] = 0;
 260                         }
 261                 }
 262         }
 263 }
 264
 265
 266 Py_LOCAL_INLINE(void)
 267 ensure_sign(char* buffer, size_t buf_size)
 268 {
 269         Py_ssize_t len;
 270
 271         if (buffer[0] == '-')
 272                 /* Already have a sign. */
 273                 return;
 274
 275         /* Include the trailing 0 byte. */
 276         len = strlen(buffer)+1;
 277         if (len >= buf_size+1)
 278                 /* No room for the sign, don't do anything. */
 279                 return;
 280
 281         memmove(buffer+1, buffer, len);
 282         buffer[0] = '+';
 283 }
 284
 285 /* From the C99 standard, section 7.19.6:
 286 The exponent always contains at least two digits, and only as many more digits
 287 as necessary to represent the exponent.
 288 */
 289 #define MIN_EXPONENT_DIGITS 2
 290
 291 /* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
 292    in length. */
 293 Py_LOCAL_INLINE(void)
 294 ensure_minumim_exponent_length(char* buffer, size_t buf_size)
 295 {
 296         char *p = strpbrk(buffer, "eE");
 297         if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
 298                 char *start = p + 2;
 299                 int exponent_digit_cnt = 0;
 300                 int leading_zero_cnt = 0;
 301                 int in_leading_zeros = 1;
 302                 int significant_digit_cnt;
 303
 304                 /* Skip over the exponent and the sign. */
 305                 p += 2;
 306
 307                 /* Find the end of the exponent, keeping track of leading
 308                    zeros. */
 309                 while (*p && isdigit(Py_CHARMASK(*p))) {
 310                         if (in_leading_zeros && *p == '0')
 311                                 ++leading_zero_cnt;
 312                         if (*p != '0')
 313                                 in_leading_zeros = 0;
 314                         ++p;
 315                         ++exponent_digit_cnt;
 316                 }
 317
 318                 significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
 319                 if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
 320                         /* If there are 2 exactly digits, we're done,
 321                            regardless of what they contain */
 322                 }
 323                 else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
 324                         int extra_zeros_cnt;
 325
 326                         /* There are more than 2 digits in the exponent.  See
 327                            if we can delete some of the leading zeros */
 328                         if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
 329                                 significant_digit_cnt = MIN_EXPONENT_DIGITS;
 330                         extra_zeros_cnt = exponent_digit_cnt -
 331                                 significant_digit_cnt;
 332
 333                         /* Delete extra_zeros_cnt worth of characters from the
 334                            front of the exponent */
 335                         assert(extra_zeros_cnt >= 0);
 336
 337                         /* Add one to significant_digit_cnt to copy the
 338                            trailing 0 byte, thus setting the length */
 339                         memmove(start,
 340                                 start + extra_zeros_cnt,
 341                                 significant_digit_cnt + 1);
 342                 }
 343                 else {
 344                         /* If there are fewer than 2 digits, add zeros
 345                            until there are 2, if there's enough room */
 346                         int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
 347                         if (start + zeros + exponent_digit_cnt + 1
 348                               < buffer + buf_size) {
 349                                 memmove(start + zeros, start,
 350                                         exponent_digit_cnt + 1);
 351                                 memset(start, '0', zeros);
 352                         }
 353                 }
 354         }
 355 }
 356
 357 /* Ensure that buffer has a decimal point in it.  The decimal point will not
 358    be in the current locale, it will always be '.'. Don't add a decimal if an
 359    exponent is present. */
 360 Py_LOCAL_INLINE(void)
 361 ensure_decimal_point(char* buffer, size_t buf_size)
 362 {
 363         int insert_count = 0;
 364         char* chars_to_insert;
 365
 366         /* search for the first non-digit character */
 367         char *p = buffer;
 368         if (*p == '-' || *p == '+')
 369                 /* Skip leading sign, if present.  I think this could only
 370                    ever be '-', but it can't hurt to check for both. */
 371                 ++p;
 372         while (*p && isdigit(Py_CHARMASK(*p)))
 373                 ++p;
 374
 375         if (*p == '.') {
 376                 if (isdigit(Py_CHARMASK(*(p+1)))) {
 377                         /* Nothing to do, we already have a decimal
 378                            point and a digit after it */
 379                 }
 380                 else {
 381                         /* We have a decimal point, but no following
 382                            digit.  Insert a zero after the decimal. */
 383                         ++p;
 384                         chars_to_insert = "0";
 385                         insert_count = 1;
 386                 }
 387         }
 388         else if (!(*p == 'e' || *p == 'E')) {
 389                 /* Don't add ".0" if we have an exponent. */
 390                 chars_to_insert = ".0";
 391                 insert_count = 2;
 392         }
 393         if (insert_count) {
 394                 size_t buf_len = strlen(buffer);
 395                 if (buf_len + insert_count + 1 >= buf_size) {
 396                         /* If there is not enough room in the buffer
 397                            for the additional text, just skip it.  It's
 398                            not worth generating an error over. */
 399                 }
 400                 else {
 401                         memmove(p + insert_count, p,
 402                                 buffer + strlen(buffer) - p + 1);
 403                         memcpy(p, chars_to_insert, insert_count);
 404                 }
 405         }
 406 }
 407
 408 /* see FORMATBUFLEN in unicodeobject.c */
 409 #define FLOAT_FORMATBUFLEN 120
 410
 411 /**
 412  * _PyOS_ascii_formatd:
 413  * @buffer: A buffer to place the resulting string in
 414  * @buf_size: The length of the buffer.
 415  * @format: The printf()-style format to use for the
 416  *          code to use for converting.
 417  * @d: The #gdouble to convert
 418  *
 419  * Converts a #gdouble to a string, using the '.' as
 420  * decimal point. To format the number you pass in
 421  * a printf()-style format string. Allowed conversion
 422  * specifiers are 'e', 'E', 'f', 'F', 'g', 'G', and 'Z'.
 423  *
 424  * 'Z' is the same as 'g', except it always has a decimal and
 425  *     at least one digit after the decimal.
 426  *
 427  * Return value: The pointer to the buffer with the converted string.
 428  **/
 429 /* DEPRECATED, will be deleted in 2.8 and 3.2 */
 430 PyAPI_FUNC(char *)
 431 PyOS_ascii_formatd(char       *buffer,
 432                    size_t      buf_size,
 433                    const char *format,
 434                    double      d)
 435 {
 436         char format_char;
 437         size_t format_len = strlen(format);
 438
 439         /* Issue 2264: code 'Z' requires copying the format.  'Z' is 'g', but
 440            also with at least one character past the decimal. */
 441         char tmp_format[FLOAT_FORMATBUFLEN];
 442
 443         if (PyErr_WarnEx(PyExc_DeprecationWarning,
 444                          "PyOS_ascii_formatd is deprecated, "
 445                          "use PyOS_double_to_string instead", 1) < 0)
 446                 return NULL;
 447
 448         /* The last character in the format string must be the format char */
 449         format_char = format[format_len - 1];
 450
 451         if (format[0] != '%')
 452                 return NULL;
 453
 454         /* I'm not sure why this test is here.  It's ensuring that the format
 455            string after the first character doesn't have a single quote, a
 456            lowercase l, or a percent. This is the reverse of the commented-out
 457            test about 10 lines ago. */
 458         if (strpbrk(format + 1, "'l%"))
 459                 return NULL;
 460
 461         /* Also curious about this function is that it accepts format strings
 462            like "%xg", which are invalid for floats.  In general, the
 463            interface to this function is not very good, but changing it is
 464            difficult because it's a public API. */
 465
 466         if (!(format_char == 'e' || format_char == 'E' ||
 467               format_char == 'f' || format_char == 'F' ||
 468               format_char == 'g' || format_char == 'G' ||
 469               format_char == 'Z'))
 470                 return NULL;
 471
 472         /* Map 'Z' format_char to 'g', by copying the format string and
 473            replacing the final char with a 'g' */
 474         if (format_char == 'Z') {
 475                 if (format_len + 1 >= sizeof(tmp_format)) {
 476                         /* The format won't fit in our copy.  Error out.  In
 477                            practice, this will never happen and will be
 478                            detected by returning NULL */
 479                         return NULL;
 480                 }
 481                 strcpy(tmp_format, format);
 482                 tmp_format[format_len - 1] = 'g';
 483                 format = tmp_format;
 484         }
 485
 486
 487         /* Have PyOS_snprintf do the hard work */
 488         PyOS_snprintf(buffer, buf_size, format, d);
 489
 490         /* Do various fixups on the return string */
 491
 492         /* Get the current locale, and find the decimal point string.
 493            Convert that string back to a dot. */
 494         change_decimal_from_locale_to_dot(buffer);
 495
 496         /* If an exponent exists, ensure that the exponent is at least
 497            MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
 498            for the extra zeros.  Also, if there are more than
 499            MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
 500            back to MIN_EXPONENT_DIGITS */
 501         ensure_minumim_exponent_length(buffer, buf_size);
 502
 503         /* If format_char is 'Z', make sure we have at least one character
 504            after the decimal point (and make sure we have a decimal point). */
 505         if (format_char == 'Z')
 506                 ensure_decimal_point(buffer, buf_size);
 507
 508         return buffer;
 509 }
 510
 511 PyAPI_FUNC(void)
 512 _PyOS_double_to_string(char *buf, size_t buf_len, double val,
 513                     char format_code, int precision,
 514                     int flags, int *ptype)
 515 {
 516         char format[32];
 517         int t;
 518         int upper = 0;
 519
 520         if (buf_len < 1) {
 521                 assert(0);
 522                 /* There's no way to signal this error. Just return. */
 523                 return;
 524         }
 525         buf[0] = 0;
 526
 527         /* Validate format_code, and map upper and lower case */
 528         switch (format_code) {
 529         case 'e':          /* exponent */
 530         case 'f':          /* fixed */
 531         case 'g':          /* general */
 532                 break;
 533         case 'E':
 534                 upper = 1;
 535                 format_code = 'e';
 536                 break;
 537         case 'F':
 538                 upper = 1;
 539                 format_code = 'f';
 540                 break;
 541         case 'G':
 542                 upper = 1;
 543                 format_code = 'g';
 544                 break;
 545         case 'r':          /* repr format */
 546                 /* Supplied precision is unused, must be 0. */
 547                 if (precision != 0)
 548                         return;
 549                 precision = 17;
 550                 format_code = 'g';
 551                 break;
 552         case 's':          /* str format */
 553                 /* Supplied precision is unused, must be 0. */
 554                 if (precision != 0)
 555                         return;
 556                 precision = 12;
 557                 format_code = 'g';
 558                 break;
 559         default:
 560                 assert(0);
 561                 return;
 562         }
 563
 564         /* Check for buf too small to fit "-inf". Other buffer too small
 565            conditions are dealt with when converting or formatting finite
 566            numbers. */
 567         if (buf_len < 5) {
 568                 assert(0);
 569                 return;
 570         }
 571
 572         /* Handle nan and inf. */
 573         if (Py_IS_NAN(val)) {
 574                 strcpy(buf, "nan");
 575                 t = Py_DTST_NAN;
 576         } else if (Py_IS_INFINITY(val)) {
 577                 if (copysign(1., val) == 1.)
 578                         strcpy(buf, "inf");
 579                 else
 580                         strcpy(buf, "-inf");
 581                 t = Py_DTST_INFINITE;
 582         } else {
 583                 t = Py_DTST_FINITE;
 584
 585                 /* Build the format string. */
 586                 PyOS_snprintf(format, sizeof(format), "%%%s.%i%c",
 587                               (flags & Py_DTSF_ALT ? "#" : ""), precision,
 588                               format_code);
 589
 590                 /* Have PyOS_snprintf do the hard work. */
 591                 PyOS_snprintf(buf, buf_len, format, val);
 592
 593                 /* Do various fixups on the return string */
 594
 595                 /* Get the current locale, and find the decimal point string.
 596                    Convert that string back to a dot. */
 597                 change_decimal_from_locale_to_dot(buf);
 598
 599                 /* If an exponent exists, ensure that the exponent is at least
 600                    MIN_EXPONENT_DIGITS digits, providing the buffer is large
 601                    enough for the extra zeros.  Also, if there are more than
 602                    MIN_EXPONENT_DIGITS, remove as many zeros as possible until
 603                    we get back to MIN_EXPONENT_DIGITS */
 604                 ensure_minumim_exponent_length(buf, buf_len);
 605
 606                 /* Possibly make sure we have at least one character after the
 607                    decimal point (and make sure we have a decimal point). */
 608                 if (flags & Py_DTSF_ADD_DOT_0)
 609                         ensure_decimal_point(buf, buf_len);
 610         }
 611
 612         /* Add the sign if asked and the result isn't negative. */
 613         if (flags & Py_DTSF_SIGN && buf[0] != '-')
 614                 ensure_sign(buf, buf_len);
 615
 616         if (upper) {
 617                 /* Convert to upper case. */
 618                 char *p;
 619                 for (p = buf; *p; p++)
 620                         *p = toupper(*p);
 621         }
 622
 623         if (ptype)
 624                 *ptype = t;
 625 }
 626
 627
 628 PyAPI_FUNC(char *) PyOS_double_to_string(double val,
 629                                          char format_code,
 630                                          int precision,
 631                                          int flags,
 632                                          int *ptype)
 633 {
 634         char buf[128];
 635         Py_ssize_t len;
 636         char *result;
 637
 638         _PyOS_double_to_string(buf, sizeof(buf), val, format_code, precision,
 639                                flags, ptype);
 640         len = strlen(buf);
 641         if (len == 0) {
 642                 PyErr_BadInternalCall();
 643                 return NULL;
 644         }
 645
 646         /* Add 1 for the trailing 0 byte. */
 647         result = PyMem_Malloc(len + 1);
 648         if (result == NULL) {
 649                 PyErr_NoMemory();
 650                 return NULL;
 651         }
 652         strcpy(result, buf);
 653
 654         return result;
 655 }