Python/pystrtod.c

   1 /* -*- Mode: C; c-file-style: "python" -*- */
   2
   3 #include <Python.h>
   4 #include <locale.h>
   5
   6 /* ascii character tests (as opposed to locale tests) */
   7 #define ISSPACE(c)  ((c) == ' ' || (c) == '\f' || (c) == '\n' || \
   8                      (c) == '\r' || (c) == '\t' || (c) == '\v')
   9 #define ISDIGIT(c)  ((c) >= '0' && (c) <= '9')
  10
  11
  12 /**
  13  * PyOS_ascii_strtod:
  14  * @nptr:    the string to convert to a numeric value.
  15  * @endptr:  if non-%NULL, it returns the character after
  16  *           the last character used in the conversion.
  17  *
  18  * Converts a string to a #gdouble value.
  19  * This function behaves like the standard strtod() function
  20  * does in the C locale. It does this without actually
  21  * changing the current locale, since that would not be
  22  * thread-safe.
  23  *
  24  * This function is typically used when reading configuration
  25  * files or other non-user input that should be locale independent.
  26  * To handle input from the user you should normally use the
  27  * locale-sensitive system strtod() function.
  28  *
  29  * If the correct value would cause overflow, plus or minus %HUGE_VAL
  30  * is returned (according to the sign of the value), and %ERANGE is
  31  * stored in %errno. If the correct value would cause underflow,
  32  * zero is returned and %ERANGE is stored in %errno.
  33  * If memory allocation fails, %ENOMEM is stored in %errno.
  34  *
  35  * This function resets %errno before calling strtod() so that
  36  * you can reliably detect overflow and underflow.
  37  *
  38  * Return value: the #gdouble value.
  39  **/
  40
  41 /*
  42    Use system strtod;  since strtod is locale aware, we may
  43    have to first fix the decimal separator.
  44
  45    Note that unlike _Py_dg_strtod, the system strtod may not always give
  46    correctly rounded results.
  47 */
  48
  49 double
  50 PyOS_ascii_strtod(const char *nptr, char **endptr)
  51 {
  52         char *fail_pos;
  53         double val = -1.0;
  54         struct lconv *locale_data;
  55         const char *decimal_point;
  56         size_t decimal_point_len;
  57         const char *p, *decimal_point_pos;
  58         const char *end = NULL; /* Silence gcc */
  59         const char *digits_pos = NULL;
  60         int negate = 0;
  61
  62         assert(nptr != NULL);
  63
  64         fail_pos = NULL;
  65
  66         locale_data = localeconv();
  67         decimal_point = locale_data->decimal_point;
  68         decimal_point_len = strlen(decimal_point);
  69
  70         assert(decimal_point_len != 0);
  71
  72         decimal_point_pos = NULL;
  73
  74         /* We process any leading whitespace and the optional sign manually,
  75            then pass the remainder to the system strtod.  This ensures that
  76            the result of an underflow has the correct sign. (bug #1725)  */
  77
  78         p = nptr;
  79         /* Skip leading space */
  80         while (ISSPACE(*p))
  81                 p++;
  82
  83         /* Process leading sign, if present */
  84         if (*p == '-') {
  85                 negate = 1;
  86                 p++;
  87         } else if (*p == '+') {
  88                 p++;
  89         }
  90
  91         /* What's left should begin with a digit, a decimal point, or one of
  92            the letters i, I, n, N. It should not begin with 0x or 0X */
  93         if ((!ISDIGIT(*p) &&
  94              *p != '.' && *p != 'i' && *p != 'I' && *p != 'n' && *p != 'N')
  95             ||
  96             (*p == '0' && (p[1] == 'x' || p[1] == 'X')))
  97         {
  98                 if (endptr)
  99                         *endptr = (char*)nptr;
 100                 errno = EINVAL;
 101                 return val;
 102         }
 103         digits_pos = p;
 104
 105         if (decimal_point[0] != '.' ||
 106             decimal_point[1] != 0)
 107         {
 108                 while (ISDIGIT(*p))
 109                         p++;
 110
 111                 if (*p == '.')
 112                 {
 113                         decimal_point_pos = p++;
 114
 115                         while (ISDIGIT(*p))
 116                                 p++;
 117
 118                         if (*p == 'e' || *p == 'E')
 119                                 p++;
 120                         if (*p == '+' || *p == '-')
 121                                 p++;
 122                         while (ISDIGIT(*p))
 123                                 p++;
 124                         end = p;
 125                 }
 126                 else if (strncmp(p, decimal_point, decimal_point_len) == 0)
 127                 {
 128                         /* Python bug #1417699 */
 129                         if (endptr)
 130                                 *endptr = (char*)nptr;
 131                         errno = EINVAL;
 132                         return val;
 133                 }
 134                 /* For the other cases, we need not convert the decimal
 135                    point */
 136         }
 137
 138         /* Set errno to zero, so that we can distinguish zero results
 139            and underflows */
 140         errno = 0;
 141
 142         if (decimal_point_pos)
 143         {
 144                 char *copy, *c;
 145
 146                 /* We need to convert the '.' to the locale specific decimal
 147                    point */
 148                 copy = (char *)PyMem_MALLOC(end - digits_pos +
 149                                             1 + decimal_point_len);
 150                 if (copy == NULL) {
 151                         if (endptr)
 152                                 *endptr = (char *)nptr;
 153                         errno = ENOMEM;
 154                         return val;
 155                 }
 156
 157                 c = copy;
 158                 memcpy(c, digits_pos, decimal_point_pos - digits_pos);
 159                 c += decimal_point_pos - digits_pos;
 160                 memcpy(c, decimal_point, decimal_point_len);
 161                 c += decimal_point_len;
 162                 memcpy(c, decimal_point_pos + 1,
 163                        end - (decimal_point_pos + 1));
 164                 c += end - (decimal_point_pos + 1);
 165                 *c = 0;
 166
 167                 val = strtod(copy, &fail_pos);
 168
 169                 if (fail_pos)
 170                 {
 171                         if (fail_pos > decimal_point_pos)
 172                                 fail_pos = (char *)digits_pos +
 173                                         (fail_pos - copy) -
 174                                         (decimal_point_len - 1);
 175                         else
 176                                 fail_pos = (char *)digits_pos +
 177                                         (fail_pos - copy);
 178                 }
 179
 180                 PyMem_FREE(copy);
 181
 182         }
 183         else {
 184                 val = strtod(digits_pos, &fail_pos);
 185         }
 186
 187         if (fail_pos == digits_pos)
 188                 fail_pos = (char *)nptr;
 189
 190         if (negate && fail_pos != nptr)
 191                 val = -val;
 192
 193         if (endptr)
 194                 *endptr = fail_pos;
 195
 196         return val;
 197 }
 198
 199 double
 200 PyOS_ascii_atof(const char *nptr)
 201 {
 202         return PyOS_ascii_strtod(nptr, NULL);
 203 }
 204
 205
 206 /* Given a string that may have a decimal point in the current
 207    locale, change it back to a dot.  Since the string cannot get
 208    longer, no need for a maximum buffer size parameter. */
 209 Py_LOCAL_INLINE(void)
 210 change_decimal_from_locale_to_dot(char* buffer)
 211 {
 212         struct lconv *locale_data = localeconv();
 213         const char *decimal_point = locale_data->decimal_point;
 214
 215         if (decimal_point[0] != '.' || decimal_point[1] != 0) {
 216                 size_t decimal_point_len = strlen(decimal_point);
 217
 218                 if (*buffer == '+' || *buffer == '-')
 219                         buffer++;
 220                 while (isdigit(Py_CHARMASK(*buffer)))
 221                         buffer++;
 222                 if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
 223                         *buffer = '.';
 224                         buffer++;
 225                         if (decimal_point_len > 1) {
 226                                 /* buffer needs to get smaller */
 227                                 size_t rest_len = strlen(buffer +
 228                                                      (decimal_point_len - 1));
 229                                 memmove(buffer,
 230                                         buffer + (decimal_point_len - 1),
 231                                         rest_len);
 232                                 buffer[rest_len] = 0;
 233                         }
 234                 }
 235         }
 236 }
 237
 238
 239 /* From the C99 standard, section 7.19.6:
 240 The exponent always contains at least two digits, and only as many more digits
 241 as necessary to represent the exponent.
 242 */
 243 #define MIN_EXPONENT_DIGITS 2
 244
 245 /* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
 246    in length. */
 247 Py_LOCAL_INLINE(void)
 248 ensure_minumim_exponent_length(char* buffer, size_t buf_size)
 249 {
 250         char *p = strpbrk(buffer, "eE");
 251         if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
 252                 char *start = p + 2;
 253                 int exponent_digit_cnt = 0;
 254                 int leading_zero_cnt = 0;
 255                 int in_leading_zeros = 1;
 256                 int significant_digit_cnt;
 257
 258                 /* Skip over the exponent and the sign. */
 259                 p += 2;
 260
 261                 /* Find the end of the exponent, keeping track of leading
 262                    zeros. */
 263                 while (*p && isdigit(Py_CHARMASK(*p))) {
 264                         if (in_leading_zeros && *p == '0')
 265                                 ++leading_zero_cnt;
 266                         if (*p != '0')
 267                                 in_leading_zeros = 0;
 268                         ++p;
 269                         ++exponent_digit_cnt;
 270                 }
 271
 272                 significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
 273                 if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
 274                         /* If there are 2 exactly digits, we're done,
 275                            regardless of what they contain */
 276                 }
 277                 else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
 278                         int extra_zeros_cnt;
 279
 280                         /* There are more than 2 digits in the exponent.  See
 281                            if we can delete some of the leading zeros */
 282                         if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
 283                                 significant_digit_cnt = MIN_EXPONENT_DIGITS;
 284                         extra_zeros_cnt = exponent_digit_cnt -
 285                                 significant_digit_cnt;
 286
 287                         /* Delete extra_zeros_cnt worth of characters from the
 288                            front of the exponent */
 289                         assert(extra_zeros_cnt >= 0);
 290
 291                         /* Add one to significant_digit_cnt to copy the
 292                            trailing 0 byte, thus setting the length */
 293                         memmove(start,
 294                                 start + extra_zeros_cnt,
 295                                 significant_digit_cnt + 1);
 296                 }
 297                 else {
 298                         /* If there are fewer than 2 digits, add zeros
 299                            until there are 2, if there's enough room */
 300                         int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
 301                         if (start + zeros + exponent_digit_cnt + 1
 302                               < buffer + buf_size) {
 303                                 memmove(start + zeros, start,
 304                                         exponent_digit_cnt + 1);
 305                                 memset(start, '0', zeros);
 306                         }
 307                 }
 308         }
 309 }
 310
 311 /* Ensure that buffer has a decimal point in it.  The decimal point will not
 312    be in the current locale, it will always be '.'. Don't add a decimal if an
 313    exponent is present. */
 314 Py_LOCAL_INLINE(void)
 315 ensure_decimal_point(char* buffer, size_t buf_size)
 316 {
 317         int insert_count = 0;
 318         char* chars_to_insert;
 319
 320         /* search for the first non-digit character */
 321         char *p = buffer;
 322         if (*p == '-' || *p == '+')
 323                 /* Skip leading sign, if present.  I think this could only
 324                    ever be '-', but it can't hurt to check for both. */
 325                 ++p;
 326         while (*p && isdigit(Py_CHARMASK(*p)))
 327                 ++p;
 328
 329         if (*p == '.') {
 330                 if (isdigit(Py_CHARMASK(*(p+1)))) {
 331                         /* Nothing to do, we already have a decimal
 332                            point and a digit after it */
 333                 }
 334                 else {
 335                         /* We have a decimal point, but no following
 336                            digit.  Insert a zero after the decimal. */
 337                         ++p;
 338                         chars_to_insert = "0";
 339                         insert_count = 1;
 340                 }
 341         }
 342         else if (!(*p == 'e' || *p == 'E')) {
 343                 /* Don't add ".0" if we have an exponent. */
 344                 chars_to_insert = ".0";
 345                 insert_count = 2;
 346         }
 347         if (insert_count) {
 348                 size_t buf_len = strlen(buffer);
 349                 if (buf_len + insert_count + 1 >= buf_size) {
 350                         /* If there is not enough room in the buffer
 351                            for the additional text, just skip it.  It's
 352                            not worth generating an error over. */
 353                 }
 354                 else {
 355                         memmove(p + insert_count, p,
 356                                 buffer + strlen(buffer) - p + 1);
 357                         memcpy(p, chars_to_insert, insert_count);
 358                 }
 359         }
 360 }
 361
 362 /* see FORMATBUFLEN in unicodeobject.c */
 363 #define FLOAT_FORMATBUFLEN 120
 364
 365 /**
 366  * PyOS_ascii_formatd:
 367  * @buffer: A buffer to place the resulting string in
 368  * @buf_size: The length of the buffer.
 369  * @format: The printf()-style format to use for the
 370  *          code to use for converting.
 371  * @d: The #gdouble to convert
 372  *
 373  * Converts a #gdouble to a string, using the '.' as
 374  * decimal point. To format the number you pass in
 375  * a printf()-style format string. Allowed conversion
 376  * specifiers are 'e', 'E', 'f', 'F', 'g', 'G', and 'Z'.
 377  *
 378  * 'Z' is the same as 'g', except it always has a decimal and
 379  *     at least one digit after the decimal.
 380  *
 381  * Return value: The pointer to the buffer with the converted string.
 382  **/
 383 char *
 384 PyOS_ascii_formatd(char       *buffer,
 385                    size_t      buf_size,
 386                    const char *format,
 387                    double      d)
 388 {
 389         char format_char;
 390         size_t format_len = strlen(format);
 391
 392         /* Issue 2264: code 'Z' requires copying the format.  'Z' is 'g', but
 393            also with at least one character past the decimal. */
 394         char tmp_format[FLOAT_FORMATBUFLEN];
 395
 396         /* The last character in the format string must be the format char */
 397         format_char = format[format_len - 1];
 398
 399         if (format[0] != '%')
 400                 return NULL;
 401
 402         /* I'm not sure why this test is here.  It's ensuring that the format
 403            string after the first character doesn't have a single quote, a
 404            lowercase l, or a percent. This is the reverse of the commented-out
 405            test about 10 lines ago. */
 406         if (strpbrk(format + 1, "'l%"))
 407                 return NULL;
 408
 409         /* Also curious about this function is that it accepts format strings
 410            like "%xg", which are invalid for floats.  In general, the
 411            interface to this function is not very good, but changing it is
 412            difficult because it's a public API. */
 413
 414         if (!(format_char == 'e' || format_char == 'E' ||
 415               format_char == 'f' || format_char == 'F' ||
 416               format_char == 'g' || format_char == 'G' ||
 417               format_char == 'Z'))
 418                 return NULL;
 419
 420         /* Map 'Z' format_char to 'g', by copying the format string and
 421            replacing the final char with a 'g' */
 422         if (format_char == 'Z') {
 423                 if (format_len + 1 >= sizeof(tmp_format)) {
 424                         /* The format won't fit in our copy.  Error out.  In
 425                            practice, this will never happen and will be
 426                            detected by returning NULL */
 427                         return NULL;
 428                 }
 429                 strcpy(tmp_format, format);
 430                 tmp_format[format_len - 1] = 'g';
 431                 format = tmp_format;
 432         }
 433
 434
 435         /* Have PyOS_snprintf do the hard work */
 436         PyOS_snprintf(buffer, buf_size, format, d);
 437
 438         /* Do various fixups on the return string */
 439
 440         /* Get the current locale, and find the decimal point string.
 441            Convert that string back to a dot. */
 442         change_decimal_from_locale_to_dot(buffer);
 443
 444         /* If an exponent exists, ensure that the exponent is at least
 445            MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
 446            for the extra zeros.  Also, if there are more than
 447            MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
 448            back to MIN_EXPONENT_DIGITS */
 449         ensure_minumim_exponent_length(buffer, buf_size);
 450
 451         /* If format_char is 'Z', make sure we have at least one character
 452            after the decimal point (and make sure we have a decimal point). */
 453         if (format_char == 'Z')
 454                 ensure_decimal_point(buffer, buf_size);
 455
 456         return buffer;
 457 }
 458
 459 PyAPI_FUNC(char *) PyOS_double_to_string(double val,
 460                                          char format_code,
 461                                          int precision,
 462                                          int flags,
 463                                          int *type)
 464 {
 465         char buf[128];
 466         char format[32];
 467         Py_ssize_t len;
 468         char *result;
 469         char *p;
 470         int t;
 471         int upper = 0;
 472
 473         /* Validate format_code, and map upper and lower case */
 474         switch (format_code) {
 475         case 'e':          /* exponent */
 476         case 'f':          /* fixed */
 477         case 'g':          /* general */
 478                 break;
 479         case 'E':
 480                 upper = 1;
 481                 format_code = 'e';
 482                 break;
 483         case 'F':
 484                 upper = 1;
 485                 format_code = 'f';
 486                 break;
 487         case 'G':
 488                 upper = 1;
 489                 format_code = 'g';
 490                 break;
 491         case 'r':          /* repr format */
 492                 /* Supplied precision is unused, must be 0. */
 493                 if (precision != 0) {
 494                         PyErr_BadInternalCall();
 495                         return NULL;
 496                 }
 497                 precision = 17;
 498                 format_code = 'g';
 499                 break;
 500         case 's':          /* str format */
 501                 /* Supplied precision is unused, must be 0. */
 502                 if (precision != 0) {
 503                         PyErr_BadInternalCall();
 504                         return NULL;
 505                 }
 506                 precision = 12;
 507                 format_code = 'g';
 508                 break;
 509         default:
 510                 PyErr_BadInternalCall();
 511                 return NULL;
 512         }
 513
 514         /* Handle nan and inf. */
 515         if (Py_IS_NAN(val)) {
 516                 strcpy(buf, "nan");
 517                 t = Py_DTST_NAN;
 518         } else if (Py_IS_INFINITY(val)) {
 519                 if (copysign(1., val) == 1.)
 520                         strcpy(buf, "inf");
 521                 else
 522                         strcpy(buf, "-inf");
 523                 t = Py_DTST_INFINITE;
 524         } else {
 525                 t = Py_DTST_FINITE;
 526
 527
 528                 if (flags & Py_DTSF_ADD_DOT_0)
 529                         format_code = 'Z';
 530
 531                 PyOS_snprintf(format, 32, "%%%s.%i%c", (flags & Py_DTSF_ALT ? "#" : ""), precision, format_code);
 532                 PyOS_ascii_formatd(buf, sizeof(buf), format, val);
 533         }
 534
 535         len = strlen(buf);
 536
 537         /* Add 1 for the trailing 0 byte.
 538            Add 1 because we might need to make room for the sign.
 539            */
 540         result = PyMem_Malloc(len + 2);
 541         if (result == NULL) {
 542                 PyErr_NoMemory();
 543                 return NULL;
 544         }
 545         p = result;
 546
 547         /* Add sign when requested.  It's convenient (esp. when formatting
 548          complex numbers) to include a sign even for inf and nan. */
 549         if (flags & Py_DTSF_SIGN && buf[0] != '-')
 550                 *p++ = '+';
 551
 552         strcpy(p, buf);
 553
 554         if (upper) {
 555                 /* Convert to upper case. */
 556                 char *p1;
 557                 for (p1 = p; *p1; p1++)
 558                         *p1 = toupper(*p1);
 559         }
 560
 561         if (type)
 562                 *type = t;
 563         return result;
 564 }