Python/pystrtod.c

   1 /* -*- Mode: C; c-file-style: "python" -*- */
   2
   3 #include <Python.h>
   4 #include <locale.h>
   5
   6 /* ascii character tests (as opposed to locale tests) */
   7 #define ISSPACE(c)  ((c) == ' ' || (c) == '\f' || (c) == '\n' || \
   8                      (c) == '\r' || (c) == '\t' || (c) == '\v')
   9 #define ISDIGIT(c)  ((c) >= '0' && (c) <= '9')
  10
  11
  12 /**
  13  * PyOS_ascii_strtod:
  14  * @nptr:    the string to convert to a numeric value.
  15  * @endptr:  if non-%NULL, it returns the character after
  16  *           the last character used in the conversion.
  17  *
  18  * Converts a string to a #gdouble value.
  19  * This function behaves like the standard strtod() function
  20  * does in the C locale. It does this without actually
  21  * changing the current locale, since that would not be
  22  * thread-safe.
  23  *
  24  * This function is typically used when reading configuration
  25  * files or other non-user input that should be locale independent.
  26  * To handle input from the user you should normally use the
  27  * locale-sensitive system strtod() function.
  28  *
  29  * If the correct value would cause overflow, plus or minus %HUGE_VAL
  30  * is returned (according to the sign of the value), and %ERANGE is
  31  * stored in %errno. If the correct value would cause underflow,
  32  * zero is returned and %ERANGE is stored in %errno.
  33  * If memory allocation fails, %ENOMEM is stored in %errno.
  34  *
  35  * This function resets %errno before calling strtod() so that
  36  * you can reliably detect overflow and underflow.
  37  *
  38  * Return value: the #gdouble value.
  39  **/
  40 double
  41 PyOS_ascii_strtod(const char *nptr, char **endptr)
  42 {
  43         char *fail_pos;
  44         double val = -1.0;
  45         struct lconv *locale_data;
  46         const char *decimal_point;
  47         size_t decimal_point_len;
  48         const char *p, *decimal_point_pos;
  49         const char *end = NULL; /* Silence gcc */
  50         const char *digits_pos = NULL;
  51         int negate = 0;
  52
  53         assert(nptr != NULL);
  54
  55         fail_pos = NULL;
  56
  57         locale_data = localeconv();
  58         decimal_point = locale_data->decimal_point;
  59         decimal_point_len = strlen(decimal_point);
  60
  61         assert(decimal_point_len != 0);
  62
  63         decimal_point_pos = NULL;
  64
  65         /* We process any leading whitespace and the optional sign manually,
  66            then pass the remainder to the system strtod.  This ensures that
  67            the result of an underflow has the correct sign. (bug #1725)  */
  68
  69         p = nptr;
  70         /* Skip leading space */
  71         while (ISSPACE(*p))
  72                 p++;
  73
  74         /* Process leading sign, if present */
  75         if (*p == '-') {
  76                 negate = 1;
  77                 p++;
  78         } else if (*p == '+') {
  79                 p++;
  80         }
  81
  82         /* What's left should begin with a digit, a decimal point, or one of
  83            the letters i, I, n, N. It should not begin with 0x or 0X */
  84         if ((!ISDIGIT(*p) &&
  85              *p != '.' && *p != 'i' && *p != 'I' && *p != 'n' && *p != 'N')
  86             ||
  87             (*p == '0' && (p[1] == 'x' || p[1] == 'X')))
  88         {
  89                 if (endptr)
  90                         *endptr = (char*)nptr;
  91                 errno = EINVAL;
  92                 return val;
  93         }
  94         digits_pos = p;
  95
  96         if (decimal_point[0] != '.' ||
  97             decimal_point[1] != 0)
  98         {
  99                 while (ISDIGIT(*p))
 100                         p++;
 101
 102                 if (*p == '.')
 103                 {
 104                         decimal_point_pos = p++;
 105
 106                         while (ISDIGIT(*p))
 107                                 p++;
 108
 109                         if (*p == 'e' || *p == 'E')
 110                                 p++;
 111                         if (*p == '+' || *p == '-')
 112                                 p++;
 113                         while (ISDIGIT(*p))
 114                                 p++;
 115                         end = p;
 116                 }
 117                 else if (strncmp(p, decimal_point, decimal_point_len) == 0)
 118                 {
 119                         /* Python bug #1417699 */
 120                         if (endptr)
 121                                 *endptr = (char*)nptr;
 122                         errno = EINVAL;
 123                         return val;
 124                 }
 125                 /* For the other cases, we need not convert the decimal
 126                    point */
 127         }
 128
 129         /* Set errno to zero, so that we can distinguish zero results
 130            and underflows */
 131         errno = 0;
 132
 133         if (decimal_point_pos)
 134         {
 135                 char *copy, *c;
 136
 137                 /* We need to convert the '.' to the locale specific decimal
 138                    point */
 139                 copy = (char *)PyMem_MALLOC(end - digits_pos +
 140                                             1 + decimal_point_len);
 141                 if (copy == NULL) {
 142                         if (endptr)
 143                                 *endptr = (char *)nptr;
 144                         errno = ENOMEM;
 145                         return val;
 146                 }
 147
 148                 c = copy;
 149                 memcpy(c, digits_pos, decimal_point_pos - digits_pos);
 150                 c += decimal_point_pos - digits_pos;
 151                 memcpy(c, decimal_point, decimal_point_len);
 152                 c += decimal_point_len;
 153                 memcpy(c, decimal_point_pos + 1,
 154                        end - (decimal_point_pos + 1));
 155                 c += end - (decimal_point_pos + 1);
 156                 *c = 0;
 157
 158                 val = strtod(copy, &fail_pos);
 159
 160                 if (fail_pos)
 161                 {
 162                         if (fail_pos > decimal_point_pos)
 163                                 fail_pos = (char *)digits_pos +
 164                                         (fail_pos - copy) -
 165                                         (decimal_point_len - 1);
 166                         else
 167                                 fail_pos = (char *)digits_pos +
 168                                         (fail_pos - copy);
 169                 }
 170
 171                 PyMem_FREE(copy);
 172
 173         }
 174         else {
 175                 val = strtod(digits_pos, &fail_pos);
 176         }
 177
 178         if (fail_pos == digits_pos)
 179                 fail_pos = (char *)nptr;
 180
 181         if (negate && fail_pos != nptr)
 182                 val = -val;
 183
 184         if (endptr)
 185                 *endptr = fail_pos;
 186
 187         return val;
 188 }
 189
 190 /* Given a string that may have a decimal point in the current
 191    locale, change it back to a dot.  Since the string cannot get
 192    longer, no need for a maximum buffer size parameter. */
 193 Py_LOCAL_INLINE(void)
 194 change_decimal_from_locale_to_dot(char* buffer)
 195 {
 196         struct lconv *locale_data = localeconv();
 197         const char *decimal_point = locale_data->decimal_point;
 198
 199         if (decimal_point[0] != '.' || decimal_point[1] != 0) {
 200                 size_t decimal_point_len = strlen(decimal_point);
 201
 202                 if (*buffer == '+' || *buffer == '-')
 203                         buffer++;
 204                 while (isdigit(Py_CHARMASK(*buffer)))
 205                         buffer++;
 206                 if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
 207                         *buffer = '.';
 208                         buffer++;
 209                         if (decimal_point_len > 1) {
 210                                 /* buffer needs to get smaller */
 211                                 size_t rest_len = strlen(buffer +
 212                                                      (decimal_point_len - 1));
 213                                 memmove(buffer,
 214                                         buffer + (decimal_point_len - 1),
 215                                         rest_len);
 216                                 buffer[rest_len] = 0;
 217                         }
 218                 }
 219         }
 220 }
 221
 222
 223 /* From the C99 standard, section 7.19.6:
 224 The exponent always contains at least two digits, and only as many more digits
 225 as necessary to represent the exponent.
 226 */
 227 #define MIN_EXPONENT_DIGITS 2
 228
 229 /* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
 230    in length. */
 231 Py_LOCAL_INLINE(void)
 232 ensure_minumim_exponent_length(char* buffer, size_t buf_size)
 233 {
 234         char *p = strpbrk(buffer, "eE");
 235         if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
 236                 char *start = p + 2;
 237                 int exponent_digit_cnt = 0;
 238                 int leading_zero_cnt = 0;
 239                 int in_leading_zeros = 1;
 240                 int significant_digit_cnt;
 241
 242                 /* Skip over the exponent and the sign. */
 243                 p += 2;
 244
 245                 /* Find the end of the exponent, keeping track of leading
 246                    zeros. */
 247                 while (*p && isdigit(Py_CHARMASK(*p))) {
 248                         if (in_leading_zeros && *p == '0')
 249                                 ++leading_zero_cnt;
 250                         if (*p != '0')
 251                                 in_leading_zeros = 0;
 252                         ++p;
 253                         ++exponent_digit_cnt;
 254                 }
 255
 256                 significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
 257                 if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
 258                         /* If there are 2 exactly digits, we're done,
 259                            regardless of what they contain */
 260                 }
 261                 else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
 262                         int extra_zeros_cnt;
 263
 264                         /* There are more than 2 digits in the exponent.  See
 265                            if we can delete some of the leading zeros */
 266                         if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
 267                                 significant_digit_cnt = MIN_EXPONENT_DIGITS;
 268                         extra_zeros_cnt = exponent_digit_cnt -
 269                                 significant_digit_cnt;
 270
 271                         /* Delete extra_zeros_cnt worth of characters from the
 272                            front of the exponent */
 273                         assert(extra_zeros_cnt >= 0);
 274
 275                         /* Add one to significant_digit_cnt to copy the
 276                            trailing 0 byte, thus setting the length */
 277                         memmove(start,
 278                                 start + extra_zeros_cnt,
 279                                 significant_digit_cnt + 1);
 280                 }
 281                 else {
 282                         /* If there are fewer than 2 digits, add zeros
 283                            until there are 2, if there's enough room */
 284                         int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
 285                         if (start + zeros + exponent_digit_cnt + 1
 286                               < buffer + buf_size) {
 287                                 memmove(start + zeros, start,
 288                                         exponent_digit_cnt + 1);
 289                                 memset(start, '0', zeros);
 290                         }
 291                 }
 292         }
 293 }
 294
 295 /* Ensure that buffer has a decimal point in it.  The decimal point
 296    will not be in the current locale, it will always be '.' */
 297 Py_LOCAL_INLINE(void)
 298 ensure_decimal_point(char* buffer, size_t buf_size)
 299 {
 300         int insert_count = 0;
 301         char* chars_to_insert;
 302
 303         /* search for the first non-digit character */
 304         char *p = buffer;
 305         if (*p == '-' || *p == '+')
 306                 /* Skip leading sign, if present.  I think this could only
 307                    ever be '-', but it can't hurt to check for both. */
 308                 ++p;
 309         while (*p && isdigit(Py_CHARMASK(*p)))
 310                 ++p;
 311
 312         if (*p == '.') {
 313                 if (isdigit(Py_CHARMASK(*(p+1)))) {
 314                         /* Nothing to do, we already have a decimal
 315                            point and a digit after it */
 316                 }
 317                 else {
 318                         /* We have a decimal point, but no following
 319                            digit.  Insert a zero after the decimal. */
 320                         ++p;
 321                         chars_to_insert = "0";
 322                         insert_count = 1;
 323                 }
 324         }
 325         else {
 326                 chars_to_insert = ".0";
 327                 insert_count = 2;
 328         }
 329         if (insert_count) {
 330                 size_t buf_len = strlen(buffer);
 331                 if (buf_len + insert_count + 1 >= buf_size) {
 332                         /* If there is not enough room in the buffer
 333                            for the additional text, just skip it.  It's
 334                            not worth generating an error over. */
 335                 }
 336                 else {
 337                         memmove(p + insert_count, p,
 338                                 buffer + strlen(buffer) - p + 1);
 339                         memcpy(p, chars_to_insert, insert_count);
 340                 }
 341         }
 342 }
 343
 344 /* Add the locale specific grouping characters to buffer.  Note
 345    that any decimal point (if it's present) in buffer is already
 346    locale-specific.  Return 0 on error, else 1. */
 347 Py_LOCAL_INLINE(int)
 348 add_thousands_grouping(char* buffer, size_t buf_size)
 349 {
 350         Py_ssize_t len = strlen(buffer);
 351         struct lconv *locale_data = localeconv();
 352         const char *decimal_point = locale_data->decimal_point;
 353
 354         /* Find the decimal point, if any.  We're only concerned
 355            about the characters to the left of the decimal when
 356            adding grouping. */
 357         char *p = strstr(buffer, decimal_point);
 358         if (!p) {
 359                 /* No decimal, use the entire string. */
 360
 361                 /* If any exponent, adjust p. */
 362                 p = strpbrk(buffer, "eE");
 363                 if (!p)
 364                         /* No exponent and no decimal.  Use the entire
 365                            string. */
 366                         p = buffer + len;
 367         }
 368         /* At this point, p points just past the right-most character we
 369            want to format.  We need to add the grouping string for the
 370            characters between buffer and p. */
 371         return _PyString_InsertThousandsGrouping(buffer, len, p-buffer,
 372                                                  buf_size, NULL, 1);
 373 }
 374
 375 /* see FORMATBUFLEN in unicodeobject.c */
 376 #define FLOAT_FORMATBUFLEN 120
 377
 378 /**
 379  * PyOS_ascii_formatd:
 380  * @buffer: A buffer to place the resulting string in
 381  * @buf_size: The length of the buffer.
 382  * @format: The printf()-style format to use for the
 383  *          code to use for converting.
 384  * @d: The #gdouble to convert
 385  *
 386  * Converts a #gdouble to a string, using the '.' as
 387  * decimal point. To format the number you pass in
 388  * a printf()-style format string. Allowed conversion
 389  * specifiers are 'e', 'E', 'f', 'F', 'g', 'G', and 'n'.
 390  *
 391  * 'n' is the same as 'g', except it uses the current locale.
 392  * 'Z' is the same as 'g', except it always has a decimal and
 393  *     at least one digit after the decimal.
 394  *
 395  * Return value: The pointer to the buffer with the converted string.
 396  **/
 397 char *
 398 PyOS_ascii_formatd(char       *buffer,
 399                    size_t      buf_size,
 400                    const char *format,
 401                    double      d)
 402 {
 403         char format_char;
 404         size_t format_len = strlen(format);
 405
 406         /* For type 'n', we need to make a copy of the format string, because
 407            we're going to modify 'n' -> 'g', and format is const char*, so we
 408            can't modify it directly.  FLOAT_FORMATBUFLEN should be longer than
 409            we ever need this to be.  There's an upcoming check to ensure it's
 410            big enough. */
 411         /* Issue 2264: code 'Z' requires copying the format.  'Z' is 'g', but
 412            also with at least one character past the decimal. */
 413         char tmp_format[FLOAT_FORMATBUFLEN];
 414
 415         /* The last character in the format string must be the format char */
 416         format_char = format[format_len - 1];
 417
 418         if (format[0] != '%')
 419                 return NULL;
 420
 421         /* I'm not sure why this test is here.  It's ensuring that the format
 422            string after the first character doesn't have a single quote, a
 423            lowercase l, or a percent. This is the reverse of the commented-out
 424            test about 10 lines ago. */
 425         if (strpbrk(format + 1, "'l%"))
 426                 return NULL;
 427
 428         /* Also curious about this function is that it accepts format strings
 429            like "%xg", which are invalid for floats.  In general, the
 430            interface to this function is not very good, but changing it is
 431            difficult because it's a public API. */
 432
 433         if (!(format_char == 'e' || format_char == 'E' ||
 434               format_char == 'f' || format_char == 'F' ||
 435               format_char == 'g' || format_char == 'G' ||
 436               format_char == 'n' || format_char == 'Z'))
 437                 return NULL;
 438
 439         /* Map 'n' or 'Z' format_char to 'g', by copying the format string and
 440            replacing the final char with a 'g' */
 441         if (format_char == 'n' || format_char == 'Z') {
 442                 if (format_len + 1 >= sizeof(tmp_format)) {
 443                         /* The format won't fit in our copy.  Error out.  In
 444                            practice, this will never happen and will be
 445                            detected by returning NULL */
 446                         return NULL;
 447                 }
 448                 strcpy(tmp_format, format);
 449                 tmp_format[format_len - 1] = 'g';
 450                 format = tmp_format;
 451         }
 452
 453
 454         /* Have PyOS_snprintf do the hard work */
 455         PyOS_snprintf(buffer, buf_size, format, d);
 456
 457         /* Do various fixups on the return string */
 458
 459         /* Get the current locale, and find the decimal point string.
 460            Convert that string back to a dot.  Do not do this if using the
 461            'n' (number) format code, since we want to keep the localized
 462            decimal point in that case. */
 463         if (format_char != 'n')
 464                 change_decimal_from_locale_to_dot(buffer);
 465
 466         /* If an exponent exists, ensure that the exponent is at least
 467            MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
 468            for the extra zeros.  Also, if there are more than
 469            MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
 470            back to MIN_EXPONENT_DIGITS */
 471         ensure_minumim_exponent_length(buffer, buf_size);
 472
 473         /* If format_char is 'Z', make sure we have at least one character
 474            after the decimal point (and make sure we have a decimal point). */
 475         if (format_char == 'Z')
 476                 ensure_decimal_point(buffer, buf_size);
 477
 478         /* If format_char is 'n', add the thousands grouping. */
 479         if (format_char == 'n')
 480                 if (!add_thousands_grouping(buffer, buf_size))
 481                         return NULL;
 482
 483         return buffer;
 484 }
 485
 486 double
 487 PyOS_ascii_atof(const char *nptr)
 488 {
 489         return PyOS_ascii_strtod(nptr, NULL);
 490 }