Python/pystrtod.c

   1 /* -*- Mode: C; c-file-style: "python" -*- */
   2
   3 #include <Python.h>
   4 #include <locale.h>
   5
   6 /* ascii character tests (as opposed to locale tests) */
   7 #define ISSPACE(c)  ((c) == ' ' || (c) == '\f' || (c) == '\n' || \
   8                      (c) == '\r' || (c) == '\t' || (c) == '\v')
   9 #define ISDIGIT(c)  ((c) >= '0' && (c) <= '9')
  10
  11
  12 /**
  13  * PyOS_ascii_strtod:
  14  * @nptr:    the string to convert to a numeric value.
  15  * @endptr:  if non-%NULL, it returns the character after
  16  *           the last character used in the conversion.
  17  *
  18  * Converts a string to a #gdouble value.
  19  * This function behaves like the standard strtod() function
  20  * does in the C locale. It does this without actually
  21  * changing the current locale, since that would not be
  22  * thread-safe.
  23  *
  24  * This function is typically used when reading configuration
  25  * files or other non-user input that should be locale independent.
  26  * To handle input from the user you should normally use the
  27  * locale-sensitive system strtod() function.
  28  *
  29  * If the correct value would cause overflow, plus or minus %HUGE_VAL
  30  * is returned (according to the sign of the value), and %ERANGE is
  31  * stored in %errno. If the correct value would cause underflow,
  32  * zero is returned and %ERANGE is stored in %errno.
  33  * If memory allocation fails, %ENOMEM is stored in %errno.
  34  *
  35  * This function resets %errno before calling strtod() so that
  36  * you can reliably detect overflow and underflow.
  37  *
  38  * Return value: the #gdouble value.
  39  **/
  40 double
  41 PyOS_ascii_strtod(const char *nptr, char **endptr)
  42 {
  43         char *fail_pos;
  44         double val = -1.0;
  45         struct lconv *locale_data;
  46         const char *decimal_point;
  47         size_t decimal_point_len;
  48         const char *p, *decimal_point_pos;
  49         const char *end = NULL; /* Silence gcc */
  50         const char *digits_pos = NULL;
  51         int negate = 0;
  52
  53         assert(nptr != NULL);
  54
  55         fail_pos = NULL;
  56
  57         locale_data = localeconv();
  58         decimal_point = locale_data->decimal_point;
  59         decimal_point_len = strlen(decimal_point);
  60
  61         assert(decimal_point_len != 0);
  62
  63         decimal_point_pos = NULL;
  64
  65         /* We process any leading whitespace and the optional sign manually,
  66            then pass the remainder to the system strtod.  This ensures that
  67            the result of an underflow has the correct sign. (bug #1725)  */
  68
  69         p = nptr;
  70         /* Skip leading space */
  71         while (ISSPACE(*p))
  72                 p++;
  73
  74         /* Process leading sign, if present */
  75         if (*p == '-') {
  76                 negate = 1;
  77                 p++;
  78         } else if (*p == '+') {
  79                 p++;
  80         }
  81
  82         /* What's left should begin with a digit, a decimal point, or one of
  83            the letters i, I, n, N. It should not begin with 0x or 0X */
  84         if ((!ISDIGIT(*p) &&
  85              *p != '.' && *p != 'i' && *p != 'I' && *p != 'n' && *p != 'N')
  86             ||
  87             (*p == '0' && (p[1] == 'x' || p[1] == 'X')))
  88         {
  89                 if (endptr)
  90                         *endptr = (char*)nptr;
  91                 errno = EINVAL;
  92                 return val;
  93         }
  94         digits_pos = p;
  95
  96         if (decimal_point[0] != '.' ||
  97             decimal_point[1] != 0)
  98         {
  99                 while (ISDIGIT(*p))
 100                         p++;
 101
 102                 if (*p == '.')
 103                 {
 104                         decimal_point_pos = p++;
 105
 106                         while (ISDIGIT(*p))
 107                                 p++;
 108
 109                         if (*p == 'e' || *p == 'E')
 110                                 p++;
 111                         if (*p == '+' || *p == '-')
 112                                 p++;
 113                         while (ISDIGIT(*p))
 114                                 p++;
 115                         end = p;
 116                 }
 117                 else if (strncmp(p, decimal_point, decimal_point_len) == 0)
 118                 {
 119                         /* Python bug #1417699 */
 120                         if (endptr)
 121                                 *endptr = (char*)nptr;
 122                         errno = EINVAL;
 123                         return val;
 124                 }
 125                 /* For the other cases, we need not convert the decimal
 126                    point */
 127         }
 128
 129         /* Set errno to zero, so that we can distinguish zero results
 130            and underflows */
 131         errno = 0;
 132
 133         if (decimal_point_pos)
 134         {
 135                 char *copy, *c;
 136
 137                 /* We need to convert the '.' to the locale specific decimal
 138                    point */
 139                 copy = (char *)PyMem_MALLOC(end - digits_pos +
 140                                             1 + decimal_point_len);
 141                 if (copy == NULL) {
 142                         if (endptr)
 143                                 *endptr = (char *)nptr;
 144                         errno = ENOMEM;
 145                         return val;
 146                 }
 147
 148                 c = copy;
 149                 memcpy(c, digits_pos, decimal_point_pos - digits_pos);
 150                 c += decimal_point_pos - digits_pos;
 151                 memcpy(c, decimal_point, decimal_point_len);
 152                 c += decimal_point_len;
 153                 memcpy(c, decimal_point_pos + 1,
 154                        end - (decimal_point_pos + 1));
 155                 c += end - (decimal_point_pos + 1);
 156                 *c = 0;
 157
 158                 val = strtod(copy, &fail_pos);
 159
 160                 if (fail_pos)
 161                 {
 162                         if (fail_pos > decimal_point_pos)
 163                                 fail_pos = (char *)digits_pos +
 164                                         (fail_pos - copy) -
 165                                         (decimal_point_len - 1);
 166                         else
 167                                 fail_pos = (char *)digits_pos +
 168                                         (fail_pos - copy);
 169                 }
 170
 171                 PyMem_FREE(copy);
 172
 173         }
 174         else {
 175                 val = strtod(digits_pos, &fail_pos);
 176         }
 177
 178         if (fail_pos == digits_pos)
 179                 fail_pos = (char *)nptr;
 180
 181         if (negate && fail_pos != nptr)
 182                 val = -val;
 183
 184         if (endptr)
 185                 *endptr = fail_pos;
 186
 187         return val;
 188 }
 189
 190 /* Given a string that may have a decimal point in the current
 191    locale, change it back to a dot.  Since the string cannot get
 192    longer, no need for a maximum buffer size parameter. */
 193 Py_LOCAL_INLINE(void)
 194 change_decimal_from_locale_to_dot(char* buffer)
 195 {
 196         struct lconv *locale_data = localeconv();
 197         const char *decimal_point = locale_data->decimal_point;
 198
 199         if (decimal_point[0] != '.' || decimal_point[1] != 0) {
 200                 size_t decimal_point_len = strlen(decimal_point);
 201
 202                 if (*buffer == '+' || *buffer == '-')
 203                         buffer++;
 204                 while (isdigit(Py_CHARMASK(*buffer)))
 205                         buffer++;
 206                 if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
 207                         *buffer = '.';
 208                         buffer++;
 209                         if (decimal_point_len > 1) {
 210                                 /* buffer needs to get smaller */
 211                                 size_t rest_len = strlen(buffer +
 212                                                      (decimal_point_len - 1));
 213                                 memmove(buffer,
 214                                         buffer + (decimal_point_len - 1),
 215                                         rest_len);
 216                                 buffer[rest_len] = 0;
 217                         }
 218                 }
 219         }
 220 }
 221
 222
 223 /* From the C99 standard, section 7.19.6:
 224 The exponent always contains at least two digits, and only as many more digits
 225 as necessary to represent the exponent.
 226 */
 227 #define MIN_EXPONENT_DIGITS 2
 228
 229 /* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
 230    in length. */
 231 Py_LOCAL_INLINE(void)
 232 ensure_minumim_exponent_length(char* buffer, size_t buf_size)
 233 {
 234         char *p = strpbrk(buffer, "eE");
 235         if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
 236                 char *start = p + 2;
 237                 int exponent_digit_cnt = 0;
 238                 int leading_zero_cnt = 0;
 239                 int in_leading_zeros = 1;
 240                 int significant_digit_cnt;
 241
 242                 /* Skip over the exponent and the sign. */
 243                 p += 2;
 244
 245                 /* Find the end of the exponent, keeping track of leading
 246                    zeros. */
 247                 while (*p && isdigit(Py_CHARMASK(*p))) {
 248                         if (in_leading_zeros && *p == '0')
 249                                 ++leading_zero_cnt;
 250                         if (*p != '0')
 251                                 in_leading_zeros = 0;
 252                         ++p;
 253                         ++exponent_digit_cnt;
 254                 }
 255
 256                 significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
 257                 if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
 258                         /* If there are 2 exactly digits, we're done,
 259                            regardless of what they contain */
 260                 }
 261                 else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
 262                         int extra_zeros_cnt;
 263
 264                         /* There are more than 2 digits in the exponent.  See
 265                            if we can delete some of the leading zeros */
 266                         if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
 267                                 significant_digit_cnt = MIN_EXPONENT_DIGITS;
 268                         extra_zeros_cnt = exponent_digit_cnt -
 269                                 significant_digit_cnt;
 270
 271                         /* Delete extra_zeros_cnt worth of characters from the
 272                            front of the exponent */
 273                         assert(extra_zeros_cnt >= 0);
 274
 275                         /* Add one to significant_digit_cnt to copy the
 276                            trailing 0 byte, thus setting the length */
 277                         memmove(start,
 278                                 start + extra_zeros_cnt,
 279                                 significant_digit_cnt + 1);
 280                 }
 281                 else {
 282                         /* If there are fewer than 2 digits, add zeros
 283                            until there are 2, if there's enough room */
 284                         int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
 285                         if (start + zeros + exponent_digit_cnt + 1
 286                               < buffer + buf_size) {
 287                                 memmove(start + zeros, start,
 288                                         exponent_digit_cnt + 1);
 289                                 memset(start, '0', zeros);
 290                         }
 291                 }
 292         }
 293 }
 294
 295 /* Ensure that buffer has a decimal point in it.  The decimal point
 296    will not be in the current locale, it will always be '.' */
 297 Py_LOCAL_INLINE(void)
 298 ensure_decimal_point(char* buffer, size_t buf_size)
 299 {
 300         int insert_count = 0;
 301         char* chars_to_insert;
 302
 303         /* search for the first non-digit character */
 304         char *p = buffer;
 305         while (*p && isdigit(Py_CHARMASK(*p)))
 306                 ++p;
 307
 308         if (*p == '.') {
 309                 if (isdigit(Py_CHARMASK(*(p+1)))) {
 310                         /* Nothing to do, we already have a decimal
 311                            point and a digit after it */
 312                 }
 313                 else {
 314                         /* We have a decimal point, but no following
 315                            digit.  Insert a zero after the decimal. */
 316                         ++p;
 317                         chars_to_insert = "0";
 318                         insert_count = 1;
 319                 }
 320         }
 321         else {
 322                 chars_to_insert = ".0";
 323                 insert_count = 2;
 324         }
 325         if (insert_count) {
 326                 size_t buf_len = strlen(buffer);
 327                 if (buf_len + insert_count + 1 >= buf_size) {
 328                         /* If there is not enough room in the buffer
 329                            for the additional text, just skip it.  It's
 330                            not worth generating an error over. */
 331                 }
 332                 else {
 333                         memmove(p + insert_count, p,
 334                                 buffer + strlen(buffer) - p + 1);
 335                         memcpy(p, chars_to_insert, insert_count);
 336                 }
 337         }
 338 }
 339
 340 /* Add the locale specific grouping characters to buffer.  Note
 341    that any decimal point (if it's present) in buffer is already
 342    locale-specific.  Return 0 on error, else 1. */
 343 Py_LOCAL_INLINE(int)
 344 add_thousands_grouping(char* buffer, size_t buf_size)
 345 {
 346         Py_ssize_t len = strlen(buffer);
 347         struct lconv *locale_data = localeconv();
 348         const char *decimal_point = locale_data->decimal_point;
 349
 350         /* Find the decimal point, if any.  We're only concerned
 351            about the characters to the left of the decimal when
 352            adding grouping. */
 353         char *p = strstr(buffer, decimal_point);
 354         if (!p) {
 355                 /* No decimal, use the entire string. */
 356
 357                 /* If any exponent, adjust p. */
 358                 p = strpbrk(buffer, "eE");
 359                 if (!p)
 360                         /* No exponent and no decimal.  Use the entire
 361                            string. */
 362                         p = buffer + len;
 363         }
 364         /* At this point, p points just past the right-most character we
 365            want to format.  We need to add the grouping string for the
 366            characters between buffer and p. */
 367         return _PyString_InsertThousandsGrouping(buffer, len, p-buffer,
 368                                                  buf_size, NULL, 1);
 369 }
 370
 371 /* see FORMATBUFLEN in unicodeobject.c */
 372 #define FLOAT_FORMATBUFLEN 120
 373
 374 /**
 375  * PyOS_ascii_formatd:
 376  * @buffer: A buffer to place the resulting string in
 377  * @buf_size: The length of the buffer.
 378  * @format: The printf()-style format to use for the
 379  *          code to use for converting.
 380  * @d: The #gdouble to convert
 381  *
 382  * Converts a #gdouble to a string, using the '.' as
 383  * decimal point. To format the number you pass in
 384  * a printf()-style format string. Allowed conversion
 385  * specifiers are 'e', 'E', 'f', 'F', 'g', 'G', and 'n'.
 386  *
 387  * 'n' is the same as 'g', except it uses the current locale.
 388  * 'Z' is the same as 'g', except it always has a decimal and
 389  *     at least one digit after the decimal.
 390  *
 391  * Return value: The pointer to the buffer with the converted string.
 392  **/
 393 char *
 394 PyOS_ascii_formatd(char       *buffer,
 395                    size_t      buf_size,
 396                    const char *format,
 397                    double      d)
 398 {
 399         char format_char;
 400         size_t format_len = strlen(format);
 401
 402         /* For type 'n', we need to make a copy of the format string, because
 403            we're going to modify 'n' -> 'g', and format is const char*, so we
 404            can't modify it directly.  FLOAT_FORMATBUFLEN should be longer than
 405            we ever need this to be.  There's an upcoming check to ensure it's
 406            big enough. */
 407         /* Issue 2264: code 'Z' requires copying the format.  'Z' is 'g', but
 408            also with at least one character past the decimal. */
 409         char tmp_format[FLOAT_FORMATBUFLEN];
 410
 411         /* The last character in the format string must be the format char */
 412         format_char = format[format_len - 1];
 413
 414         if (format[0] != '%')
 415                 return NULL;
 416
 417         /* I'm not sure why this test is here.  It's ensuring that the format
 418            string after the first character doesn't have a single quote, a
 419            lowercase l, or a percent. This is the reverse of the commented-out
 420            test about 10 lines ago. */
 421         if (strpbrk(format + 1, "'l%"))
 422                 return NULL;
 423
 424         /* Also curious about this function is that it accepts format strings
 425            like "%xg", which are invalid for floats.  In general, the
 426            interface to this function is not very good, but changing it is
 427            difficult because it's a public API. */
 428
 429         if (!(format_char == 'e' || format_char == 'E' ||
 430               format_char == 'f' || format_char == 'F' ||
 431               format_char == 'g' || format_char == 'G' ||
 432               format_char == 'n' || format_char == 'Z'))
 433                 return NULL;
 434
 435         /* Map 'n' or 'Z' format_char to 'g', by copying the format string and
 436            replacing the final char with a 'g' */
 437         if (format_char == 'n' || format_char == 'Z') {
 438                 if (format_len + 1 >= sizeof(tmp_format)) {
 439                         /* The format won't fit in our copy.  Error out.  In
 440                            practice, this will never happen and will be
 441                            detected by returning NULL */
 442                         return NULL;
 443                 }
 444                 strcpy(tmp_format, format);
 445                 tmp_format[format_len - 1] = 'g';
 446                 format = tmp_format;
 447         }
 448
 449
 450         /* Have PyOS_snprintf do the hard work */
 451         PyOS_snprintf(buffer, buf_size, format, d);
 452
 453         /* Do various fixups on the return string */
 454
 455         /* Get the current locale, and find the decimal point string.
 456            Convert that string back to a dot.  Do not do this if using the
 457            'n' (number) format code, since we want to keep the localized
 458            decimal point in that case. */
 459         if (format_char != 'n')
 460                 change_decimal_from_locale_to_dot(buffer);
 461
 462         /* If an exponent exists, ensure that the exponent is at least
 463            MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
 464            for the extra zeros.  Also, if there are more than
 465            MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
 466            back to MIN_EXPONENT_DIGITS */
 467         ensure_minumim_exponent_length(buffer, buf_size);
 468
 469         /* If format_char is 'Z', make sure we have at least one character
 470            after the decimal point (and make sure we have a decimal point). */
 471         if (format_char == 'Z')
 472                 ensure_decimal_point(buffer, buf_size);
 473
 474         /* If format_char is 'n', add the thousands grouping. */
 475         if (format_char == 'n')
 476                 if (!add_thousands_grouping(buffer, buf_size))
 477                         return NULL;
 478
 479         return buffer;
 480 }
 481
 482 double
 483 PyOS_ascii_atof(const char *nptr)
 484 {
 485         return PyOS_ascii_strtod(nptr, NULL);
 486 }