Objects/stringlib/string_format.h

   1 /*
   2     string_format.h -- implementation of string.format().
   3
   4     It uses the Objects/stringlib conventions, so that it can be
   5     compiled for both unicode and string objects.
   6 */
   7
   8
   9 /* Defines for Python 2.6 compatability */
  10 #if PY_VERSION_HEX < 0x03000000
  11 #define PyLong_FromSsize_t _PyLong_FromSsize_t
  12 #endif
  13
  14 /* Defines for more efficiently reallocating the string buffer */
  15 #define INITIAL_SIZE_INCREMENT 100
  16 #define SIZE_MULTIPLIER 2
  17 #define MAX_SIZE_INCREMENT  3200
  18
  19
  20 /************************************************************************/
  21 /***********   Global data structures and forward declarations  *********/
  22 /************************************************************************/
  23
  24 /*
  25    A SubString consists of the characters between two string or
  26    unicode pointers.
  27 */
  28 typedef struct {
  29     STRINGLIB_CHAR *ptr;
  30     STRINGLIB_CHAR *end;
  31 } SubString;
  32
  33
  34 typedef enum {
  35     ANS_INIT,
  36     ANS_AUTO,
  37     ANS_MANUAL
  38 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
  39
  40 /* Keeps track of our auto-numbering state, and which number field we're on */
  41 typedef struct {
  42     AutoNumberState an_state;
  43     int an_field_number;
  44 } AutoNumber;
  45
  46
  47 /* forward declaration for recursion */
  48 static PyObject *
  49 build_string(SubString *input, PyObject *args, PyObject *kwargs,
  50              int recursion_depth, AutoNumber *auto_number);
  51
  52
  53
  54 /************************************************************************/
  55 /**************************  Utility  functions  ************************/
  56 /************************************************************************/
  57
  58 static void
  59 AutoNumber_Init(AutoNumber *auto_number)
  60 {
  61     auto_number->an_state = ANS_INIT;
  62     auto_number->an_field_number = 0;
  63 }
  64
  65 /* fill in a SubString from a pointer and length */
  66 Py_LOCAL_INLINE(void)
  67 SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len)
  68 {
  69     str->ptr = p;
  70     if (p == NULL)
  71         str->end = NULL;
  72     else
  73         str->end = str->ptr + len;
  74 }
  75
  76 /* return a new string.  if str->ptr is NULL, return None */
  77 Py_LOCAL_INLINE(PyObject *)
  78 SubString_new_object(SubString *str)
  79 {
  80     if (str->ptr == NULL) {
  81         Py_INCREF(Py_None);
  82         return Py_None;
  83     }
  84     return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
  85 }
  86
  87 /* return a new string.  if str->ptr is NULL, return None */
  88 Py_LOCAL_INLINE(PyObject *)
  89 SubString_new_object_or_empty(SubString *str)
  90 {
  91     if (str->ptr == NULL) {
  92         return STRINGLIB_NEW(NULL, 0);
  93     }
  94     return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
  95 }
  96
  97 /* Return 1 if an error has been detected switching between automatic
  98    field numbering and manual field specification, else return 0. Set
  99    ValueError on error. */
 100 static int
 101 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
 102 {
 103     if (state == ANS_MANUAL) {
 104         if (field_name_is_empty) {
 105             PyErr_SetString(PyExc_ValueError, "cannot switch from "
 106                             "manual field specification to "
 107                             "automatic field numbering");
 108             return 1;
 109         }
 110     }
 111     else {
 112         if (!field_name_is_empty) {
 113             PyErr_SetString(PyExc_ValueError, "cannot switch from "
 114                             "automatic field numbering to "
 115                             "manual field specification");
 116             return 1;
 117         }
 118     }
 119     return 0;
 120 }
 121
 122
 123 /************************************************************************/
 124 /***********    Output string management functions       ****************/
 125 /************************************************************************/
 126
 127 typedef struct {
 128     STRINGLIB_CHAR *ptr;
 129     STRINGLIB_CHAR *end;
 130     PyObject *obj;
 131     Py_ssize_t size_increment;
 132 } OutputString;
 133
 134 /* initialize an OutputString object, reserving size characters */
 135 static int
 136 output_initialize(OutputString *output, Py_ssize_t size)
 137 {
 138     output->obj = STRINGLIB_NEW(NULL, size);
 139     if (output->obj == NULL)
 140         return 0;
 141
 142     output->ptr = STRINGLIB_STR(output->obj);
 143     output->end = STRINGLIB_LEN(output->obj) + output->ptr;
 144     output->size_increment = INITIAL_SIZE_INCREMENT;
 145
 146     return 1;
 147 }
 148
 149 /*
 150     output_extend reallocates the output string buffer.
 151     It returns a status:  0 for a failed reallocation,
 152     1 for success.
 153 */
 154
 155 static int
 156 output_extend(OutputString *output, Py_ssize_t count)
 157 {
 158     STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj);
 159     Py_ssize_t curlen = output->ptr - startptr;
 160     Py_ssize_t maxlen = curlen + count + output->size_increment;
 161
 162     if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0)
 163         return 0;
 164     startptr = STRINGLIB_STR(output->obj);
 165     output->ptr = startptr + curlen;
 166     output->end = startptr + maxlen;
 167     if (output->size_increment < MAX_SIZE_INCREMENT)
 168         output->size_increment *= SIZE_MULTIPLIER;
 169     return 1;
 170 }
 171
 172 /*
 173     output_data dumps characters into our output string
 174     buffer.
 175
 176     In some cases, it has to reallocate the string.
 177
 178     It returns a status:  0 for a failed reallocation,
 179     1 for success.
 180 */
 181 static int
 182 output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count)
 183 {
 184     if ((count > output->end - output->ptr) && !output_extend(output, count))
 185         return 0;
 186     memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR));
 187     output->ptr += count;
 188     return 1;
 189 }
 190
 191 /************************************************************************/
 192 /***********  Format string parsing -- integers and identifiers *********/
 193 /************************************************************************/
 194
 195 static Py_ssize_t
 196 get_integer(const SubString *str)
 197 {
 198     Py_ssize_t accumulator = 0;
 199     Py_ssize_t digitval;
 200     Py_ssize_t oldaccumulator;
 201     STRINGLIB_CHAR *p;
 202
 203     /* empty string is an error */
 204     if (str->ptr >= str->end)
 205         return -1;
 206
 207     for (p = str->ptr; p < str->end; p++) {
 208         digitval = STRINGLIB_TODECIMAL(*p);
 209         if (digitval < 0)
 210             return -1;
 211         /*
 212            This trick was copied from old Unicode format code.  It's cute,
 213            but would really suck on an old machine with a slow divide
 214            implementation.  Fortunately, in the normal case we do not
 215            expect too many digits.
 216         */
 217         oldaccumulator = accumulator;
 218         accumulator *= 10;
 219         if ((accumulator+10)/10 != oldaccumulator+1) {
 220             PyErr_Format(PyExc_ValueError,
 221                          "Too many decimal digits in format string");
 222             return -1;
 223         }
 224         accumulator += digitval;
 225     }
 226     return accumulator;
 227 }
 228
 229 /************************************************************************/
 230 /******** Functions to get field objects and specification strings ******/
 231 /************************************************************************/
 232
 233 /* do the equivalent of obj.name */
 234 static PyObject *
 235 getattr(PyObject *obj, SubString *name)
 236 {
 237     PyObject *newobj;
 238     PyObject *str = SubString_new_object(name);
 239     if (str == NULL)
 240         return NULL;
 241     newobj = PyObject_GetAttr(obj, str);
 242     Py_DECREF(str);
 243     return newobj;
 244 }
 245
 246 /* do the equivalent of obj[idx], where obj is a sequence */
 247 static PyObject *
 248 getitem_sequence(PyObject *obj, Py_ssize_t idx)
 249 {
 250     return PySequence_GetItem(obj, idx);
 251 }
 252
 253 /* do the equivalent of obj[idx], where obj is not a sequence */
 254 static PyObject *
 255 getitem_idx(PyObject *obj, Py_ssize_t idx)
 256 {
 257     PyObject *newobj;
 258     PyObject *idx_obj = PyLong_FromSsize_t(idx);
 259     if (idx_obj == NULL)
 260         return NULL;
 261     newobj = PyObject_GetItem(obj, idx_obj);
 262     Py_DECREF(idx_obj);
 263     return newobj;
 264 }
 265
 266 /* do the equivalent of obj[name] */
 267 static PyObject *
 268 getitem_str(PyObject *obj, SubString *name)
 269 {
 270     PyObject *newobj;
 271     PyObject *str = SubString_new_object(name);
 272     if (str == NULL)
 273         return NULL;
 274     newobj = PyObject_GetItem(obj, str);
 275     Py_DECREF(str);
 276     return newobj;
 277 }
 278
 279 typedef struct {
 280     /* the entire string we're parsing.  we assume that someone else
 281        is managing its lifetime, and that it will exist for the
 282        lifetime of the iterator.  can be empty */
 283     SubString str;
 284
 285     /* pointer to where we are inside field_name */
 286     STRINGLIB_CHAR *ptr;
 287 } FieldNameIterator;
 288
 289
 290 static int
 291 FieldNameIterator_init(FieldNameIterator *self, STRINGLIB_CHAR *ptr,
 292                        Py_ssize_t len)
 293 {
 294     SubString_init(&self->str, ptr, len);
 295     self->ptr = self->str.ptr;
 296     return 1;
 297 }
 298
 299 static int
 300 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
 301 {
 302     STRINGLIB_CHAR c;
 303
 304     name->ptr = self->ptr;
 305
 306     /* return everything until '.' or '[' */
 307     while (self->ptr < self->str.end) {
 308         switch (c = *self->ptr++) {
 309         case '[':
 310         case '.':
 311             /* backup so that we this character will be seen next time */
 312             self->ptr--;
 313             break;
 314         default:
 315             continue;
 316         }
 317         break;
 318     }
 319     /* end of string is okay */
 320     name->end = self->ptr;
 321     return 1;
 322 }
 323
 324 static int
 325 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
 326 {
 327     int bracket_seen = 0;
 328     STRINGLIB_CHAR c;
 329
 330     name->ptr = self->ptr;
 331
 332     /* return everything until ']' */
 333     while (self->ptr < self->str.end) {
 334         switch (c = *self->ptr++) {
 335         case ']':
 336             bracket_seen = 1;
 337             break;
 338         default:
 339             continue;
 340         }
 341         break;
 342     }
 343     /* make sure we ended with a ']' */
 344     if (!bracket_seen) {
 345         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
 346         return 0;
 347     }
 348
 349     /* end of string is okay */
 350     /* don't include the ']' */
 351     name->end = self->ptr-1;
 352     return 1;
 353 }
 354
 355 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
 356 static int
 357 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
 358                        Py_ssize_t *name_idx, SubString *name)
 359 {
 360     /* check at end of input */
 361     if (self->ptr >= self->str.end)
 362         return 1;
 363
 364     switch (*self->ptr++) {
 365     case '.':
 366         *is_attribute = 1;
 367         if (_FieldNameIterator_attr(self, name) == 0)
 368             return 0;
 369         *name_idx = -1;
 370         break;
 371     case '[':
 372         *is_attribute = 0;
 373         if (_FieldNameIterator_item(self, name) == 0)
 374             return 0;
 375         *name_idx = get_integer(name);
 376         break;
 377     default:
 378         /* Invalid character follows ']' */
 379         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
 380                         "follow ']' in format field specifier");
 381         return 0;
 382     }
 383
 384     /* empty string is an error */
 385     if (name->ptr == name->end) {
 386         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
 387         return 0;
 388     }
 389
 390     return 2;
 391 }
 392
 393
 394 /* input: field_name
 395    output: 'first' points to the part before the first '[' or '.'
 396            'first_idx' is -1 if 'first' is not an integer, otherwise
 397                        it's the value of first converted to an integer
 398            'rest' is an iterator to return the rest
 399 */
 400 static int
 401 field_name_split(STRINGLIB_CHAR *ptr, Py_ssize_t len, SubString *first,
 402                  Py_ssize_t *first_idx, FieldNameIterator *rest,
 403                  AutoNumber *auto_number)
 404 {
 405     STRINGLIB_CHAR c;
 406     STRINGLIB_CHAR *p = ptr;
 407     STRINGLIB_CHAR *end = ptr + len;
 408     int field_name_is_empty;
 409     int using_numeric_index;
 410
 411     /* find the part up until the first '.' or '[' */
 412     while (p < end) {
 413         switch (c = *p++) {
 414         case '[':
 415         case '.':
 416             /* backup so that we this character is available to the
 417                "rest" iterator */
 418             p--;
 419             break;
 420         default:
 421             continue;
 422         }
 423         break;
 424     }
 425
 426     /* set up the return values */
 427     SubString_init(first, ptr, p - ptr);
 428     FieldNameIterator_init(rest, p, end - p);
 429
 430     /* see if "first" is an integer, in which case it's used as an index */
 431     *first_idx = get_integer(first);
 432
 433     field_name_is_empty = first->ptr >= first->end;
 434
 435     /* If the field name is omitted or if we have a numeric index
 436        specified, then we're doing numeric indexing into args. */
 437     using_numeric_index = field_name_is_empty || *first_idx != -1;
 438
 439     /* We always get here exactly one time for each field we're
 440        processing. And we get here in field order (counting by left
 441        braces). So this is the perfect place to handle automatic field
 442        numbering if the field name is omitted. */
 443
 444     /* Check if we need to do the auto-numbering. It's not needed if
 445        we're called from string.Format routines, because it's handled
 446        in that class by itself. */
 447     if (auto_number) {
 448         /* Initialize our auto numbering state if this is the first
 449            time we're either auto-numbering or manually numbering. */
 450         if (auto_number->an_state == ANS_INIT && using_numeric_index)
 451             auto_number->an_state = field_name_is_empty ?
 452                 ANS_AUTO : ANS_MANUAL;
 453
 454         /* Make sure our state is consistent with what we're doing
 455            this time through. Only check if we're using a numeric
 456            index. */
 457         if (using_numeric_index)
 458             if (autonumber_state_error(auto_number->an_state,
 459                                        field_name_is_empty))
 460                 return 0;
 461         /* Zero length field means we want to do auto-numbering of the
 462            fields. */
 463         if (field_name_is_empty)
 464             *first_idx = (auto_number->an_field_number)++;
 465     }
 466
 467     return 1;
 468 }
 469
 470
 471 /*
 472     get_field_object returns the object inside {}, before the
 473     format_spec.  It handles getindex and getattr lookups and consumes
 474     the entire input string.
 475 */
 476 static PyObject *
 477 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
 478                  AutoNumber *auto_number)
 479 {
 480     PyObject *obj = NULL;
 481     int ok;
 482     int is_attribute;
 483     SubString name;
 484     SubString first;
 485     Py_ssize_t index;
 486     FieldNameIterator rest;
 487
 488     if (!field_name_split(input->ptr, input->end - input->ptr, &first,
 489                           &index, &rest, auto_number)) {
 490         goto error;
 491     }
 492
 493     if (index == -1) {
 494         /* look up in kwargs */
 495         PyObject *key = SubString_new_object(&first);
 496         if (key == NULL)
 497             goto error;
 498         if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) {
 499             PyErr_SetObject(PyExc_KeyError, key);
 500             Py_DECREF(key);
 501             goto error;
 502         }
 503         Py_DECREF(key);
 504         Py_INCREF(obj);
 505     }
 506     else {
 507         /* look up in args */
 508         obj = PySequence_GetItem(args, index);
 509         if (obj == NULL)
 510             goto error;
 511     }
 512
 513     /* iterate over the rest of the field_name */
 514     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
 515                                         &name)) == 2) {
 516         PyObject *tmp;
 517
 518         if (is_attribute)
 519             /* getattr lookup "." */
 520             tmp = getattr(obj, &name);
 521         else
 522             /* getitem lookup "[]" */
 523             if (index == -1)
 524                 tmp = getitem_str(obj, &name);
 525             else
 526                 if (PySequence_Check(obj))
 527                     tmp = getitem_sequence(obj, index);
 528                 else
 529                     /* not a sequence */
 530                     tmp = getitem_idx(obj, index);
 531         if (tmp == NULL)
 532             goto error;
 533
 534         /* assign to obj */
 535         Py_DECREF(obj);
 536         obj = tmp;
 537     }
 538     /* end of iterator, this is the non-error case */
 539     if (ok == 1)
 540         return obj;
 541 error:
 542     Py_XDECREF(obj);
 543     return NULL;
 544 }
 545
 546 /************************************************************************/
 547 /*****************  Field rendering functions  **************************/
 548 /************************************************************************/
 549
 550 /*
 551     render_field() is the main function in this section.  It takes the
 552     field object and field specification string generated by
 553     get_field_and_spec, and renders the field into the output string.
 554
 555     render_field calls fieldobj.__format__(format_spec) method, and
 556     appends to the output.
 557 */
 558 static int
 559 render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output)
 560 {
 561     int ok = 0;
 562     PyObject *result = NULL;
 563     PyObject *format_spec_object = NULL;
 564     PyObject *(*formatter)(PyObject *, STRINGLIB_CHAR *, Py_ssize_t) = NULL;
 565     STRINGLIB_CHAR* format_spec_start = format_spec->ptr ?
 566             format_spec->ptr : NULL;
 567     Py_ssize_t format_spec_len = format_spec->ptr ?
 568             format_spec->end - format_spec->ptr : 0;
 569
 570     /* If we know the type exactly, skip the lookup of __format__ and just
 571        call the formatter directly. */
 572 #if STRINGLIB_IS_UNICODE
 573     if (PyUnicode_CheckExact(fieldobj))
 574         formatter = _PyUnicode_FormatAdvanced;
 575     /* Unfortunately, there's a problem with checking for int, long,
 576        and float here.  If we're being included as unicode, their
 577        formatters expect string format_spec args.  For now, just skip
 578        this optimization for unicode.  This could be fixed, but it's a
 579        hassle. */
 580 #else
 581     if (PyString_CheckExact(fieldobj))
 582         formatter = _PyBytes_FormatAdvanced;
 583     else if (PyInt_CheckExact(fieldobj))
 584         formatter =_PyInt_FormatAdvanced;
 585     else if (PyLong_CheckExact(fieldobj))
 586         formatter =_PyLong_FormatAdvanced;
 587     else if (PyFloat_CheckExact(fieldobj))
 588         formatter = _PyFloat_FormatAdvanced;
 589 #endif
 590
 591     if (formatter) {
 592         /* we know exactly which formatter will be called when __format__ is
 593            looked up, so call it directly, instead. */
 594         result = formatter(fieldobj, format_spec_start, format_spec_len);
 595     }
 596     else {
 597         /* We need to create an object out of the pointers we have, because
 598            __format__ takes a string/unicode object for format_spec. */
 599         format_spec_object = STRINGLIB_NEW(format_spec_start,
 600                                            format_spec_len);
 601         if (format_spec_object == NULL)
 602             goto done;
 603
 604         result = PyObject_Format(fieldobj, format_spec_object);
 605     }
 606     if (result == NULL)
 607         goto done;
 608
 609 #if PY_VERSION_HEX >= 0x03000000
 610     assert(PyUnicode_Check(result));
 611 #else
 612     assert(PyString_Check(result) || PyUnicode_Check(result));
 613
 614     /* Convert result to our type.  We could be str, and result could
 615        be unicode */
 616     {
 617         PyObject *tmp = STRINGLIB_TOSTR(result);
 618         if (tmp == NULL)
 619             goto done;
 620         Py_DECREF(result);
 621         result = tmp;
 622     }
 623 #endif
 624
 625     ok = output_data(output,
 626                      STRINGLIB_STR(result), STRINGLIB_LEN(result));
 627 done:
 628     Py_XDECREF(format_spec_object);
 629     Py_XDECREF(result);
 630     return ok;
 631 }
 632
 633 static int
 634 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
 635             STRINGLIB_CHAR *conversion)
 636 {
 637     /* Note this function works if the field name is zero length,
 638        which is good.  Zero length field names are handled later, in
 639        field_name_split. */
 640
 641     STRINGLIB_CHAR c = 0;
 642
 643     /* initialize these, as they may be empty */
 644     *conversion = '\0';
 645     SubString_init(format_spec, NULL, 0);
 646
 647     /* Search for the field name.  it's terminated by the end of
 648        the string, or a ':' or '!' */
 649     field_name->ptr = str->ptr;
 650     while (str->ptr < str->end) {
 651         switch (c = *(str->ptr++)) {
 652         case ':':
 653         case '!':
 654             break;
 655         default:
 656             continue;
 657         }
 658         break;
 659     }
 660
 661     if (c == '!' || c == ':') {
 662         /* we have a format specifier and/or a conversion */
 663         /* don't include the last character */
 664         field_name->end = str->ptr-1;
 665
 666         /* the format specifier is the rest of the string */
 667         format_spec->ptr = str->ptr;
 668         format_spec->end = str->end;
 669
 670         /* see if there's a conversion specifier */
 671         if (c == '!') {
 672             /* there must be another character present */
 673             if (format_spec->ptr >= format_spec->end) {
 674                 PyErr_SetString(PyExc_ValueError,
 675                                 "end of format while looking for conversion "
 676                                 "specifier");
 677                 return 0;
 678             }
 679             *conversion = *(format_spec->ptr++);
 680
 681             /* if there is another character, it must be a colon */
 682             if (format_spec->ptr < format_spec->end) {
 683                 c = *(format_spec->ptr++);
 684                 if (c != ':') {
 685                     PyErr_SetString(PyExc_ValueError,
 686                                     "expected ':' after format specifier");
 687                     return 0;
 688                 }
 689             }
 690         }
 691     }
 692     else
 693         /* end of string, there's no format_spec or conversion */
 694         field_name->end = str->ptr;
 695
 696     return 1;
 697 }
 698
 699 /************************************************************************/
 700 /******* Output string allocation and escape-to-markup processing  ******/
 701 /************************************************************************/
 702
 703 /* MarkupIterator breaks the string into pieces of either literal
 704    text, or things inside {} that need to be marked up.  it is
 705    designed to make it easy to wrap a Python iterator around it, for
 706    use with the Formatter class */
 707
 708 typedef struct {
 709     SubString str;
 710 } MarkupIterator;
 711
 712 static int
 713 MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len)
 714 {
 715     SubString_init(&self->str, ptr, len);
 716     return 1;
 717 }
 718
 719 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
 720    string (or something to be expanded) */
 721 static int
 722 MarkupIterator_next(MarkupIterator *self, SubString *literal,
 723                     int *field_present, SubString *field_name,
 724                     SubString *format_spec, STRINGLIB_CHAR *conversion,
 725                     int *format_spec_needs_expanding)
 726 {
 727     int at_end;
 728     STRINGLIB_CHAR c = 0;
 729     STRINGLIB_CHAR *start;
 730     int count;
 731     Py_ssize_t len;
 732     int markup_follows = 0;
 733
 734     /* initialize all of the output variables */
 735     SubString_init(literal, NULL, 0);
 736     SubString_init(field_name, NULL, 0);
 737     SubString_init(format_spec, NULL, 0);
 738     *conversion = '\0';
 739     *format_spec_needs_expanding = 0;
 740     *field_present = 0;
 741
 742     /* No more input, end of iterator.  This is the normal exit
 743        path. */
 744     if (self->str.ptr >= self->str.end)
 745         return 1;
 746
 747     start = self->str.ptr;
 748
 749     /* First read any literal text. Read until the end of string, an
 750        escaped '{' or '}', or an unescaped '{'.  In order to never
 751        allocate memory and so I can just pass pointers around, if
 752        there's an escaped '{' or '}' then we'll return the literal
 753        including the brace, but no format object.  The next time
 754        through, we'll return the rest of the literal, skipping past
 755        the second consecutive brace. */
 756     while (self->str.ptr < self->str.end) {
 757         switch (c = *(self->str.ptr++)) {
 758         case '{':
 759         case '}':
 760             markup_follows = 1;
 761             break;
 762         default:
 763             continue;
 764         }
 765         break;
 766     }
 767
 768     at_end = self->str.ptr >= self->str.end;
 769     len = self->str.ptr - start;
 770
 771     if ((c == '}') && (at_end || (c != *self->str.ptr))) {
 772         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
 773                         "in format string");
 774         return 0;
 775     }
 776     if (at_end && c == '{') {
 777         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
 778                         "in format string");
 779         return 0;
 780     }
 781     if (!at_end) {
 782         if (c == *self->str.ptr) {
 783             /* escaped } or {, skip it in the input.  there is no
 784                markup object following us, just this literal text */
 785             self->str.ptr++;
 786             markup_follows = 0;
 787         }
 788         else
 789             len--;
 790     }
 791
 792     /* record the literal text */
 793     literal->ptr = start;
 794     literal->end = start + len;
 795
 796     if (!markup_follows)
 797         return 2;
 798
 799     /* this is markup, find the end of the string by counting nested
 800        braces.  note that this prohibits escaped braces, so that
 801        format_specs cannot have braces in them. */
 802     *field_present = 1;
 803     count = 1;
 804
 805     start = self->str.ptr;
 806
 807     /* we know we can't have a zero length string, so don't worry
 808        about that case */
 809     while (self->str.ptr < self->str.end) {
 810         switch (c = *(self->str.ptr++)) {
 811         case '{':
 812             /* the format spec needs to be recursively expanded.
 813                this is an optimization, and not strictly needed */
 814             *format_spec_needs_expanding = 1;
 815             count++;
 816             break;
 817         case '}':
 818             count--;
 819             if (count <= 0) {
 820                 /* we're done.  parse and get out */
 821                 SubString s;
 822
 823                 SubString_init(&s, start, self->str.ptr - 1 - start);
 824                 if (parse_field(&s, field_name, format_spec, conversion) == 0)
 825                     return 0;
 826
 827                 /* success */
 828                 return 2;
 829             }
 830             break;
 831         }
 832     }
 833
 834     /* end of string while searching for matching '}' */
 835     PyErr_SetString(PyExc_ValueError, "unmatched '{' in format");
 836     return 0;
 837 }
 838
 839
 840 /* do the !r or !s conversion on obj */
 841 static PyObject *
 842 do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
 843 {
 844     /* XXX in pre-3.0, do we need to convert this to unicode, since it
 845        might have returned a string? */
 846     switch (conversion) {
 847     case 'r':
 848         return PyObject_Repr(obj);
 849     case 's':
 850         return STRINGLIB_TOSTR(obj);
 851     default:
 852         if (conversion > 32 && conversion < 127) {
 853                 /* It's the ASCII subrange; casting to char is safe
 854                    (assuming the execution character set is an ASCII
 855                    superset). */
 856                 PyErr_Format(PyExc_ValueError,
 857                      "Unknown conversion specifier %c",
 858                      (char)conversion);
 859         } else
 860                 PyErr_Format(PyExc_ValueError,
 861                      "Unknown conversion specifier \\x%x",
 862                      (unsigned int)conversion);
 863         return NULL;
 864     }
 865 }
 866
 867 /* given:
 868
 869    {field_name!conversion:format_spec}
 870
 871    compute the result and write it to output.
 872    format_spec_needs_expanding is an optimization.  if it's false,
 873    just output the string directly, otherwise recursively expand the
 874    format_spec string.
 875
 876    field_name is allowed to be zero length, in which case we
 877    are doing auto field numbering.
 878 */
 879
 880 static int
 881 output_markup(SubString *field_name, SubString *format_spec,
 882               int format_spec_needs_expanding, STRINGLIB_CHAR conversion,
 883               OutputString *output, PyObject *args, PyObject *kwargs,
 884               int recursion_depth, AutoNumber *auto_number)
 885 {
 886     PyObject *tmp = NULL;
 887     PyObject *fieldobj = NULL;
 888     SubString expanded_format_spec;
 889     SubString *actual_format_spec;
 890     int result = 0;
 891
 892     /* convert field_name to an object */
 893     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
 894     if (fieldobj == NULL)
 895         goto done;
 896
 897     if (conversion != '\0') {
 898         tmp = do_conversion(fieldobj, conversion);
 899         if (tmp == NULL)
 900             goto done;
 901
 902         /* do the assignment, transferring ownership: fieldobj = tmp */
 903         Py_DECREF(fieldobj);
 904         fieldobj = tmp;
 905         tmp = NULL;
 906     }
 907
 908     /* if needed, recurively compute the format_spec */
 909     if (format_spec_needs_expanding) {
 910         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
 911                            auto_number);
 912         if (tmp == NULL)
 913             goto done;
 914
 915         /* note that in the case we're expanding the format string,
 916            tmp must be kept around until after the call to
 917            render_field. */
 918         SubString_init(&expanded_format_spec,
 919                        STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp));
 920         actual_format_spec = &expanded_format_spec;
 921     }
 922     else
 923         actual_format_spec = format_spec;
 924
 925     if (render_field(fieldobj, actual_format_spec, output) == 0)
 926         goto done;
 927
 928     result = 1;
 929
 930 done:
 931     Py_XDECREF(fieldobj);
 932     Py_XDECREF(tmp);
 933
 934     return result;
 935 }
 936
 937 /*
 938     do_markup is the top-level loop for the format() method.  It
 939     searches through the format string for escapes to markup codes, and
 940     calls other functions to move non-markup text to the output,
 941     and to perform the markup to the output.
 942 */
 943 static int
 944 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
 945           OutputString *output, int recursion_depth, AutoNumber *auto_number)
 946 {
 947     MarkupIterator iter;
 948     int format_spec_needs_expanding;
 949     int result;
 950     int field_present;
 951     SubString literal;
 952     SubString field_name;
 953     SubString format_spec;
 954     STRINGLIB_CHAR conversion;
 955
 956     MarkupIterator_init(&iter, input->ptr, input->end - input->ptr);
 957     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
 958                                          &field_name, &format_spec,
 959                                          &conversion,
 960                                          &format_spec_needs_expanding)) == 2) {
 961         if (!output_data(output, literal.ptr, literal.end - literal.ptr))
 962             return 0;
 963         if (field_present)
 964             if (!output_markup(&field_name, &format_spec,
 965                                format_spec_needs_expanding, conversion, output,
 966                                args, kwargs, recursion_depth, auto_number))
 967                 return 0;
 968     }
 969     return result;
 970 }
 971
 972
 973 /*
 974     build_string allocates the output string and then
 975     calls do_markup to do the heavy lifting.
 976 */
 977 static PyObject *
 978 build_string(SubString *input, PyObject *args, PyObject *kwargs,
 979              int recursion_depth, AutoNumber *auto_number)
 980 {
 981     OutputString output;
 982     PyObject *result = NULL;
 983     Py_ssize_t count;
 984
 985     output.obj = NULL; /* needed so cleanup code always works */
 986
 987     /* check the recursion level */
 988     if (recursion_depth <= 0) {
 989         PyErr_SetString(PyExc_ValueError,
 990                         "Max string recursion exceeded");
 991         goto done;
 992     }
 993
 994     /* initial size is the length of the format string, plus the size
 995        increment.  seems like a reasonable default */
 996     if (!output_initialize(&output,
 997                            input->end - input->ptr +
 998                            INITIAL_SIZE_INCREMENT))
 999         goto done;
1000
1001     if (!do_markup(input, args, kwargs, &output, recursion_depth,
1002                    auto_number)) {
1003         goto done;
1004     }
1005
1006     count = output.ptr - STRINGLIB_STR(output.obj);
1007     if (STRINGLIB_RESIZE(&output.obj, count) < 0) {
1008         goto done;
1009     }
1010
1011     /* transfer ownership to result */
1012     result = output.obj;
1013     output.obj = NULL;
1014
1015 done:
1016     Py_XDECREF(output.obj);
1017     return result;
1018 }
1019
1020 /************************************************************************/
1021 /*********** main routine ***********************************************/
1022 /************************************************************************/
1023
1024 /* this is the main entry point */
1025 static PyObject *
1026 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
1027 {
1028     SubString input;
1029
1030     /* PEP 3101 says only 2 levels, so that
1031        "{0:{1}}".format('abc', 's')            # works
1032        "{0:{1:{2}}}".format('abc', 's', '')    # fails
1033     */
1034     int recursion_depth = 2;
1035
1036     AutoNumber auto_number;
1037
1038     AutoNumber_Init(&auto_number);
1039     SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self));
1040     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
1041 }
1042
1043
1044
1045 /************************************************************************/
1046 /*********** formatteriterator ******************************************/
1047 /************************************************************************/
1048
1049 /* This is used to implement string.Formatter.vparse().  It exists so
1050    Formatter can share code with the built in unicode.format() method.
1051    It's really just a wrapper around MarkupIterator that is callable
1052    from Python. */
1053
1054 typedef struct {
1055     PyObject_HEAD
1056
1057     STRINGLIB_OBJECT *str;
1058
1059     MarkupIterator it_markup;
1060 } formatteriterobject;
1061
1062 static void
1063 formatteriter_dealloc(formatteriterobject *it)
1064 {
1065     Py_XDECREF(it->str);
1066     PyObject_FREE(it);
1067 }
1068
1069 /* returns a tuple:
1070    (literal, field_name, format_spec, conversion)
1071
1072    literal is any literal text to output.  might be zero length
1073    field_name is the string before the ':'.  might be None
1074    format_spec is the string after the ':'.  mibht be None
1075    conversion is either None, or the string after the '!'
1076 */
1077 static PyObject *
1078 formatteriter_next(formatteriterobject *it)
1079 {
1080     SubString literal;
1081     SubString field_name;
1082     SubString format_spec;
1083     STRINGLIB_CHAR conversion;
1084     int format_spec_needs_expanding;
1085     int field_present;
1086     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1087                                      &field_name, &format_spec, &conversion,
1088                                      &format_spec_needs_expanding);
1089
1090     /* all of the SubString objects point into it->str, so no
1091        memory management needs to be done on them */
1092     assert(0 <= result && result <= 2);
1093     if (result == 0 || result == 1)
1094         /* if 0, error has already been set, if 1, iterator is empty */
1095         return NULL;
1096     else {
1097         PyObject *literal_str = NULL;
1098         PyObject *field_name_str = NULL;
1099         PyObject *format_spec_str = NULL;
1100         PyObject *conversion_str = NULL;
1101         PyObject *tuple = NULL;
1102
1103         literal_str = SubString_new_object(&literal);
1104         if (literal_str == NULL)
1105             goto done;
1106
1107         field_name_str = SubString_new_object(&field_name);
1108         if (field_name_str == NULL)
1109             goto done;
1110
1111         /* if field_name is non-zero length, return a string for
1112            format_spec (even if zero length), else return None */
1113         format_spec_str = (field_present ?
1114                            SubString_new_object_or_empty :
1115                            SubString_new_object)(&format_spec);
1116         if (format_spec_str == NULL)
1117             goto done;
1118
1119         /* if the conversion is not specified, return a None,
1120            otherwise create a one length string with the conversion
1121            character */
1122         if (conversion == '\0') {
1123             conversion_str = Py_None;
1124             Py_INCREF(conversion_str);
1125         }
1126         else
1127             conversion_str = STRINGLIB_NEW(&conversion, 1);
1128         if (conversion_str == NULL)
1129             goto done;
1130
1131         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1132                              conversion_str);
1133     done:
1134         Py_XDECREF(literal_str);
1135         Py_XDECREF(field_name_str);
1136         Py_XDECREF(format_spec_str);
1137         Py_XDECREF(conversion_str);
1138         return tuple;
1139     }
1140 }
1141
1142 static PyMethodDef formatteriter_methods[] = {
1143     {NULL,              NULL}           /* sentinel */
1144 };
1145
1146 static PyTypeObject PyFormatterIter_Type = {
1147     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1148     "formatteriterator",                /* tp_name */
1149     sizeof(formatteriterobject),        /* tp_basicsize */
1150     0,                                  /* tp_itemsize */
1151     /* methods */
1152     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1153     0,                                  /* tp_print */
1154     0,                                  /* tp_getattr */
1155     0,                                  /* tp_setattr */
1156     0,                                  /* tp_compare */
1157     0,                                  /* tp_repr */
1158     0,                                  /* tp_as_number */
1159     0,                                  /* tp_as_sequence */
1160     0,                                  /* tp_as_mapping */
1161     0,                                  /* tp_hash */
1162     0,                                  /* tp_call */
1163     0,                                  /* tp_str */
1164     PyObject_GenericGetAttr,            /* tp_getattro */
1165     0,                                  /* tp_setattro */
1166     0,                                  /* tp_as_buffer */
1167     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1168     0,                                  /* tp_doc */
1169     0,                                  /* tp_traverse */
1170     0,                                  /* tp_clear */
1171     0,                                  /* tp_richcompare */
1172     0,                                  /* tp_weaklistoffset */
1173     PyObject_SelfIter,                  /* tp_iter */
1174     (iternextfunc)formatteriter_next,   /* tp_iternext */
1175     formatteriter_methods,              /* tp_methods */
1176     0,
1177 };
1178
1179 /* unicode_formatter_parser is used to implement
1180    string.Formatter.vformat.  it parses a string and returns tuples
1181    describing the parsed elements.  It's a wrapper around
1182    stringlib/string_format.h's MarkupIterator */
1183 static PyObject *
1184 formatter_parser(STRINGLIB_OBJECT *self)
1185 {
1186     formatteriterobject *it;
1187
1188     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1189     if (it == NULL)
1190         return NULL;
1191
1192     /* take ownership, give the object to the iterator */
1193     Py_INCREF(self);
1194     it->str = self;
1195
1196     /* initialize the contained MarkupIterator */
1197     MarkupIterator_init(&it->it_markup,
1198                         STRINGLIB_STR(self),
1199                         STRINGLIB_LEN(self));
1200
1201     return (PyObject *)it;
1202 }
1203
1204
1205 /************************************************************************/
1206 /*********** fieldnameiterator ******************************************/
1207 /************************************************************************/
1208
1209
1210 /* This is used to implement string.Formatter.vparse().  It parses the
1211    field name into attribute and item values.  It's a Python-callable
1212    wrapper around FieldNameIterator */
1213
1214 typedef struct {
1215     PyObject_HEAD
1216
1217     STRINGLIB_OBJECT *str;
1218
1219     FieldNameIterator it_field;
1220 } fieldnameiterobject;
1221
1222 static void
1223 fieldnameiter_dealloc(fieldnameiterobject *it)
1224 {
1225     Py_XDECREF(it->str);
1226     PyObject_FREE(it);
1227 }
1228
1229 /* returns a tuple:
1230    (is_attr, value)
1231    is_attr is true if we used attribute syntax (e.g., '.foo')
1232               false if we used index syntax (e.g., '[foo]')
1233    value is an integer or string
1234 */
1235 static PyObject *
1236 fieldnameiter_next(fieldnameiterobject *it)
1237 {
1238     int result;
1239     int is_attr;
1240     Py_ssize_t idx;
1241     SubString name;
1242
1243     result = FieldNameIterator_next(&it->it_field, &is_attr,
1244                                     &idx, &name);
1245     if (result == 0 || result == 1)
1246         /* if 0, error has already been set, if 1, iterator is empty */
1247         return NULL;
1248     else {
1249         PyObject* result = NULL;
1250         PyObject* is_attr_obj = NULL;
1251         PyObject* obj = NULL;
1252
1253         is_attr_obj = PyBool_FromLong(is_attr);
1254         if (is_attr_obj == NULL)
1255             goto done;
1256
1257         /* either an integer or a string */
1258         if (idx != -1)
1259             obj = PyLong_FromSsize_t(idx);
1260         else
1261             obj = SubString_new_object(&name);
1262         if (obj == NULL)
1263             goto done;
1264
1265         /* return a tuple of values */
1266         result = PyTuple_Pack(2, is_attr_obj, obj);
1267
1268     done:
1269         Py_XDECREF(is_attr_obj);
1270         Py_XDECREF(obj);
1271         return result;
1272     }
1273 }
1274
1275 static PyMethodDef fieldnameiter_methods[] = {
1276     {NULL,              NULL}           /* sentinel */
1277 };
1278
1279 static PyTypeObject PyFieldNameIter_Type = {
1280     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1281     "fieldnameiterator",                /* tp_name */
1282     sizeof(fieldnameiterobject),        /* tp_basicsize */
1283     0,                                  /* tp_itemsize */
1284     /* methods */
1285     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1286     0,                                  /* tp_print */
1287     0,                                  /* tp_getattr */
1288     0,                                  /* tp_setattr */
1289     0,                                  /* tp_compare */
1290     0,                                  /* tp_repr */
1291     0,                                  /* tp_as_number */
1292     0,                                  /* tp_as_sequence */
1293     0,                                  /* tp_as_mapping */
1294     0,                                  /* tp_hash */
1295     0,                                  /* tp_call */
1296     0,                                  /* tp_str */
1297     PyObject_GenericGetAttr,            /* tp_getattro */
1298     0,                                  /* tp_setattro */
1299     0,                                  /* tp_as_buffer */
1300     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1301     0,                                  /* tp_doc */
1302     0,                                  /* tp_traverse */
1303     0,                                  /* tp_clear */
1304     0,                                  /* tp_richcompare */
1305     0,                                  /* tp_weaklistoffset */
1306     PyObject_SelfIter,                  /* tp_iter */
1307     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1308     fieldnameiter_methods,              /* tp_methods */
1309     0};
1310
1311 /* unicode_formatter_field_name_split is used to implement
1312    string.Formatter.vformat.  it takes an PEP 3101 "field name", and
1313    returns a tuple of (first, rest): "first", the part before the
1314    first '.' or '['; and "rest", an iterator for the rest of the field
1315    name.  it's a wrapper around stringlib/string_format.h's
1316    field_name_split.  The iterator it returns is a
1317    FieldNameIterator */
1318 static PyObject *
1319 formatter_field_name_split(STRINGLIB_OBJECT *self)
1320 {
1321     SubString first;
1322     Py_ssize_t first_idx;
1323     fieldnameiterobject *it;
1324
1325     PyObject *first_obj = NULL;
1326     PyObject *result = NULL;
1327
1328     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1329     if (it == NULL)
1330         return NULL;
1331
1332     /* take ownership, give the object to the iterator.  this is
1333        just to keep the field_name alive */
1334     Py_INCREF(self);
1335     it->str = self;
1336
1337     /* Pass in auto_number = NULL. We'll return an empty string for
1338        first_obj in that case. */
1339     if (!field_name_split(STRINGLIB_STR(self),
1340                           STRINGLIB_LEN(self),
1341                           &first, &first_idx, &it->it_field, NULL))
1342         goto done;
1343
1344     /* first becomes an integer, if possible; else a string */
1345     if (first_idx != -1)
1346         first_obj = PyLong_FromSsize_t(first_idx);
1347     else
1348         /* convert "first" into a string object */
1349         first_obj = SubString_new_object(&first);
1350     if (first_obj == NULL)
1351         goto done;
1352
1353     /* return a tuple of values */
1354     result = PyTuple_Pack(2, first_obj, it);
1355
1356 done:
1357     Py_XDECREF(it);
1358     Py_XDECREF(first_obj);
1359     return result;
1360 }