Modules/_json.c

   1 #include "Python.h"
   2
   3 #define DEFAULT_ENCODING "utf-8"
   4 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
   5 #define MIN_EXPANSION 6
   6
   7 #ifdef Py_UNICODE_WIDE
   8 #define MAX_EXPANSION (2 * MIN_EXPANSION)
   9 #else
  10 #define MAX_EXPANSION MIN_EXPANSION
  11 #endif
  12
  13 static Py_ssize_t
  14 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
  15 {
  16     Py_UNICODE x;
  17     output[chars++] = '\\';
  18     switch (c) {
  19         case '\\': output[chars++] = (char)c; break;
  20         case '"': output[chars++] = (char)c; break;
  21         case '\b': output[chars++] = 'b'; break;
  22         case '\f': output[chars++] = 'f'; break;
  23         case '\n': output[chars++] = 'n'; break;
  24         case '\r': output[chars++] = 'r'; break;
  25         case '\t': output[chars++] = 't'; break;
  26         default:
  27 #ifdef Py_UNICODE_WIDE
  28             if (c >= 0x10000) {
  29                 /* UTF-16 surrogate pair */
  30                 Py_UNICODE v = c - 0x10000;
  31                 c = 0xd800 | ((v >> 10) & 0x3ff);
  32                 output[chars++] = 'u';
  33                 x = (c & 0xf000) >> 12;
  34                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  35                 x = (c & 0x0f00) >> 8;
  36                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  37                 x = (c & 0x00f0) >> 4;
  38                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  39                 x = (c & 0x000f);
  40                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  41                 c = 0xdc00 | (v & 0x3ff);
  42                 output[chars++] = '\\';
  43             }
  44 #endif
  45             output[chars++] = 'u';
  46             x = (c & 0xf000) >> 12;
  47             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  48             x = (c & 0x0f00) >> 8;
  49             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  50             x = (c & 0x00f0) >> 4;
  51             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  52             x = (c & 0x000f);
  53             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  54     }
  55     return chars;
  56 }
  57
  58 static PyObject *
  59 ascii_escape_unicode(PyObject *pystr)
  60 {
  61     Py_ssize_t i;
  62     Py_ssize_t input_chars;
  63     Py_ssize_t output_size;
  64     Py_ssize_t chars;
  65     PyObject *rval;
  66     char *output;
  67     Py_UNICODE *input_unicode;
  68
  69     input_chars = PyUnicode_GET_SIZE(pystr);
  70     input_unicode = PyUnicode_AS_UNICODE(pystr);
  71     /* One char input can be up to 6 chars output, estimate 4 of these */
  72     output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
  73     rval = PyString_FromStringAndSize(NULL, output_size);
  74     if (rval == NULL) {
  75         return NULL;
  76     }
  77     output = PyString_AS_STRING(rval);
  78     chars = 0;
  79     output[chars++] = '"';
  80     for (i = 0; i < input_chars; i++) {
  81         Py_UNICODE c = input_unicode[i];
  82         if (S_CHAR(c)) {
  83             output[chars++] = (char)c;
  84         }
  85         else {
  86             chars = ascii_escape_char(c, output, chars);
  87         }
  88         if (output_size - chars < (1 + MAX_EXPANSION)) {
  89             /* There's more than four, so let's resize by a lot */
  90             output_size *= 2;
  91             /* This is an upper bound */
  92             if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
  93                 output_size = 2 + (input_chars * MAX_EXPANSION);
  94             }
  95             if (_PyString_Resize(&rval, output_size) == -1) {
  96                 return NULL;
  97             }
  98             output = PyString_AS_STRING(rval);
  99         }
 100     }
 101     output[chars++] = '"';
 102     if (_PyString_Resize(&rval, chars) == -1) {
 103         return NULL;
 104     }
 105     return rval;
 106 }
 107
 108 static PyObject *
 109 ascii_escape_str(PyObject *pystr)
 110 {
 111     Py_ssize_t i;
 112     Py_ssize_t input_chars;
 113     Py_ssize_t output_size;
 114     Py_ssize_t chars;
 115     PyObject *rval;
 116     char *output;
 117     char *input_str;
 118
 119     input_chars = PyString_GET_SIZE(pystr);
 120     input_str = PyString_AS_STRING(pystr);
 121     /* One char input can be up to 6 chars output, estimate 4 of these */
 122     output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
 123     rval = PyString_FromStringAndSize(NULL, output_size);
 124     if (rval == NULL) {
 125         return NULL;
 126     }
 127     output = PyString_AS_STRING(rval);
 128     chars = 0;
 129     output[chars++] = '"';
 130     for (i = 0; i < input_chars; i++) {
 131         Py_UNICODE c = (Py_UNICODE)input_str[i];
 132         if (S_CHAR(c)) {
 133             output[chars++] = (char)c;
 134         }
 135         else if (c > 0x7F) {
 136             /* We hit a non-ASCII character, bail to unicode mode */
 137             PyObject *uni;
 138             Py_DECREF(rval);
 139             uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
 140             if (uni == NULL) {
 141                 return NULL;
 142             }
 143             rval = ascii_escape_unicode(uni);
 144             Py_DECREF(uni);
 145             return rval;
 146         }
 147         else {
 148             chars = ascii_escape_char(c, output, chars);
 149         }
 150         /* An ASCII char can't possibly expand to a surrogate! */
 151         if (output_size - chars < (1 + MIN_EXPANSION)) {
 152             /* There's more than four, so let's resize by a lot */
 153             output_size *= 2;
 154             if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
 155                 output_size = 2 + (input_chars * MIN_EXPANSION);
 156             }
 157             if (_PyString_Resize(&rval, output_size) == -1) {
 158                 return NULL;
 159             }
 160             output = PyString_AS_STRING(rval);
 161         }
 162     }
 163     output[chars++] = '"';
 164     if (_PyString_Resize(&rval, chars) == -1) {
 165         return NULL;
 166     }
 167     return rval;
 168 }
 169
 170 void
 171 raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
 172 {
 173     static PyObject *errmsg_fn = NULL;
 174     PyObject *pymsg;
 175     if (errmsg_fn == NULL) {
 176         PyObject *decoder = PyImport_ImportModule("json.decoder");
 177         if (decoder == NULL)
 178             return;
 179         errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
 180         if (errmsg_fn == NULL)
 181             return;
 182         Py_DECREF(decoder);
 183     }
 184     pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
 185     if (pymsg) {
 186         PyErr_SetObject(PyExc_ValueError, pymsg);
 187         Py_DECREF(pymsg);
 188     }
 189 /*
 190
 191 def linecol(doc, pos):
 192     lineno = doc.count('\n', 0, pos) + 1
 193     if lineno == 1:
 194         colno = pos
 195     else:
 196         colno = pos - doc.rindex('\n', 0, pos)
 197     return lineno, colno
 198
 199 def errmsg(msg, doc, pos, end=None):
 200     lineno, colno = linecol(doc, pos)
 201     if end is None:
 202         return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
 203     endlineno, endcolno = linecol(doc, end)
 204     return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
 205         msg, lineno, colno, endlineno, endcolno, pos, end)
 206
 207 */
 208 }
 209
 210 static PyObject *
 211 join_list_unicode(PyObject *lst)
 212 {
 213     static PyObject *ustr = NULL;
 214     static PyObject *joinstr = NULL;
 215     if (ustr == NULL) {
 216         Py_UNICODE c = 0;
 217         ustr = PyUnicode_FromUnicode(&c, 0);
 218     }
 219     if (joinstr == NULL) {
 220         joinstr = PyString_InternFromString("join");
 221     }
 222     if (joinstr == NULL || ustr == NULL) {
 223         return NULL;
 224     }
 225     return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
 226 }
 227
 228 static PyObject *
 229 scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
 230 {
 231     PyObject *rval;
 232     Py_ssize_t len = PyString_GET_SIZE(pystr);
 233     Py_ssize_t begin = end - 1;
 234     Py_ssize_t next = begin;
 235     char *buf = PyString_AS_STRING(pystr);
 236     PyObject *chunks = PyList_New(0);
 237     if (chunks == NULL) {
 238         goto bail;
 239     }
 240     if (end < 0 || len <= end) {
 241         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
 242         goto bail;
 243     }
 244     while (1) {
 245         /* Find the end of the string or the next escape */
 246         Py_UNICODE c = 0;
 247         PyObject *chunk = NULL;
 248         for (next = end; next < len; next++) {
 249             c = buf[next];
 250             if (c == '"' || c == '\\') {
 251                 break;
 252             }
 253             else if (strict && c <= 0x1f) {
 254                 raise_errmsg("Invalid control character at", pystr, next);
 255                 goto bail;
 256             }
 257         }
 258         if (!(c == '"' || c == '\\')) {
 259             raise_errmsg("Unterminated string starting at", pystr, begin);
 260             goto bail;
 261         }
 262         /* Pick up this chunk if it's not zero length */
 263         if (next != end) {
 264             PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
 265             if (strchunk == NULL) {
 266                 goto bail;
 267             }
 268             chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
 269             Py_DECREF(strchunk);
 270             if (chunk == NULL) {
 271                 goto bail;
 272             }
 273             if (PyList_Append(chunks, chunk)) {
 274                 Py_DECREF(chunk);
 275                 goto bail;
 276             }
 277             Py_DECREF(chunk);
 278         }
 279         next++;
 280         if (c == '"') {
 281             end = next;
 282             break;
 283         }
 284         if (next == len) {
 285             raise_errmsg("Unterminated string starting at", pystr, begin);
 286             goto bail;
 287         }
 288         c = buf[next];
 289         if (c != 'u') {
 290             /* Non-unicode backslash escapes */
 291             end = next + 1;
 292             switch (c) {
 293                 case '"': break;
 294                 case '\\': break;
 295                 case '/': break;
 296                 case 'b': c = '\b'; break;
 297                 case 'f': c = '\f'; break;
 298                 case 'n': c = '\n'; break;
 299                 case 'r': c = '\r'; break;
 300                 case 't': c = '\t'; break;
 301                 default: c = 0;
 302             }
 303             if (c == 0) {
 304                 raise_errmsg("Invalid \\escape", pystr, end - 2);
 305                 goto bail;
 306             }
 307         }
 308         else {
 309             c = 0;
 310             next++;
 311             end = next + 4;
 312             if (end >= len) {
 313                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
 314                 goto bail;
 315             }
 316             /* Decode 4 hex digits */
 317             for (; next < end; next++) {
 318                 Py_ssize_t shl = (end - next - 1) << 2;
 319                 Py_UNICODE digit = buf[next];
 320                 switch (digit) {
 321                     case '0': case '1': case '2': case '3': case '4':
 322                     case '5': case '6': case '7': case '8': case '9':
 323                         c |= (digit - '0') << shl; break;
 324                     case 'a': case 'b': case 'c': case 'd': case 'e':
 325                     case 'f':
 326                         c |= (digit - 'a' + 10) << shl; break;
 327                     case 'A': case 'B': case 'C': case 'D': case 'E':
 328                     case 'F':
 329                         c |= (digit - 'A' + 10) << shl; break;
 330                     default:
 331                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 332                         goto bail;
 333                 }
 334             }
 335 #ifdef Py_UNICODE_WIDE
 336             /* Surrogate pair */
 337             if (c >= 0xd800 && c <= 0xdbff) {
 338                 Py_UNICODE c2 = 0;
 339                 if (end + 6 >= len) {
 340                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 341                         end - 5);
 342                 }
 343                 if (buf[next++] != '\\' || buf[next++] != 'u') {
 344                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 345                         end - 5);
 346                 }
 347                 end += 6;
 348                 /* Decode 4 hex digits */
 349                 for (; next < end; next++) {
 350                     Py_ssize_t shl = (end - next - 1) << 2;
 351                     Py_UNICODE digit = buf[next];
 352                     switch (digit) {
 353                         case '0': case '1': case '2': case '3': case '4':
 354                         case '5': case '6': case '7': case '8': case '9':
 355                             c2 |= (digit - '0') << shl; break;
 356                         case 'a': case 'b': case 'c': case 'd': case 'e':
 357                         case 'f':
 358                             c2 |= (digit - 'a' + 10) << shl; break;
 359                         case 'A': case 'B': case 'C': case 'D': case 'E':
 360                         case 'F':
 361                             c2 |= (digit - 'A' + 10) << shl; break;
 362                         default:
 363                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 364                             goto bail;
 365                     }
 366                 }
 367                 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
 368             }
 369 #endif
 370         }
 371         chunk = PyUnicode_FromUnicode(&c, 1);
 372         if (chunk == NULL) {
 373             goto bail;
 374         }
 375         if (PyList_Append(chunks, chunk)) {
 376             Py_DECREF(chunk);
 377             goto bail;
 378         }
 379         Py_DECREF(chunk);
 380     }
 381
 382     rval = join_list_unicode(chunks);
 383     if (rval == NULL) {
 384         goto bail;
 385     }
 386     Py_CLEAR(chunks);
 387     return Py_BuildValue("(Nn)", rval, end);
 388 bail:
 389     Py_XDECREF(chunks);
 390     return NULL;
 391 }
 392
 393
 394 static PyObject *
 395 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
 396 {
 397     PyObject *rval;
 398     Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
 399     Py_ssize_t begin = end - 1;
 400     Py_ssize_t next = begin;
 401     const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
 402     PyObject *chunks = PyList_New(0);
 403     if (chunks == NULL) {
 404         goto bail;
 405     }
 406     if (end < 0 || len <= end) {
 407         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
 408         goto bail;
 409     }
 410     while (1) {
 411         /* Find the end of the string or the next escape */
 412         Py_UNICODE c = 0;
 413         PyObject *chunk = NULL;
 414         for (next = end; next < len; next++) {
 415             c = buf[next];
 416             if (c == '"' || c == '\\') {
 417                 break;
 418             }
 419             else if (strict && c <= 0x1f) {
 420                 raise_errmsg("Invalid control character at", pystr, next);
 421                 goto bail;
 422             }
 423         }
 424         if (!(c == '"' || c == '\\')) {
 425             raise_errmsg("Unterminated string starting at", pystr, begin);
 426             goto bail;
 427         }
 428         /* Pick up this chunk if it's not zero length */
 429         if (next != end) {
 430             chunk = PyUnicode_FromUnicode(&buf[end], next - end);
 431             if (chunk == NULL) {
 432                 goto bail;
 433             }
 434             if (PyList_Append(chunks, chunk)) {
 435                 Py_DECREF(chunk);
 436                 goto bail;
 437             }
 438             Py_DECREF(chunk);
 439         }
 440         next++;
 441         if (c == '"') {
 442             end = next;
 443             break;
 444         }
 445         if (next == len) {
 446             raise_errmsg("Unterminated string starting at", pystr, begin);
 447             goto bail;
 448         }
 449         c = buf[next];
 450         if (c != 'u') {
 451             /* Non-unicode backslash escapes */
 452             end = next + 1;
 453             switch (c) {
 454                 case '"': break;
 455                 case '\\': break;
 456                 case '/': break;
 457                 case 'b': c = '\b'; break;
 458                 case 'f': c = '\f'; break;
 459                 case 'n': c = '\n'; break;
 460                 case 'r': c = '\r'; break;
 461                 case 't': c = '\t'; break;
 462                 default: c = 0;
 463             }
 464             if (c == 0) {
 465                 raise_errmsg("Invalid \\escape", pystr, end - 2);
 466                 goto bail;
 467             }
 468         }
 469         else {
 470             c = 0;
 471             next++;
 472             end = next + 4;
 473             if (end >= len) {
 474                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
 475                 goto bail;
 476             }
 477             /* Decode 4 hex digits */
 478             for (; next < end; next++) {
 479                 Py_ssize_t shl = (end - next - 1) << 2;
 480                 Py_UNICODE digit = buf[next];
 481                 switch (digit) {
 482                     case '0': case '1': case '2': case '3': case '4':
 483                     case '5': case '6': case '7': case '8': case '9':
 484                         c |= (digit - '0') << shl; break;
 485                     case 'a': case 'b': case 'c': case 'd': case 'e':
 486                     case 'f':
 487                         c |= (digit - 'a' + 10) << shl; break;
 488                     case 'A': case 'B': case 'C': case 'D': case 'E':
 489                     case 'F':
 490                         c |= (digit - 'A' + 10) << shl; break;
 491                     default:
 492                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 493                         goto bail;
 494                 }
 495             }
 496 #ifdef Py_UNICODE_WIDE
 497             /* Surrogate pair */
 498             if (c >= 0xd800 && c <= 0xdbff) {
 499                 Py_UNICODE c2 = 0;
 500                 if (end + 6 >= len) {
 501                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 502                         end - 5);
 503                 }
 504                 if (buf[next++] != '\\' || buf[next++] != 'u') {
 505                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 506                         end - 5);
 507                 }
 508                 end += 6;
 509                 /* Decode 4 hex digits */
 510                 for (; next < end; next++) {
 511                     Py_ssize_t shl = (end - next - 1) << 2;
 512                     Py_UNICODE digit = buf[next];
 513                     switch (digit) {
 514                         case '0': case '1': case '2': case '3': case '4':
 515                         case '5': case '6': case '7': case '8': case '9':
 516                             c2 |= (digit - '0') << shl; break;
 517                         case 'a': case 'b': case 'c': case 'd': case 'e':
 518                         case 'f':
 519                             c2 |= (digit - 'a' + 10) << shl; break;
 520                         case 'A': case 'B': case 'C': case 'D': case 'E':
 521                         case 'F':
 522                             c2 |= (digit - 'A' + 10) << shl; break;
 523                         default:
 524                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 525                             goto bail;
 526                     }
 527                 }
 528                 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
 529             }
 530 #endif
 531         }
 532         chunk = PyUnicode_FromUnicode(&c, 1);
 533         if (chunk == NULL) {
 534             goto bail;
 535         }
 536         if (PyList_Append(chunks, chunk)) {
 537             Py_DECREF(chunk);
 538             goto bail;
 539         }
 540         Py_DECREF(chunk);
 541     }
 542
 543     rval = join_list_unicode(chunks);
 544     if (rval == NULL) {
 545         goto bail;
 546     }
 547     Py_CLEAR(chunks);
 548     return Py_BuildValue("(Nn)", rval, end);
 549 bail:
 550     Py_XDECREF(chunks);
 551     return NULL;
 552 }
 553
 554 PyDoc_STRVAR(pydoc_scanstring,
 555 "scanstring(basestring, end, encoding) -> (str, end)\n");
 556
 557 static PyObject *
 558 py_scanstring(PyObject* self, PyObject *args)
 559 {
 560     PyObject *pystr;
 561     Py_ssize_t end;
 562     char *encoding = NULL;
 563     int strict = 0;
 564     if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
 565         return NULL;
 566     }
 567     if (encoding == NULL) {
 568         encoding = DEFAULT_ENCODING;
 569     }
 570     if (PyString_Check(pystr)) {
 571         return scanstring_str(pystr, end, encoding, strict);
 572     }
 573     else if (PyUnicode_Check(pystr)) {
 574         return scanstring_unicode(pystr, end, strict);
 575     }
 576     else {
 577         PyErr_Format(PyExc_TypeError,
 578                      "first argument must be a string or unicode, not %.80s",
 579                      Py_TYPE(pystr)->tp_name);
 580         return NULL;
 581     }
 582 }
 583
 584 PyDoc_STRVAR(pydoc_encode_basestring_ascii,
 585 "encode_basestring_ascii(basestring) -> str\n");
 586
 587 static PyObject *
 588 py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
 589 {
 590     /* METH_O */
 591     if (PyString_Check(pystr)) {
 592         return ascii_escape_str(pystr);
 593     }
 594     else if (PyUnicode_Check(pystr)) {
 595         return ascii_escape_unicode(pystr);
 596     }
 597     else {
 598         PyErr_Format(PyExc_TypeError,
 599                      "first argument must be a string or unicode, not %.80s",
 600                      Py_TYPE(pystr)->tp_name);
 601         return NULL;
 602     }
 603 }
 604
 605 static PyMethodDef json_methods[] = {
 606     {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
 607      METH_O, pydoc_encode_basestring_ascii},
 608     {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
 609      pydoc_scanstring},
 610     {NULL, NULL, 0, NULL}
 611 };
 612
 613 PyDoc_STRVAR(module_doc,
 614 "json speedups\n");
 615
 616 void
 617 init_json(void)
 618 {
 619     PyObject *m;
 620     m = Py_InitModule3("_json", json_methods, module_doc);
 621 }