Modules/_json.c

   1 #include "Python.h"
   2
   3 #define DEFAULT_ENCODING "utf-8"
   4 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
   5 #define MIN_EXPANSION 6
   6
   7 #ifdef Py_UNICODE_WIDE
   8 #define MAX_EXPANSION (2 * MIN_EXPANSION)
   9 #else
  10 #define MAX_EXPANSION MIN_EXPANSION
  11 #endif
  12
  13 static Py_ssize_t
  14 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
  15 {
  16     Py_UNICODE x;
  17     output[chars++] = '\\';
  18     switch (c) {
  19         case '\\': output[chars++] = (char)c; break;
  20         case '"': output[chars++] = (char)c; break;
  21         case '\b': output[chars++] = 'b'; break;
  22         case '\f': output[chars++] = 'f'; break;
  23         case '\n': output[chars++] = 'n'; break;
  24         case '\r': output[chars++] = 'r'; break;
  25         case '\t': output[chars++] = 't'; break;
  26         default:
  27 #ifdef Py_UNICODE_WIDE
  28             if (c >= 0x10000) {
  29                 /* UTF-16 surrogate pair */
  30                 Py_UNICODE v = c - 0x10000;
  31                 c = 0xd800 | ((v >> 10) & 0x3ff);
  32                 output[chars++] = 'u';
  33                 x = (c & 0xf000) >> 12;
  34                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  35                 x = (c & 0x0f00) >> 8;
  36                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  37                 x = (c & 0x00f0) >> 4;
  38                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  39                 x = (c & 0x000f);
  40                 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  41                 c = 0xdc00 | (v & 0x3ff);
  42                 output[chars++] = '\\';
  43             }
  44 #endif
  45             output[chars++] = 'u';
  46             x = (c & 0xf000) >> 12;
  47             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  48             x = (c & 0x0f00) >> 8;
  49             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  50             x = (c & 0x00f0) >> 4;
  51             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  52             x = (c & 0x000f);
  53             output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  54     }
  55     return chars;
  56 }
  57
  58 static PyObject *
  59 ascii_escape_unicode(PyObject *pystr)
  60 {
  61     Py_ssize_t i;
  62     Py_ssize_t input_chars;
  63     Py_ssize_t output_size;
  64     Py_ssize_t chars;
  65     PyObject *rval;
  66     char *output;
  67     Py_UNICODE *input_unicode;
  68
  69     input_chars = PyUnicode_GET_SIZE(pystr);
  70     input_unicode = PyUnicode_AS_UNICODE(pystr);
  71     /* One char input can be up to 6 chars output, estimate 4 of these */
  72     output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
  73     rval = PyString_FromStringAndSize(NULL, output_size);
  74     if (rval == NULL) {
  75         return NULL;
  76     }
  77     output = PyString_AS_STRING(rval);
  78     chars = 0;
  79     output[chars++] = '"';
  80     for (i = 0; i < input_chars; i++) {
  81         Py_UNICODE c = input_unicode[i];
  82         if (S_CHAR(c)) {
  83             output[chars++] = (char)c;
  84         }
  85         else {
  86             chars = ascii_escape_char(c, output, chars);
  87         }
  88         if (output_size - chars < (1 + MAX_EXPANSION)) {
  89             /* There's more than four, so let's resize by a lot */
  90             output_size *= 2;
  91             /* This is an upper bound */
  92             if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
  93                 output_size = 2 + (input_chars * MAX_EXPANSION);
  94             }
  95             if (_PyString_Resize(&rval, output_size) == -1) {
  96                 return NULL;
  97             }
  98             output = PyString_AS_STRING(rval);
  99         }
 100     }
 101     output[chars++] = '"';
 102     if (_PyString_Resize(&rval, chars) == -1) {
 103         return NULL;
 104     }
 105     return rval;
 106 }
 107
 108 static PyObject *
 109 ascii_escape_str(PyObject *pystr)
 110 {
 111     Py_ssize_t i;
 112     Py_ssize_t input_chars;
 113     Py_ssize_t output_size;
 114     Py_ssize_t chars;
 115     PyObject *rval;
 116     char *output;
 117     char *input_str;
 118
 119     input_chars = PyString_GET_SIZE(pystr);
 120     input_str = PyString_AS_STRING(pystr);
 121     /* One char input can be up to 6 chars output, estimate 4 of these */
 122     output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
 123     rval = PyString_FromStringAndSize(NULL, output_size);
 124     if (rval == NULL) {
 125         return NULL;
 126     }
 127     output = PyString_AS_STRING(rval);
 128     chars = 0;
 129     output[chars++] = '"';
 130     for (i = 0; i < input_chars; i++) {
 131         Py_UNICODE c = (Py_UNICODE)input_str[i];
 132         if (S_CHAR(c)) {
 133             output[chars++] = (char)c;
 134         }
 135         else if (c > 0x7F) {
 136             /* We hit a non-ASCII character, bail to unicode mode */
 137             PyObject *uni;
 138             Py_DECREF(rval);
 139             uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
 140             if (uni == NULL) {
 141                 return NULL;
 142             }
 143             rval = ascii_escape_unicode(uni);
 144             Py_DECREF(uni);
 145             return rval;
 146         }
 147         else {
 148             chars = ascii_escape_char(c, output, chars);
 149         }
 150         /* An ASCII char can't possibly expand to a surrogate! */
 151         if (output_size - chars < (1 + MIN_EXPANSION)) {
 152             /* There's more than four, so let's resize by a lot */
 153             output_size *= 2;
 154             if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
 155                 output_size = 2 + (input_chars * MIN_EXPANSION);
 156             }
 157             if (_PyString_Resize(&rval, output_size) == -1) {
 158                 return NULL;
 159             }
 160             output = PyString_AS_STRING(rval);
 161         }
 162     }
 163     output[chars++] = '"';
 164     if (_PyString_Resize(&rval, chars) == -1) {
 165         return NULL;
 166     }
 167     return rval;
 168 }
 169
 170 void
 171 raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
 172 {
 173     static PyObject *errmsg_fn = NULL;
 174     PyObject *pymsg;
 175     if (errmsg_fn == NULL) {
 176         PyObject *decoder = PyImport_ImportModule("json.decoder");
 177         if (decoder == NULL)
 178             return;
 179         errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
 180         if (errmsg_fn == NULL)
 181             return;
 182         Py_XDECREF(decoder);
 183     }
 184     pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
 185     PyErr_SetObject(PyExc_ValueError, pymsg);
 186     Py_DECREF(pymsg);
 187 /*
 188
 189 def linecol(doc, pos):
 190     lineno = doc.count('\n', 0, pos) + 1
 191     if lineno == 1:
 192         colno = pos
 193     else:
 194         colno = pos - doc.rindex('\n', 0, pos)
 195     return lineno, colno
 196
 197 def errmsg(msg, doc, pos, end=None):
 198     lineno, colno = linecol(doc, pos)
 199     if end is None:
 200         return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
 201     endlineno, endcolno = linecol(doc, end)
 202     return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
 203         msg, lineno, colno, endlineno, endcolno, pos, end)
 204
 205 */
 206 }
 207
 208 static PyObject *
 209 join_list_unicode(PyObject *lst)
 210 {
 211     static PyObject *ustr = NULL;
 212     static PyObject *joinstr = NULL;
 213     if (ustr == NULL) {
 214         Py_UNICODE c = 0;
 215         ustr = PyUnicode_FromUnicode(&c, 0);
 216     }
 217     if (joinstr == NULL) {
 218         joinstr = PyString_InternFromString("join");
 219     }
 220     if (joinstr == NULL || ustr == NULL) {
 221         return NULL;
 222     }
 223     return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
 224 }
 225
 226 static PyObject *
 227 scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
 228 {
 229     PyObject *rval;
 230     Py_ssize_t len = PyString_GET_SIZE(pystr);
 231     Py_ssize_t begin = end - 1;
 232     Py_ssize_t next = begin;
 233     char *buf = PyString_AS_STRING(pystr);
 234     PyObject *chunks = PyList_New(0);
 235     if (chunks == NULL) {
 236         goto bail;
 237     }
 238     while (1) {
 239         /* Find the end of the string or the next escape */
 240         Py_UNICODE c = 0;
 241         PyObject *chunk = NULL;
 242         for (next = end; next < len; next++) {
 243             c = buf[next];
 244             if (c == '"' || c == '\\') {
 245                 break;
 246             }
 247             else if (strict && c <= 0x1f) {
 248                 raise_errmsg("Invalid control character at", pystr, begin);
 249                 goto bail;
 250             }
 251         }
 252         if (!(c == '"' || c == '\\')) {
 253             raise_errmsg("Unterminated string starting at", pystr, begin);
 254             goto bail;
 255         }
 256         /* Pick up this chunk if it's not zero length */
 257         if (next != end) {
 258             PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
 259             if (strchunk == NULL) {
 260                 goto bail;
 261             }
 262             chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
 263             Py_DECREF(strchunk);
 264             if (chunk == NULL) {
 265                 goto bail;
 266             }
 267             if (PyList_Append(chunks, chunk)) {
 268                 goto bail;
 269             }
 270             Py_DECREF(chunk);
 271         }
 272         next++;
 273         if (c == '"') {
 274             end = next;
 275             break;
 276         }
 277         if (next == len) {
 278             raise_errmsg("Unterminated string starting at", pystr, begin);
 279             goto bail;
 280         }
 281         c = buf[next];
 282         if (c != 'u') {
 283             /* Non-unicode backslash escapes */
 284             end = next + 1;
 285             switch (c) {
 286                 case '"': break;
 287                 case '\\': break;
 288                 case '/': break;
 289                 case 'b': c = '\b'; break;
 290                 case 'f': c = '\f'; break;
 291                 case 'n': c = '\n'; break;
 292                 case 'r': c = '\r'; break;
 293                 case 't': c = '\t'; break;
 294                 default: c = 0;
 295             }
 296             if (c == 0) {
 297                 raise_errmsg("Invalid \\escape", pystr, end - 2);
 298                 goto bail;
 299             }
 300         }
 301         else {
 302             c = 0;
 303             next++;
 304             end = next + 4;
 305             if (end >= len) {
 306                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
 307                 goto bail;
 308             }
 309             /* Decode 4 hex digits */
 310             for (; next < end; next++) {
 311                 Py_ssize_t shl = (end - next - 1) << 2;
 312                 Py_UNICODE digit = buf[next];
 313                 switch (digit) {
 314                     case '0': case '1': case '2': case '3': case '4':
 315                     case '5': case '6': case '7': case '8': case '9':
 316                         c |= (digit - '0') << shl; break;
 317                     case 'a': case 'b': case 'c': case 'd': case 'e':
 318                     case 'f':
 319                         c |= (digit - 'a' + 10) << shl; break;
 320                     case 'A': case 'B': case 'C': case 'D': case 'E':
 321                     case 'F':
 322                         c |= (digit - 'A' + 10) << shl; break;
 323                     default:
 324                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 325                         goto bail;
 326                 }
 327             }
 328 #ifdef Py_UNICODE_WIDE
 329             /* Surrogate pair */
 330             if (c >= 0xd800 && c <= 0xdbff) {
 331                 Py_UNICODE c2 = 0;
 332                 if (end + 6 >= len) {
 333                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 334                         end - 5);
 335                 }
 336                 if (buf[next++] != '\\' || buf[next++] != 'u') {
 337                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 338                         end - 5);
 339                 }
 340                 end += 6;
 341                 /* Decode 4 hex digits */
 342                 for (; next < end; next++) {
 343                     Py_ssize_t shl = (end - next - 1) << 2;
 344                     Py_UNICODE digit = buf[next];
 345                     switch (digit) {
 346                         case '0': case '1': case '2': case '3': case '4':
 347                         case '5': case '6': case '7': case '8': case '9':
 348                             c2 |= (digit - '0') << shl; break;
 349                         case 'a': case 'b': case 'c': case 'd': case 'e':
 350                         case 'f':
 351                             c2 |= (digit - 'a' + 10) << shl; break;
 352                         case 'A': case 'B': case 'C': case 'D': case 'E':
 353                         case 'F':
 354                             c2 |= (digit - 'A' + 10) << shl; break;
 355                         default:
 356                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 357                             goto bail;
 358                     }
 359                 }
 360                 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
 361             }
 362 #endif
 363         }
 364         chunk = PyUnicode_FromUnicode(&c, 1);
 365         if (chunk == NULL) {
 366             goto bail;
 367         }
 368         if (PyList_Append(chunks, chunk)) {
 369             goto bail;
 370         }
 371         Py_DECREF(chunk);
 372     }
 373
 374     rval = join_list_unicode(chunks);
 375     if (rval == NULL) {
 376         goto bail;
 377     }
 378     Py_DECREF(chunks);
 379     chunks = NULL;
 380     return Py_BuildValue("(Nn)", rval, end);
 381 bail:
 382     Py_XDECREF(chunks);
 383     return NULL;
 384 }
 385
 386
 387 static PyObject *
 388 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
 389 {
 390     PyObject *rval;
 391     Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
 392     Py_ssize_t begin = end - 1;
 393     Py_ssize_t next = begin;
 394     const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
 395     PyObject *chunks = PyList_New(0);
 396     if (chunks == NULL) {
 397         goto bail;
 398     }
 399     while (1) {
 400         /* Find the end of the string or the next escape */
 401         Py_UNICODE c = 0;
 402         PyObject *chunk = NULL;
 403         for (next = end; next < len; next++) {
 404             c = buf[next];
 405             if (c == '"' || c == '\\') {
 406                 break;
 407             }
 408             else if (strict && c <= 0x1f) {
 409                 raise_errmsg("Invalid control character at", pystr, begin);
 410                 goto bail;
 411             }
 412         }
 413         if (!(c == '"' || c == '\\')) {
 414             raise_errmsg("Unterminated string starting at", pystr, begin);
 415             goto bail;
 416         }
 417         /* Pick up this chunk if it's not zero length */
 418         if (next != end) {
 419             chunk = PyUnicode_FromUnicode(&buf[end], next - end);
 420             if (chunk == NULL) {
 421                 goto bail;
 422             }
 423             if (PyList_Append(chunks, chunk)) {
 424                 goto bail;
 425             }
 426             Py_DECREF(chunk);
 427         }
 428         next++;
 429         if (c == '"') {
 430             end = next;
 431             break;
 432         }
 433         if (next == len) {
 434             raise_errmsg("Unterminated string starting at", pystr, begin);
 435             goto bail;
 436         }
 437         c = buf[next];
 438         if (c != 'u') {
 439             /* Non-unicode backslash escapes */
 440             end = next + 1;
 441             switch (c) {
 442                 case '"': break;
 443                 case '\\': break;
 444                 case '/': break;
 445                 case 'b': c = '\b'; break;
 446                 case 'f': c = '\f'; break;
 447                 case 'n': c = '\n'; break;
 448                 case 'r': c = '\r'; break;
 449                 case 't': c = '\t'; break;
 450                 default: c = 0;
 451             }
 452             if (c == 0) {
 453                 raise_errmsg("Invalid \\escape", pystr, end - 2);
 454                 goto bail;
 455             }
 456         }
 457         else {
 458             c = 0;
 459             next++;
 460             end = next + 4;
 461             if (end >= len) {
 462                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
 463                 goto bail;
 464             }
 465             /* Decode 4 hex digits */
 466             for (; next < end; next++) {
 467                 Py_ssize_t shl = (end - next - 1) << 2;
 468                 Py_UNICODE digit = buf[next];
 469                 switch (digit) {
 470                     case '0': case '1': case '2': case '3': case '4':
 471                     case '5': case '6': case '7': case '8': case '9':
 472                         c |= (digit - '0') << shl; break;
 473                     case 'a': case 'b': case 'c': case 'd': case 'e':
 474                     case 'f':
 475                         c |= (digit - 'a' + 10) << shl; break;
 476                     case 'A': case 'B': case 'C': case 'D': case 'E':
 477                     case 'F':
 478                         c |= (digit - 'A' + 10) << shl; break;
 479                     default:
 480                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 481                         goto bail;
 482                 }
 483             }
 484 #ifdef Py_UNICODE_WIDE
 485             /* Surrogate pair */
 486             if (c >= 0xd800 && c <= 0xdbff) {
 487                 Py_UNICODE c2 = 0;
 488                 if (end + 6 >= len) {
 489                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 490                         end - 5);
 491                 }
 492                 if (buf[next++] != '\\' || buf[next++] != 'u') {
 493                     raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
 494                         end - 5);
 495                 }
 496                 end += 6;
 497                 /* Decode 4 hex digits */
 498                 for (; next < end; next++) {
 499                     Py_ssize_t shl = (end - next - 1) << 2;
 500                     Py_UNICODE digit = buf[next];
 501                     switch (digit) {
 502                         case '0': case '1': case '2': case '3': case '4':
 503                         case '5': case '6': case '7': case '8': case '9':
 504                             c2 |= (digit - '0') << shl; break;
 505                         case 'a': case 'b': case 'c': case 'd': case 'e':
 506                         case 'f':
 507                             c2 |= (digit - 'a' + 10) << shl; break;
 508                         case 'A': case 'B': case 'C': case 'D': case 'E':
 509                         case 'F':
 510                             c2 |= (digit - 'A' + 10) << shl; break;
 511                         default:
 512                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
 513                             goto bail;
 514                     }
 515                 }
 516                 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
 517             }
 518 #endif
 519         }
 520         chunk = PyUnicode_FromUnicode(&c, 1);
 521         if (chunk == NULL) {
 522             goto bail;
 523         }
 524         if (PyList_Append(chunks, chunk)) {
 525             goto bail;
 526         }
 527         Py_DECREF(chunk);
 528     }
 529
 530     rval = join_list_unicode(chunks);
 531     if (rval == NULL) {
 532         goto bail;
 533     }
 534     Py_DECREF(chunks);
 535     chunks = NULL;
 536     return Py_BuildValue("(Nn)", rval, end);
 537 bail:
 538     Py_XDECREF(chunks);
 539     return NULL;
 540 }
 541
 542 PyDoc_STRVAR(pydoc_scanstring,
 543 "scanstring(basestring, end, encoding) -> (str, end)\n");
 544
 545 static PyObject *
 546 py_scanstring(PyObject* self, PyObject *args)
 547 {
 548     PyObject *pystr;
 549     Py_ssize_t end;
 550     char *encoding = NULL;
 551     int strict = 0;
 552     if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
 553         return NULL;
 554     }
 555     if (encoding == NULL) {
 556         encoding = DEFAULT_ENCODING;
 557     }
 558     if (PyString_Check(pystr)) {
 559         return scanstring_str(pystr, end, encoding, strict);
 560     }
 561     else if (PyUnicode_Check(pystr)) {
 562         return scanstring_unicode(pystr, end, strict);
 563     }
 564     else {
 565         PyErr_Format(PyExc_TypeError,
 566                      "first argument must be a string or unicode, not %.80s",
 567                      Py_TYPE(pystr)->tp_name);
 568         return NULL;
 569     }
 570 }
 571
 572 PyDoc_STRVAR(pydoc_encode_basestring_ascii,
 573 "encode_basestring_ascii(basestring) -> str\n");
 574
 575 static PyObject *
 576 py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
 577 {
 578     /* METH_O */
 579     if (PyString_Check(pystr)) {
 580         return ascii_escape_str(pystr);
 581     }
 582     else if (PyUnicode_Check(pystr)) {
 583         return ascii_escape_unicode(pystr);
 584     }
 585     else {
 586         PyErr_Format(PyExc_TypeError,
 587                      "first argument must be a string or unicode, not %.80s",
 588                      Py_TYPE(pystr)->tp_name);
 589         return NULL;
 590     }
 591 }
 592
 593 static PyMethodDef json_methods[] = {
 594     {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
 595      METH_O, pydoc_encode_basestring_ascii},
 596     {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
 597      pydoc_scanstring},
 598     {NULL, NULL, 0, NULL}
 599 };
 600
 601 PyDoc_STRVAR(module_doc,
 602 "json speedups\n");
 603
 604 void
 605 init_json(void)
 606 {
 607     PyObject *m;
 608     m = Py_InitModule3("_json", json_methods, module_doc);
 609 }