This reverts r63675 based on the discussion in this thread:
[python.git] / Modules / _json.c
blobea6d66f60cac9e6a58c65228272a72ff69de2ff5
1 #include "Python.h"
3 #define DEFAULT_ENCODING "utf-8"
4 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
5 #define MIN_EXPANSION 6
7 #ifdef Py_UNICODE_WIDE
8 #define MAX_EXPANSION (2 * MIN_EXPANSION)
9 #else
10 #define MAX_EXPANSION MIN_EXPANSION
11 #endif
13 static Py_ssize_t
14 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
16 Py_UNICODE x;
17 output[chars++] = '\\';
18 switch (c) {
19 case '\\': output[chars++] = (char)c; break;
20 case '"': output[chars++] = (char)c; break;
21 case '\b': output[chars++] = 'b'; break;
22 case '\f': output[chars++] = 'f'; break;
23 case '\n': output[chars++] = 'n'; break;
24 case '\r': output[chars++] = 'r'; break;
25 case '\t': output[chars++] = 't'; break;
26 default:
27 #ifdef Py_UNICODE_WIDE
28 if (c >= 0x10000) {
29 /* UTF-16 surrogate pair */
30 Py_UNICODE v = c - 0x10000;
31 c = 0xd800 | ((v >> 10) & 0x3ff);
32 output[chars++] = 'u';
33 x = (c & 0xf000) >> 12;
34 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
35 x = (c & 0x0f00) >> 8;
36 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
37 x = (c & 0x00f0) >> 4;
38 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
39 x = (c & 0x000f);
40 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
41 c = 0xdc00 | (v & 0x3ff);
42 output[chars++] = '\\';
44 #endif
45 output[chars++] = 'u';
46 x = (c & 0xf000) >> 12;
47 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
48 x = (c & 0x0f00) >> 8;
49 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
50 x = (c & 0x00f0) >> 4;
51 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
52 x = (c & 0x000f);
53 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
55 return chars;
58 static PyObject *
59 ascii_escape_unicode(PyObject *pystr)
61 Py_ssize_t i;
62 Py_ssize_t input_chars;
63 Py_ssize_t output_size;
64 Py_ssize_t chars;
65 PyObject *rval;
66 char *output;
67 Py_UNICODE *input_unicode;
69 input_chars = PyUnicode_GET_SIZE(pystr);
70 input_unicode = PyUnicode_AS_UNICODE(pystr);
71 /* One char input can be up to 6 chars output, estimate 4 of these */
72 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
73 rval = PyString_FromStringAndSize(NULL, output_size);
74 if (rval == NULL) {
75 return NULL;
77 output = PyString_AS_STRING(rval);
78 chars = 0;
79 output[chars++] = '"';
80 for (i = 0; i < input_chars; i++) {
81 Py_UNICODE c = input_unicode[i];
82 if (S_CHAR(c)) {
83 output[chars++] = (char)c;
85 else {
86 chars = ascii_escape_char(c, output, chars);
88 if (output_size - chars < (1 + MAX_EXPANSION)) {
89 /* There's more than four, so let's resize by a lot */
90 output_size *= 2;
91 /* This is an upper bound */
92 if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
93 output_size = 2 + (input_chars * MAX_EXPANSION);
95 if (_PyString_Resize(&rval, output_size) == -1) {
96 return NULL;
98 output = PyString_AS_STRING(rval);
101 output[chars++] = '"';
102 if (_PyString_Resize(&rval, chars) == -1) {
103 return NULL;
105 return rval;
108 static PyObject *
109 ascii_escape_str(PyObject *pystr)
111 Py_ssize_t i;
112 Py_ssize_t input_chars;
113 Py_ssize_t output_size;
114 Py_ssize_t chars;
115 PyObject *rval;
116 char *output;
117 char *input_str;
119 input_chars = PyString_GET_SIZE(pystr);
120 input_str = PyString_AS_STRING(pystr);
121 /* One char input can be up to 6 chars output, estimate 4 of these */
122 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
123 rval = PyString_FromStringAndSize(NULL, output_size);
124 if (rval == NULL) {
125 return NULL;
127 output = PyString_AS_STRING(rval);
128 chars = 0;
129 output[chars++] = '"';
130 for (i = 0; i < input_chars; i++) {
131 Py_UNICODE c = (Py_UNICODE)input_str[i];
132 if (S_CHAR(c)) {
133 output[chars++] = (char)c;
135 else if (c > 0x7F) {
136 /* We hit a non-ASCII character, bail to unicode mode */
137 PyObject *uni;
138 Py_DECREF(rval);
139 uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
140 if (uni == NULL) {
141 return NULL;
143 rval = ascii_escape_unicode(uni);
144 Py_DECREF(uni);
145 return rval;
147 else {
148 chars = ascii_escape_char(c, output, chars);
150 /* An ASCII char can't possibly expand to a surrogate! */
151 if (output_size - chars < (1 + MIN_EXPANSION)) {
152 /* There's more than four, so let's resize by a lot */
153 output_size *= 2;
154 if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
155 output_size = 2 + (input_chars * MIN_EXPANSION);
157 if (_PyString_Resize(&rval, output_size) == -1) {
158 return NULL;
160 output = PyString_AS_STRING(rval);
163 output[chars++] = '"';
164 if (_PyString_Resize(&rval, chars) == -1) {
165 return NULL;
167 return rval;
170 void
171 raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
173 static PyObject *errmsg_fn = NULL;
174 PyObject *pymsg;
175 if (errmsg_fn == NULL) {
176 PyObject *decoder = PyImport_ImportModule("json.decoder");
177 if (decoder == NULL)
178 return;
179 errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
180 if (errmsg_fn == NULL)
181 return;
182 Py_XDECREF(decoder);
184 pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
185 PyErr_SetObject(PyExc_ValueError, pymsg);
186 Py_DECREF(pymsg);
189 def linecol(doc, pos):
190 lineno = doc.count('\n', 0, pos) + 1
191 if lineno == 1:
192 colno = pos
193 else:
194 colno = pos - doc.rindex('\n', 0, pos)
195 return lineno, colno
197 def errmsg(msg, doc, pos, end=None):
198 lineno, colno = linecol(doc, pos)
199 if end is None:
200 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
201 endlineno, endcolno = linecol(doc, end)
202 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
203 msg, lineno, colno, endlineno, endcolno, pos, end)
208 static PyObject *
209 join_list_unicode(PyObject *lst)
211 static PyObject *ustr = NULL;
212 static PyObject *joinstr = NULL;
213 if (ustr == NULL) {
214 Py_UNICODE c = 0;
215 ustr = PyUnicode_FromUnicode(&c, 0);
217 if (joinstr == NULL) {
218 joinstr = PyString_InternFromString("join");
220 if (joinstr == NULL || ustr == NULL) {
221 return NULL;
223 return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
226 static PyObject *
227 scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
229 PyObject *rval;
230 Py_ssize_t len = PyString_GET_SIZE(pystr);
231 Py_ssize_t begin = end - 1;
232 Py_ssize_t next = begin;
233 char *buf = PyString_AS_STRING(pystr);
234 PyObject *chunks = PyList_New(0);
235 if (chunks == NULL) {
236 goto bail;
238 while (1) {
239 /* Find the end of the string or the next escape */
240 Py_UNICODE c = 0;
241 PyObject *chunk = NULL;
242 for (next = end; next < len; next++) {
243 c = buf[next];
244 if (c == '"' || c == '\\') {
245 break;
247 else if (strict && c <= 0x1f) {
248 raise_errmsg("Invalid control character at", pystr, begin);
249 goto bail;
252 if (!(c == '"' || c == '\\')) {
253 raise_errmsg("Unterminated string starting at", pystr, begin);
254 goto bail;
256 /* Pick up this chunk if it's not zero length */
257 if (next != end) {
258 PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
259 if (strchunk == NULL) {
260 goto bail;
262 chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
263 Py_DECREF(strchunk);
264 if (chunk == NULL) {
265 goto bail;
267 if (PyList_Append(chunks, chunk)) {
268 goto bail;
270 Py_DECREF(chunk);
272 next++;
273 if (c == '"') {
274 end = next;
275 break;
277 if (next == len) {
278 raise_errmsg("Unterminated string starting at", pystr, begin);
279 goto bail;
281 c = buf[next];
282 if (c != 'u') {
283 /* Non-unicode backslash escapes */
284 end = next + 1;
285 switch (c) {
286 case '"': break;
287 case '\\': break;
288 case '/': break;
289 case 'b': c = '\b'; break;
290 case 'f': c = '\f'; break;
291 case 'n': c = '\n'; break;
292 case 'r': c = '\r'; break;
293 case 't': c = '\t'; break;
294 default: c = 0;
296 if (c == 0) {
297 raise_errmsg("Invalid \\escape", pystr, end - 2);
298 goto bail;
301 else {
302 c = 0;
303 next++;
304 end = next + 4;
305 if (end >= len) {
306 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
307 goto bail;
309 /* Decode 4 hex digits */
310 for (; next < end; next++) {
311 Py_ssize_t shl = (end - next - 1) << 2;
312 Py_UNICODE digit = buf[next];
313 switch (digit) {
314 case '0': case '1': case '2': case '3': case '4':
315 case '5': case '6': case '7': case '8': case '9':
316 c |= (digit - '0') << shl; break;
317 case 'a': case 'b': case 'c': case 'd': case 'e':
318 case 'f':
319 c |= (digit - 'a' + 10) << shl; break;
320 case 'A': case 'B': case 'C': case 'D': case 'E':
321 case 'F':
322 c |= (digit - 'A' + 10) << shl; break;
323 default:
324 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
325 goto bail;
328 #ifdef Py_UNICODE_WIDE
329 /* Surrogate pair */
330 if (c >= 0xd800 && c <= 0xdbff) {
331 Py_UNICODE c2 = 0;
332 if (end + 6 >= len) {
333 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
334 end - 5);
336 if (buf[next++] != '\\' || buf[next++] != 'u') {
337 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
338 end - 5);
340 end += 6;
341 /* Decode 4 hex digits */
342 for (; next < end; next++) {
343 Py_ssize_t shl = (end - next - 1) << 2;
344 Py_UNICODE digit = buf[next];
345 switch (digit) {
346 case '0': case '1': case '2': case '3': case '4':
347 case '5': case '6': case '7': case '8': case '9':
348 c2 |= (digit - '0') << shl; break;
349 case 'a': case 'b': case 'c': case 'd': case 'e':
350 case 'f':
351 c2 |= (digit - 'a' + 10) << shl; break;
352 case 'A': case 'B': case 'C': case 'D': case 'E':
353 case 'F':
354 c2 |= (digit - 'A' + 10) << shl; break;
355 default:
356 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
357 goto bail;
360 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
362 #endif
364 chunk = PyUnicode_FromUnicode(&c, 1);
365 if (chunk == NULL) {
366 goto bail;
368 if (PyList_Append(chunks, chunk)) {
369 goto bail;
371 Py_DECREF(chunk);
374 rval = join_list_unicode(chunks);
375 if (rval == NULL) {
376 goto bail;
378 Py_DECREF(chunks);
379 chunks = NULL;
380 return Py_BuildValue("(Nn)", rval, end);
381 bail:
382 Py_XDECREF(chunks);
383 return NULL;
387 static PyObject *
388 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
390 PyObject *rval;
391 Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
392 Py_ssize_t begin = end - 1;
393 Py_ssize_t next = begin;
394 const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
395 PyObject *chunks = PyList_New(0);
396 if (chunks == NULL) {
397 goto bail;
399 while (1) {
400 /* Find the end of the string or the next escape */
401 Py_UNICODE c = 0;
402 PyObject *chunk = NULL;
403 for (next = end; next < len; next++) {
404 c = buf[next];
405 if (c == '"' || c == '\\') {
406 break;
408 else if (strict && c <= 0x1f) {
409 raise_errmsg("Invalid control character at", pystr, begin);
410 goto bail;
413 if (!(c == '"' || c == '\\')) {
414 raise_errmsg("Unterminated string starting at", pystr, begin);
415 goto bail;
417 /* Pick up this chunk if it's not zero length */
418 if (next != end) {
419 chunk = PyUnicode_FromUnicode(&buf[end], next - end);
420 if (chunk == NULL) {
421 goto bail;
423 if (PyList_Append(chunks, chunk)) {
424 goto bail;
426 Py_DECREF(chunk);
428 next++;
429 if (c == '"') {
430 end = next;
431 break;
433 if (next == len) {
434 raise_errmsg("Unterminated string starting at", pystr, begin);
435 goto bail;
437 c = buf[next];
438 if (c != 'u') {
439 /* Non-unicode backslash escapes */
440 end = next + 1;
441 switch (c) {
442 case '"': break;
443 case '\\': break;
444 case '/': break;
445 case 'b': c = '\b'; break;
446 case 'f': c = '\f'; break;
447 case 'n': c = '\n'; break;
448 case 'r': c = '\r'; break;
449 case 't': c = '\t'; break;
450 default: c = 0;
452 if (c == 0) {
453 raise_errmsg("Invalid \\escape", pystr, end - 2);
454 goto bail;
457 else {
458 c = 0;
459 next++;
460 end = next + 4;
461 if (end >= len) {
462 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
463 goto bail;
465 /* Decode 4 hex digits */
466 for (; next < end; next++) {
467 Py_ssize_t shl = (end - next - 1) << 2;
468 Py_UNICODE digit = buf[next];
469 switch (digit) {
470 case '0': case '1': case '2': case '3': case '4':
471 case '5': case '6': case '7': case '8': case '9':
472 c |= (digit - '0') << shl; break;
473 case 'a': case 'b': case 'c': case 'd': case 'e':
474 case 'f':
475 c |= (digit - 'a' + 10) << shl; break;
476 case 'A': case 'B': case 'C': case 'D': case 'E':
477 case 'F':
478 c |= (digit - 'A' + 10) << shl; break;
479 default:
480 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
481 goto bail;
484 #ifdef Py_UNICODE_WIDE
485 /* Surrogate pair */
486 if (c >= 0xd800 && c <= 0xdbff) {
487 Py_UNICODE c2 = 0;
488 if (end + 6 >= len) {
489 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
490 end - 5);
492 if (buf[next++] != '\\' || buf[next++] != 'u') {
493 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
494 end - 5);
496 end += 6;
497 /* Decode 4 hex digits */
498 for (; next < end; next++) {
499 Py_ssize_t shl = (end - next - 1) << 2;
500 Py_UNICODE digit = buf[next];
501 switch (digit) {
502 case '0': case '1': case '2': case '3': case '4':
503 case '5': case '6': case '7': case '8': case '9':
504 c2 |= (digit - '0') << shl; break;
505 case 'a': case 'b': case 'c': case 'd': case 'e':
506 case 'f':
507 c2 |= (digit - 'a' + 10) << shl; break;
508 case 'A': case 'B': case 'C': case 'D': case 'E':
509 case 'F':
510 c2 |= (digit - 'A' + 10) << shl; break;
511 default:
512 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
513 goto bail;
516 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
518 #endif
520 chunk = PyUnicode_FromUnicode(&c, 1);
521 if (chunk == NULL) {
522 goto bail;
524 if (PyList_Append(chunks, chunk)) {
525 goto bail;
527 Py_DECREF(chunk);
530 rval = join_list_unicode(chunks);
531 if (rval == NULL) {
532 goto bail;
534 Py_DECREF(chunks);
535 chunks = NULL;
536 return Py_BuildValue("(Nn)", rval, end);
537 bail:
538 Py_XDECREF(chunks);
539 return NULL;
542 PyDoc_STRVAR(pydoc_scanstring,
543 "scanstring(basestring, end, encoding) -> (str, end)\n");
545 static PyObject *
546 py_scanstring(PyObject* self, PyObject *args)
548 PyObject *pystr;
549 Py_ssize_t end;
550 char *encoding = NULL;
551 int strict = 0;
552 if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
553 return NULL;
555 if (encoding == NULL) {
556 encoding = DEFAULT_ENCODING;
558 if (PyString_Check(pystr)) {
559 return scanstring_str(pystr, end, encoding, strict);
561 else if (PyUnicode_Check(pystr)) {
562 return scanstring_unicode(pystr, end, strict);
564 else {
565 PyErr_Format(PyExc_TypeError,
566 "first argument must be a string or unicode, not %.80s",
567 Py_TYPE(pystr)->tp_name);
568 return NULL;
572 PyDoc_STRVAR(pydoc_encode_basestring_ascii,
573 "encode_basestring_ascii(basestring) -> str\n");
575 static PyObject *
576 py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
578 /* METH_O */
579 if (PyString_Check(pystr)) {
580 return ascii_escape_str(pystr);
582 else if (PyUnicode_Check(pystr)) {
583 return ascii_escape_unicode(pystr);
585 else {
586 PyErr_Format(PyExc_TypeError,
587 "first argument must be a string or unicode, not %.80s",
588 Py_TYPE(pystr)->tp_name);
589 return NULL;
593 static PyMethodDef json_methods[] = {
594 {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
595 METH_O, pydoc_encode_basestring_ascii},
596 {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
597 pydoc_scanstring},
598 {NULL, NULL, 0, NULL}
601 PyDoc_STRVAR(module_doc,
602 "json speedups\n");
604 void
605 init_json(void)
607 PyObject *m;
608 m = Py_InitModule3("_json", json_methods, module_doc);