3 #define DEFAULT_ENCODING "utf-8"
4 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
5 #define MIN_EXPANSION 6
8 #define MAX_EXPANSION (2 * MIN_EXPANSION)
10 #define MAX_EXPANSION MIN_EXPANSION
14 ascii_escape_char(Py_UNICODE c
, char *output
, Py_ssize_t chars
)
17 output
[chars
++] = '\\';
19 case '\\': output
[chars
++] = (char)c
; break;
20 case '"': output
[chars
++] = (char)c
; break;
21 case '\b': output
[chars
++] = 'b'; break;
22 case '\f': output
[chars
++] = 'f'; break;
23 case '\n': output
[chars
++] = 'n'; break;
24 case '\r': output
[chars
++] = 'r'; break;
25 case '\t': output
[chars
++] = 't'; break;
27 #ifdef Py_UNICODE_WIDE
29 /* UTF-16 surrogate pair */
30 Py_UNICODE v
= c
- 0x10000;
31 c
= 0xd800 | ((v
>> 10) & 0x3ff);
32 output
[chars
++] = 'u';
33 x
= (c
& 0xf000) >> 12;
34 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
35 x
= (c
& 0x0f00) >> 8;
36 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
37 x
= (c
& 0x00f0) >> 4;
38 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
40 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
41 c
= 0xdc00 | (v
& 0x3ff);
42 output
[chars
++] = '\\';
45 output
[chars
++] = 'u';
46 x
= (c
& 0xf000) >> 12;
47 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
48 x
= (c
& 0x0f00) >> 8;
49 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
50 x
= (c
& 0x00f0) >> 4;
51 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
53 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
59 ascii_escape_unicode(PyObject
*pystr
)
62 Py_ssize_t input_chars
;
63 Py_ssize_t output_size
;
67 Py_UNICODE
*input_unicode
;
69 input_chars
= PyUnicode_GET_SIZE(pystr
);
70 input_unicode
= PyUnicode_AS_UNICODE(pystr
);
71 /* One char input can be up to 6 chars output, estimate 4 of these */
72 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
73 rval
= PyString_FromStringAndSize(NULL
, output_size
);
77 output
= PyString_AS_STRING(rval
);
79 output
[chars
++] = '"';
80 for (i
= 0; i
< input_chars
; i
++) {
81 Py_UNICODE c
= input_unicode
[i
];
83 output
[chars
++] = (char)c
;
86 chars
= ascii_escape_char(c
, output
, chars
);
88 if (output_size
- chars
< (1 + MAX_EXPANSION
)) {
89 /* There's more than four, so let's resize by a lot */
91 /* This is an upper bound */
92 if (output_size
> 2 + (input_chars
* MAX_EXPANSION
)) {
93 output_size
= 2 + (input_chars
* MAX_EXPANSION
);
95 if (_PyString_Resize(&rval
, output_size
) == -1) {
98 output
= PyString_AS_STRING(rval
);
101 output
[chars
++] = '"';
102 if (_PyString_Resize(&rval
, chars
) == -1) {
109 ascii_escape_str(PyObject
*pystr
)
112 Py_ssize_t input_chars
;
113 Py_ssize_t output_size
;
119 input_chars
= PyString_GET_SIZE(pystr
);
120 input_str
= PyString_AS_STRING(pystr
);
121 /* One char input can be up to 6 chars output, estimate 4 of these */
122 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
123 rval
= PyString_FromStringAndSize(NULL
, output_size
);
127 output
= PyString_AS_STRING(rval
);
129 output
[chars
++] = '"';
130 for (i
= 0; i
< input_chars
; i
++) {
131 Py_UNICODE c
= (Py_UNICODE
)input_str
[i
];
133 output
[chars
++] = (char)c
;
136 /* We hit a non-ASCII character, bail to unicode mode */
139 uni
= PyUnicode_DecodeUTF8(input_str
, input_chars
, "strict");
143 rval
= ascii_escape_unicode(uni
);
148 chars
= ascii_escape_char(c
, output
, chars
);
150 /* An ASCII char can't possibly expand to a surrogate! */
151 if (output_size
- chars
< (1 + MIN_EXPANSION
)) {
152 /* There's more than four, so let's resize by a lot */
154 if (output_size
> 2 + (input_chars
* MIN_EXPANSION
)) {
155 output_size
= 2 + (input_chars
* MIN_EXPANSION
);
157 if (_PyString_Resize(&rval
, output_size
) == -1) {
160 output
= PyString_AS_STRING(rval
);
163 output
[chars
++] = '"';
164 if (_PyString_Resize(&rval
, chars
) == -1) {
171 raise_errmsg(char *msg
, PyObject
*s
, Py_ssize_t end
)
173 static PyObject
*errmsg_fn
= NULL
;
175 if (errmsg_fn
== NULL
) {
176 PyObject
*decoder
= PyImport_ImportModule("json.decoder");
179 errmsg_fn
= PyObject_GetAttrString(decoder
, "errmsg");
180 if (errmsg_fn
== NULL
)
184 pymsg
= PyObject_CallFunction(errmsg_fn
, "(zOn)", msg
, s
, end
);
186 PyErr_SetObject(PyExc_ValueError
, pymsg
);
191 def linecol(doc, pos):
192 lineno = doc.count('\n', 0, pos) + 1
196 colno = pos - doc.rindex('\n', 0, pos)
199 def errmsg(msg, doc, pos, end=None):
200 lineno, colno = linecol(doc, pos)
202 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
203 endlineno, endcolno = linecol(doc, end)
204 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
205 msg, lineno, colno, endlineno, endcolno, pos, end)
211 join_list_unicode(PyObject
*lst
)
213 static PyObject
*ustr
= NULL
;
214 static PyObject
*joinstr
= NULL
;
217 ustr
= PyUnicode_FromUnicode(&c
, 0);
219 if (joinstr
== NULL
) {
220 joinstr
= PyString_InternFromString("join");
222 if (joinstr
== NULL
|| ustr
== NULL
) {
225 return PyObject_CallMethodObjArgs(ustr
, joinstr
, lst
, NULL
);
229 scanstring_str(PyObject
*pystr
, Py_ssize_t end
, char *encoding
, int strict
)
232 Py_ssize_t len
= PyString_GET_SIZE(pystr
);
233 Py_ssize_t begin
= end
- 1;
234 Py_ssize_t next
= begin
;
235 char *buf
= PyString_AS_STRING(pystr
);
236 PyObject
*chunks
= PyList_New(0);
237 if (chunks
== NULL
) {
240 if (end
< 0 || len
<= end
) {
241 PyErr_SetString(PyExc_ValueError
, "end is out of bounds");
245 /* Find the end of the string or the next escape */
247 PyObject
*chunk
= NULL
;
248 for (next
= end
; next
< len
; next
++) {
250 if (c
== '"' || c
== '\\') {
253 else if (strict
&& c
<= 0x1f) {
254 raise_errmsg("Invalid control character at", pystr
, next
);
258 if (!(c
== '"' || c
== '\\')) {
259 raise_errmsg("Unterminated string starting at", pystr
, begin
);
262 /* Pick up this chunk if it's not zero length */
264 PyObject
*strchunk
= PyBuffer_FromMemory(&buf
[end
], next
- end
);
265 if (strchunk
== NULL
) {
268 chunk
= PyUnicode_FromEncodedObject(strchunk
, encoding
, NULL
);
273 if (PyList_Append(chunks
, chunk
)) {
285 raise_errmsg("Unterminated string starting at", pystr
, begin
);
290 /* Non-unicode backslash escapes */
296 case 'b': c
= '\b'; break;
297 case 'f': c
= '\f'; break;
298 case 'n': c
= '\n'; break;
299 case 'r': c
= '\r'; break;
300 case 't': c
= '\t'; break;
304 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
313 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
316 /* Decode 4 hex digits */
317 for (; next
< end
; next
++) {
318 Py_ssize_t shl
= (end
- next
- 1) << 2;
319 Py_UNICODE digit
= buf
[next
];
321 case '0': case '1': case '2': case '3': case '4':
322 case '5': case '6': case '7': case '8': case '9':
323 c
|= (digit
- '0') << shl
; break;
324 case 'a': case 'b': case 'c': case 'd': case 'e':
326 c
|= (digit
- 'a' + 10) << shl
; break;
327 case 'A': case 'B': case 'C': case 'D': case 'E':
329 c
|= (digit
- 'A' + 10) << shl
; break;
331 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
335 #ifdef Py_UNICODE_WIDE
337 if (c
>= 0xd800 && c
<= 0xdbff) {
339 if (end
+ 6 >= len
) {
340 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
343 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
344 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
348 /* Decode 4 hex digits */
349 for (; next
< end
; next
++) {
350 Py_ssize_t shl
= (end
- next
- 1) << 2;
351 Py_UNICODE digit
= buf
[next
];
353 case '0': case '1': case '2': case '3': case '4':
354 case '5': case '6': case '7': case '8': case '9':
355 c2
|= (digit
- '0') << shl
; break;
356 case 'a': case 'b': case 'c': case 'd': case 'e':
358 c2
|= (digit
- 'a' + 10) << shl
; break;
359 case 'A': case 'B': case 'C': case 'D': case 'E':
361 c2
|= (digit
- 'A' + 10) << shl
; break;
363 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
367 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
371 chunk
= PyUnicode_FromUnicode(&c
, 1);
375 if (PyList_Append(chunks
, chunk
)) {
382 rval
= join_list_unicode(chunks
);
387 return Py_BuildValue("(Nn)", rval
, end
);
395 scanstring_unicode(PyObject
*pystr
, Py_ssize_t end
, int strict
)
398 Py_ssize_t len
= PyUnicode_GET_SIZE(pystr
);
399 Py_ssize_t begin
= end
- 1;
400 Py_ssize_t next
= begin
;
401 const Py_UNICODE
*buf
= PyUnicode_AS_UNICODE(pystr
);
402 PyObject
*chunks
= PyList_New(0);
403 if (chunks
== NULL
) {
406 if (end
< 0 || len
<= end
) {
407 PyErr_SetString(PyExc_ValueError
, "end is out of bounds");
411 /* Find the end of the string or the next escape */
413 PyObject
*chunk
= NULL
;
414 for (next
= end
; next
< len
; next
++) {
416 if (c
== '"' || c
== '\\') {
419 else if (strict
&& c
<= 0x1f) {
420 raise_errmsg("Invalid control character at", pystr
, next
);
424 if (!(c
== '"' || c
== '\\')) {
425 raise_errmsg("Unterminated string starting at", pystr
, begin
);
428 /* Pick up this chunk if it's not zero length */
430 chunk
= PyUnicode_FromUnicode(&buf
[end
], next
- end
);
434 if (PyList_Append(chunks
, chunk
)) {
446 raise_errmsg("Unterminated string starting at", pystr
, begin
);
451 /* Non-unicode backslash escapes */
457 case 'b': c
= '\b'; break;
458 case 'f': c
= '\f'; break;
459 case 'n': c
= '\n'; break;
460 case 'r': c
= '\r'; break;
461 case 't': c
= '\t'; break;
465 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
474 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
477 /* Decode 4 hex digits */
478 for (; next
< end
; next
++) {
479 Py_ssize_t shl
= (end
- next
- 1) << 2;
480 Py_UNICODE digit
= buf
[next
];
482 case '0': case '1': case '2': case '3': case '4':
483 case '5': case '6': case '7': case '8': case '9':
484 c
|= (digit
- '0') << shl
; break;
485 case 'a': case 'b': case 'c': case 'd': case 'e':
487 c
|= (digit
- 'a' + 10) << shl
; break;
488 case 'A': case 'B': case 'C': case 'D': case 'E':
490 c
|= (digit
- 'A' + 10) << shl
; break;
492 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
496 #ifdef Py_UNICODE_WIDE
498 if (c
>= 0xd800 && c
<= 0xdbff) {
500 if (end
+ 6 >= len
) {
501 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
504 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
505 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
509 /* Decode 4 hex digits */
510 for (; next
< end
; next
++) {
511 Py_ssize_t shl
= (end
- next
- 1) << 2;
512 Py_UNICODE digit
= buf
[next
];
514 case '0': case '1': case '2': case '3': case '4':
515 case '5': case '6': case '7': case '8': case '9':
516 c2
|= (digit
- '0') << shl
; break;
517 case 'a': case 'b': case 'c': case 'd': case 'e':
519 c2
|= (digit
- 'a' + 10) << shl
; break;
520 case 'A': case 'B': case 'C': case 'D': case 'E':
522 c2
|= (digit
- 'A' + 10) << shl
; break;
524 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
528 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
532 chunk
= PyUnicode_FromUnicode(&c
, 1);
536 if (PyList_Append(chunks
, chunk
)) {
543 rval
= join_list_unicode(chunks
);
548 return Py_BuildValue("(Nn)", rval
, end
);
554 PyDoc_STRVAR(pydoc_scanstring
,
555 "scanstring(basestring, end, encoding) -> (str, end)\n");
558 py_scanstring(PyObject
* self
, PyObject
*args
)
562 char *encoding
= NULL
;
564 if (!PyArg_ParseTuple(args
, "On|zi:scanstring", &pystr
, &end
, &encoding
, &strict
)) {
567 if (encoding
== NULL
) {
568 encoding
= DEFAULT_ENCODING
;
570 if (PyString_Check(pystr
)) {
571 return scanstring_str(pystr
, end
, encoding
, strict
);
573 else if (PyUnicode_Check(pystr
)) {
574 return scanstring_unicode(pystr
, end
, strict
);
577 PyErr_Format(PyExc_TypeError
,
578 "first argument must be a string or unicode, not %.80s",
579 Py_TYPE(pystr
)->tp_name
);
584 PyDoc_STRVAR(pydoc_encode_basestring_ascii
,
585 "encode_basestring_ascii(basestring) -> str\n");
588 py_encode_basestring_ascii(PyObject
* self
, PyObject
*pystr
)
591 if (PyString_Check(pystr
)) {
592 return ascii_escape_str(pystr
);
594 else if (PyUnicode_Check(pystr
)) {
595 return ascii_escape_unicode(pystr
);
598 PyErr_Format(PyExc_TypeError
,
599 "first argument must be a string or unicode, not %.80s",
600 Py_TYPE(pystr
)->tp_name
);
605 static PyMethodDef json_methods
[] = {
606 {"encode_basestring_ascii", (PyCFunction
)py_encode_basestring_ascii
,
607 METH_O
, pydoc_encode_basestring_ascii
},
608 {"scanstring", (PyCFunction
)py_scanstring
, METH_VARARGS
,
610 {NULL
, NULL
, 0, NULL
}
613 PyDoc_STRVAR(module_doc
,
620 m
= Py_InitModule3("_json", json_methods
, module_doc
);