3 #define DEFAULT_ENCODING "utf-8"
4 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
5 #define MIN_EXPANSION 6
8 #define MAX_EXPANSION (2 * MIN_EXPANSION)
10 #define MAX_EXPANSION MIN_EXPANSION
14 ascii_escape_char(Py_UNICODE c
, char *output
, Py_ssize_t chars
)
17 output
[chars
++] = '\\';
19 case '\\': output
[chars
++] = (char)c
; break;
20 case '"': output
[chars
++] = (char)c
; break;
21 case '\b': output
[chars
++] = 'b'; break;
22 case '\f': output
[chars
++] = 'f'; break;
23 case '\n': output
[chars
++] = 'n'; break;
24 case '\r': output
[chars
++] = 'r'; break;
25 case '\t': output
[chars
++] = 't'; break;
27 #ifdef Py_UNICODE_WIDE
29 /* UTF-16 surrogate pair */
30 Py_UNICODE v
= c
- 0x10000;
31 c
= 0xd800 | ((v
>> 10) & 0x3ff);
32 output
[chars
++] = 'u';
33 x
= (c
& 0xf000) >> 12;
34 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
35 x
= (c
& 0x0f00) >> 8;
36 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
37 x
= (c
& 0x00f0) >> 4;
38 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
40 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
41 c
= 0xdc00 | (v
& 0x3ff);
42 output
[chars
++] = '\\';
45 output
[chars
++] = 'u';
46 x
= (c
& 0xf000) >> 12;
47 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
48 x
= (c
& 0x0f00) >> 8;
49 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
50 x
= (c
& 0x00f0) >> 4;
51 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
53 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
59 ascii_escape_unicode(PyObject
*pystr
)
62 Py_ssize_t input_chars
;
63 Py_ssize_t output_size
;
67 Py_UNICODE
*input_unicode
;
69 input_chars
= PyUnicode_GET_SIZE(pystr
);
70 input_unicode
= PyUnicode_AS_UNICODE(pystr
);
71 /* One char input can be up to 6 chars output, estimate 4 of these */
72 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
73 rval
= PyString_FromStringAndSize(NULL
, output_size
);
77 output
= PyString_AS_STRING(rval
);
79 output
[chars
++] = '"';
80 for (i
= 0; i
< input_chars
; i
++) {
81 Py_UNICODE c
= input_unicode
[i
];
83 output
[chars
++] = (char)c
;
86 chars
= ascii_escape_char(c
, output
, chars
);
88 if (output_size
- chars
< (1 + MAX_EXPANSION
)) {
89 /* There's more than four, so let's resize by a lot */
91 /* This is an upper bound */
92 if (output_size
> 2 + (input_chars
* MAX_EXPANSION
)) {
93 output_size
= 2 + (input_chars
* MAX_EXPANSION
);
95 if (_PyString_Resize(&rval
, output_size
) == -1) {
98 output
= PyString_AS_STRING(rval
);
101 output
[chars
++] = '"';
102 if (_PyString_Resize(&rval
, chars
) == -1) {
109 ascii_escape_str(PyObject
*pystr
)
112 Py_ssize_t input_chars
;
113 Py_ssize_t output_size
;
119 input_chars
= PyString_GET_SIZE(pystr
);
120 input_str
= PyString_AS_STRING(pystr
);
121 /* One char input can be up to 6 chars output, estimate 4 of these */
122 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
123 rval
= PyString_FromStringAndSize(NULL
, output_size
);
127 output
= PyString_AS_STRING(rval
);
129 output
[chars
++] = '"';
130 for (i
= 0; i
< input_chars
; i
++) {
131 Py_UNICODE c
= (Py_UNICODE
)input_str
[i
];
133 output
[chars
++] = (char)c
;
136 /* We hit a non-ASCII character, bail to unicode mode */
139 uni
= PyUnicode_DecodeUTF8(input_str
, input_chars
, "strict");
143 rval
= ascii_escape_unicode(uni
);
148 chars
= ascii_escape_char(c
, output
, chars
);
150 /* An ASCII char can't possibly expand to a surrogate! */
151 if (output_size
- chars
< (1 + MIN_EXPANSION
)) {
152 /* There's more than four, so let's resize by a lot */
154 if (output_size
> 2 + (input_chars
* MIN_EXPANSION
)) {
155 output_size
= 2 + (input_chars
* MIN_EXPANSION
);
157 if (_PyString_Resize(&rval
, output_size
) == -1) {
160 output
= PyString_AS_STRING(rval
);
163 output
[chars
++] = '"';
164 if (_PyString_Resize(&rval
, chars
) == -1) {
171 raise_errmsg(char *msg
, PyObject
*s
, Py_ssize_t end
)
173 static PyObject
*errmsg_fn
= NULL
;
175 if (errmsg_fn
== NULL
) {
176 PyObject
*decoder
= PyImport_ImportModule("json.decoder");
179 errmsg_fn
= PyObject_GetAttrString(decoder
, "errmsg");
180 if (errmsg_fn
== NULL
)
184 pymsg
= PyObject_CallFunction(errmsg_fn
, "(zOn)", msg
, s
, end
);
185 PyErr_SetObject(PyExc_ValueError
, pymsg
);
189 def linecol(doc, pos):
190 lineno = doc.count('\n', 0, pos) + 1
194 colno = pos - doc.rindex('\n', 0, pos)
197 def errmsg(msg, doc, pos, end=None):
198 lineno, colno = linecol(doc, pos)
200 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
201 endlineno, endcolno = linecol(doc, end)
202 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
203 msg, lineno, colno, endlineno, endcolno, pos, end)
209 join_list_unicode(PyObject
*lst
)
211 static PyObject
*ustr
= NULL
;
212 static PyObject
*joinstr
= NULL
;
215 ustr
= PyUnicode_FromUnicode(&c
, 0);
217 if (joinstr
== NULL
) {
218 joinstr
= PyString_InternFromString("join");
220 if (joinstr
== NULL
|| ustr
== NULL
) {
223 return PyObject_CallMethodObjArgs(ustr
, joinstr
, lst
, NULL
);
227 scanstring_str(PyObject
*pystr
, Py_ssize_t end
, char *encoding
, int strict
)
230 Py_ssize_t len
= PyString_GET_SIZE(pystr
);
231 Py_ssize_t begin
= end
- 1;
232 Py_ssize_t next
= begin
;
233 char *buf
= PyString_AS_STRING(pystr
);
234 PyObject
*chunks
= PyList_New(0);
235 if (chunks
== NULL
) {
239 /* Find the end of the string or the next escape */
241 PyObject
*chunk
= NULL
;
242 for (next
= end
; next
< len
; next
++) {
244 if (c
== '"' || c
== '\\') {
247 else if (strict
&& c
<= 0x1f) {
248 raise_errmsg("Invalid control character at", pystr
, begin
);
252 if (!(c
== '"' || c
== '\\')) {
253 raise_errmsg("Unterminated string starting at", pystr
, begin
);
256 /* Pick up this chunk if it's not zero length */
258 PyObject
*strchunk
= PyBuffer_FromMemory(&buf
[end
], next
- end
);
259 if (strchunk
== NULL
) {
262 chunk
= PyUnicode_FromEncodedObject(strchunk
, encoding
, NULL
);
267 if (PyList_Append(chunks
, chunk
)) {
278 raise_errmsg("Unterminated string starting at", pystr
, begin
);
283 /* Non-unicode backslash escapes */
289 case 'b': c
= '\b'; break;
290 case 'f': c
= '\f'; break;
291 case 'n': c
= '\n'; break;
292 case 'r': c
= '\r'; break;
293 case 't': c
= '\t'; break;
297 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
306 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
309 /* Decode 4 hex digits */
310 for (; next
< end
; next
++) {
311 Py_ssize_t shl
= (end
- next
- 1) << 2;
312 Py_UNICODE digit
= buf
[next
];
314 case '0': case '1': case '2': case '3': case '4':
315 case '5': case '6': case '7': case '8': case '9':
316 c
|= (digit
- '0') << shl
; break;
317 case 'a': case 'b': case 'c': case 'd': case 'e':
319 c
|= (digit
- 'a' + 10) << shl
; break;
320 case 'A': case 'B': case 'C': case 'D': case 'E':
322 c
|= (digit
- 'A' + 10) << shl
; break;
324 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
328 #ifdef Py_UNICODE_WIDE
330 if (c
>= 0xd800 && c
<= 0xdbff) {
332 if (end
+ 6 >= len
) {
333 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
336 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
337 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
341 /* Decode 4 hex digits */
342 for (; next
< end
; next
++) {
343 Py_ssize_t shl
= (end
- next
- 1) << 2;
344 Py_UNICODE digit
= buf
[next
];
346 case '0': case '1': case '2': case '3': case '4':
347 case '5': case '6': case '7': case '8': case '9':
348 c2
|= (digit
- '0') << shl
; break;
349 case 'a': case 'b': case 'c': case 'd': case 'e':
351 c2
|= (digit
- 'a' + 10) << shl
; break;
352 case 'A': case 'B': case 'C': case 'D': case 'E':
354 c2
|= (digit
- 'A' + 10) << shl
; break;
356 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
360 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
364 chunk
= PyUnicode_FromUnicode(&c
, 1);
368 if (PyList_Append(chunks
, chunk
)) {
374 rval
= join_list_unicode(chunks
);
380 return Py_BuildValue("(Nn)", rval
, end
);
388 scanstring_unicode(PyObject
*pystr
, Py_ssize_t end
, int strict
)
391 Py_ssize_t len
= PyUnicode_GET_SIZE(pystr
);
392 Py_ssize_t begin
= end
- 1;
393 Py_ssize_t next
= begin
;
394 const Py_UNICODE
*buf
= PyUnicode_AS_UNICODE(pystr
);
395 PyObject
*chunks
= PyList_New(0);
396 if (chunks
== NULL
) {
400 /* Find the end of the string or the next escape */
402 PyObject
*chunk
= NULL
;
403 for (next
= end
; next
< len
; next
++) {
405 if (c
== '"' || c
== '\\') {
408 else if (strict
&& c
<= 0x1f) {
409 raise_errmsg("Invalid control character at", pystr
, begin
);
413 if (!(c
== '"' || c
== '\\')) {
414 raise_errmsg("Unterminated string starting at", pystr
, begin
);
417 /* Pick up this chunk if it's not zero length */
419 chunk
= PyUnicode_FromUnicode(&buf
[end
], next
- end
);
423 if (PyList_Append(chunks
, chunk
)) {
434 raise_errmsg("Unterminated string starting at", pystr
, begin
);
439 /* Non-unicode backslash escapes */
445 case 'b': c
= '\b'; break;
446 case 'f': c
= '\f'; break;
447 case 'n': c
= '\n'; break;
448 case 'r': c
= '\r'; break;
449 case 't': c
= '\t'; break;
453 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
462 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
465 /* Decode 4 hex digits */
466 for (; next
< end
; next
++) {
467 Py_ssize_t shl
= (end
- next
- 1) << 2;
468 Py_UNICODE digit
= buf
[next
];
470 case '0': case '1': case '2': case '3': case '4':
471 case '5': case '6': case '7': case '8': case '9':
472 c
|= (digit
- '0') << shl
; break;
473 case 'a': case 'b': case 'c': case 'd': case 'e':
475 c
|= (digit
- 'a' + 10) << shl
; break;
476 case 'A': case 'B': case 'C': case 'D': case 'E':
478 c
|= (digit
- 'A' + 10) << shl
; break;
480 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
484 #ifdef Py_UNICODE_WIDE
486 if (c
>= 0xd800 && c
<= 0xdbff) {
488 if (end
+ 6 >= len
) {
489 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
492 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
493 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
497 /* Decode 4 hex digits */
498 for (; next
< end
; next
++) {
499 Py_ssize_t shl
= (end
- next
- 1) << 2;
500 Py_UNICODE digit
= buf
[next
];
502 case '0': case '1': case '2': case '3': case '4':
503 case '5': case '6': case '7': case '8': case '9':
504 c2
|= (digit
- '0') << shl
; break;
505 case 'a': case 'b': case 'c': case 'd': case 'e':
507 c2
|= (digit
- 'a' + 10) << shl
; break;
508 case 'A': case 'B': case 'C': case 'D': case 'E':
510 c2
|= (digit
- 'A' + 10) << shl
; break;
512 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
516 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
520 chunk
= PyUnicode_FromUnicode(&c
, 1);
524 if (PyList_Append(chunks
, chunk
)) {
530 rval
= join_list_unicode(chunks
);
536 return Py_BuildValue("(Nn)", rval
, end
);
542 PyDoc_STRVAR(pydoc_scanstring
,
543 "scanstring(basestring, end, encoding) -> (str, end)\n");
546 py_scanstring(PyObject
* self
, PyObject
*args
)
550 char *encoding
= NULL
;
552 if (!PyArg_ParseTuple(args
, "On|zi:scanstring", &pystr
, &end
, &encoding
, &strict
)) {
555 if (encoding
== NULL
) {
556 encoding
= DEFAULT_ENCODING
;
558 if (PyString_Check(pystr
)) {
559 return scanstring_str(pystr
, end
, encoding
, strict
);
561 else if (PyUnicode_Check(pystr
)) {
562 return scanstring_unicode(pystr
, end
, strict
);
565 PyErr_Format(PyExc_TypeError
,
566 "first argument must be a string or unicode, not %.80s",
567 Py_TYPE(pystr
)->tp_name
);
572 PyDoc_STRVAR(pydoc_encode_basestring_ascii
,
573 "encode_basestring_ascii(basestring) -> str\n");
576 py_encode_basestring_ascii(PyObject
* self
, PyObject
*pystr
)
579 if (PyString_Check(pystr
)) {
580 return ascii_escape_str(pystr
);
582 else if (PyUnicode_Check(pystr
)) {
583 return ascii_escape_unicode(pystr
);
586 PyErr_Format(PyExc_TypeError
,
587 "first argument must be a string or unicode, not %.80s",
588 Py_TYPE(pystr
)->tp_name
);
593 static PyMethodDef json_methods
[] = {
594 {"encode_basestring_ascii", (PyCFunction
)py_encode_basestring_ascii
,
595 METH_O
, pydoc_encode_basestring_ascii
},
596 {"scanstring", (PyCFunction
)py_scanstring
, METH_VARARGS
,
598 {NULL
, NULL
, 0, NULL
}
601 PyDoc_STRVAR(module_doc
,
608 m
= Py_InitModule3("_json", json_methods
, module_doc
);