2 #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
3 typedef int Py_ssize_t
;
4 #define PY_SSIZE_T_MAX INT_MAX
5 #define PY_SSIZE_T_MIN INT_MIN
9 #define UNUSED __attribute__((__unused__))
14 #define DEFAULT_ENCODING "utf-8"
17 ascii_escape_char(Py_UNICODE c
, char *output
, Py_ssize_t chars
);
19 ascii_escape_unicode(PyObject
*pystr
);
21 ascii_escape_str(PyObject
*pystr
);
23 py_encode_basestring_ascii(PyObject
* self UNUSED
, PyObject
*pystr
);
24 void init_speedups(void);
26 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
28 #define MIN_EXPANSION 6
29 #ifdef Py_UNICODE_WIDE
30 #define MAX_EXPANSION (2 * MIN_EXPANSION)
32 #define MAX_EXPANSION MIN_EXPANSION
36 ascii_escape_char(Py_UNICODE c
, char *output
, Py_ssize_t chars
)
39 output
[chars
++] = '\\';
41 case '\\': output
[chars
++] = (char)c
; break;
42 case '"': output
[chars
++] = (char)c
; break;
43 case '\b': output
[chars
++] = 'b'; break;
44 case '\f': output
[chars
++] = 'f'; break;
45 case '\n': output
[chars
++] = 'n'; break;
46 case '\r': output
[chars
++] = 'r'; break;
47 case '\t': output
[chars
++] = 't'; break;
49 #ifdef Py_UNICODE_WIDE
51 /* UTF-16 surrogate pair */
52 Py_UNICODE v
= c
- 0x10000;
53 c
= 0xd800 | ((v
>> 10) & 0x3ff);
54 output
[chars
++] = 'u';
55 x
= (c
& 0xf000) >> 12;
56 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
57 x
= (c
& 0x0f00) >> 8;
58 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
59 x
= (c
& 0x00f0) >> 4;
60 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
62 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
63 c
= 0xdc00 | (v
& 0x3ff);
64 output
[chars
++] = '\\';
67 output
[chars
++] = 'u';
68 x
= (c
& 0xf000) >> 12;
69 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
70 x
= (c
& 0x0f00) >> 8;
71 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
72 x
= (c
& 0x00f0) >> 4;
73 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
75 output
[chars
++] = (x
< 10) ? '0' + x
: 'a' + (x
- 10);
81 ascii_escape_unicode(PyObject
*pystr
)
84 Py_ssize_t input_chars
;
85 Py_ssize_t output_size
;
89 Py_UNICODE
*input_unicode
;
91 input_chars
= PyUnicode_GET_SIZE(pystr
);
92 input_unicode
= PyUnicode_AS_UNICODE(pystr
);
93 /* One char input can be up to 6 chars output, estimate 4 of these */
94 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
95 rval
= PyString_FromStringAndSize(NULL
, output_size
);
99 output
= PyString_AS_STRING(rval
);
101 output
[chars
++] = '"';
102 for (i
= 0; i
< input_chars
; i
++) {
103 Py_UNICODE c
= input_unicode
[i
];
105 output
[chars
++] = (char)c
;
108 chars
= ascii_escape_char(c
, output
, chars
);
110 if (output_size
- chars
< (1 + MAX_EXPANSION
)) {
111 /* There's more than four, so let's resize by a lot */
113 /* This is an upper bound */
114 if (output_size
> 2 + (input_chars
* MAX_EXPANSION
)) {
115 output_size
= 2 + (input_chars
* MAX_EXPANSION
);
117 if (_PyString_Resize(&rval
, output_size
) == -1) {
120 output
= PyString_AS_STRING(rval
);
123 output
[chars
++] = '"';
124 if (_PyString_Resize(&rval
, chars
) == -1) {
131 ascii_escape_str(PyObject
*pystr
)
134 Py_ssize_t input_chars
;
135 Py_ssize_t output_size
;
141 input_chars
= PyString_GET_SIZE(pystr
);
142 input_str
= PyString_AS_STRING(pystr
);
143 /* One char input can be up to 6 chars output, estimate 4 of these */
144 output_size
= 2 + (MIN_EXPANSION
* 4) + input_chars
;
145 rval
= PyString_FromStringAndSize(NULL
, output_size
);
149 output
= PyString_AS_STRING(rval
);
151 output
[chars
++] = '"';
152 for (i
= 0; i
< input_chars
; i
++) {
153 Py_UNICODE c
= (Py_UNICODE
)input_str
[i
];
155 output
[chars
++] = (char)c
;
158 /* We hit a non-ASCII character, bail to unicode mode */
161 uni
= PyUnicode_DecodeUTF8(input_str
, input_chars
, "strict");
165 rval
= ascii_escape_unicode(uni
);
170 chars
= ascii_escape_char(c
, output
, chars
);
172 /* An ASCII char can't possibly expand to a surrogate! */
173 if (output_size
- chars
< (1 + MIN_EXPANSION
)) {
174 /* There's more than four, so let's resize by a lot */
176 if (output_size
> 2 + (input_chars
* MIN_EXPANSION
)) {
177 output_size
= 2 + (input_chars
* MIN_EXPANSION
);
179 if (_PyString_Resize(&rval
, output_size
) == -1) {
182 output
= PyString_AS_STRING(rval
);
185 output
[chars
++] = '"';
186 if (_PyString_Resize(&rval
, chars
) == -1) {
193 raise_errmsg(char *msg
, PyObject
*s
, Py_ssize_t end
)
195 static PyObject
*errmsg_fn
= NULL
;
197 if (errmsg_fn
== NULL
) {
198 PyObject
*decoder
= PyImport_ImportModule("simplejson.decoder");
199 if (decoder
== NULL
) return;
200 errmsg_fn
= PyObject_GetAttrString(decoder
, "errmsg");
201 if (errmsg_fn
== NULL
) return;
204 #if PY_VERSION_HEX < 0x02050000
205 pymsg
= PyObject_CallFunction(errmsg_fn
, "(zOi)", msg
, s
, end
);
207 pymsg
= PyObject_CallFunction(errmsg_fn
, "(zOn)", msg
, s
, end
);
209 PyErr_SetObject(PyExc_ValueError
, pymsg
);
213 def linecol(doc, pos):
214 lineno = doc.count('\n', 0, pos) + 1
218 colno = pos - doc.rindex('\n', 0, pos)
221 def errmsg(msg, doc, pos, end=None):
222 lineno, colno = linecol(doc, pos)
224 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
225 endlineno, endcolno = linecol(doc, end)
226 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
227 msg, lineno, colno, endlineno, endcolno, pos, end)
233 join_list_unicode(PyObject
*lst
)
235 static PyObject
*ustr
= NULL
;
236 static PyObject
*joinstr
= NULL
;
239 ustr
= PyUnicode_FromUnicode(&c
, 0);
241 if (joinstr
== NULL
) {
242 joinstr
= PyString_FromString("join");
244 if (joinstr
== NULL
|| ustr
== NULL
) {
247 return PyObject_CallMethodObjArgs(ustr
, joinstr
, lst
, NULL
);
251 scanstring_str(PyObject
*pystr
, Py_ssize_t end
, char *encoding
, int strict
)
254 Py_ssize_t len
= PyString_GET_SIZE(pystr
);
255 Py_ssize_t begin
= end
- 1;
256 Py_ssize_t next
= begin
;
257 char *buf
= PyString_AS_STRING(pystr
);
258 PyObject
*chunks
= PyList_New(0);
259 if (chunks
== NULL
) {
263 /* Find the end of the string or the next escape */
265 PyObject
*chunk
= NULL
;
266 for (next
= end
; next
< len
; next
++) {
268 if (c
== '"' || c
== '\\') {
271 else if (strict
&& c
<= 0x1f) {
272 raise_errmsg("Invalid control character at", pystr
, begin
);
276 if (!(c
== '"' || c
== '\\')) {
277 raise_errmsg("Unterminated string starting at", pystr
, begin
);
280 /* Pick up this chunk if it's not zero length */
282 PyObject
*strchunk
= PyBuffer_FromMemory(&buf
[end
], next
- end
);
283 if (strchunk
== NULL
) {
286 chunk
= PyUnicode_FromEncodedObject(strchunk
, encoding
, NULL
);
287 Py_XDECREF(strchunk
);
291 if (PyList_Append(chunks
, chunk
)) {
302 raise_errmsg("Unterminated string starting at", pystr
, begin
);
307 /* Non-unicode backslash escapes */
313 case 'b': c
= '\b'; break;
314 case 'f': c
= '\f'; break;
315 case 'n': c
= '\n'; break;
316 case 'r': c
= '\r'; break;
317 case 't': c
= '\t'; break;
321 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
330 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
333 /* Decode 4 hex digits */
334 for (; next
< end
; next
++) {
335 Py_ssize_t shl
= (end
- next
- 1) << 2;
336 Py_UNICODE digit
= buf
[next
];
338 case '0': case '1': case '2': case '3': case '4':
339 case '5': case '6': case '7': case '8': case '9':
340 c
|= (digit
- '0') << shl
; break;
341 case 'a': case 'b': case 'c': case 'd': case 'e':
343 c
|= (digit
- 'a' + 10) << shl
; break;
344 case 'A': case 'B': case 'C': case 'D': case 'E':
346 c
|= (digit
- 'A' + 10) << shl
; break;
348 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
352 #ifdef Py_UNICODE_WIDE
354 if (c
>= 0xd800 && c
<= 0xdbff) {
356 if (end
+ 6 >= len
) {
357 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
360 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
361 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
365 /* Decode 4 hex digits */
366 for (; next
< end
; next
++) {
367 Py_ssize_t shl
= (end
- next
- 1) << 2;
368 Py_UNICODE digit
= buf
[next
];
370 case '0': case '1': case '2': case '3': case '4':
371 case '5': case '6': case '7': case '8': case '9':
372 c2
|= (digit
- '0') << shl
; break;
373 case 'a': case 'b': case 'c': case 'd': case 'e':
375 c2
|= (digit
- 'a' + 10) << shl
; break;
376 case 'A': case 'B': case 'C': case 'D': case 'E':
378 c2
|= (digit
- 'A' + 10) << shl
; break;
380 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
384 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
388 chunk
= PyUnicode_FromUnicode(&c
, 1);
392 if (PyList_Append(chunks
, chunk
)) {
398 rval
= join_list_unicode(chunks
);
404 #if PY_VERSION_HEX < 0x02050000
405 return Py_BuildValue("(Ni)", rval
, end
);
407 return Py_BuildValue("(Nn)", rval
, end
);
416 scanstring_unicode(PyObject
*pystr
, Py_ssize_t end
, int strict
)
419 Py_ssize_t len
= PyUnicode_GET_SIZE(pystr
);
420 Py_ssize_t begin
= end
- 1;
421 Py_ssize_t next
= begin
;
422 const Py_UNICODE
*buf
= PyUnicode_AS_UNICODE(pystr
);
423 PyObject
*chunks
= PyList_New(0);
424 if (chunks
== NULL
) {
428 /* Find the end of the string or the next escape */
430 PyObject
*chunk
= NULL
;
431 for (next
= end
; next
< len
; next
++) {
433 if (c
== '"' || c
== '\\') {
436 else if (strict
&& c
<= 0x1f) {
437 raise_errmsg("Invalid control character at", pystr
, begin
);
441 if (!(c
== '"' || c
== '\\')) {
442 raise_errmsg("Unterminated string starting at", pystr
, begin
);
445 /* Pick up this chunk if it's not zero length */
447 chunk
= PyUnicode_FromUnicode(&buf
[end
], next
- end
);
451 if (PyList_Append(chunks
, chunk
)) {
462 raise_errmsg("Unterminated string starting at", pystr
, begin
);
467 /* Non-unicode backslash escapes */
473 case 'b': c
= '\b'; break;
474 case 'f': c
= '\f'; break;
475 case 'n': c
= '\n'; break;
476 case 'r': c
= '\r'; break;
477 case 't': c
= '\t'; break;
481 raise_errmsg("Invalid \\escape", pystr
, end
- 2);
490 raise_errmsg("Invalid \\uXXXX escape", pystr
, next
- 1);
493 /* Decode 4 hex digits */
494 for (; next
< end
; next
++) {
495 Py_ssize_t shl
= (end
- next
- 1) << 2;
496 Py_UNICODE digit
= buf
[next
];
498 case '0': case '1': case '2': case '3': case '4':
499 case '5': case '6': case '7': case '8': case '9':
500 c
|= (digit
- '0') << shl
; break;
501 case 'a': case 'b': case 'c': case 'd': case 'e':
503 c
|= (digit
- 'a' + 10) << shl
; break;
504 case 'A': case 'B': case 'C': case 'D': case 'E':
506 c
|= (digit
- 'A' + 10) << shl
; break;
508 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
512 #ifdef Py_UNICODE_WIDE
514 if (c
>= 0xd800 && c
<= 0xdbff) {
516 if (end
+ 6 >= len
) {
517 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
520 if (buf
[next
++] != '\\' || buf
[next
++] != 'u') {
521 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr
,
525 /* Decode 4 hex digits */
526 for (; next
< end
; next
++) {
527 Py_ssize_t shl
= (end
- next
- 1) << 2;
528 Py_UNICODE digit
= buf
[next
];
530 case '0': case '1': case '2': case '3': case '4':
531 case '5': case '6': case '7': case '8': case '9':
532 c2
|= (digit
- '0') << shl
; break;
533 case 'a': case 'b': case 'c': case 'd': case 'e':
535 c2
|= (digit
- 'a' + 10) << shl
; break;
536 case 'A': case 'B': case 'C': case 'D': case 'E':
538 c2
|= (digit
- 'A' + 10) << shl
; break;
540 raise_errmsg("Invalid \\uXXXX escape", pystr
, end
- 5);
544 c
= 0x10000 + (((c
- 0xd800) << 10) | (c2
- 0xdc00));
548 chunk
= PyUnicode_FromUnicode(&c
, 1);
552 if (PyList_Append(chunks
, chunk
)) {
558 rval
= join_list_unicode(chunks
);
564 #if PY_VERSION_HEX < 0x02050000
565 return Py_BuildValue("(Ni)", rval
, end
);
567 return Py_BuildValue("(Nn)", rval
, end
);
574 PyDoc_STRVAR(pydoc_scanstring
,
575 "scanstring(basestring, end, encoding) -> (str, end)\n"
581 py_scanstring(PyObject
* self UNUSED
, PyObject
*args
)
585 char *encoding
= NULL
;
587 #if PY_VERSION_HEX < 0x02050000
588 if (!PyArg_ParseTuple(args
, "Oi|zi:scanstring", &pystr
, &end
, &encoding
, &strict
)) {
590 if (!PyArg_ParseTuple(args
, "On|zi:scanstring", &pystr
, &end
, &encoding
, &strict
)) {
594 if (encoding
== NULL
) {
595 encoding
= DEFAULT_ENCODING
;
597 if (PyString_Check(pystr
)) {
598 return scanstring_str(pystr
, end
, encoding
, strict
);
600 else if (PyUnicode_Check(pystr
)) {
601 return scanstring_unicode(pystr
, end
, strict
);
603 PyErr_SetString(PyExc_TypeError
, "first argument must be a string");
607 PyDoc_STRVAR(pydoc_encode_basestring_ascii
,
608 "encode_basestring_ascii(basestring) -> str\n"
614 py_encode_basestring_ascii(PyObject
* self UNUSED
, PyObject
*pystr
)
617 if (PyString_Check(pystr
)) {
618 return ascii_escape_str(pystr
);
620 else if (PyUnicode_Check(pystr
)) {
621 return ascii_escape_unicode(pystr
);
623 PyErr_SetString(PyExc_TypeError
, "first argument must be a string");
627 static PyMethodDef speedups_methods
[] = {
628 {"encode_basestring_ascii",
629 (PyCFunction
)py_encode_basestring_ascii
,
631 pydoc_encode_basestring_ascii
},
633 (PyCFunction
)py_scanstring
,
636 {NULL
, NULL
, 0, NULL
}
643 m
= Py_InitModule4("_speedups", speedups_methods
, NULL
, NULL
, PYTHON_API_VERSION
);