move sections
[python/dscho.git] / Lib / json / decoder.py
blob73236ed4de97ac562c09e85bc29f089af3c94ca6
1 """Implementation of JSONDecoder
2 """
3 import re
4 import sys
5 import struct
7 from json.scanner import make_scanner
8 try:
9 from _json import scanstring as c_scanstring
10 except ImportError:
11 c_scanstring = None
13 __all__ = ['JSONDecoder']
15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17 def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
24 NaN, PosInf, NegInf = _floatconstants()
27 def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
30 colno = pos
31 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
36 def errmsg(msg, doc, pos, end=None):
37 # Note that this function is called from _json
38 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
42 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
44 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
47 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
51 _CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
57 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58 BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
63 DEFAULT_ENCODING = "utf-8"
65 def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
73 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
75 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
87 # Content is contains zero or more unescaped string characters
88 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
92 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
94 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
98 #msg = "Invalid control character %r at" % (terminator,)
99 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
109 # If not a unicode escape sequence, must be in the lookup table
110 if esc != 'u':
111 try:
112 char = _b[esc]
113 except KeyError:
114 msg = "Invalid \\escape: " + repr(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 # Unicode escape sequence
119 esc = s[end + 1:end + 5]
120 next_end = end + 5
121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
123 raise ValueError(errmsg(msg, s, end))
124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
137 end = next_end
138 # Append the unescaped character
139 _append(char)
140 return u''.join(chunks), end
143 # Use speedup if available
144 scanstring = c_scanstring or py_scanstring
146 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147 WHITESPACE_STR = ' \t\n\r'
149 def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
151 s, end = s_and_end
152 pairs = []
153 pairs_append = pairs.append
154 # Use a slice to prevent IndexError from being raised, the following
155 # check will raise a more specific ValueError if the string is empty
156 nextchar = s[end:end + 1]
157 # Normally we expect nextchar == '"'
158 if nextchar != '"':
159 if nextchar in _ws:
160 end = _w(s, end).end()
161 nextchar = s[end:end + 1]
162 # Trivial empty object
163 if nextchar == '}':
164 return pairs, end + 1
165 elif nextchar != '"':
166 raise ValueError(errmsg("Expecting property name", s, end))
167 end += 1
168 while True:
169 key, end = scanstring(s, end, encoding, strict)
171 # To skip some function call overhead we optimize the fast paths where
172 # the JSON key separator is ": " or just ":".
173 if s[end:end + 1] != ':':
174 end = _w(s, end).end()
175 if s[end:end + 1] != ':':
176 raise ValueError(errmsg("Expecting : delimiter", s, end))
178 end += 1
180 try:
181 if s[end] in _ws:
182 end += 1
183 if s[end] in _ws:
184 end = _w(s, end + 1).end()
185 except IndexError:
186 pass
188 try:
189 value, end = scan_once(s, end)
190 except StopIteration:
191 raise ValueError(errmsg("Expecting object", s, end))
192 pairs_append((key, value))
194 try:
195 nextchar = s[end]
196 if nextchar in _ws:
197 end = _w(s, end + 1).end()
198 nextchar = s[end]
199 except IndexError:
200 nextchar = ''
201 end += 1
203 if nextchar == '}':
204 break
205 elif nextchar != ',':
206 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
208 try:
209 nextchar = s[end]
210 if nextchar in _ws:
211 end += 1
212 nextchar = s[end]
213 if nextchar in _ws:
214 end = _w(s, end + 1).end()
215 nextchar = s[end]
216 except IndexError:
217 nextchar = ''
219 end += 1
220 if nextchar != '"':
221 raise ValueError(errmsg("Expecting property name", s, end - 1))
223 if object_pairs_hook is not None:
224 result = object_pairs_hook(pairs)
225 return result, end
226 pairs = dict(pairs)
227 if object_hook is not None:
228 pairs = object_hook(pairs)
229 return pairs, end
231 def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
232 s, end = s_and_end
233 values = []
234 nextchar = s[end:end + 1]
235 if nextchar in _ws:
236 end = _w(s, end + 1).end()
237 nextchar = s[end:end + 1]
238 # Look-ahead for trivial empty array
239 if nextchar == ']':
240 return values, end + 1
241 _append = values.append
242 while True:
243 try:
244 value, end = scan_once(s, end)
245 except StopIteration:
246 raise ValueError(errmsg("Expecting object", s, end))
247 _append(value)
248 nextchar = s[end:end + 1]
249 if nextchar in _ws:
250 end = _w(s, end + 1).end()
251 nextchar = s[end:end + 1]
252 end += 1
253 if nextchar == ']':
254 break
255 elif nextchar != ',':
256 raise ValueError(errmsg("Expecting , delimiter", s, end))
258 try:
259 if s[end] in _ws:
260 end += 1
261 if s[end] in _ws:
262 end = _w(s, end + 1).end()
263 except IndexError:
264 pass
266 return values, end
268 class JSONDecoder(object):
269 """Simple JSON <http://json.org> decoder
271 Performs the following translations in decoding by default:
273 +---------------+-------------------+
274 | JSON | Python |
275 +===============+===================+
276 | object | dict |
277 +---------------+-------------------+
278 | array | list |
279 +---------------+-------------------+
280 | string | unicode |
281 +---------------+-------------------+
282 | number (int) | int, long |
283 +---------------+-------------------+
284 | number (real) | float |
285 +---------------+-------------------+
286 | true | True |
287 +---------------+-------------------+
288 | false | False |
289 +---------------+-------------------+
290 | null | None |
291 +---------------+-------------------+
293 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
294 their corresponding ``float`` values, which is outside the JSON spec.
298 def __init__(self, encoding=None, object_hook=None, parse_float=None,
299 parse_int=None, parse_constant=None, strict=True,
300 object_pairs_hook=None):
301 """``encoding`` determines the encoding used to interpret any ``str``
302 objects decoded by this instance (utf-8 by default). It has no
303 effect when decoding ``unicode`` objects.
305 Note that currently only encodings that are a superset of ASCII work,
306 strings of other encodings should be passed in as ``unicode``.
308 ``object_hook``, if specified, will be called with the result
309 of every JSON object decoded and its return value will be used in
310 place of the given ``dict``. This can be used to provide custom
311 deserializations (e.g. to support JSON-RPC class hinting).
313 ``parse_float``, if specified, will be called with the string
314 of every JSON float to be decoded. By default this is equivalent to
315 float(num_str). This can be used to use another datatype or parser
316 for JSON floats (e.g. decimal.Decimal).
318 ``parse_int``, if specified, will be called with the string
319 of every JSON int to be decoded. By default this is equivalent to
320 int(num_str). This can be used to use another datatype or parser
321 for JSON integers (e.g. float).
323 ``parse_constant``, if specified, will be called with one of the
324 following strings: -Infinity, Infinity, NaN.
325 This can be used to raise an exception if invalid JSON numbers
326 are encountered.
329 self.encoding = encoding
330 self.object_hook = object_hook
331 self.object_pairs_hook = object_pairs_hook
332 self.parse_float = parse_float or float
333 self.parse_int = parse_int or int
334 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
335 self.strict = strict
336 self.parse_object = JSONObject
337 self.parse_array = JSONArray
338 self.parse_string = scanstring
339 self.scan_once = make_scanner(self)
341 def decode(self, s, _w=WHITESPACE.match):
342 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
343 instance containing a JSON document)
346 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
347 end = _w(s, end).end()
348 if end != len(s):
349 raise ValueError(errmsg("Extra data", s, end, len(s)))
350 return obj
352 def raw_decode(self, s, idx=0):
353 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
354 beginning with a JSON document) and return a 2-tuple of the Python
355 representation and the index in ``s`` where the document ended.
357 This can be used to decode a JSON document from a string that may
358 have extraneous data at the end.
361 try:
362 obj, end = self.scan_once(s, idx)
363 except StopIteration:
364 raise ValueError("No JSON object could be decoded")
365 return obj, end