Lib/json/encoder.py

   1 """Implementation of JSONEncoder
   2 """
   3 import re
   4
   5 try:
   6     from _json import encode_basestring_ascii as c_encode_basestring_ascii
   7 except ImportError:
   8     c_encode_basestring_ascii = None
   9 try:
  10     from _json import make_encoder as c_make_encoder
  11 except ImportError:
  12     c_make_encoder = None
  13
  14 ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
  15 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
  16 HAS_UTF8 = re.compile(r'[\x80-\xff]')
  17 ESCAPE_DCT = {
  18     '\\': '\\\\',
  19     '"': '\\"',
  20     '\b': '\\b',
  21     '\f': '\\f',
  22     '\n': '\\n',
  23     '\r': '\\r',
  24     '\t': '\\t',
  25 }
  26 for i in range(0x20):
  27     ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
  28     #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
  29
  30 # Assume this produces an infinity on all machines (probably not guaranteed)
  31 INFINITY = float('1e66666')
  32 FLOAT_REPR = repr
  33
  34 def encode_basestring(s):
  35     """Return a JSON representation of a Python string
  36
  37     """
  38     def replace(match):
  39         return ESCAPE_DCT[match.group(0)]
  40     return '"' + ESCAPE.sub(replace, s) + '"'
  41
  42
  43 def py_encode_basestring_ascii(s):
  44     """Return an ASCII-only JSON representation of a Python string
  45
  46     """
  47     if isinstance(s, str) and HAS_UTF8.search(s) is not None:
  48         s = s.decode('utf-8')
  49     def replace(match):
  50         s = match.group(0)
  51         try:
  52             return ESCAPE_DCT[s]
  53         except KeyError:
  54             n = ord(s)
  55             if n < 0x10000:
  56                 return '\\u{0:04x}'.format(n)
  57                 #return '\\u%04x' % (n,)
  58             else:
  59                 # surrogate pair
  60                 n -= 0x10000
  61                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
  62                 s2 = 0xdc00 | (n & 0x3ff)
  63                 return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
  64                 #return '\\u%04x\\u%04x' % (s1, s2)
  65     return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
  66
  67
  68 encode_basestring_ascii = (
  69     c_encode_basestring_ascii or py_encode_basestring_ascii)
  70
  71 class JSONEncoder(object):
  72     """Extensible JSON <http://json.org> encoder for Python data structures.
  73
  74     Supports the following objects and types by default:
  75
  76     +-------------------+---------------+
  77     | Python            | JSON          |
  78     +===================+===============+
  79     | dict              | object        |
  80     +-------------------+---------------+
  81     | list, tuple       | array         |
  82     +-------------------+---------------+
  83     | str, unicode      | string        |
  84     +-------------------+---------------+
  85     | int, long, float  | number        |
  86     +-------------------+---------------+
  87     | True              | true          |
  88     +-------------------+---------------+
  89     | False             | false         |
  90     +-------------------+---------------+
  91     | None              | null          |
  92     +-------------------+---------------+
  93
  94     To extend this to recognize other objects, subclass and implement a
  95     ``.default()`` method with another method that returns a serializable
  96     object for ``o`` if possible, otherwise it should call the superclass
  97     implementation (to raise ``TypeError``).
  98
  99     """
 100     item_separator = ', '
 101     key_separator = ': '
 102     def __init__(self, skipkeys=False, ensure_ascii=True,
 103             check_circular=True, allow_nan=True, sort_keys=False,
 104             indent=None, separators=None, encoding='utf-8', default=None):
 105         """Constructor for JSONEncoder, with sensible defaults.
 106
 107         If skipkeys is false, then it is a TypeError to attempt
 108         encoding of keys that are not str, int, long, float or None.  If
 109         skipkeys is True, such items are simply skipped.
 110
 111         If ensure_ascii is true, the output is guaranteed to be str
 112         objects with all incoming unicode characters escaped.  If
 113         ensure_ascii is false, the output will be unicode object.
 114
 115         If check_circular is true, then lists, dicts, and custom encoded
 116         objects will be checked for circular references during encoding to
 117         prevent an infinite recursion (which would cause an OverflowError).
 118         Otherwise, no such check takes place.
 119
 120         If allow_nan is true, then NaN, Infinity, and -Infinity will be
 121         encoded as such.  This behavior is not JSON specification compliant,
 122         but is consistent with most JavaScript based encoders and decoders.
 123         Otherwise, it will be a ValueError to encode such floats.
 124
 125         If sort_keys is true, then the output of dictionaries will be
 126         sorted by key; this is useful for regression tests to ensure
 127         that JSON serializations can be compared on a day-to-day basis.
 128
 129         If indent is a non-negative integer, then JSON array
 130         elements and object members will be pretty-printed with that
 131         indent level.  An indent level of 0 will only insert newlines.
 132         None is the most compact representation.
 133
 134         If specified, separators should be a (item_separator, key_separator)
 135         tuple.  The default is (', ', ': ').  To get the most compact JSON
 136         representation you should specify (',', ':') to eliminate whitespace.
 137
 138         If specified, default is a function that gets called for objects
 139         that can't otherwise be serialized.  It should return a JSON encodable
 140         version of the object or raise a ``TypeError``.
 141
 142         If encoding is not None, then all input strings will be
 143         transformed into unicode using that encoding prior to JSON-encoding.
 144         The default is UTF-8.
 145
 146         """
 147
 148         self.skipkeys = skipkeys
 149         self.ensure_ascii = ensure_ascii
 150         self.check_circular = check_circular
 151         self.allow_nan = allow_nan
 152         self.sort_keys = sort_keys
 153         self.indent = indent
 154         if separators is not None:
 155             self.item_separator, self.key_separator = separators
 156         if default is not None:
 157             self.default = default
 158         self.encoding = encoding
 159
 160     def default(self, o):
 161         """Implement this method in a subclass such that it returns
 162         a serializable object for ``o``, or calls the base implementation
 163         (to raise a ``TypeError``).
 164
 165         For example, to support arbitrary iterators, you could
 166         implement default like this::
 167
 168             def default(self, o):
 169                 try:
 170                     iterable = iter(o)
 171                 except TypeError:
 172                     pass
 173                 else:
 174                     return list(iterable)
 175                 return JSONEncoder.default(self, o)
 176
 177         """
 178         raise TypeError(repr(o) + " is not JSON serializable")
 179
 180     def encode(self, o):
 181         """Return a JSON string representation of a Python data structure.
 182
 183         >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
 184         '{"foo": ["bar", "baz"]}'
 185
 186         """
 187         # This is for extremely simple cases and benchmarks.
 188         if isinstance(o, basestring):
 189             if isinstance(o, str):
 190                 _encoding = self.encoding
 191                 if (_encoding is not None
 192                         and not (_encoding == 'utf-8')):
 193                     o = o.decode(_encoding)
 194             if self.ensure_ascii:
 195                 return encode_basestring_ascii(o)
 196             else:
 197                 return encode_basestring(o)
 198         # This doesn't pass the iterator directly to ''.join() because the
 199         # exceptions aren't as detailed.  The list call should be roughly
 200         # equivalent to the PySequence_Fast that ''.join() would do.
 201         chunks = self.iterencode(o, _one_shot=True)
 202         if not isinstance(chunks, (list, tuple)):
 203             chunks = list(chunks)
 204         return ''.join(chunks)
 205
 206     def iterencode(self, o, _one_shot=False):
 207         """Encode the given object and yield each string
 208         representation as available.
 209
 210         For example::
 211
 212             for chunk in JSONEncoder().iterencode(bigobject):
 213                 mysocket.write(chunk)
 214
 215         """
 216         if self.check_circular:
 217             markers = {}
 218         else:
 219             markers = None
 220         if self.ensure_ascii:
 221             _encoder = encode_basestring_ascii
 222         else:
 223             _encoder = encode_basestring
 224         if self.encoding != 'utf-8':
 225             def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
 226                 if isinstance(o, str):
 227                     o = o.decode(_encoding)
 228                 return _orig_encoder(o)
 229
 230         def floatstr(o, allow_nan=self.allow_nan,
 231                 _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
 232             # Check for specials.  Note that this type of test is processor
 233             # and/or platform-specific, so do tests which don't depend on the
 234             # internals.
 235
 236             if o != o:
 237                 text = 'NaN'
 238             elif o == _inf:
 239                 text = 'Infinity'
 240             elif o == _neginf:
 241                 text = '-Infinity'
 242             else:
 243                 return _repr(o)
 244
 245             if not allow_nan:
 246                 raise ValueError(
 247                     "Out of range float values are not JSON compliant: " +
 248                     repr(o))
 249
 250             return text
 251
 252
 253         if (_one_shot and c_make_encoder is not None
 254                 and not self.indent and not self.sort_keys):
 255             _iterencode = c_make_encoder(
 256                 markers, self.default, _encoder, self.indent,
 257                 self.key_separator, self.item_separator, self.sort_keys,
 258                 self.skipkeys, self.allow_nan)
 259         else:
 260             _iterencode = _make_iterencode(
 261                 markers, self.default, _encoder, self.indent, floatstr,
 262                 self.key_separator, self.item_separator, self.sort_keys,
 263                 self.skipkeys, _one_shot)
 264         return _iterencode(o, 0)
 265
 266 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
 267         _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
 268         ## HACK: hand-optimized bytecode; turn globals into locals
 269         False=False,
 270         True=True,
 271         ValueError=ValueError,
 272         basestring=basestring,
 273         dict=dict,
 274         float=float,
 275         id=id,
 276         int=int,
 277         isinstance=isinstance,
 278         list=list,
 279         long=long,
 280         str=str,
 281         tuple=tuple,
 282     ):
 283
 284     def _iterencode_list(lst, _current_indent_level):
 285         if not lst:
 286             yield '[]'
 287             return
 288         if markers is not None:
 289             markerid = id(lst)
 290             if markerid in markers:
 291                 raise ValueError("Circular reference detected")
 292             markers[markerid] = lst
 293         buf = '['
 294         if _indent is not None:
 295             _current_indent_level += 1
 296             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
 297             separator = _item_separator + newline_indent
 298             buf += newline_indent
 299         else:
 300             newline_indent = None
 301             separator = _item_separator
 302         first = True
 303         for value in lst:
 304             if first:
 305                 first = False
 306             else:
 307                 buf = separator
 308             if isinstance(value, basestring):
 309                 yield buf + _encoder(value)
 310             elif value is None:
 311                 yield buf + 'null'
 312             elif value is True:
 313                 yield buf + 'true'
 314             elif value is False:
 315                 yield buf + 'false'
 316             elif isinstance(value, (int, long)):
 317                 yield buf + str(value)
 318             elif isinstance(value, float):
 319                 yield buf + _floatstr(value)
 320             else:
 321                 yield buf
 322                 if isinstance(value, (list, tuple)):
 323                     chunks = _iterencode_list(value, _current_indent_level)
 324                 elif isinstance(value, dict):
 325                     chunks = _iterencode_dict(value, _current_indent_level)
 326                 else:
 327                     chunks = _iterencode(value, _current_indent_level)
 328                 for chunk in chunks:
 329                     yield chunk
 330         if newline_indent is not None:
 331             _current_indent_level -= 1
 332             yield '\n' + (' ' * (_indent * _current_indent_level))
 333         yield ']'
 334         if markers is not None:
 335             del markers[markerid]
 336
 337     def _iterencode_dict(dct, _current_indent_level):
 338         if not dct:
 339             yield '{}'
 340             return
 341         if markers is not None:
 342             markerid = id(dct)
 343             if markerid in markers:
 344                 raise ValueError("Circular reference detected")
 345             markers[markerid] = dct
 346         yield '{'
 347         if _indent is not None:
 348             _current_indent_level += 1
 349             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
 350             item_separator = _item_separator + newline_indent
 351             yield newline_indent
 352         else:
 353             newline_indent = None
 354             item_separator = _item_separator
 355         first = True
 356         if _sort_keys:
 357             items = dct.items()
 358             items.sort(key=lambda kv: kv[0])
 359         else:
 360             items = dct.iteritems()
 361         for key, value in items:
 362             if isinstance(key, basestring):
 363                 pass
 364             # JavaScript is weakly typed for these, so it makes sense to
 365             # also allow them.  Many encoders seem to do something like this.
 366             elif isinstance(key, float):
 367                 key = _floatstr(key)
 368             elif key is True:
 369                 key = 'true'
 370             elif key is False:
 371                 key = 'false'
 372             elif key is None:
 373                 key = 'null'
 374             elif isinstance(key, (int, long)):
 375                 key = str(key)
 376             elif _skipkeys:
 377                 continue
 378             else:
 379                 raise TypeError("key " + repr(key) + " is not a string")
 380             if first:
 381                 first = False
 382             else:
 383                 yield item_separator
 384             yield _encoder(key)
 385             yield _key_separator
 386             if isinstance(value, basestring):
 387                 yield _encoder(value)
 388             elif value is None:
 389                 yield 'null'
 390             elif value is True:
 391                 yield 'true'
 392             elif value is False:
 393                 yield 'false'
 394             elif isinstance(value, (int, long)):
 395                 yield str(value)
 396             elif isinstance(value, float):
 397                 yield _floatstr(value)
 398             else:
 399                 if isinstance(value, (list, tuple)):
 400                     chunks = _iterencode_list(value, _current_indent_level)
 401                 elif isinstance(value, dict):
 402                     chunks = _iterencode_dict(value, _current_indent_level)
 403                 else:
 404                     chunks = _iterencode(value, _current_indent_level)
 405                 for chunk in chunks:
 406                     yield chunk
 407         if newline_indent is not None:
 408             _current_indent_level -= 1
 409             yield '\n' + (' ' * (_indent * _current_indent_level))
 410         yield '}'
 411         if markers is not None:
 412             del markers[markerid]
 413
 414     def _iterencode(o, _current_indent_level):
 415         if isinstance(o, basestring):
 416             yield _encoder(o)
 417         elif o is None:
 418             yield 'null'
 419         elif o is True:
 420             yield 'true'
 421         elif o is False:
 422             yield 'false'
 423         elif isinstance(o, (int, long)):
 424             yield str(o)
 425         elif isinstance(o, float):
 426             yield _floatstr(o)
 427         elif isinstance(o, (list, tuple)):
 428             for chunk in _iterencode_list(o, _current_indent_level):
 429                 yield chunk
 430         elif isinstance(o, dict):
 431             for chunk in _iterencode_dict(o, _current_indent_level):
 432                 yield chunk
 433         else:
 434             if markers is not None:
 435                 markerid = id(o)
 436                 if markerid in markers:
 437                     raise ValueError("Circular reference detected")
 438                 markers[markerid] = o
 439             o = _default(o)
 440             for chunk in _iterencode(o, _current_indent_level):
 441                 yield chunk
 442             if markers is not None:
 443                 del markers[markerid]
 444
 445     return _iterencode