simplejson/encoder.py

   1 """Implementation of JSONEncoder
   2 """
   3 import re
   4
   5 try:
   6     from simplejson._speedups import encode_basestring_ascii as \
   7         c_encode_basestring_ascii
   8 except ImportError:
   9     c_encode_basestring_ascii = None
  10 try:
  11     from simplejson._speedups import make_encoder as c_make_encoder
  12 except ImportError:
  13     c_make_encoder = None
  14
  15 from simplejson.decoder import PosInf
  16
  17 ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
  18 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
  19 HAS_UTF8 = re.compile(r'[\x80-\xff]')
  20 ESCAPE_DCT = {
  21     '\\': '\\\\',
  22     '"': '\\"',
  23     '\b': '\\b',
  24     '\f': '\\f',
  25     '\n': '\\n',
  26     '\r': '\\r',
  27     '\t': '\\t',
  28 }
  29 for i in range(0x20):
  30     #ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
  31     ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
  32
  33 FLOAT_REPR = repr
  34
  35 def encode_basestring(s):
  36     """Return a JSON representation of a Python string
  37
  38     """
  39     if isinstance(s, str) and HAS_UTF8.search(s) is not None:
  40         s = s.decode('utf-8')
  41     def replace(match):
  42         return ESCAPE_DCT[match.group(0)]
  43     return u'"' + ESCAPE.sub(replace, s) + u'"'
  44
  45
  46 def py_encode_basestring_ascii(s):
  47     """Return an ASCII-only JSON representation of a Python string
  48
  49     """
  50     if isinstance(s, str) and HAS_UTF8.search(s) is not None:
  51         s = s.decode('utf-8')
  52     def replace(match):
  53         s = match.group(0)
  54         try:
  55             return ESCAPE_DCT[s]
  56         except KeyError:
  57             n = ord(s)
  58             if n < 0x10000:
  59                 #return '\\u{0:04x}'.format(n)
  60                 return '\\u%04x' % (n,)
  61             else:
  62                 # surrogate pair
  63                 n -= 0x10000
  64                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
  65                 s2 = 0xdc00 | (n & 0x3ff)
  66                 #return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
  67                 return '\\u%04x\\u%04x' % (s1, s2)
  68     return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
  69
  70
  71 encode_basestring_ascii = (
  72     c_encode_basestring_ascii or py_encode_basestring_ascii)
  73
  74 class JSONEncoder(object):
  75     """Extensible JSON <http://json.org> encoder for Python data structures.
  76
  77     Supports the following objects and types by default:
  78
  79     +-------------------+---------------+
  80     | Python            | JSON          |
  81     +===================+===============+
  82     | dict              | object        |
  83     +-------------------+---------------+
  84     | list, tuple       | array         |
  85     +-------------------+---------------+
  86     | str, unicode      | string        |
  87     +-------------------+---------------+
  88     | int, long, float  | number        |
  89     +-------------------+---------------+
  90     | True              | true          |
  91     +-------------------+---------------+
  92     | False             | false         |
  93     +-------------------+---------------+
  94     | None              | null          |
  95     +-------------------+---------------+
  96
  97     To extend this to recognize other objects, subclass and implement a
  98     ``.default()`` method with another method that returns a serializable
  99     object for ``o`` if possible, otherwise it should call the superclass
 100     implementation (to raise ``TypeError``).
 101
 102     """
 103     item_separator = ', '
 104     key_separator = ': '
 105     def __init__(self, skipkeys=False, ensure_ascii=True,
 106             check_circular=True, allow_nan=True, sort_keys=False,
 107             indent=None, separators=None, encoding='utf-8', default=None):
 108         """Constructor for JSONEncoder, with sensible defaults.
 109
 110         If skipkeys is false, then it is a TypeError to attempt
 111         encoding of keys that are not str, int, long, float or None.  If
 112         skipkeys is True, such items are simply skipped.
 113
 114         If ensure_ascii is true, the output is guaranteed to be str
 115         objects with all incoming unicode characters escaped.  If
 116         ensure_ascii is false, the output will be unicode object.
 117
 118         If check_circular is true, then lists, dicts, and custom encoded
 119         objects will be checked for circular references during encoding to
 120         prevent an infinite recursion (which would cause an OverflowError).
 121         Otherwise, no such check takes place.
 122
 123         If allow_nan is true, then NaN, Infinity, and -Infinity will be
 124         encoded as such.  This behavior is not JSON specification compliant,
 125         but is consistent with most JavaScript based encoders and decoders.
 126         Otherwise, it will be a ValueError to encode such floats.
 127
 128         If sort_keys is true, then the output of dictionaries will be
 129         sorted by key; this is useful for regression tests to ensure
 130         that JSON serializations can be compared on a day-to-day basis.
 131
 132         If indent is a non-negative integer, then JSON array
 133         elements and object members will be pretty-printed with that
 134         indent level.  An indent level of 0 will only insert newlines.
 135         None is the most compact representation.
 136
 137         If specified, separators should be a (item_separator, key_separator)
 138         tuple.  The default is (', ', ': ').  To get the most compact JSON
 139         representation you should specify (',', ':') to eliminate whitespace.
 140
 141         If specified, default is a function that gets called for objects
 142         that can't otherwise be serialized.  It should return a JSON encodable
 143         version of the object or raise a ``TypeError``.
 144
 145         If encoding is not None, then all input strings will be
 146         transformed into unicode using that encoding prior to JSON-encoding.
 147         The default is UTF-8.
 148
 149         """
 150
 151         self.skipkeys = skipkeys
 152         self.ensure_ascii = ensure_ascii
 153         self.check_circular = check_circular
 154         self.allow_nan = allow_nan
 155         self.sort_keys = sort_keys
 156         self.indent = indent
 157         if separators is not None:
 158             self.item_separator, self.key_separator = separators
 159         if default is not None:
 160             self.default = default
 161         self.encoding = encoding
 162
 163     def default(self, o):
 164         """Implement this method in a subclass such that it returns
 165         a serializable object for ``o``, or calls the base implementation
 166         (to raise a ``TypeError``).
 167
 168         For example, to support arbitrary iterators, you could
 169         implement default like this::
 170
 171             def default(self, o):
 172                 try:
 173                     iterable = iter(o)
 174                 except TypeError:
 175                     pass
 176                 else:
 177                     return list(iterable)
 178                 return JSONEncoder.default(self, o)
 179
 180         """
 181         raise TypeError(repr(o) + " is not JSON serializable")
 182
 183     def encode(self, o):
 184         """Return a JSON string representation of a Python data structure.
 185
 186         >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
 187         '{"foo": ["bar", "baz"]}'
 188
 189         """
 190         # This is for extremely simple cases and benchmarks.
 191         if isinstance(o, basestring):
 192             if isinstance(o, str):
 193                 _encoding = self.encoding
 194                 if (_encoding is not None
 195                         and not (_encoding == 'utf-8')):
 196                     o = o.decode(_encoding)
 197             if self.ensure_ascii:
 198                 return encode_basestring_ascii(o)
 199             else:
 200                 return encode_basestring(o)
 201         # This doesn't pass the iterator directly to ''.join() because the
 202         # exceptions aren't as detailed.  The list call should be roughly
 203         # equivalent to the PySequence_Fast that ''.join() would do.
 204         chunks = self.iterencode(o, _one_shot=True)
 205         if not isinstance(chunks, (list, tuple)):
 206             chunks = list(chunks)
 207         if self.ensure_ascii:
 208             return ''.join(chunks)
 209         else:
 210             return u''.join(chunks)
 211
 212     def iterencode(self, o, _one_shot=False):
 213         """Encode the given object and yield each string
 214         representation as available.
 215
 216         For example::
 217
 218             for chunk in JSONEncoder().iterencode(bigobject):
 219                 mysocket.write(chunk)
 220
 221         """
 222         if self.check_circular:
 223             markers = {}
 224         else:
 225             markers = None
 226         if self.ensure_ascii:
 227             _encoder = encode_basestring_ascii
 228         else:
 229             _encoder = encode_basestring
 230         if self.encoding != 'utf-8':
 231             def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
 232                 if isinstance(o, str):
 233                     o = o.decode(_encoding)
 234                 return _orig_encoder(o)
 235
 236         def floatstr(o, allow_nan=self.allow_nan,
 237                 _repr=FLOAT_REPR, _inf=PosInf, _neginf=-PosInf):
 238             # Check for specials. Note that this type of test is processor
 239             # and/or platform-specific, so do tests which don't depend on
 240             # the internals.
 241
 242             if o != o:
 243                 text = 'NaN'
 244             elif o == _inf:
 245                 text = 'Infinity'
 246             elif o == _neginf:
 247                 text = '-Infinity'
 248             else:
 249                 return _repr(o)
 250
 251             if not allow_nan:
 252                 raise ValueError(
 253                     "Out of range float values are not JSON compliant: " +
 254                     repr(o))
 255
 256             return text
 257
 258
 259         if (_one_shot and c_make_encoder is not None
 260                 and not self.indent and not self.sort_keys):
 261             _iterencode = c_make_encoder(
 262                 markers, self.default, _encoder, self.indent,
 263                 self.key_separator, self.item_separator, self.sort_keys,
 264                 self.skipkeys, self.allow_nan)
 265         else:
 266             _iterencode = _make_iterencode(
 267                 markers, self.default, _encoder, self.indent, floatstr,
 268                 self.key_separator, self.item_separator, self.sort_keys,
 269                 self.skipkeys, _one_shot)
 270         return _iterencode(o, 0)
 271
 272 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
 273         _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
 274         ## HACK: hand-optimized bytecode; turn globals into locals
 275         False=False,
 276         True=True,
 277         ValueError=ValueError,
 278         basestring=basestring,
 279         dict=dict,
 280         float=float,
 281         id=id,
 282         int=int,
 283         isinstance=isinstance,
 284         list=list,
 285         long=long,
 286         str=str,
 287         tuple=tuple,
 288     ):
 289
 290     def _iterencode_list(lst, _current_indent_level):
 291         if not lst:
 292             yield '[]'
 293             return
 294         if markers is not None:
 295             markerid = id(lst)
 296             if markerid in markers:
 297                 raise ValueError("Circular reference detected")
 298             markers[markerid] = lst
 299         buf = '['
 300         if _indent is not None:
 301             _current_indent_level += 1
 302             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
 303             separator = _item_separator + newline_indent
 304             buf += newline_indent
 305         else:
 306             newline_indent = None
 307             separator = _item_separator
 308         first = True
 309         for value in lst:
 310             if first:
 311                 first = False
 312             else:
 313                 buf = separator
 314             if isinstance(value, basestring):
 315                 yield buf + _encoder(value)
 316             elif value is None:
 317                 yield buf + 'null'
 318             elif value is True:
 319                 yield buf + 'true'
 320             elif value is False:
 321                 yield buf + 'false'
 322             elif isinstance(value, (int, long)):
 323                 yield buf + str(value)
 324             elif isinstance(value, float):
 325                 yield buf + _floatstr(value)
 326             else:
 327                 yield buf
 328                 if isinstance(value, (list, tuple)):
 329                     chunks = _iterencode_list(value, _current_indent_level)
 330                 elif isinstance(value, dict):
 331                     chunks = _iterencode_dict(value, _current_indent_level)
 332                 else:
 333                     chunks = _iterencode(value, _current_indent_level)
 334                 for chunk in chunks:
 335                     yield chunk
 336         if newline_indent is not None:
 337             _current_indent_level -= 1
 338             yield '\n' + (' ' * (_indent * _current_indent_level))
 339         yield ']'
 340         if markers is not None:
 341             del markers[markerid]
 342
 343     def _iterencode_dict(dct, _current_indent_level):
 344         if not dct:
 345             yield '{}'
 346             return
 347         if markers is not None:
 348             markerid = id(dct)
 349             if markerid in markers:
 350                 raise ValueError("Circular reference detected")
 351             markers[markerid] = dct
 352         yield '{'
 353         if _indent is not None:
 354             _current_indent_level += 1
 355             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
 356             item_separator = _item_separator + newline_indent
 357             yield newline_indent
 358         else:
 359             newline_indent = None
 360             item_separator = _item_separator
 361         first = True
 362         if _sort_keys:
 363             items = dct.items()
 364             items.sort(key=lambda kv: kv[0])
 365         else:
 366             items = dct.iteritems()
 367         for key, value in items:
 368             if isinstance(key, basestring):
 369                 pass
 370             # JavaScript is weakly typed for these, so it makes sense to
 371             # also allow them.  Many encoders seem to do something like this.
 372             elif isinstance(key, float):
 373                 key = _floatstr(key)
 374             elif key is True:
 375                 key = 'true'
 376             elif key is False:
 377                 key = 'false'
 378             elif key is None:
 379                 key = 'null'
 380             elif isinstance(key, (int, long)):
 381                 key = str(key)
 382             elif _skipkeys:
 383                 continue
 384             else:
 385                 raise TypeError("key " + repr(key) + " is not a string")
 386             if first:
 387                 first = False
 388             else:
 389                 yield item_separator
 390             yield _encoder(key)
 391             yield _key_separator
 392             if isinstance(value, basestring):
 393                 yield _encoder(value)
 394             elif value is None:
 395                 yield 'null'
 396             elif value is True:
 397                 yield 'true'
 398             elif value is False:
 399                 yield 'false'
 400             elif isinstance(value, (int, long)):
 401                 yield str(value)
 402             elif isinstance(value, float):
 403                 yield _floatstr(value)
 404             else:
 405                 if isinstance(value, (list, tuple)):
 406                     chunks = _iterencode_list(value, _current_indent_level)
 407                 elif isinstance(value, dict):
 408                     chunks = _iterencode_dict(value, _current_indent_level)
 409                 else:
 410                     chunks = _iterencode(value, _current_indent_level)
 411                 for chunk in chunks:
 412                     yield chunk
 413         if newline_indent is not None:
 414             _current_indent_level -= 1
 415             yield '\n' + (' ' * (_indent * _current_indent_level))
 416         yield '}'
 417         if markers is not None:
 418             del markers[markerid]
 419
 420     def _iterencode(o, _current_indent_level):
 421         if isinstance(o, basestring):
 422             yield _encoder(o)
 423         elif o is None:
 424             yield 'null'
 425         elif o is True:
 426             yield 'true'
 427         elif o is False:
 428             yield 'false'
 429         elif isinstance(o, (int, long)):
 430             yield str(o)
 431         elif isinstance(o, float):
 432             yield _floatstr(o)
 433         elif isinstance(o, (list, tuple)):
 434             for chunk in _iterencode_list(o, _current_indent_level):
 435                 yield chunk
 436         elif isinstance(o, dict):
 437             for chunk in _iterencode_dict(o, _current_indent_level):
 438                 yield chunk
 439         else:
 440             if markers is not None:
 441                 markerid = id(o)
 442                 if markerid in markers:
 443                     raise ValueError("Circular reference detected")
 444                 markers[markerid] = o
 445             o = _default(o)
 446             for chunk in _iterencode(o, _current_indent_level):
 447                 yield chunk
 448             if markers is not None:
 449                 del markers[markerid]
 450
 451     return _iterencode