Issue #7632: Fix a serious wrong output bug for string -> float conversion.
[python.git] / Lib / encodings / utf_8_sig.py
blob697ba95b4d3864d993b565968efad546b3bd1bf3
1 """ Python 'utf-8-sig' Codec
2 This work similar to UTF-8 with the following changes:
4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5 first three bytes.
7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8 bytes will be skipped.
9 """
10 import codecs
12 ### Codec APIs
14 def encode(input, errors='strict'):
15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
17 def decode(input, errors='strict'):
18 prefix = 0
19 if input[:3] == codecs.BOM_UTF8:
20 input = input[3:]
21 prefix = 3
22 (output, consumed) = codecs.utf_8_decode(input, errors, True)
23 return (output, consumed+prefix)
25 class IncrementalEncoder(codecs.IncrementalEncoder):
26 def __init__(self, errors='strict'):
27 codecs.IncrementalEncoder.__init__(self, errors)
28 self.first = True
30 def encode(self, input, final=False):
31 if self.first:
32 self.first = False
33 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
34 else:
35 return codecs.utf_8_encode(input, self.errors)[0]
37 def reset(self):
38 codecs.IncrementalEncoder.reset(self)
39 self.first = True
41 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
42 def __init__(self, errors='strict'):
43 codecs.BufferedIncrementalDecoder.__init__(self, errors)
44 self.first = True
46 def _buffer_decode(self, input, errors, final):
47 if self.first:
48 if len(input) < 3:
49 if codecs.BOM_UTF8.startswith(input):
50 # not enough data to decide if this really is a BOM
51 # => try again on the next call
52 return (u"", 0)
53 else:
54 self.first = None
55 else:
56 self.first = None
57 if input[:3] == codecs.BOM_UTF8:
58 (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
59 return (output, consumed+3)
60 return codecs.utf_8_decode(input, errors, final)
62 def reset(self):
63 codecs.BufferedIncrementalDecoder.reset(self)
64 self.first = True
66 class StreamWriter(codecs.StreamWriter):
67 def reset(self):
68 codecs.StreamWriter.reset(self)
69 try:
70 del self.encode
71 except AttributeError:
72 pass
74 def encode(self, input, errors='strict'):
75 self.encode = codecs.utf_8_encode
76 return encode(input, errors)
78 class StreamReader(codecs.StreamReader):
79 def reset(self):
80 codecs.StreamReader.reset(self)
81 try:
82 del self.decode
83 except AttributeError:
84 pass
86 def decode(self, input, errors='strict'):
87 if len(input) < 3:
88 if codecs.BOM_UTF8.startswith(input):
89 # not enough data to decide if this is a BOM
90 # => try again on the next call
91 return (u"", 0)
92 elif input[:3] == codecs.BOM_UTF8:
93 self.decode = codecs.utf_8_decode
94 (output, consumed) = codecs.utf_8_decode(input[3:],errors)
95 return (output, consumed+3)
96 # (else) no BOM present
97 self.decode = codecs.utf_8_decode
98 return codecs.utf_8_decode(input, errors)
100 ### encodings module API
102 def getregentry():
103 return codecs.CodecInfo(
104 name='utf-8-sig',
105 encode=encode,
106 decode=decode,
107 incrementalencoder=IncrementalEncoder,
108 incrementaldecoder=IncrementalDecoder,
109 streamreader=StreamReader,
110 streamwriter=StreamWriter,