Scanner is complete.
[pyyaml/python3.git] / lib / yaml / reader.py
blob73838ee8afedc5aa288283bf0ae83dcc7d51fa93
1 # This module contains abstractions for the input stream. You don't have to
2 # looks further, there are no pretty code.
4 # We define two classes here.
6 # Marker(source, line, column)
7 # It's just a record and its only use is producing nice error messages.
8 # Parser does not use it for any other purposes.
10 # Reader(source, data)
11 # Reader determines the encoding of `data` and converts it to unicode.
12 # Reader provides the following methods and attributes:
13 # reader.peek(length=1) - return the next `length` characters
14 # reader.forward(length=1) - move the current position to `length` characters.
15 # reader.index - the number of the current character.
16 # reader.line, stream.column - the line and the column of the current character.
18 __all__ = ['Marker', 'Reader', 'ReaderError']
20 from error import YAMLError
22 import codecs, re
24 # Unfortunately, codec functions in Python 2.3 does not support the `finish`
25 # arguments, so we have to write our own wrappers.
27 try:
28 codecs.utf_8_decode('', 'strict', False)
29 from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
31 except TypeError:
33 def utf_16_le_decode(data, errors, finish=False):
34 if not finish and len(data) % 2 == 1:
35 data = data[:-1]
36 return codecs.utf_16_le_decode(data, errors)
38 def utf_16_be_decode(data, errors, finish=False):
39 if not finish and len(data) % 2 == 1:
40 data = data[:-1]
41 return codecs.utf_16_be_decode(data, errors)
43 def utf_8_decode(data, errors, finish=False):
44 if not finish:
45 # We are trying to remove a possible incomplete multibyte character
46 # from the suffix of the data.
47 # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
48 # All further bytes are in the range 0x80 to 0xbf.
49 # UTF-8 encoded UCS characters may be up to six bytes long.
50 count = 0
51 while count < 5 and count < len(data) \
52 and '\x80' <= data[-count-1] <= '\xBF':
53 count -= 1
54 if count < 5 and count < len(data) \
55 and '\xC0' <= data[-count-1] <= '\xFD':
56 data = data[:-count-1]
57 return codecs.utf_8_decode(data, errors)
59 class Marker:
61 def __init__(self, name, line, column, buffer, pointer):
62 self.name = name
63 self.line = line
64 self.column = column
65 self.buffer = buffer
66 self.pointer = pointer
68 def get_snippet(self, indent=4, max_length=75):
69 if self.buffer is None:
70 return None
71 head = ''
72 start = self.pointer
73 while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029':
74 start -= 1
75 if self.pointer-start > max_length/2-1:
76 head = ' ... '
77 start += 5
78 break
79 tail = ''
80 end = self.pointer
81 while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029':
82 end += 1
83 if end-self.pointer > max_length/2-1:
84 tail = ' ... '
85 end -= 5
86 break
87 snippet = self.buffer[start:end].encode('utf-8')
88 return ' '*indent + head + snippet + tail + '\n' \
89 + ' '*(indent+self.pointer-start+len(head)) + '^'
91 def __str__(self):
92 snippet = self.get_snippet()
93 where = " in \"%s\", line %d, column %d" \
94 % (self.name, self.line+1, self.column+1)
95 if snippet is not None:
96 where += ":\n"+snippet
97 return where
99 class ReaderError(YAMLError):
101 def __init__(self, name, position, character, encoding, reason):
102 self.name = name
103 self.character = character
104 self.position = position
105 self.encoding = encoding
106 self.reason = reason
108 def __str__(self):
109 if isinstance(self.character, str):
110 return "'%s' codec can't decode byte #x%02x: %s\n" \
111 " in \"%s\", position %d" \
112 % (self.encoding, ord(self.character), self.reason,
113 self.name, self.position)
114 else:
115 return "unacceptable character #x%04x: %s\n" \
116 " in \"%s\", position %d" \
117 % (ord(self.character), self.reason,
118 self.name, self.position)
120 class Reader:
121 # Reader:
122 # - determines the data encoding and converts it to unicode,
123 # - checks if characters are in allowed range,
124 # - adds '\0' to the end.
126 # Reader accepts
127 # - a `str` object,
128 # - a `unicode` object,
129 # - a file-like object with its `read` method returning `str`,
130 # - a file-like object with its `read` method returning `unicode`.
132 # Yeah, it's ugly and slow.
134 def __init__(self, data):
135 self.name = None
136 self.stream = None
137 self.stream_pointer = 0
138 self.eof = True
139 self.buffer = u''
140 self.pointer = 0
141 self.raw_buffer = None
142 self.raw_decode = None
143 self.index = 0
144 self.line = 0
145 self.column = 0
146 if isinstance(data, unicode):
147 self.name = "<unicode string>"
148 self.check_printable(data)
149 self.buffer = data+u'\0'
150 elif isinstance(data, str):
151 self.name = "<string>"
152 self.raw_buffer = data
153 self.determine_encoding()
154 else:
155 self.stream = data
156 self.name = getattr(data, 'name', "<file>")
157 self.eof = False
158 self.raw_buffer = ''
159 self.determine_encoding()
161 def peek(self, index=0):
162 if self.pointer+index+1 >= len(self.buffer):
163 self.update(index+1)
164 return self.buffer[self.pointer+index]
166 def prefix(self, length=1):
167 if self.pointer+length >= len(self.buffer):
168 self.update(length)
169 return self.buffer[self.pointer:self.pointer+length]
171 def forward(self, length=1):
172 if self.pointer+length+1 >= len(self.buffer):
173 self.update(length+1)
174 for k in range(length):
175 ch = self.buffer[self.pointer]
176 self.pointer += 1
177 self.index += 1
178 if ch in u'\n\x85\u2028\u2029' \
179 or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'):
180 self.line += 1
181 self.column = 0
182 elif ch != u'\uFEFF':
183 self.column += 1
185 def get_marker(self):
186 if self.stream is None:
187 return Marker(self.name, self.line, self.column,
188 self.buffer, self.pointer)
189 else:
190 return Marker(self.name, self.line, self.column, None, None)
192 def determine_encoding(self):
193 while not self.eof and len(self.raw_buffer) < 2:
194 self.update_raw()
195 if not isinstance(self.raw_buffer, unicode):
196 if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
197 self.raw_decode = utf_16_le_decode
198 elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
199 self.raw_decode = utf_16_be_decode
200 else:
201 self.raw_decode = utf_8_decode
202 self.update(1)
204 NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
205 def check_printable(self, data):
206 match = self.NON_PRINTABLE.search(data)
207 if match:
208 character = match.group()
209 position = self.index+(len(self.buffer)-self.pointer)+match.start()
210 raise ReaderError(self.name, position, character,
211 'unicode', "special characters are not allowed")
213 def update(self, length):
214 if self.raw_buffer is None:
215 return
216 self.buffer = self.buffer[self.pointer:]
217 self.pointer = 0
218 while len(self.buffer) < length:
219 if not self.eof:
220 self.update_raw()
221 if self.raw_decode is not None:
222 try:
223 data, converted = self.raw_decode(self.raw_buffer,
224 'strict', self.eof)
225 except UnicodeDecodeError, exc:
226 character = exc.object[exc.start]
227 if self.stream is not None:
228 position = self.stream_pointer-len(self.raw_buffer)+exc.start
229 else:
230 position = exc.start
231 raise ReaderError(self.name, position, character,
232 exc.encoding, exc.reason)
233 else:
234 data = self.raw_buffer
235 converted = len(data)
236 self.check_printable(data)
237 self.buffer += data
238 self.raw_buffer = self.raw_buffer[converted:]
239 if self.eof:
240 self.buffer += u'\0'
241 self.raw_buffer = None
242 break
244 def update_raw(self, size=1024):
245 data = self.stream.read(size)
246 if data:
247 self.raw_buffer += data
248 self.stream_pointer += len(data)
249 else:
250 self.eof = True
252 #try:
253 # import psyco
254 # psyco.bind(Reader)
255 #except ImportError:
256 # pass