Parser is done. Add iterator interfaces for Scanner and Parser.
[pyyaml/python3.git] / lib / yaml / parser.py
blob858d9069c7ae48912aaafd0d3f032b0703cec53e
2 # YAML can be parsed by an LL(1) parser!
4 # We use the following production rules:
5 # stream ::= implicit_document? explicit_document* STREAM-END
6 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7 # implicit_document ::= block_node DOCUMENT-END?
8 # block_node ::= ALIAS | properties? block_content
9 # flow_node ::= ALIAS | properties? flow_content
10 # properties ::= TAG ANCHOR? | ANCHOR TAG?
11 # block_content ::= block_collection | flow_collection | SCALAR
12 # flow_content ::= flow_collection | SCALAR
13 # block_collection ::= block_sequence | block_mapping
14 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15 # block_mapping ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16 # block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17 # indentless_block_sequence ::= (BLOCK-ENTRY block_node?)+
18 # flow_collection ::= flow_sequence | flow_mapping
19 # flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20 # flow_mapping ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
24 # Note that there is a slight deviation from the specification. We require a
25 # non-empty node content if ANCHOR or TAG is specified. This disallow such
26 # documents as
28 # key: !!str # empty value
30 # This is done to prevent ambiguity in parsing tags and aliases:
32 # { !!perl/YAML::Parser: value }
34 # What is it? Should it be interpreted as
35 # { ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
36 # or
37 # { ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
38 # Since we disallow non-empty node content, tags are always followed by spaces
39 # or line breaks.
41 # FIRST sets:
42 # stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
43 # explicit_document: { DIRECTIVE DOCUMENT-START }
44 # implicit_document: FIRST(block_node)
45 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
46 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
47 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
48 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
49 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
50 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
51 # block_sequence: { BLOCK-SEQUENCE-START }
52 # block_mapping: { BLOCK-MAPPING-START }
53 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
54 # indentless_sequence: { ENTRY }
55 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
56 # flow_sequence: { FLOW-SEQUENCE-START }
57 # flow_mapping: { FLOW-MAPPING-START }
58 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
59 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
61 from error import YAMLError
62 from tokens import *
63 from events import *
65 class ParserError(YAMLError):
67 def __init__(self, context=None, context_marker=None,
68 problem=None, problem_marker=None):
69 self.context = context
70 self.context_marker = context_marker
71 self.problem = problem
72 self.problem_marker = problem_marker
74 def __str__(self):
75 lines = []
76 for (place, marker) in [(self.context, self.context_marker),
77 (self.problem, self.problem_marker)]:
78 if place is not None:
79 lines.append(place)
80 if marker is not None:
81 lines.append(str(marker))
82 return '\n'.join(lines)
84 class Parser:
85 # Since writing an LL(1) parser is a straightforward task, we do not give
86 # many comments here.
87 # Note that we use Python generators. If you rewrite the parser to another
88 # language, you may replace all 'yield'-s with event handler calls.
90 DEFAULT_TAGS = {
91 u'!': u'!',
92 u'!!': u'tag:yaml.org,2002:',
95 def __init__(self, scanner):
96 self.scanner = scanner
97 self.current_event = None
98 self.yaml_version = None
99 self.tag_handles = {}
100 self.event_generator = self.parse_stream()
102 def check(self, *choices):
103 # Check the type of the next event.
104 if self.current_event is None:
105 try:
106 self.current_event = self.event_generator.next()
107 except StopIteration:
108 pass
109 if self.current_event is not None:
110 for choice in choices:
111 if isinstance(self.current_event, choice):
112 return True
113 return False
115 def get(self):
116 # Get the next event.
117 if self.current_event is None:
118 try:
119 self.current_event = self.event_generator.next()
120 except StopIteration:
121 pass
122 value = self.current_event
123 self.current_event = None
124 return value
126 def __iter__(self):
127 # Iterator protocol.
128 return self.event_generator
130 def parse_stream(self):
131 # implicit_document? explicit_document* STREAM-END
133 # Parse implicit document.
134 if not self.scanner.check(DirectiveToken, DocumentStartToken,
135 StreamEndToken):
136 self.tag_handles = self.DEFAULT_TAGS
137 for event in self.parse_block_node():
138 yield event
140 # Parse explicit documents.
141 while not self.scanner.check(StreamEndToken):
142 self.process_directives()
143 if not self.scanner.check(DocumentStartToken):
144 raise ParserError(None, None,
145 "expected '<document start>', but found %r"
146 % self.scanner.peek().id,
147 self.scanner.peek().start_marker)
148 token = self.scanner.get()
149 if self.scanner.check(DirectiveToken,
150 DocumentStartToken, DocumentEndToken, StreamEndToken):
151 yield self.process_empty_scalar(token.end_marker)
152 else:
153 for event in self.parse_block_node():
154 yield event
155 while self.scanner.check(DocumentEndToken):
156 self.scanner.get()
158 # Parse end of stream.
159 token = self.scanner.get()
160 yield StreamEndEvent(token.start_marker, token.end_marker)
162 def process_directives(self):
163 # DIRECTIVE*
164 self.yaml_version = None
165 self.tag_handles = {}
166 while self.scanner.check(DirectiveToken):
167 token = self.scanner.get()
168 if token.name == u'YAML':
169 if self.yaml_version is not None:
170 raise ParserError(None, None,
171 "found duplicate YAML directive", token.start_marker())
172 major, minor = token.value
173 if major != 1:
174 raise ParserError(None, None,
175 "found incompatible YAML document (version 1.* is required)",
176 token.start_marker())
177 self.yaml_version = token.value
178 elif token.name == u'TAG':
179 handle, prefix = token.value
180 if handle in self.tag_handles:
181 raise ParserError(None, None,
182 "duplicate tag handle %r" % handle.encode('utf-8'),
183 token.start_marker())
184 self.tag_handles[handle] = prefix
185 for key in self.DEFAULT_TAGS:
186 if key not in self.tag_handles:
187 self.tag_handles[key] = self.DEFAULT_TAGS[key]
189 def parse_block_node(self):
190 return self.parse_node(block=True)
192 def parse_flow_node(self):
193 return self.parse_node()
195 def parse_block_node_or_indentless_sequence(self):
196 return self.parse_node(block=True, indentless_sequence=True)
198 def parse_node(self, block=False, indentless_sequence=False):
199 # block_node ::= ALIAS | properties? block_content
200 # flow_node ::= ALIAS | properties? flow_content
201 # properties ::= TAG ANCHOR? | ANCHOR TAG?
202 # block_content ::= block_collection | flow_collection | SCALAR
203 # flow_content ::= flow_collection | SCALAR
204 # block_collection ::= block_sequence | block_mapping
205 # block_node_or_indentless_sequence ::= ALIAS | properties?
206 # (block_content | indentless_block_sequence)
207 if self.scanner.check(AliasToken):
208 token = self.scanner.get()
209 yield AliasEvent(token.value, token.start_marker, token.end_marker)
210 else:
211 anchor = None
212 tag = None
213 start_marker = end_marker = tag_marker = None
214 if self.scanner.check(AnchorToken):
215 token = self.scanner.get()
216 start_marker = end_marker = token.start_marker
217 anchor = token.value
218 if self.scanner.check(TagToken):
219 token = self.scanner.get()
220 end_marker = tag_marker = token.start_marker
221 tag = token.value
222 elif self.scanner.check(TagToken):
223 token = self.scanner.get()
224 start_marker = end_marker = tag_marker = token.start_marker
225 tag = token.value
226 if self.scanner.check(AnchorToken):
227 token = self.scanner.get()
228 end_marker = token.start_marker
229 anchor = token.value
230 if tag is not None:
231 handle, suffix = tag
232 if handle is not None:
233 if handle not in self.tag_handles:
234 raise ParserError("while parsing a node", start_marker,
235 "found undefined tag handle %r" % handle.encode('utf-8'),
236 tag_marker)
237 tag = self.tag_handles[handle]+suffix
238 else:
239 tag = suffix
240 if tag is None:
241 if not (self.scanner.check(ScalarToken) and
242 self.scanner.peek().plain):
243 tag = u'!'
244 if start_marker is None:
245 start_marker = self.scanner.peek().start_marker
246 event = None
247 collection_events = None
248 if indentless_sequence and self.scanner.check(BlockEntryToken):
249 end_marker = self.scanner.peek().end_marker
250 event = SequenceEvent(anchor, tag, start_marker, end_marker)
251 collection_events = self.parse_indentless_sequence()
252 else:
253 if self.scanner.check(ScalarToken):
254 token = self.scanner.get()
255 end_marker = token.end_marker
256 event = ScalarEvent(anchor, tag, token.value,
257 start_marker, end_marker)
258 elif self.scanner.check(FlowSequenceStartToken):
259 end_marker = self.scanner.peek().end_marker
260 event = SequenceEvent(anchor, tag, start_marker, end_marker)
261 collection_events = self.parse_flow_sequence()
262 elif self.scanner.check(FlowMappingStartToken):
263 end_marker = self.scanner.peek().end_marker
264 event = MappingEvent(anchor, tag, start_marker, end_marker)
265 collection_events = self.parse_flow_mapping()
266 elif block and self.scanner.check(BlockSequenceStartToken):
267 end_marker = self.scanner.peek().start_marker
268 event = SequenceEvent(anchor, tag, start_marker, end_marker)
269 collection_events = self.parse_block_sequence()
270 elif block and self.scanner.check(BlockMappingStartToken):
271 end_marker = self.scanner.peek().start_marker
272 event = MappingEvent(anchor, tag, start_marker, end_marker)
273 collection_events = self.parse_block_mapping()
274 else:
275 if block:
276 node = 'block'
277 else:
278 node = 'flow'
279 token = self.scanner.peek()
280 raise ParserError("while scanning a %s node" % node, start_marker,
281 "expected the node content, but found %r" % token.id,
282 token.start_marker)
283 yield event
284 if collection_events is not None:
285 for event in collection_events:
286 yield event
288 def parse_block_sequence(self):
289 # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
290 token = self.scanner.get()
291 start_marker = token.start_marker
292 while self.scanner.check(BlockEntryToken):
293 token = self.scanner.get()
294 if not self.scanner.check(BlockEntryToken, BlockEndToken):
295 for event in self.parse_block_node():
296 yield event
297 else:
298 yield self.process_empty_scalar(token.end_marker)
299 if not self.scanner.check(BlockEndToken):
300 token = self.scanner.peek()
301 raise ParserError("while scanning a block collection", start_marker,
302 "expected <block end>, but found %r" % token.id, token.start_marker)
303 token = self.scanner.get()
304 yield CollectionEndEvent(token.start_marker, token.end_marker)
306 def parse_indentless_sequence(self):
307 # (BLOCK-ENTRY block_node?)+
308 while self.scanner.check(BlockEntryToken):
309 token = self.scanner.get()
310 if not self.scanner.check(BlockEntryToken,
311 KeyToken, ValueToken, BlockEndToken):
312 for event in self.parse_block_node():
313 yield event
314 else:
315 yield self.process_empty_scalar(token.end_marker)
316 token = self.scanner.peek()
317 yield CollectionEndEvent(token.start_marker, token.start_marker)
319 def parse_block_mapping(self):
320 # BLOCK-MAPPING_START
321 # ((KEY block_node_or_indentless_sequence?)?
322 # (VALUE block_node_or_indentless_sequence?)?)*
323 # BLOCK-END
324 token = self.scanner.get()
325 start_marker = token.start_marker
326 while self.scanner.check(KeyToken, ValueToken):
327 if self.scanner.check(KeyToken):
328 token = self.scanner.get()
329 if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
330 for event in self.parse_block_node_or_indentless_sequence():
331 yield event
332 else:
333 yield self.process_empty_scalar(token.end_marker)
334 if self.scanner.check(ValueToken):
335 token = self.scanner.get()
336 if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
337 for event in self.parse_block_node_or_indentless_sequence():
338 yield event
339 else:
340 yield self.process_empty_scalar(token.end_marker)
341 else:
342 token = self.scanner.peek()
343 yield self.process_empty_scalar(token.start_marker)
344 if not self.scanner.check(BlockEndToken):
345 token = self.scanner.peek()
346 raise ParserError("while scanning a block mapping", start_marker,
347 "expected <block end>, but found %r" % token.id, token.start_marker)
348 token = self.scanner.get()
349 yield CollectionEndEvent(token.start_marker, token.end_marker)
351 def parse_flow_sequence(self):
352 # flow_sequence ::= FLOW-SEQUENCE-START
353 # (flow_sequence_entry FLOW-ENTRY)*
354 # flow_sequence_entry?
355 # FLOW-SEQUENCE-END
356 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
358 # Note that while production rules for both flow_sequence_entry and
359 # flow_mapping_entry are equal, their interpretations are different.
360 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
361 # generate an inline mapping (set syntax).
362 token = self.scanner.get()
363 start_marker = token.start_marker
364 while not self.scanner.check(FlowSequenceEndToken):
365 if self.scanner.check(KeyToken):
366 token = self.scanner.get()
367 yield MappingEvent(None, u'!',
368 token.start_marker, token.end_marker)
369 if not self.scanner.check(ValueToken,
370 FlowEntryToken, FlowSequenceEndToken):
371 for event in self.parse_flow_node():
372 yield event
373 else:
374 yield self.process_empty_scalar(token.end_marker)
375 if self.scanner.check(ValueToken):
376 token = self.scanner.get()
377 if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
378 for event in self.parse_flow_node():
379 yield event
380 else:
381 yield self.process_empty_scalar(token.end_marker)
382 else:
383 token = self.scanner.peek()
384 yield self.process_empty_scalar(token.start_marker)
385 token = self.scanner.peek()
386 yield CollectionEndEvent(token.start_marker, token.start_marker)
387 else:
388 for event in self.parse_flow_node():
389 yield event
390 if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
391 token = self.scanner.peek()
392 raise ParserError("while scanning a flow sequence", start_marker,
393 "expected ',' or ']', but got %r" % token.id, token.start_marker)
394 if self.scanner.check(FlowEntryToken):
395 self.scanner.get()
396 if not self.scanner.check(FlowSequenceEndToken):
397 token = self.scanner.peek()
398 raise ParserError("while scanning a flow sequence", start_marker,
399 "expected ']', but found %r" % token.id, token.start_marker)
400 token = self.scanner.get()
401 yield CollectionEndEvent(token.start_marker, token.end_marker)
403 def parse_flow_mapping(self):
404 # flow_mapping ::= FLOW-MAPPING-START
405 # (flow_mapping_entry FLOW-ENTRY)*
406 # flow_mapping_entry?
407 # FLOW-MAPPING-END
408 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
409 token = self.scanner.get()
410 start_marker = token.start_marker
411 while not self.scanner.check(FlowMappingEndToken):
412 if self.scanner.check(KeyToken):
413 token = self.scanner.get()
414 if not self.scanner.check(ValueToken,
415 FlowEntryToken, FlowMappingEndToken):
416 for event in self.parse_flow_node():
417 yield event
418 else:
419 yield self.process_empty_scalar(token.end_marker)
420 if self.scanner.check(ValueToken):
421 token = self.scanner.get()
422 if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
423 for event in self.parse_flow_node():
424 yield event
425 else:
426 yield self.process_empty_scalar(token.end_marker)
427 else:
428 token = self.scanner.peek()
429 yield self.process_empty_scalar(token.start_marker)
430 else:
431 for event in self.parse_flow_node():
432 yield event
433 yield self.process_empty_scalar(self.scanner.peek().start_marker)
434 if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
435 token = self.scanner.peek()
436 raise ParserError("while scanning a flow mapping", start_marker,
437 "expected ',' or '}', but got %r" % token.id, token.start_marker)
438 if self.scanner.check(FlowEntryToken):
439 self.scanner.get()
440 if not self.scanner.check(FlowMappingEndToken):
441 token = self.scanner.peek()
442 raise ParserError("while scanning a flow mapping", start_marker,
443 "expected '}', but found %r" % token.id, token.start_marker)
444 token = self.scanner.get()
445 yield CollectionEndEvent(token.start_marker, token.end_marker)
447 def process_empty_scalar(self, marker):
448 return ScalarEvent(None, None, u'', marker, marker)