Fixed a problem when the DOCUMENT-END event is not emitted until the beginning of...
[pyyaml.git] / lib / yaml / parser.py
bloba46bb9e9b2a711716d214edd65480f878308409f
2 # The following YAML grammar is LL(1) and is parsed by a recursive descent
3 # parser.
5 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
6 # implicit_document ::= block_node DOCUMENT-END*
7 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
8 # block_node_or_indentless_sequence ::=
9 # ALIAS
10 # | properties (block_content | indentless_block_sequence)?
11 # | block_content
12 # | indentless_block_sequence
13 # block_node ::= ALIAS
14 # | properties block_content?
15 # | block_content
16 # flow_node ::= ALIAS
17 # | properties flow_content?
18 # | flow_content
19 # properties ::= TAG ANCHOR? | ANCHOR TAG?
20 # block_content ::= block_collection | flow_collection | SCALAR
21 # flow_content ::= flow_collection | SCALAR
22 # block_collection ::= block_sequence | block_mapping
23 # flow_collection ::= flow_sequence | flow_mapping
24 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
25 # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
26 # block_mapping ::= BLOCK-MAPPING_START
27 # ((KEY block_node_or_indentless_sequence?)?
28 # (VALUE block_node_or_indentless_sequence?)?)*
29 # BLOCK-END
30 # flow_sequence ::= FLOW-SEQUENCE-START
31 # (flow_sequence_entry FLOW-ENTRY)*
32 # flow_sequence_entry?
33 # FLOW-SEQUENCE-END
34 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
35 # flow_mapping ::= FLOW-MAPPING-START
36 # (flow_mapping_entry FLOW-ENTRY)*
37 # flow_mapping_entry?
38 # FLOW-MAPPING-END
39 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
41 # FIRST sets:
43 # stream: { STREAM-START }
44 # explicit_document: { DIRECTIVE DOCUMENT-START }
45 # implicit_document: FIRST(block_node)
46 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
47 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
48 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
49 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
50 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
51 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
52 # block_sequence: { BLOCK-SEQUENCE-START }
53 # block_mapping: { BLOCK-MAPPING-START }
54 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
55 # indentless_sequence: { ENTRY }
56 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
57 # flow_sequence: { FLOW-SEQUENCE-START }
58 # flow_mapping: { FLOW-MAPPING-START }
59 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
60 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62 __all__ = ['Parser', 'ParserError']
64 from error import MarkedYAMLError
65 from tokens import *
66 from events import *
67 from scanner import *
69 class ParserError(MarkedYAMLError):
70 pass
72 class Parser(object):
73 # Since writing a recursive-descendant parser is a straightforward task, we
74 # do not give many comments here.
75 # Note that we use Python generators. If you rewrite the parser in another
76 # language, you may replace all 'yield'-s with event handler calls.
78 DEFAULT_TAGS = {
79 u'!': u'!',
80 u'!!': u'tag:yaml.org,2002:',
83 def __init__(self):
84 self.current_event = None
85 self.yaml_version = None
86 self.tag_handles = {}
87 self.states = []
88 self.marks = []
89 self.state = self.parse_stream_start
91 def check_event(self, *choices):
92 # Check the type of the next event.
93 if self.current_event is None:
94 if self.state:
95 self.current_event = self.state()
96 if self.current_event is not None:
97 if not choices:
98 return True
99 for choice in choices:
100 if isinstance(self.current_event, choice):
101 return True
102 return False
104 def peek_event(self):
105 # Get the next event.
106 if self.current_event is None:
107 if self.state:
108 self.current_event = self.state()
109 return self.current_event
111 def get_event(self):
112 # Get the next event and proceed further.
113 if self.current_event is None:
114 if self.state:
115 self.current_event = self.state()
116 value = self.current_event
117 self.current_event = None
118 return value
120 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
121 # implicit_document ::= block_node DOCUMENT-END*
122 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
124 def parse_stream_start(self):
126 # Parse the stream start.
127 token = self.get_token()
128 event = StreamStartEvent(token.start_mark, token.end_mark,
129 encoding=token.encoding)
131 # Prepare the next state.
132 self.state = self.parse_implicit_document_start
134 return event
136 def parse_implicit_document_start(self):
138 # Parse an implicit document.
139 if not self.check_token(DirectiveToken, DocumentStartToken,
140 StreamEndToken):
141 self.tag_handles = self.DEFAULT_TAGS
142 token = self.peek_token()
143 start_mark = end_mark = token.start_mark
144 event = DocumentStartEvent(start_mark, end_mark,
145 explicit=False)
147 # Prepare the next state.
148 self.states.append(self.parse_document_end)
149 self.state = self.parse_block_node
151 return event
153 else:
154 return self.parse_document_start()
156 def parse_document_start(self):
158 # Parse any extra document end indicators.
159 while self.check_token(DocumentEndToken):
160 self.get_token()
162 # Parse an explicit document.
163 if not self.check_token(StreamEndToken):
164 token = self.peek_token()
165 start_mark = token.start_mark
166 version, tags = self.process_directives()
167 if not self.check_token(DocumentStartToken):
168 raise ParserError(None, None,
169 "expected '<document start>', but found %r"
170 % self.peek_token().id,
171 self.peek_token().start_mark)
172 token = self.get_token()
173 end_mark = token.end_mark
174 event = DocumentStartEvent(start_mark, end_mark,
175 explicit=True, version=version, tags=tags)
176 self.states.append(self.parse_document_end)
177 self.state = self.parse_document_content
178 else:
179 # Parse the end of the stream.
180 token = self.get_token()
181 event = StreamEndEvent(token.start_mark, token.end_mark)
182 assert not self.states
183 assert not self.marks
184 self.state = None
185 return event
187 def parse_document_end(self):
189 # Parse the document end.
190 token = self.peek_token()
191 start_mark = end_mark = token.start_mark
192 explicit = False
193 if self.check_token(DocumentEndToken):
194 token = self.get_token()
195 end_mark = token.end_mark
196 explicit = True
197 event = DocumentEndEvent(start_mark, end_mark,
198 explicit=explicit)
200 # Prepare the next state.
201 self.state = self.parse_document_start
203 return event
205 def parse_document_content(self):
206 if self.check_token(DirectiveToken,
207 DocumentStartToken, DocumentEndToken, StreamEndToken):
208 event = self.process_empty_scalar(self.peek_token().start_mark)
209 self.state = self.states.pop()
210 return event
211 else:
212 return self.parse_block_node()
214 def process_directives(self):
215 self.yaml_version = None
216 self.tag_handles = {}
217 while self.check_token(DirectiveToken):
218 token = self.get_token()
219 if token.name == u'YAML':
220 if self.yaml_version is not None:
221 raise ParserError(None, None,
222 "found duplicate YAML directive", token.start_mark)
223 major, minor = token.value
224 if major != 1:
225 raise ParserError(None, None,
226 "found incompatible YAML document (version 1.* is required)",
227 token.start_mark)
228 self.yaml_version = token.value
229 elif token.name == u'TAG':
230 handle, prefix = token.value
231 if handle in self.tag_handles:
232 raise ParserError(None, None,
233 "duplicate tag handle %r" % handle.encode('utf-8'),
234 token.start_mark)
235 self.tag_handles[handle] = prefix
236 if self.tag_handles:
237 value = self.yaml_version, self.tag_handles.copy()
238 else:
239 value = self.yaml_version, None
240 for key in self.DEFAULT_TAGS:
241 if key not in self.tag_handles:
242 self.tag_handles[key] = self.DEFAULT_TAGS[key]
243 return value
245 # block_node_or_indentless_sequence ::= ALIAS
246 # | properties (block_content | indentless_block_sequence)?
247 # | block_content
248 # | indentless_block_sequence
249 # block_node ::= ALIAS
250 # | properties block_content?
251 # | block_content
252 # flow_node ::= ALIAS
253 # | properties flow_content?
254 # | flow_content
255 # properties ::= TAG ANCHOR? | ANCHOR TAG?
256 # block_content ::= block_collection | flow_collection | SCALAR
257 # flow_content ::= flow_collection | SCALAR
258 # block_collection ::= block_sequence | block_mapping
259 # flow_collection ::= flow_sequence | flow_mapping
261 def parse_block_node(self):
262 return self.parse_node(block=True)
264 def parse_flow_node(self):
265 return self.parse_node()
267 def parse_block_node_or_indentless_sequence(self):
268 return self.parse_node(block=True, indentless_sequence=True)
270 def parse_node(self, block=False, indentless_sequence=False):
271 if self.check_token(AliasToken):
272 token = self.get_token()
273 event = AliasEvent(token.value, token.start_mark, token.end_mark)
274 self.state = self.states.pop()
275 else:
276 anchor = None
277 tag = None
278 start_mark = end_mark = tag_mark = None
279 if self.check_token(AnchorToken):
280 token = self.get_token()
281 start_mark = token.start_mark
282 end_mark = token.end_mark
283 anchor = token.value
284 if self.check_token(TagToken):
285 token = self.get_token()
286 tag_mark = token.start_mark
287 end_mark = token.end_mark
288 tag = token.value
289 elif self.check_token(TagToken):
290 token = self.get_token()
291 start_mark = tag_mark = token.start_mark
292 end_mark = token.end_mark
293 tag = token.value
294 if self.check_token(AnchorToken):
295 token = self.get_token()
296 end_mark = token.end_mark
297 anchor = token.value
298 if tag is not None:
299 handle, suffix = tag
300 if handle is not None:
301 if handle not in self.tag_handles:
302 raise ParserError("while parsing a node", start_mark,
303 "found undefined tag handle %r" % handle.encode('utf-8'),
304 tag_mark)
305 tag = self.tag_handles[handle]+suffix
306 else:
307 tag = suffix
308 #if tag == u'!':
309 # raise ParserError("while parsing a node", start_mark,
310 # "found non-specific tag '!'", tag_mark,
311 # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
312 if start_mark is None:
313 start_mark = end_mark = self.peek_token().start_mark
314 event = None
315 implicit = (tag is None or tag == u'!')
316 if indentless_sequence and self.check_token(BlockEntryToken):
317 end_mark = self.peek_token().end_mark
318 event = SequenceStartEvent(anchor, tag, implicit,
319 start_mark, end_mark)
320 self.state = self.parse_indentless_sequence_entry
321 else:
322 if self.check_token(ScalarToken):
323 token = self.get_token()
324 end_mark = token.end_mark
325 if (token.plain and tag is None) or tag == u'!':
326 implicit = (True, False)
327 elif tag is None:
328 implicit = (False, True)
329 else:
330 implicit = (False, False)
331 event = ScalarEvent(anchor, tag, implicit, token.value,
332 start_mark, end_mark, style=token.style)
333 self.state = self.states.pop()
334 elif self.check_token(FlowSequenceStartToken):
335 end_mark = self.peek_token().end_mark
336 event = SequenceStartEvent(anchor, tag, implicit,
337 start_mark, end_mark, flow_style=True)
338 self.state = self.parse_flow_sequence_first_entry
339 elif self.check_token(FlowMappingStartToken):
340 end_mark = self.peek_token().end_mark
341 event = MappingStartEvent(anchor, tag, implicit,
342 start_mark, end_mark, flow_style=True)
343 self.state = self.parse_flow_mapping_first_key
344 elif block and self.check_token(BlockSequenceStartToken):
345 end_mark = self.peek_token().start_mark
346 event = SequenceStartEvent(anchor, tag, implicit,
347 start_mark, end_mark, flow_style=False)
348 self.state = self.parse_block_sequence_first_entry
349 elif block and self.check_token(BlockMappingStartToken):
350 end_mark = self.peek_token().start_mark
351 event = MappingStartEvent(anchor, tag, implicit,
352 start_mark, end_mark, flow_style=False)
353 self.state = self.parse_block_mapping_first_key
354 elif anchor is not None or tag is not None:
355 # Empty scalars are allowed even if a tag or an anchor is
356 # specified.
357 event = ScalarEvent(anchor, tag, (implicit, False), u'',
358 start_mark, end_mark)
359 self.state = self.states.pop()
360 else:
361 if block:
362 node = 'block'
363 else:
364 node = 'flow'
365 token = self.peek_token()
366 raise ParserError("while parsing a %s node" % node, start_mark,
367 "expected the node content, but found %r" % token.id,
368 token.start_mark)
369 return event
371 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
373 def parse_block_sequence_first_entry(self):
374 token = self.get_token()
375 self.marks.append(token.start_mark)
376 return self.parse_block_sequence_entry()
378 def parse_block_sequence_entry(self):
379 if self.check_token(BlockEntryToken):
380 token = self.get_token()
381 if not self.check_token(BlockEntryToken, BlockEndToken):
382 self.states.append(self.parse_block_sequence_entry)
383 return self.parse_block_node()
384 else:
385 self.state = self.parse_block_sequence_entry
386 return self.process_empty_scalar(token.end_mark)
387 if not self.check_token(BlockEndToken):
388 token = self.peek_token()
389 raise ParserError("while parsing a block collection", self.marks[-1],
390 "expected <block end>, but found %r" % token.id, token.start_mark)
391 token = self.get_token()
392 event = SequenceEndEvent(token.start_mark, token.end_mark)
393 self.state = self.states.pop()
394 self.marks.pop()
395 return event
397 # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
399 def parse_indentless_sequence_entry(self):
400 if self.check_token(BlockEntryToken):
401 token = self.get_token()
402 if not self.check_token(BlockEntryToken,
403 KeyToken, ValueToken, BlockEndToken):
404 self.states.append(self.parse_indentless_sequence_entry)
405 return self.parse_block_node()
406 else:
407 self.state = self.parse_indentless_sequence_entry
408 return self.process_empty_scalar(token.end_mark)
409 token = self.peek_token()
410 event = SequenceEndEvent(token.start_mark, token.start_mark)
411 self.state = self.states.pop()
412 return event
414 # block_mapping ::= BLOCK-MAPPING_START
415 # ((KEY block_node_or_indentless_sequence?)?
416 # (VALUE block_node_or_indentless_sequence?)?)*
417 # BLOCK-END
419 def parse_block_mapping_first_key(self):
420 token = self.get_token()
421 self.marks.append(token.start_mark)
422 return self.parse_block_mapping_key()
424 def parse_block_mapping_key(self):
425 if self.check_token(KeyToken):
426 token = self.get_token()
427 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
428 self.states.append(self.parse_block_mapping_value)
429 return self.parse_block_node_or_indentless_sequence()
430 else:
431 self.state = self.parse_block_mapping_value
432 return self.process_empty_scalar(token.end_mark)
433 if not self.check_token(BlockEndToken):
434 token = self.peek_token()
435 raise ParserError("while parsing a block mapping", self.marks[-1],
436 "expected <block end>, but found %r" % token.id, token.start_mark)
437 token = self.get_token()
438 event = MappingEndEvent(token.start_mark, token.end_mark)
439 self.state = self.states.pop()
440 self.marks.pop()
441 return event
443 def parse_block_mapping_value(self):
444 if self.check_token(ValueToken):
445 token = self.get_token()
446 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
447 self.states.append(self.parse_block_mapping_key)
448 return self.parse_block_node_or_indentless_sequence()
449 else:
450 self.state = self.parse_block_mapping_key
451 return self.process_empty_scalar(token.end_mark)
452 else:
453 self.state = self.parse_block_mapping_key
454 token = self.peek_token()
455 return self.process_empty_scalar(token.start_mark)
457 # flow_sequence ::= FLOW-SEQUENCE-START
458 # (flow_sequence_entry FLOW-ENTRY)*
459 # flow_sequence_entry?
460 # FLOW-SEQUENCE-END
461 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
463 # Note that while production rules for both flow_sequence_entry and
464 # flow_mapping_entry are equal, their interpretations are different.
465 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
466 # generate an inline mapping (set syntax).
468 def parse_flow_sequence_first_entry(self):
469 token = self.get_token()
470 self.marks.append(token.start_mark)
471 return self.parse_flow_sequence_entry(first=True)
473 def parse_flow_sequence_entry(self, first=False):
474 if not self.check_token(FlowSequenceEndToken):
475 if not first:
476 if self.check_token(FlowEntryToken):
477 self.get_token()
478 else:
479 token = self.peek_token()
480 raise ParserError("while parsing a flow sequence", self.marks[-1],
481 "expected ',' or ']', but got %r" % token.id, token.start_mark)
483 if self.check_token(KeyToken):
484 token = self.peek_token()
485 event = MappingStartEvent(None, None, True,
486 token.start_mark, token.end_mark,
487 flow_style=True)
488 self.state = self.parse_flow_sequence_entry_mapping_key
489 return event
490 elif not self.check_token(FlowSequenceEndToken):
491 self.states.append(self.parse_flow_sequence_entry)
492 return self.parse_flow_node()
493 token = self.get_token()
494 event = SequenceEndEvent(token.start_mark, token.end_mark)
495 self.state = self.states.pop()
496 self.marks.pop()
497 return event
499 def parse_flow_sequence_entry_mapping_key(self):
500 token = self.get_token()
501 if not self.check_token(ValueToken,
502 FlowEntryToken, FlowSequenceEndToken):
503 self.states.append(self.parse_flow_sequence_entry_mapping_value)
504 return self.parse_flow_node()
505 else:
506 self.state = self.parse_flow_sequence_entry_mapping_value
507 return self.process_empty_scalar(token.end_mark)
509 def parse_flow_sequence_entry_mapping_value(self):
510 if self.check_token(ValueToken):
511 token = self.get_token()
512 if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
513 self.states.append(self.parse_flow_sequence_entry_mapping_end)
514 return self.parse_flow_node()
515 else:
516 self.state = self.parse_flow_sequence_entry_mapping_end
517 return self.process_empty_scalar(token.end_mark)
518 else:
519 self.state = self.parse_flow_sequence_entry_mapping_end
520 token = self.peek_token()
521 return self.process_empty_scalar(token.start_mark)
523 def parse_flow_sequence_entry_mapping_end(self):
524 self.state = self.parse_flow_sequence_entry
525 token = self.peek_token()
526 return MappingEndEvent(token.start_mark, token.start_mark)
528 # flow_mapping ::= FLOW-MAPPING-START
529 # (flow_mapping_entry FLOW-ENTRY)*
530 # flow_mapping_entry?
531 # FLOW-MAPPING-END
532 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
534 def parse_flow_mapping_first_key(self):
535 token = self.get_token()
536 self.marks.append(token.start_mark)
537 return self.parse_flow_mapping_key(first=True)
539 def parse_flow_mapping_key(self, first=False):
540 if not self.check_token(FlowMappingEndToken):
541 if not first:
542 if self.check_token(FlowEntryToken):
543 self.get_token()
544 else:
545 token = self.peek_token()
546 raise ParserError("while parsing a flow mapping", self.marks[-1],
547 "expected ',' or '}', but got %r" % token.id, token.start_mark)
548 if self.check_token(KeyToken):
549 token = self.get_token()
550 if not self.check_token(ValueToken,
551 FlowEntryToken, FlowMappingEndToken):
552 self.states.append(self.parse_flow_mapping_value)
553 return self.parse_flow_node()
554 else:
555 self.state = self.parse_flow_mapping_value
556 return self.process_empty_scalar(token.end_mark)
557 elif not self.check_token(FlowMappingEndToken):
558 self.states.append(self.parse_flow_mapping_empty_value)
559 return self.parse_flow_node()
560 token = self.get_token()
561 event = MappingEndEvent(token.start_mark, token.end_mark)
562 self.state = self.states.pop()
563 self.marks.pop()
564 return event
566 def parse_flow_mapping_value(self):
567 if self.check_token(ValueToken):
568 token = self.get_token()
569 if not self.check_token(FlowEntryToken, FlowMappingEndToken):
570 self.states.append(self.parse_flow_mapping_key)
571 return self.parse_flow_node()
572 else:
573 self.state = self.parse_flow_mapping_key
574 return self.process_empty_scalar(token.end_mark)
575 else:
576 self.state = self.parse_flow_mapping_key
577 token = self.peek_token()
578 return self.process_empty_scalar(token.start_mark)
580 def parse_flow_mapping_empty_value(self):
581 self.state = self.parse_flow_mapping_key
582 return self.process_empty_scalar(self.peek_token().start_mark)
584 def process_empty_scalar(self, mark):
585 return ScalarEvent(None, None, (True, False), u'', mark, mark)