Back to work :). Rename markers to marks.
[pyyaml/python3.git] / lib / yaml / parser.py
blob28a05261075817d66e4c016b9077e83214fb166c
2 # YAML can be parsed by an LL(1) parser!
4 # We use the following production rules:
5 # stream ::= implicit_document? explicit_document* STREAM-END
6 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7 # implicit_document ::= block_node DOCUMENT-END?
8 # block_node ::= ALIAS | properties? block_content
9 # flow_node ::= ALIAS | properties? flow_content
10 # properties ::= TAG ANCHOR? | ANCHOR TAG?
11 # block_content ::= block_collection | flow_collection | SCALAR
12 # flow_content ::= flow_collection | SCALAR
13 # block_collection ::= block_sequence | block_mapping
14 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15 # block_mapping ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16 # block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17 # indentless_block_sequence ::= (BLOCK-ENTRY block_node?)+
18 # flow_collection ::= flow_sequence | flow_mapping
19 # flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20 # flow_mapping ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
24 # TODO: support for BOM within a stream.
25 # stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
27 # Note that there is a slight deviation from the specification. We require a
28 # non-empty node content if ANCHOR or TAG is specified. This disallow such
29 # documents as
31 # key: !!str # empty value
33 # This is done to prevent ambiguity in parsing tags and aliases:
35 # { !!perl/YAML::Parser: value }
37 # What is it? Should it be interpreted as
38 # { ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39 # or
40 # { ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41 # Since we disallow non-empty node content, tags are always followed by spaces
42 # or line breaks.
44 # FIRST sets:
45 # stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
46 # explicit_document: { DIRECTIVE DOCUMENT-START }
47 # implicit_document: FIRST(block_node)
48 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54 # block_sequence: { BLOCK-SEQUENCE-START }
55 # block_mapping: { BLOCK-MAPPING-START }
56 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
57 # indentless_sequence: { ENTRY }
58 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59 # flow_sequence: { FLOW-SEQUENCE-START }
60 # flow_mapping: { FLOW-MAPPING-START }
61 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
64 __all__ = ['Parser', 'ParserError']
66 from error import MarkedYAMLError
67 from tokens import *
68 from events import *
70 class ParserError(MarkedYAMLError):
71 pass
73 class Parser:
74 # Since writing an LL(1) parser is a straightforward task, we do not give
75 # many comments here.
76 # Note that we use Python generators. If you rewrite the parser in another
77 # language, you may replace all 'yield'-s with event handler calls.
79 DEFAULT_TAGS = {
80 u'!': u'!',
81 u'!!': u'tag:yaml.org,2002:',
84 def __init__(self, scanner):
85 self.scanner = scanner
86 self.current_event = None
87 self.yaml_version = None
88 self.tag_handles = {}
89 self.event_generator = self.parse_stream()
91 def check(self, *choices):
92 # Check the type of the next event.
93 if self.current_event is None:
94 try:
95 self.current_event = self.event_generator.next()
96 except StopIteration:
97 pass
98 if self.current_event is not None:
99 for choice in choices:
100 if isinstance(self.current_event, choice):
101 return True
102 return False
104 def peek(self):
105 # Get the next event.
106 if self.current_event is None:
107 try:
108 self.current_event = self.event_generator.next()
109 except StopIteration:
110 pass
111 return self.current_event
113 def get(self):
114 # Get the next event.
115 if self.current_event is None:
116 try:
117 self.current_event = self.event_generator.next()
118 except StopIteration:
119 pass
120 value = self.current_event
121 self.current_event = None
122 return value
124 def __iter__(self):
125 # Iterator protocol.
126 return self.event_generator
128 def parse_stream(self):
129 # implicit_document? explicit_document* STREAM-END
131 # Parse implicit document.
132 if not self.scanner.check(DirectiveToken, DocumentStartToken,
133 StreamEndToken):
134 self.tag_handles = self.DEFAULT_TAGS
135 for event in self.parse_block_node():
136 yield event
138 # Parse explicit documents.
139 while not self.scanner.check(StreamEndToken):
140 self.process_directives()
141 if not self.scanner.check(DocumentStartToken):
142 raise ParserError(None, None,
143 "expected '<document start>', but found %r"
144 % self.scanner.peek().id,
145 self.scanner.peek().start_mark)
146 token = self.scanner.get()
147 if self.scanner.check(DirectiveToken,
148 DocumentStartToken, DocumentEndToken, StreamEndToken):
149 yield self.process_empty_scalar(token.end_mark)
150 else:
151 for event in self.parse_block_node():
152 yield event
153 while self.scanner.check(DocumentEndToken):
154 self.scanner.get()
156 # Parse end of stream.
157 token = self.scanner.get()
158 yield StreamEndEvent(token.start_mark, token.end_mark)
160 def process_directives(self):
161 # DIRECTIVE*
162 self.yaml_version = None
163 self.tag_handles = {}
164 while self.scanner.check(DirectiveToken):
165 token = self.scanner.get()
166 if token.name == u'YAML':
167 if self.yaml_version is not None:
168 raise ParserError(None, None,
169 "found duplicate YAML directive", token.start_mark)
170 major, minor = token.value
171 if major != 1:
172 raise ParserError(None, None,
173 "found incompatible YAML document (version 1.* is required)",
174 token.start_mark)
175 self.yaml_version = token.value
176 elif token.name == u'TAG':
177 handle, prefix = token.value
178 if handle in self.tag_handles:
179 raise ParserError(None, None,
180 "duplicate tag handle %r" % handle.encode('utf-8'),
181 token.start_mark)
182 self.tag_handles[handle] = prefix
183 for key in self.DEFAULT_TAGS:
184 if key not in self.tag_handles:
185 self.tag_handles[key] = self.DEFAULT_TAGS[key]
187 def parse_block_node(self):
188 return self.parse_node(block=True)
190 def parse_flow_node(self):
191 return self.parse_node()
193 def parse_block_node_or_indentless_sequence(self):
194 return self.parse_node(block=True, indentless_sequence=True)
196 def parse_node(self, block=False, indentless_sequence=False):
197 # block_node ::= ALIAS | properties? block_content
198 # flow_node ::= ALIAS | properties? flow_content
199 # properties ::= TAG ANCHOR? | ANCHOR TAG?
200 # block_content ::= block_collection | flow_collection | SCALAR
201 # flow_content ::= flow_collection | SCALAR
202 # block_collection ::= block_sequence | block_mapping
203 # block_node_or_indentless_sequence ::= ALIAS | properties?
204 # (block_content | indentless_block_sequence)
205 if self.scanner.check(AliasToken):
206 token = self.scanner.get()
207 yield AliasEvent(token.value, token.start_mark, token.end_mark)
208 else:
209 anchor = None
210 tag = None
211 start_mark = end_mark = tag_mark = None
212 if self.scanner.check(AnchorToken):
213 token = self.scanner.get()
214 start_mark = end_mark = token.start_mark
215 anchor = token.value
216 if self.scanner.check(TagToken):
217 token = self.scanner.get()
218 end_mark = tag_mark = token.start_mark
219 tag = token.value
220 elif self.scanner.check(TagToken):
221 token = self.scanner.get()
222 start_mark = end_mark = tag_mark = token.start_mark
223 tag = token.value
224 if self.scanner.check(AnchorToken):
225 token = self.scanner.get()
226 end_mark = token.start_mark
227 anchor = token.value
228 if tag is not None:
229 handle, suffix = tag
230 if handle is not None:
231 if handle not in self.tag_handles:
232 raise ParserError("while parsing a node", start_mark,
233 "found undefined tag handle %r" % handle.encode('utf-8'),
234 tag_mark)
235 tag = self.tag_handles[handle]+suffix
236 else:
237 tag = suffix
238 if tag is None:
239 if not (self.scanner.check(ScalarToken) and
240 self.scanner.peek().plain):
241 tag = u'!'
242 if start_mark is None:
243 start_mark = self.scanner.peek().start_mark
244 event = None
245 collection_events = None
246 if indentless_sequence and self.scanner.check(BlockEntryToken):
247 end_mark = self.scanner.peek().end_mark
248 event = SequenceEvent(anchor, tag, start_mark, end_mark)
249 collection_events = self.parse_indentless_sequence()
250 else:
251 if self.scanner.check(ScalarToken):
252 token = self.scanner.get()
253 end_mark = token.end_mark
254 event = ScalarEvent(anchor, tag, token.value,
255 start_mark, end_mark)
256 elif self.scanner.check(FlowSequenceStartToken):
257 end_mark = self.scanner.peek().end_mark
258 event = SequenceEvent(anchor, tag, start_mark, end_mark)
259 collection_events = self.parse_flow_sequence()
260 elif self.scanner.check(FlowMappingStartToken):
261 end_mark = self.scanner.peek().end_mark
262 event = MappingEvent(anchor, tag, start_mark, end_mark)
263 collection_events = self.parse_flow_mapping()
264 elif block and self.scanner.check(BlockSequenceStartToken):
265 end_mark = self.scanner.peek().start_mark
266 event = SequenceEvent(anchor, tag, start_mark, end_mark)
267 collection_events = self.parse_block_sequence()
268 elif block and self.scanner.check(BlockMappingStartToken):
269 end_mark = self.scanner.peek().start_mark
270 event = MappingEvent(anchor, tag, start_mark, end_mark)
271 collection_events = self.parse_block_mapping()
272 else:
273 if block:
274 node = 'block'
275 else:
276 node = 'flow'
277 token = self.scanner.peek()
278 raise ParserError("while scanning a %s node" % node, start_mark,
279 "expected the node content, but found %r" % token.id,
280 token.start_mark)
281 yield event
282 if collection_events is not None:
283 for event in collection_events:
284 yield event
286 def parse_block_sequence(self):
287 # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
288 token = self.scanner.get()
289 start_mark = token.start_mark
290 while self.scanner.check(BlockEntryToken):
291 token = self.scanner.get()
292 if not self.scanner.check(BlockEntryToken, BlockEndToken):
293 for event in self.parse_block_node():
294 yield event
295 else:
296 yield self.process_empty_scalar(token.end_mark)
297 if not self.scanner.check(BlockEndToken):
298 token = self.scanner.peek()
299 raise ParserError("while scanning a block collection", start_mark,
300 "expected <block end>, but found %r" % token.id, token.start_mark)
301 token = self.scanner.get()
302 yield CollectionEndEvent(token.start_mark, token.end_mark)
304 def parse_indentless_sequence(self):
305 # (BLOCK-ENTRY block_node?)+
306 while self.scanner.check(BlockEntryToken):
307 token = self.scanner.get()
308 if not self.scanner.check(BlockEntryToken,
309 KeyToken, ValueToken, BlockEndToken):
310 for event in self.parse_block_node():
311 yield event
312 else:
313 yield self.process_empty_scalar(token.end_mark)
314 token = self.scanner.peek()
315 yield CollectionEndEvent(token.start_mark, token.start_mark)
317 def parse_block_mapping(self):
318 # BLOCK-MAPPING_START
319 # ((KEY block_node_or_indentless_sequence?)?
320 # (VALUE block_node_or_indentless_sequence?)?)*
321 # BLOCK-END
322 token = self.scanner.get()
323 start_mark = token.start_mark
324 while self.scanner.check(KeyToken, ValueToken):
325 if self.scanner.check(KeyToken):
326 token = self.scanner.get()
327 if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
328 for event in self.parse_block_node_or_indentless_sequence():
329 yield event
330 else:
331 yield self.process_empty_scalar(token.end_mark)
332 if self.scanner.check(ValueToken):
333 token = self.scanner.get()
334 if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
335 for event in self.parse_block_node_or_indentless_sequence():
336 yield event
337 else:
338 yield self.process_empty_scalar(token.end_mark)
339 else:
340 token = self.scanner.peek()
341 yield self.process_empty_scalar(token.start_mark)
342 if not self.scanner.check(BlockEndToken):
343 token = self.scanner.peek()
344 raise ParserError("while scanning a block mapping", start_mark,
345 "expected <block end>, but found %r" % token.id, token.start_mark)
346 token = self.scanner.get()
347 yield CollectionEndEvent(token.start_mark, token.end_mark)
349 def parse_flow_sequence(self):
350 # flow_sequence ::= FLOW-SEQUENCE-START
351 # (flow_sequence_entry FLOW-ENTRY)*
352 # flow_sequence_entry?
353 # FLOW-SEQUENCE-END
354 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
356 # Note that while production rules for both flow_sequence_entry and
357 # flow_mapping_entry are equal, their interpretations are different.
358 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
359 # generate an inline mapping (set syntax).
360 token = self.scanner.get()
361 start_mark = token.start_mark
362 while not self.scanner.check(FlowSequenceEndToken):
363 if self.scanner.check(KeyToken):
364 token = self.scanner.get()
365 yield MappingEvent(None, u'!',
366 token.start_mark, token.end_mark)
367 if not self.scanner.check(ValueToken,
368 FlowEntryToken, FlowSequenceEndToken):
369 for event in self.parse_flow_node():
370 yield event
371 else:
372 yield self.process_empty_scalar(token.end_mark)
373 if self.scanner.check(ValueToken):
374 token = self.scanner.get()
375 if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
376 for event in self.parse_flow_node():
377 yield event
378 else:
379 yield self.process_empty_scalar(token.end_mark)
380 else:
381 token = self.scanner.peek()
382 yield self.process_empty_scalar(token.start_mark)
383 token = self.scanner.peek()
384 yield CollectionEndEvent(token.start_mark, token.start_mark)
385 else:
386 for event in self.parse_flow_node():
387 yield event
388 if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
389 token = self.scanner.peek()
390 raise ParserError("while scanning a flow sequence", start_mark,
391 "expected ',' or ']', but got %r" % token.id, token.start_mark)
392 if self.scanner.check(FlowEntryToken):
393 self.scanner.get()
394 token = self.scanner.get()
395 yield CollectionEndEvent(token.start_mark, token.end_mark)
397 def parse_flow_mapping(self):
398 # flow_mapping ::= FLOW-MAPPING-START
399 # (flow_mapping_entry FLOW-ENTRY)*
400 # flow_mapping_entry?
401 # FLOW-MAPPING-END
402 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
403 token = self.scanner.get()
404 start_mark = token.start_mark
405 while not self.scanner.check(FlowMappingEndToken):
406 if self.scanner.check(KeyToken):
407 token = self.scanner.get()
408 if not self.scanner.check(ValueToken,
409 FlowEntryToken, FlowMappingEndToken):
410 for event in self.parse_flow_node():
411 yield event
412 else:
413 yield self.process_empty_scalar(token.end_mark)
414 if self.scanner.check(ValueToken):
415 token = self.scanner.get()
416 if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
417 for event in self.parse_flow_node():
418 yield event
419 else:
420 yield self.process_empty_scalar(token.end_mark)
421 else:
422 token = self.scanner.peek()
423 yield self.process_empty_scalar(token.start_mark)
424 else:
425 for event in self.parse_flow_node():
426 yield event
427 yield self.process_empty_scalar(self.scanner.peek().start_mark)
428 if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
429 token = self.scanner.peek()
430 raise ParserError("while scanning a flow mapping", start_mark,
431 "expected ',' or '}', but got %r" % token.id, token.start_mark)
432 if self.scanner.check(FlowEntryToken):
433 self.scanner.get()
434 if not self.scanner.check(FlowMappingEndToken):
435 token = self.scanner.peek()
436 raise ParserError("while scanning a flow mapping", start_mark,
437 "expected '}', but found %r" % token.id, token.start_mark)
438 token = self.scanner.get()
439 yield CollectionEndEvent(token.start_mark, token.end_mark)
441 def process_empty_scalar(self, mark):
442 return ScalarEvent(None, None, u'', mark, mark)