Loose indentation rules for the following cases:
[pyyaml/python3.git] / lib / yaml / scanner.py
blob29de348e4cd99eaffe443275c8c4d82c5ce237df
2 # Scanner produces tokens of the following types:
3 # DIRECTIVE(name, value)
4 # DOCUMENT-START
5 # DOCUMENT-END
6 # STREAM-END
7 # BLOCK-SEQUENCE-START
8 # BLOCK-MAPPING-START
9 # BLOCK-END
10 # FLOW-SEQUENCE-START
11 # FLOW-MAPPING-START
12 # FLOW-SEQUENCE-END
13 # FLOW-MAPPING-END
14 # BLOCK-ENTRY
15 # FLOW-ENTRY
16 # KEY
17 # VALUE
18 # ALIAS(value)
19 # ANCHOR(value)
20 # TAG(value)
21 # SCALAR(value, plain)
23 # Read comments in the Scanner code for more details.
26 __all__ = ['Scanner', 'ScannerError']
28 from error import MarkedYAMLError
29 from tokens import *
31 class ScannerError(MarkedYAMLError):
32 pass
34 class SimpleKey:
35 # See below simple keys treatment.
37 def __init__(self, token_number, required, index, line, column, mark):
38 self.token_number = token_number
39 self.required = required
40 self.index = index
41 self.line = line
42 self.column = column
43 self.mark = mark
45 class Scanner:
48 def __init__(self, reader):
49 """Initialize the scanner."""
50 # The input stream. The Reader class do the dirty work of checking for
51 # BOM and converting the input data to Unicode. It also adds NUL to
52 # the end.
54 # Reader supports the following methods
55 # self.reader.peek(i=0) # peek the next i-th character
56 # self.reader.prefix(l=1) # peek the next l characters
57 # self.reader.forward(l=1) # read the next l characters
58 # and move the pointer
59 self.reader = reader
61 # Had we reached the end of the stream?
62 self.done = False
64 # The number of unclosed '{' and '['. `flow_level == 0` means block
65 # context.
66 self.flow_level = 0
68 # List of processed tokens that are not yet emitted.
69 self.tokens = []
71 # Number of tokens that were emitted through the `get_token` method.
72 self.tokens_taken = 0
74 # The current indentation level.
75 self.indent = -1
77 # Past indentation levels.
78 self.indents = []
80 # Variables related to simple keys treatment.
82 # A simple key is a key that is not denoted by the '?' indicator.
83 # Example of simple keys:
84 # ---
85 # block simple key: value
86 # ? not a simple key:
87 # : { flow simple key: value }
88 # We emit the KEY token before all keys, so when we find a potential
89 # simple key, we try to locate the corresponding ':' indicator.
90 # Simple keys should be limited to a single line and 1024 characters.
92 # Can a simple key start at the current position? A simple key may
93 # start:
94 # - at the beginning of the line, not counting indentation spaces
95 # (in block context),
96 # - after '{', '[', ',' (in the flow context),
97 # - after '?', ':', '-' (in the block context).
98 # In the block context, this flag also signifies if a block collection
99 # may start at the current position.
100 self.allow_simple_key = True
102 # Keep track of possible simple keys. This is a dictionary. The key
103 # is `flow_level`; there can be no more that one possible simple key
104 # for each level. The value is a SimpleKey record:
105 # (token_number, required, index, line, column, mark)
106 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
107 # '[', or '{' tokens.
108 self.possible_simple_keys = {}
110 # Public methods.
112 def check(self, *choices):
113 # Check if the next token is one of the given types.
114 while self.need_more_tokens():
115 self.fetch_more_tokens()
116 if self.tokens:
117 for choice in choices:
118 if isinstance(self.tokens[0], choice):
119 return True
120 return False
122 def peek(self):
123 # Return the next token, but do not delete if from the queue.
124 while self.need_more_tokens():
125 self.fetch_more_tokens()
126 if self.tokens:
127 return self.tokens[0]
129 def get(self):
130 # Return the next token.
131 while self.need_more_tokens():
132 self.fetch_more_tokens()
133 if self.tokens:
134 self.tokens_taken += 1
135 return self.tokens.pop(0)
137 def __iter__(self):
138 # Iterator protocol.
139 while self.need_more_tokens():
140 self.fetch_more_tokens()
141 while self.tokens:
142 self.tokens_taken += 1
143 yield self.tokens.pop(0)
144 while self.need_more_tokens():
145 self.fetch_more_tokens()
147 # Private methods.
149 def need_more_tokens(self):
150 if self.done:
151 return False
152 if not self.tokens:
153 return True
154 # The current token may be a potential simple key, so we
155 # need to look further.
156 self.stale_possible_simple_keys()
157 if self.next_possible_simple_key() == self.tokens_taken:
158 return True
160 def fetch_more_tokens(self):
162 # Eat whitespaces and comments until we reach the next token.
163 self.scan_to_next_token()
165 # Remove obsolete possible simple keys.
166 self.stale_possible_simple_keys()
168 # Compare the current indentation and column. It may add some tokens
169 # and decrease the current indentation level.
170 self.unwind_indent(self.reader.column)
172 # Peek the next character.
173 ch = self.reader.peek()
175 # Is it the end of stream?
176 if ch == u'\0':
177 return self.fetch_stream_end()
179 # Is it a directive?
180 if ch == u'%' and self.check_directive():
181 return self.fetch_directive()
183 # Is it the document start?
184 if ch == u'-' and self.check_document_start():
185 return self.fetch_document_start()
187 # Is it the document end?
188 if ch == u'.' and self.check_document_end():
189 return self.fetch_document_end()
191 # TODO: support for BOM within a stream.
192 #if ch == u'\uFEFF':
193 # return self.fetch_bom() <-- issue BOMToken
195 # Note: the order of the following checks is NOT significant.
197 # Is it the flow sequence start indicator?
198 if ch == u'[':
199 return self.fetch_flow_sequence_start()
201 # Is it the flow mapping start indicator?
202 if ch == u'{':
203 return self.fetch_flow_mapping_start()
205 # Is it the flow sequence end indicator?
206 if ch == u']':
207 return self.fetch_flow_sequence_end()
209 # Is it the flow mapping end indicator?
210 if ch == u'}':
211 return self.fetch_flow_mapping_end()
213 # Is it the flow entry indicator?
214 if ch in u',':
215 return self.fetch_flow_entry()
217 # Is it the block entry indicator?
218 if ch in u'-' and self.check_block_entry():
219 return self.fetch_block_entry()
221 # Is it the key indicator?
222 if ch == u'?' and self.check_key():
223 return self.fetch_key()
225 # Is it the value indicator?
226 if ch == u':' and self.check_value():
227 return self.fetch_value()
229 # Is it an alias?
230 if ch == u'*':
231 return self.fetch_alias()
233 # Is it an anchor?
234 if ch == u'&':
235 return self.fetch_anchor()
237 # Is it a tag?
238 if ch == u'!':
239 return self.fetch_tag()
241 # Is it a literal scalar?
242 if ch == u'|' and not self.flow_level:
243 return self.fetch_literal()
245 # Is it a folded scalar?
246 if ch == u'>' and not self.flow_level:
247 return self.fetch_folded()
249 # Is it a single quoted scalar?
250 if ch == u'\'':
251 return self.fetch_single()
253 # Is it a double quoted scalar?
254 if ch == u'\"':
255 return self.fetch_double()
257 # It must be a plain scalar then.
258 if self.check_plain():
259 return self.fetch_plain()
261 # No? It's an error. Let's produce a nice error message.
262 raise ScannerError("while scanning for the next token", None,
263 "found character %r that cannot start any token"
264 % ch.encode('utf-8'), self.reader.get_mark())
266 # Simple keys treatment.
268 def next_possible_simple_key(self):
269 # Return the number of the nearest possible simple key. Actually we
270 # don't need to loop through the whole dictionary. We may replace it
271 # with the following code:
272 # if not self.possible_simple_keys:
273 # return None
274 # return self.possible_simple_keys[
275 # min(self.possible_simple_keys.keys())].token_number
276 min_token_number = None
277 for level in self.possible_simple_keys:
278 key = self.possible_simple_keys[level]
279 if min_token_number is None or key.token_number < min_token_number:
280 min_token_number = key.token_number
281 return min_token_number
283 def stale_possible_simple_keys(self):
284 # Remove entries that are no longer possible simple keys. According to
285 # the YAML specification, simple keys
286 # - should be limited to a single line,
287 # - should be no longer than 1024 characters.
288 # Disabling this procedure will allow simple keys of any length and
289 # height (may cause problems if indentation is broken though).
290 for level in self.possible_simple_keys.keys():
291 key = self.possible_simple_keys[level]
292 if key.line != self.reader.line \
293 or self.reader.index-key.index > 1024:
294 if key.required:
295 raise ScannerError("while scanning a simple key", key.mark,
296 "could not found expected ':'", self.reader.get_mark())
297 del self.possible_simple_keys[level]
299 def save_possible_simple_key(self):
300 # The next token may start a simple key. We check if it's possible
301 # and save its position. This function is called for
302 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
304 # Check if a simple key is required at the current position.
305 required = not self.flow_level and self.indent == self.reader.column
307 # A simple key is required only if it is the first token in the current
308 # line. Therefore it is always allowed.
309 assert self.allow_simple_key or not required
311 # The next token might be a simple key. Let's save it's number and
312 # position.
313 if self.allow_simple_key:
314 self.remove_possible_simple_key()
315 token_number = self.tokens_taken+len(self.tokens)
316 index = self.reader.index
317 line = self.reader.line
318 column = self.reader.column
319 mark = self.reader.get_mark()
320 key = SimpleKey(token_number, required,
321 index, line, column, mark)
322 self.possible_simple_keys[self.flow_level] = key
324 def remove_possible_simple_key(self):
325 # Remove the saved possible key position at the current flow level.
326 if self.flow_level in self.possible_simple_keys:
327 key = self.possible_simple_keys[self.flow_level]
329 # I don't think it's possible, but I could be wrong.
330 assert not key.required
331 #if key.required:
332 # raise ScannerError("while scanning a simple key", key.mark,
333 # "could not found expected ':'", self.reader.get_mark())
335 # Indentation functions.
337 def unwind_indent(self, column):
339 ## In flow context, tokens should respect indentation.
340 ## Actually the condition should be `self.indent >= column` according to
341 ## the spec. But this condition will prohibit intuitively correct
342 ## constructions such as
343 ## key : {
344 ## }
345 #if self.flow_level and self.indent > column:
346 # raise ScannerError(None, None,
347 # "invalid intendation or unclosed '[' or '{'",
348 # self.reader.get_mark())
350 # In the flow context, indentation is ignored. We make the scanner less
351 # restrictive then specification requires.
352 if self.flow_level:
353 return
355 # In block context, we may need to issue the BLOCK-END tokens.
356 while self.indent > column:
357 mark = self.reader.get_mark()
358 self.indent = self.indents.pop()
359 self.tokens.append(BlockEndToken(mark, mark))
361 def add_indent(self, column):
362 # Check if we need to increase indentation.
363 if self.indent < column:
364 self.indents.append(self.indent)
365 self.indent = column
366 return True
367 return False
369 # Fetchers.
371 def fetch_stream_end(self):
373 # Set the current intendation to -1.
374 self.unwind_indent(-1)
376 # Reset everything (not really needed).
377 self.allow_simple_key = False
378 self.possible_simple_keys = {}
380 # Read the token.
381 mark = self.reader.get_mark()
383 # Add END.
384 self.tokens.append(StreamEndToken(mark, mark))
386 # The reader is ended.
387 self.done = True
389 def fetch_directive(self):
391 # Set the current intendation to -1.
392 self.unwind_indent(-1)
394 # Reset simple keys.
395 self.remove_possible_simple_key()
396 self.allow_simple_key = False
398 # Scan and add DIRECTIVE.
399 self.tokens.append(self.scan_directive())
401 def fetch_document_start(self):
402 self.fetch_document_indicator(DocumentStartToken)
404 def fetch_document_end(self):
405 self.fetch_document_indicator(DocumentEndToken)
407 def fetch_document_indicator(self, TokenClass):
409 # Set the current intendation to -1.
410 self.unwind_indent(-1)
412 # Reset simple keys. Note that there could not be a block collection
413 # after '---'.
414 self.remove_possible_simple_key()
415 self.allow_simple_key = False
417 # Add DOCUMENT-START or DOCUMENT-END.
418 start_mark = self.reader.get_mark()
419 self.reader.forward(3)
420 end_mark = self.reader.get_mark()
421 self.tokens.append(TokenClass(start_mark, end_mark))
423 def fetch_flow_sequence_start(self):
424 self.fetch_flow_collection_start(FlowSequenceStartToken)
426 def fetch_flow_mapping_start(self):
427 self.fetch_flow_collection_start(FlowMappingStartToken)
429 def fetch_flow_collection_start(self, TokenClass):
431 # '[' and '{' may start a simple key.
432 self.save_possible_simple_key()
434 # Increase the flow level.
435 self.flow_level += 1
437 # Simple keys are allowed after '[' and '{'.
438 self.allow_simple_key = True
440 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
441 start_mark = self.reader.get_mark()
442 self.reader.forward()
443 end_mark = self.reader.get_mark()
444 self.tokens.append(TokenClass(start_mark, end_mark))
446 def fetch_flow_sequence_end(self):
447 self.fetch_flow_collection_end(FlowSequenceEndToken)
449 def fetch_flow_mapping_end(self):
450 self.fetch_flow_collection_end(FlowMappingEndToken)
452 def fetch_flow_collection_end(self, TokenClass):
454 # Reset possible simple key on the current level.
455 self.remove_possible_simple_key()
457 # Decrease the flow level.
458 self.flow_level -= 1
460 # No simple keys after ']' or '}'.
461 self.allow_simple_key = False
463 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
464 start_mark = self.reader.get_mark()
465 self.reader.forward()
466 end_mark = self.reader.get_mark()
467 self.tokens.append(TokenClass(start_mark, end_mark))
469 def fetch_flow_entry(self):
471 # Simple keys are allowed after ','.
472 self.allow_simple_key = True
474 # Reset possible simple key on the current level.
475 self.remove_possible_simple_key()
477 # Add FLOW-ENTRY.
478 start_mark = self.reader.get_mark()
479 self.reader.forward()
480 end_mark = self.reader.get_mark()
481 self.tokens.append(FlowEntryToken(start_mark, end_mark))
483 def fetch_block_entry(self):
485 # Block context needs additional checks.
486 if not self.flow_level:
488 # Are we allowed to start a new entry?
489 if not self.allow_simple_key:
490 raise ScannerError(None, None,
491 "sequence entries are not allowed here",
492 self.reader.get_mark())
494 # We may need to add BLOCK-SEQUENCE-START.
495 if self.add_indent(self.reader.column):
496 mark = self.reader.get_mark()
497 self.tokens.append(BlockSequenceStartToken(mark, mark))
499 # It's an error for the block entry to occur in the flow context,
500 # but we let the parser detect this.
501 else:
502 pass
504 # Simple keys are allowed after '-'.
505 self.allow_simple_key = True
507 # Reset possible simple key on the current level.
508 self.remove_possible_simple_key()
510 # Add BLOCK-ENTRY.
511 start_mark = self.reader.get_mark()
512 self.reader.forward()
513 end_mark = self.reader.get_mark()
514 self.tokens.append(BlockEntryToken(start_mark, end_mark))
516 def fetch_key(self):
518 # Block context needs additional checks.
519 if not self.flow_level:
521 # Are we allowed to start a key (not nessesary a simple)?
522 if not self.allow_simple_key:
523 raise ScannerError(None, None,
524 "mapping keys are not allowed here",
525 self.reader.get_mark())
527 # We may need to add BLOCK-MAPPING-START.
528 if self.add_indent(self.reader.column):
529 mark = self.reader.get_mark()
530 self.tokens.append(BlockMappingStartToken(mark, mark))
532 # Simple keys are allowed after '?' in the block context.
533 self.allow_simple_key = not self.flow_level
535 # Reset possible simple key on the current level.
536 self.remove_possible_simple_key()
538 # Add KEY.
539 start_mark = self.reader.get_mark()
540 self.reader.forward()
541 end_mark = self.reader.get_mark()
542 self.tokens.append(KeyToken(start_mark, end_mark))
544 def fetch_value(self):
546 # Do we determine a simple key?
547 if self.flow_level in self.possible_simple_keys:
549 # Add KEY.
550 key = self.possible_simple_keys[self.flow_level]
551 del self.possible_simple_keys[self.flow_level]
552 self.tokens.insert(key.token_number-self.tokens_taken,
553 KeyToken(key.mark, key.mark))
555 # If this key starts a new block mapping, we need to add
556 # BLOCK-MAPPING-START.
557 if not self.flow_level:
558 if self.add_indent(key.column):
559 self.tokens.insert(key.token_number-self.tokens_taken,
560 BlockMappingStartToken(key.mark, key.mark))
562 # There cannot be two simple keys one after another.
563 self.allow_simple_key = False
565 # It must be a part of a complex key.
566 else:
568 # Block context needs additional checks.
569 # (Do we really need them? They will be catched by the parser
570 # anyway.)
571 if not self.flow_level:
573 # We are allowed to start a complex value if and only if
574 # we can start a simple key.
575 if not self.allow_simple_key:
576 raise ScannerError(None, None,
577 "mapping values are not allowed here",
578 self.reader.get_mark())
580 # Simple keys are allowed after ':' in the block context.
581 self.allow_simple_key = not self.flow_level
583 # Reset possible simple key on the current level.
584 self.remove_possible_simple_key()
586 # Add VALUE.
587 start_mark = self.reader.get_mark()
588 self.reader.forward()
589 end_mark = self.reader.get_mark()
590 self.tokens.append(ValueToken(start_mark, end_mark))
592 def fetch_alias(self):
594 # ALIAS could be a simple key.
595 self.save_possible_simple_key()
597 # No simple keys after ALIAS.
598 self.allow_simple_key = False
600 # Scan and add ALIAS.
601 self.tokens.append(self.scan_anchor(AliasToken))
603 def fetch_anchor(self):
605 # ANCHOR could start a simple key.
606 self.save_possible_simple_key()
608 # No simple keys after ANCHOR.
609 self.allow_simple_key = False
611 # Scan and add ANCHOR.
612 self.tokens.append(self.scan_anchor(AnchorToken))
614 def fetch_tag(self):
616 # TAG could start a simple key.
617 self.save_possible_simple_key()
619 # No simple keys after TAG.
620 self.allow_simple_key = False
622 # Scan and add TAG.
623 self.tokens.append(self.scan_tag())
625 def fetch_literal(self):
626 self.fetch_block_scalar(folded=False)
628 def fetch_folded(self):
629 self.fetch_block_scalar(folded=True)
631 def fetch_block_scalar(self, folded):
633 # A simple key may follow a block scalar.
634 self.allow_simple_key = True
636 # Reset possible simple key on the current level.
637 self.remove_possible_simple_key()
639 # Scan and add SCALAR.
640 self.tokens.append(self.scan_block_scalar(folded))
642 def fetch_single(self):
643 self.fetch_flow_scalar(double=False)
645 def fetch_double(self):
646 self.fetch_flow_scalar(double=True)
648 def fetch_flow_scalar(self, double):
650 # A flow scalar could be a simple key.
651 self.save_possible_simple_key()
653 # No simple keys after flow scalars.
654 self.allow_simple_key = False
656 # Scan and add SCALAR.
657 self.tokens.append(self.scan_flow_scalar(double))
659 def fetch_plain(self):
661 # A plain scalar could be a simple key.
662 self.save_possible_simple_key()
664 # No simple keys after plain scalars. But note that `scan_plain` will
665 # change this flag if the scan is finished at the beginning of the
666 # line.
667 self.allow_simple_key = False
669 # Scan and add SCALAR. May change `allow_simple_key`.
670 self.tokens.append(self.scan_plain())
672 # Checkers.
674 def check_directive(self):
676 # DIRECTIVE: ^ '%' ...
677 # The '%' indicator is already checked.
678 if self.reader.column == 0:
679 return True
681 def check_document_start(self):
683 # DOCUMENT-START: ^ '---' (' '|'\n')
684 if self.reader.column == 0:
685 if self.reader.prefix(3) == u'---' \
686 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
687 return True
689 def check_document_end(self):
691 # DOCUMENT-END: ^ '...' (' '|'\n')
692 if self.reader.column == 0:
693 prefix = self.reader.peek(4)
694 if self.reader.prefix(3) == u'...' \
695 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
696 return True
698 def check_block_entry(self):
700 # BLOCK-ENTRY: '-' (' '|'\n')
701 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
703 def check_key(self):
705 # KEY(flow context): '?'
706 if self.flow_level:
707 return True
709 # KEY(block context): '?' (' '|'\n')
710 else:
711 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
713 def check_value(self):
715 # VALUE(flow context): ':'
716 if self.flow_level:
717 return True
719 # VALUE(block context): ':' (' '|'\n')
720 else:
721 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
723 def check_plain(self):
725 # A plain scalar may start with any non-space character except:
726 # '-', '?', ':', ',', '[', ']', '{', '}',
727 # '#', '&', '*', '!', '|', '>', '\'', '\"',
728 # '%', '@', '`'.
730 # It may also start with
731 # '-', '?', ':'
732 # if it is followed by a non-space character.
734 # Note that we limit the last rule to the block context (except the
735 # '-' character) because we want the flow context to be space
736 # independent.
737 ch = self.reader.peek()
738 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
739 or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
740 and (ch == '-' or (not self.flow_level and ch in u'?:')))
742 # Scanners.
744 def scan_to_next_token(self):
745 # We ignore spaces, line breaks and comments.
746 # If we find a line break in the block context, we set the flag
747 # `allow_simple_key` on.
748 # The byte order mark is stripped if it's the first character in the
749 # stream. We do not yet support BOM inside the stream as the
750 # specification requires. Any such mark will be considered as a part
751 # of the document.
753 # TODO: We need to make tab handling rules more sane. A good rule is
754 # Tabs cannot precede tokens
755 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
756 # KEY(block), VALUE(block), BLOCK-ENTRY
757 # So the checking code is
758 # if <TAB>:
759 # self.allow_simple_keys = False
760 # We also need to add the check for `allow_simple_keys == True` to
761 # `unwind_indent` before issuing BLOCK-END.
762 # Scanners for block, flow, and plain scalars need to be modified.
764 if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
765 self.reader.forward()
766 found = False
767 while not found:
768 while self.reader.peek() == u' ':
769 self.reader.forward()
770 if self.reader.peek() == u'#':
771 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
772 self.reader.forward()
773 if self.scan_line_break():
774 if not self.flow_level:
775 self.allow_simple_key = True
776 else:
777 found = True
779 def scan_directive(self):
780 # See the specification for details.
781 start_mark = self.reader.get_mark()
782 self.reader.forward()
783 name = self.scan_directive_name(start_mark)
784 value = None
785 if name == u'YAML':
786 value = self.scan_yaml_directive_value(start_mark)
787 end_mark = self.reader.get_mark()
788 elif name == u'TAG':
789 value = self.scan_tag_directive_value(start_mark)
790 end_mark = self.reader.get_mark()
791 else:
792 end_mark = self.reader.get_mark()
793 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
794 self.reader.forward()
795 self.scan_directive_ignored_line(start_mark)
796 return DirectiveToken(name, value, start_mark, end_mark)
798 def scan_directive_name(self, start_mark):
799 # See the specification for details.
800 length = 0
801 ch = self.reader.peek(length)
802 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
803 or ch in u'-_':
804 length += 1
805 ch = self.reader.peek(length)
806 if not length:
807 raise ScannerError("while scanning a directive", start_mark,
808 "expected alphabetic or numeric character, but found %r"
809 % ch.encode('utf-8'), self.reader.get_mark())
810 value = self.reader.prefix(length)
811 self.reader.forward(length)
812 ch = self.reader.peek()
813 if ch not in u'\0 \r\n\x85\u2028\u2029':
814 raise ScannerError("while scanning a directive", start_mark,
815 "expected alphabetic or numeric character, but found %r"
816 % ch.encode('utf-8'), self.reader.get_mark())
817 return value
819 def scan_yaml_directive_value(self, start_mark):
820 # See the specification for details.
821 while self.reader.peek() == u' ':
822 self.reader.forward()
823 major = self.scan_yaml_directive_number(start_mark)
824 if self.reader.peek() != '.':
825 raise ScannerError("while scanning a directive", start_mark,
826 "expected a digit or '.', but found %r"
827 % self.reader.peek().encode('utf-8'),
828 self.reader.get_mark())
829 self.reader.forward()
830 minor = self.scan_yaml_directive_number(start_mark)
831 if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
832 raise ScannerError("while scanning a directive", start_mark,
833 "expected a digit or ' ', but found %r"
834 % self.reader.peek().encode('utf-8'),
835 self.reader.get_mark())
836 return (major, minor)
838 def scan_yaml_directive_number(self, start_mark):
839 # See the specification for details.
840 ch = self.reader.peek()
841 if not (u'0' <= ch <= '9'):
842 raise ScannerError("while scanning a directive", start_mark,
843 "expected a digit, but found %r" % ch.encode('utf-8'),
844 self.reader.get_mark())
845 length = 0
846 while u'0' <= self.reader.peek(length) <= u'9':
847 length += 1
848 value = int(self.reader.prefix(length))
849 self.reader.forward(length)
850 return value
852 def scan_tag_directive_value(self, start_mark):
853 # See the specification for details.
854 while self.reader.peek() == u' ':
855 self.reader.forward()
856 handle = self.scan_tag_directive_handle(start_mark)
857 while self.reader.peek() == u' ':
858 self.reader.forward()
859 prefix = self.scan_tag_directive_prefix(start_mark)
860 return (handle, prefix)
862 def scan_tag_directive_handle(self, start_mark):
863 # See the specification for details.
864 value = self.scan_tag_handle('directive', start_mark)
865 ch = self.reader.peek()
866 if ch != u' ':
867 raise ScannerError("while scanning a directive", start_mark,
868 "expected ' ', but found %r" % ch.encode('utf-8'),
869 self.reader.get_mark())
870 return value
872 def scan_tag_directive_prefix(self, start_mark):
873 # See the specification for details.
874 value = self.scan_tag_uri('directive', start_mark)
875 ch = self.reader.peek()
876 if ch not in u'\0 \r\n\x85\u2028\u2029':
877 raise ScannerError("while scanning a directive", start_mark,
878 "expected ' ', but found %r" % ch.encode('utf-8'),
879 self.reader.get_mark())
880 return value
882 def scan_directive_ignored_line(self, start_mark):
883 # See the specification for details.
884 while self.reader.peek() == u' ':
885 self.reader.forward()
886 if self.reader.peek() == u'#':
887 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
888 self.reader.forward()
889 ch = self.reader.peek()
890 if ch not in u'\0\r\n\x85\u2028\u2029':
891 raise ScannerError("while scanning a directive", start_mark,
892 "expected a comment or a line break, but found %r"
893 % ch.encode('utf-8'), self.reader.get_mark())
894 self.scan_line_break()
896 def scan_anchor(self, TokenClass):
897 # The specification does not restrict characters for anchors and
898 # aliases. This may lead to problems, for instance, the document:
899 # [ *alias, value ]
900 # can be interpteted in two ways, as
901 # [ "value" ]
902 # and
903 # [ *alias , "value" ]
904 # Therefore we restrict aliases to numbers and ASCII letters.
905 start_mark = self.reader.get_mark()
906 indicator = self.reader.peek()
907 if indicator == '*':
908 name = 'alias'
909 else:
910 name = 'anchor'
911 self.reader.forward()
912 length = 0
913 ch = self.reader.peek(length)
914 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
915 or ch in u'-_':
916 length += 1
917 ch = self.reader.peek(length)
918 if not length:
919 raise ScannerError("while scanning an %s" % name, start_mark,
920 "expected alphabetic or numeric character, but found %r"
921 % ch.encode('utf-8'), self.reader.get_mark())
922 value = self.reader.prefix(length)
923 self.reader.forward(length)
924 ch = self.reader.peek()
925 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
926 raise ScannerError("while scanning an %s" % name, start_mark,
927 "expected alphabetic or numeric character, but found %r"
928 % ch.encode('utf-8'), self.reader.get_mark())
929 end_mark = self.reader.get_mark()
930 return TokenClass(value, start_mark, end_mark)
932 def scan_tag(self):
933 # See the specification for details.
934 start_mark = self.reader.get_mark()
935 ch = self.reader.peek(1)
936 if ch == u'<':
937 handle = None
938 self.reader.forward(2)
939 suffix = self.scan_tag_uri('tag', start_mark)
940 if self.reader.peek() != u'>':
941 raise ScannerError("while parsing a tag", start_mark,
942 "expected '>', but found %r" % self.reader.peek().encode('utf-8'),
943 self.reader.get_mark())
944 self.reader.forward()
945 elif ch in u'\0 \t\r\n\x85\u2028\u2029':
946 handle = None
947 suffix = u'!'
948 self.reader.forward()
949 else:
950 length = 1
951 use_handle = False
952 while ch not in u'\0 \r\n\x85\u2028\u2029':
953 if ch == u'!':
954 use_handle = True
955 break
956 length += 1
957 ch = self.reader.peek(length)
958 handle = u'!'
959 if use_handle:
960 handle = self.scan_tag_handle('tag', start_mark)
961 else:
962 handle = u'!'
963 self.reader.forward()
964 suffix = self.scan_tag_uri('tag', start_mark)
965 ch = self.reader.peek()
966 if ch not in u'\0 \r\n\x85\u2028\u2029':
967 raise ScannerError("while scanning a tag", start_mark,
968 "expected ' ', but found %r" % ch.encode('utf-8'),
969 self.reader.get_mark())
970 value = (handle, suffix)
971 end_mark = self.reader.get_mark()
972 return TagToken(value, start_mark, end_mark)
974 def scan_block_scalar(self, folded):
975 # See the specification for details.
977 chunks = []
978 start_mark = self.reader.get_mark()
980 # Scan the header.
981 self.reader.forward()
982 chomping, increment = self.scan_block_scalar_indicators(start_mark)
983 self.scan_block_scalar_ignored_line(start_mark)
985 # Determine the indentation level and go to the first non-empty line.
986 min_indent = self.indent+1
987 if min_indent < 1:
988 min_indent = 1
989 if increment is None:
990 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
991 indent = max(min_indent, max_indent)
992 else:
993 indent = min_indent+increment-1
994 breaks, end_mark = self.scan_block_scalar_breaks(indent)
995 line_break = u''
997 # Scan the inner part of the block scalar.
998 while self.reader.column == indent and self.reader.peek() != u'\0':
999 chunks.extend(breaks)
1000 leading_non_space = self.reader.peek() not in u' \t'
1001 length = 0
1002 while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
1003 length += 1
1004 chunks.append(self.reader.prefix(length))
1005 self.reader.forward(length)
1006 line_break = self.scan_line_break()
1007 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1008 if self.reader.column == indent and self.reader.peek() != u'\0':
1009 # Unfortunately, folding rules are ambiguous.
1011 # This is the folding according to the specification:
1013 if folded and line_break == u'\n' \
1014 and leading_non_space and self.reader.peek() not in u' \t':
1015 if not breaks:
1016 chunks.append(u' ')
1017 else:
1018 chunks.append(line_break)
1020 # This is Clark Evans's interpretation (also in the spec
1021 # examples):
1023 #if folded and line_break == u'\n':
1024 # if not breaks:
1025 # if self.reader.peek() not in ' \t':
1026 # chunks.append(u' ')
1027 # else:
1028 # chunks.append(line_break)
1029 #else:
1030 # chunks.append(line_break)
1031 else:
1032 break
1034 # Chomp the tail.
1035 if chomping is not False:
1036 chunks.append(line_break)
1037 if chomping is True:
1038 chunks.extend(breaks)
1040 # We are done.
1041 return ScalarToken(u''.join(chunks), False, start_mark, end_mark)
1043 def scan_block_scalar_indicators(self, start_mark):
1044 # See the specification for details.
1045 chomping = None
1046 increment = None
1047 ch = self.reader.peek()
1048 if ch in u'+-':
1049 if ch == '+':
1050 chomping = True
1051 else:
1052 chomping = False
1053 self.reader.forward()
1054 ch = self.reader.peek()
1055 if ch in u'0123456789':
1056 increment = int(ch)
1057 if increment == 0:
1058 raise ScannerError("while scanning a block scalar", start_mark,
1059 "expected indentation indicator in the range 1-9, but found 0",
1060 self.reader.get_mark())
1061 self.reader.forward()
1062 elif ch in u'0123456789':
1063 increment = int(ch)
1064 if increment == 0:
1065 raise ScannerError("while scanning a block scalar", start_mark,
1066 "expected indentation indicator in the range 1-9, but found 0",
1067 self.reader.get_mark())
1068 self.reader.forward()
1069 ch = self.reader.peek()
1070 if ch in u'+-':
1071 if ch == '+':
1072 chomping = True
1073 else:
1074 chomping = False
1075 self.reader.forward()
1076 ch = self.reader.peek()
1077 if ch not in u'\0 \r\n\x85\u2028\u2029':
1078 raise ScannerError("while scanning a block scalar", start_mark,
1079 "expected chomping or indentation indicators, but found %r"
1080 % ch.encode('utf-8'), self.reader.get_mark())
1081 return chomping, increment
1083 def scan_block_scalar_ignored_line(self, start_mark):
1084 # See the specification for details.
1085 while self.reader.peek() == u' ':
1086 self.reader.forward()
1087 if self.reader.peek() == u'#':
1088 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1089 self.reader.forward()
1090 ch = self.reader.peek()
1091 if ch not in u'\0\r\n\x85\u2028\u2029':
1092 raise ScannerError("while scanning a block scalar", start_mark,
1093 "expected a comment or a line break, but found %r"
1094 % ch.encode('utf-8'), self.reader.get_mark())
1095 self.scan_line_break()
1097 def scan_block_scalar_indentation(self):
1098 # See the specification for details.
1099 chunks = []
1100 max_indent = 0
1101 end_mark = self.reader.get_mark()
1102 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1103 if self.reader.peek() != u' ':
1104 chunks.append(self.scan_line_break())
1105 end_mark = self.reader.get_mark()
1106 else:
1107 self.reader.forward()
1108 if self.reader.column > max_indent:
1109 max_indent = self.reader.column
1110 return chunks, max_indent, end_mark
1112 def scan_block_scalar_breaks(self, indent):
1113 # See the specification for details.
1114 chunks = []
1115 end_mark = self.reader.get_mark()
1116 while self.reader.column < indent and self.reader.peek() == u' ':
1117 self.reader.forward()
1118 while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1119 chunks.append(self.scan_line_break())
1120 end_mark = self.reader.get_mark()
1121 while self.reader.column < indent and self.reader.peek() == u' ':
1122 self.reader.forward()
1123 return chunks, end_mark
1125 def scan_flow_scalar(self, double):
1126 # See the specification for details.
1127 # Note that we loose indentation rules for quoted scalars. Quoted
1128 # scalars don't need to adhere indentation because " and ' clearly
1129 # mark the beginning and the end of them. Therefore we are less
1130 # restrictive then the specification requires. We only need to check
1131 # that document separators are not included in scalars.
1132 chunks = []
1133 start_mark = self.reader.get_mark()
1134 quote = self.reader.peek()
1135 self.reader.forward()
1136 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1137 while self.reader.peek() != quote:
1138 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1139 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1140 self.reader.forward()
1141 end_mark = self.reader.get_mark()
1142 return ScalarToken(u''.join(chunks), False, start_mark, end_mark)
1144 ESCAPE_REPLACEMENTS = {
1145 u'0': u'\0',
1146 u'a': u'\x07',
1147 u'b': u'\x08',
1148 u't': u'\x09',
1149 u'\t': u'\x09',
1150 u'n': u'\x0A',
1151 u'v': u'\x0B',
1152 u'f': u'\x0C',
1153 u'r': u'\x0D',
1154 u'e': u'\x1B',
1155 u' ': u'\x20',
1156 u'\"': u'\"',
1157 u'\\': u'\\',
1158 u'N': u'\x85',
1159 u'_': u'\xA0',
1160 u'L': u'\u2028',
1161 u'P': u'\u2029',
1164 ESCAPE_CODES = {
1165 u'x': 2,
1166 u'u': 4,
1167 u'U': 8,
1170 def scan_flow_scalar_non_spaces(self, double, start_mark):
1171 # See the specification for details.
1172 chunks = []
1173 while True:
1174 length = 0
1175 while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1176 length += 1
1177 if length:
1178 chunks.append(self.reader.prefix(length))
1179 self.reader.forward(length)
1180 ch = self.reader.peek()
1181 if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1182 chunks.append(u'\'')
1183 self.reader.forward(2)
1184 elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1185 chunks.append(ch)
1186 self.reader.forward()
1187 elif double and ch == u'\\':
1188 self.reader.forward()
1189 ch = self.reader.peek()
1190 if ch in self.ESCAPE_REPLACEMENTS:
1191 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1192 self.reader.forward()
1193 elif ch in self.ESCAPE_CODES:
1194 length = self.ESCAPE_CODES[ch]
1195 self.reader.forward()
1196 for k in range(length):
1197 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1198 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1199 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1200 (length, self.reader.peek(k).encode('utf-8')), self.reader.get_mark())
1201 code = int(self.reader.prefix(length), 16)
1202 chunks.append(unichr(code))
1203 self.reader.forward(length)
1204 elif ch in u'\r\n\x85\u2028\u2029':
1205 self.scan_line_break()
1206 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1207 else:
1208 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1209 "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_mark())
1210 else:
1211 return chunks
1213 def scan_flow_scalar_spaces(self, double, start_mark):
1214 # See the specification for details.
1215 chunks = []
1216 length = 0
1217 while self.reader.peek(length) in u' \t':
1218 length += 1
1219 whitespaces = self.reader.prefix(length)
1220 self.reader.forward(length)
1221 ch = self.reader.peek()
1222 if ch == u'\0':
1223 raise ScannerError("while scanning a quoted scalar", start_mark,
1224 "found unexpected end of stream", self.reader.get_mark())
1225 elif ch in u'\r\n\x85\u2028\u2029':
1226 line_break = self.scan_line_break()
1227 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1228 if line_break != u'\n':
1229 chunks.append(line_break)
1230 elif not breaks:
1231 chunks.append(u' ')
1232 chunks.extend(breaks)
1233 else:
1234 chunks.append(whitespaces)
1235 return chunks
1237 def scan_flow_scalar_breaks(self, double, start_mark):
1238 # See the specification for details.
1239 chunks = []
1240 while True:
1241 # Instead of checking indentation, we check for document
1242 # separators.
1243 prefix = self.reader.prefix(3)
1244 if (prefix == u'---' or prefix == u'...') \
1245 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1246 raise ScannerError("while scanning a quoted scalar", start_mark,
1247 "found unexpected document separator", self.reader.get_mark())
1248 while self.reader.peek() in u' \t':
1249 self.reader.forward()
1250 if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1251 chunks.append(self.scan_line_break())
1252 else:
1253 return chunks
1255 def scan_plain(self):
1256 # See the specification for details.
1257 # We add an additional restriction for the flow context:
1258 # plain scalars in the flow context cannot contain ',', ':' and '?'.
1259 # We also keep track of the `allow_simple_key` flag here.
1260 # Indentation rules are loosed for the flow context.
1261 chunks = []
1262 start_mark = self.reader.get_mark()
1263 end_mark = start_mark
1264 indent = self.indent+1
1265 # We allow zero indentation for scalars, but then we need to check for
1266 # document separators at the beginning of the line.
1267 #if indent == 0:
1268 # indent = 1
1269 spaces = []
1270 while True:
1271 length = 0
1272 if self.reader.peek() == u'#':
1273 break
1274 while True:
1275 ch = self.reader.peek(length)
1276 if ch in u'\0 \t\r\n\x85\u2028\u2029' \
1277 or (not self.flow_level and ch == u':' and
1278 self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1279 or (self.flow_level and ch in u',:?[]{}'):
1280 break
1281 length += 1
1282 if length == 0:
1283 break
1284 self.allow_simple_key = False
1285 chunks.extend(spaces)
1286 chunks.append(self.reader.prefix(length))
1287 self.reader.forward(length)
1288 end_mark = self.reader.get_mark()
1289 spaces = self.scan_plain_spaces(indent, start_mark)
1290 if not spaces or self.reader.peek() == u'#' \
1291 or (not self.flow_level and self.reader.column < indent):
1292 break
1293 return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
1295 def scan_plain_spaces(self, indent, start_mark):
1296 # See the specification for details.
1297 # The specification is really confusing about tabs in plain scalars.
1298 # We just forbid them completely. Do not use tabs in YAML!
1299 chunks = []
1300 length = 0
1301 while self.reader.peek(length) in u' ':
1302 length += 1
1303 whitespaces = self.reader.prefix(length)
1304 self.reader.forward(length)
1305 ch = self.reader.peek()
1306 if ch in u'\r\n\x85\u2028\u2029':
1307 line_break = self.scan_line_break()
1308 self.allow_simple_key = True
1309 prefix = self.reader.prefix(3)
1310 if (prefix == u'---' or prefix == u'...') \
1311 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1312 return
1313 breaks = []
1314 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1315 if self.reader.peek() == ' ':
1316 self.reader.forward()
1317 else:
1318 breaks.append(self.scan_line_break())
1319 prefix = self.reader.prefix(3)
1320 if (prefix == u'---' or prefix == u'...') \
1321 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1322 return
1323 if line_break != u'\n':
1324 chunks.append(line_break)
1325 elif not breaks:
1326 chunks.append(u' ')
1327 chunks.extend(breaks)
1328 elif whitespaces:
1329 chunks.append(whitespaces)
1330 return chunks
1332 def scan_tag_handle(self, name, start_mark):
1333 # See the specification for details.
1334 # For some strange reasons, the specification does not allow '_' in
1335 # tag handles. I have allowed it anyway.
1336 ch = self.reader.peek()
1337 if ch != u'!':
1338 raise ScannerError("while scanning a %s" % name, start_mark,
1339 "expected '!', but found %r" % ch.encode('utf-8'),
1340 self.reader.get_mark())
1341 length = 1
1342 ch = self.reader.peek(length)
1343 if ch != u' ':
1344 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1345 or ch in u'-_':
1346 length += 1
1347 ch = self.reader.peek(length)
1348 if ch != u'!':
1349 self.reader.forward(length)
1350 raise ScannerError("while scanning a %s" % name, start_mark,
1351 "expected '!', but found %r" % ch.encode('utf-8'),
1352 self.reader.get_mark())
1353 length += 1
1354 value = self.reader.prefix(length)
1355 self.reader.forward(length)
1356 return value
1358 def scan_tag_uri(self, name, start_mark):
1359 # See the specification for details.
1360 # Note: we do not check if URI is well-formed.
1361 chunks = []
1362 length = 0
1363 ch = self.reader.peek(length)
1364 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1365 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1366 if ch == u'%':
1367 chunks.append(self.reader.prefix(length))
1368 self.reader.forward(length)
1369 length = 0
1370 chunks.append(self.scan_uri_escapes(name, start_mark))
1371 else:
1372 length += 1
1373 ch = self.reader.peek(length)
1374 if length:
1375 chunks.append(self.reader.prefix(length))
1376 self.reader.forward(length)
1377 length = 0
1378 if not chunks:
1379 raise ScannerError("while parsing a %s" % name, start_mark,
1380 "expected URI, but found %r" % ch.encode('utf-8'),
1381 self.reader.get_mark())
1382 return u''.join(chunks)
1384 def scan_uri_escapes(self, name, start_mark):
1385 # See the specification for details.
1386 bytes = []
1387 mark = self.reader.get_mark()
1388 while self.reader.peek() == u'%':
1389 self.reader.forward()
1390 for k in range(2):
1391 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1392 raise ScannerError("while scanning a %s" % name, start_mark,
1393 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1394 (self.reader.peek(k).encode('utf-8')), self.reader.get_mark())
1395 bytes.append(chr(int(self.reader.prefix(2), 16)))
1396 self.reader.forward(2)
1397 try:
1398 value = unicode(''.join(bytes), 'utf-8')
1399 except UnicodeDecodeError, exc:
1400 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
1401 return value
1403 def scan_line_break(self):
1404 # Transforms:
1405 # '\r\n' : '\n'
1406 # '\r' : '\n'
1407 # '\n' : '\n'
1408 # '\x85' : '\n'
1409 # '\u2028' : '\u2028'
1410 # '\u2029 : '\u2029'
1411 # default : ''
1412 ch = self.reader.peek()
1413 if ch in u'\r\n\x85':
1414 if self.reader.prefix(2) == u'\r\n':
1415 self.reader.forward(2)
1416 else:
1417 self.reader.forward()
1418 return u'\n'
1419 elif ch in u'\u2028\u2029':
1420 self.reader.forward()
1421 return ch
1422 return u''
1424 #try:
1425 # import psyco
1426 # psyco.bind(Scanner)
1427 #except ImportError:
1428 # pass