2 # Scanner produces tokens of the following types:
3 # DIRECTIVE(name, value)
21 # SCALAR(value, plain)
23 # Read comments in the Scanner code for more details.
26 __all__
= ['Scanner', 'ScannerError']
28 from error
import MarkedYAMLError
31 class ScannerError(MarkedYAMLError
):
35 # See below simple keys treatment.
37 def __init__(self
, token_number
, required
, index
, line
, column
, mark
):
38 self
.token_number
= token_number
39 self
.required
= required
48 def __init__(self
, reader
):
49 """Initialize the scanner."""
50 # The input stream. The Reader class do the dirty work of checking for
51 # BOM and converting the input data to Unicode. It also adds NUL to
54 # Reader supports the following methods
55 # self.reader.peek(i=0) # peek the next i-th character
56 # self.reader.prefix(l=1) # peek the next l characters
57 # self.reader.forward(l=1) # read the next l characters
58 # and move the pointer
61 # Had we reached the end of the stream?
64 # The number of unclosed '{' and '['. `flow_level == 0` means block
68 # List of processed tokens that are not yet emitted.
71 # Number of tokens that were emitted through the `get_token` method.
74 # The current indentation level.
77 # Past indentation levels.
80 # Variables related to simple keys treatment.
82 # A simple key is a key that is not denoted by the '?' indicator.
83 # Example of simple keys:
85 # block simple key: value
87 # : { flow simple key: value }
88 # We emit the KEY token before all keys, so when we find a potential
89 # simple key, we try to locate the corresponding ':' indicator.
90 # Simple keys should be limited to a single line and 1024 characters.
92 # Can a simple key start at the current position? A simple key may
94 # - at the beginning of the line, not counting indentation spaces
96 # - after '{', '[', ',' (in the flow context),
97 # - after '?', ':', '-' (in the block context).
98 # In the block context, this flag also signifies if a block collection
99 # may start at the current position.
100 self
.allow_simple_key
= True
102 # Keep track of possible simple keys. This is a dictionary. The key
103 # is `flow_level`; there can be no more that one possible simple key
104 # for each level. The value is a SimpleKey record:
105 # (token_number, required, index, line, column, mark)
106 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
107 # '[', or '{' tokens.
108 self
.possible_simple_keys
= {}
112 def check(self
, *choices
):
113 # Check if the next token is one of the given types.
114 while self
.need_more_tokens():
115 self
.fetch_more_tokens()
117 for choice
in choices
:
118 if isinstance(self
.tokens
[0], choice
):
123 # Return the next token, but do not delete if from the queue.
124 while self
.need_more_tokens():
125 self
.fetch_more_tokens()
127 return self
.tokens
[0]
130 # Return the next token.
131 while self
.need_more_tokens():
132 self
.fetch_more_tokens()
134 self
.tokens_taken
+= 1
135 return self
.tokens
.pop(0)
139 while self
.need_more_tokens():
140 self
.fetch_more_tokens()
142 self
.tokens_taken
+= 1
143 yield self
.tokens
.pop(0)
144 while self
.need_more_tokens():
145 self
.fetch_more_tokens()
149 def need_more_tokens(self
):
154 # The current token may be a potential simple key, so we
155 # need to look further.
156 self
.stale_possible_simple_keys()
157 if self
.next_possible_simple_key() == self
.tokens_taken
:
160 def fetch_more_tokens(self
):
162 # Eat whitespaces and comments until we reach the next token.
163 self
.scan_to_next_token()
165 # Remove obsolete possible simple keys.
166 self
.stale_possible_simple_keys()
168 # Compare the current indentation and column. It may add some tokens
169 # and decrease the current indentation level.
170 self
.unwind_indent(self
.reader
.column
)
172 # Peek the next character.
173 ch
= self
.reader
.peek()
175 # Is it the end of stream?
177 return self
.fetch_stream_end()
180 if ch
== u
'%' and self
.check_directive():
181 return self
.fetch_directive()
183 # Is it the document start?
184 if ch
== u
'-' and self
.check_document_start():
185 return self
.fetch_document_start()
187 # Is it the document end?
188 if ch
== u
'.' and self
.check_document_end():
189 return self
.fetch_document_end()
191 # TODO: support for BOM within a stream.
193 # return self.fetch_bom() <-- issue BOMToken
195 # Note: the order of the following checks is NOT significant.
197 # Is it the flow sequence start indicator?
199 return self
.fetch_flow_sequence_start()
201 # Is it the flow mapping start indicator?
203 return self
.fetch_flow_mapping_start()
205 # Is it the flow sequence end indicator?
207 return self
.fetch_flow_sequence_end()
209 # Is it the flow mapping end indicator?
211 return self
.fetch_flow_mapping_end()
213 # Is it the flow entry indicator?
215 return self
.fetch_flow_entry()
217 # Is it the block entry indicator?
218 if ch
in u
'-' and self
.check_block_entry():
219 return self
.fetch_block_entry()
221 # Is it the key indicator?
222 if ch
== u
'?' and self
.check_key():
223 return self
.fetch_key()
225 # Is it the value indicator?
226 if ch
== u
':' and self
.check_value():
227 return self
.fetch_value()
231 return self
.fetch_alias()
235 return self
.fetch_anchor()
239 return self
.fetch_tag()
241 # Is it a literal scalar?
242 if ch
== u
'|' and not self
.flow_level
:
243 return self
.fetch_literal()
245 # Is it a folded scalar?
246 if ch
== u
'>' and not self
.flow_level
:
247 return self
.fetch_folded()
249 # Is it a single quoted scalar?
251 return self
.fetch_single()
253 # Is it a double quoted scalar?
255 return self
.fetch_double()
257 # It must be a plain scalar then.
258 if self
.check_plain():
259 return self
.fetch_plain()
261 # No? It's an error. Let's produce a nice error message.
262 raise ScannerError("while scanning for the next token", None,
263 "found character %r that cannot start any token"
264 % ch
.encode('utf-8'), self
.reader
.get_mark())
266 # Simple keys treatment.
268 def next_possible_simple_key(self
):
269 # Return the number of the nearest possible simple key. Actually we
270 # don't need to loop through the whole dictionary. We may replace it
271 # with the following code:
272 # if not self.possible_simple_keys:
274 # return self.possible_simple_keys[
275 # min(self.possible_simple_keys.keys())].token_number
276 min_token_number
= None
277 for level
in self
.possible_simple_keys
:
278 key
= self
.possible_simple_keys
[level
]
279 if min_token_number
is None or key
.token_number
< min_token_number
:
280 min_token_number
= key
.token_number
281 return min_token_number
283 def stale_possible_simple_keys(self
):
284 # Remove entries that are no longer possible simple keys. According to
285 # the YAML specification, simple keys
286 # - should be limited to a single line,
287 # - should be no longer than 1024 characters.
288 # Disabling this procedure will allow simple keys of any length and
289 # height (may cause problems if indentation is broken though).
290 for level
in self
.possible_simple_keys
.keys():
291 key
= self
.possible_simple_keys
[level
]
292 if key
.line
!= self
.reader
.line \
293 or self
.reader
.index
-key
.index
> 1024:
295 raise ScannerError("while scanning a simple key", key
.mark
,
296 "could not found expected ':'", self
.reader
.get_mark())
297 del self
.possible_simple_keys
[level
]
299 def save_possible_simple_key(self
):
300 # The next token may start a simple key. We check if it's possible
301 # and save its position. This function is called for
302 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
304 # Check if a simple key is required at the current position.
305 required
= not self
.flow_level
and self
.indent
== self
.reader
.column
307 # A simple key is required only if it is the first token in the current
308 # line. Therefore it is always allowed.
309 assert self
.allow_simple_key
or not required
311 # The next token might be a simple key. Let's save it's number and
313 if self
.allow_simple_key
:
314 self
.remove_possible_simple_key()
315 token_number
= self
.tokens_taken
+len(self
.tokens
)
316 index
= self
.reader
.index
317 line
= self
.reader
.line
318 column
= self
.reader
.column
319 mark
= self
.reader
.get_mark()
320 key
= SimpleKey(token_number
, required
,
321 index
, line
, column
, mark
)
322 self
.possible_simple_keys
[self
.flow_level
] = key
324 def remove_possible_simple_key(self
):
325 # Remove the saved possible key position at the current flow level.
326 if self
.flow_level
in self
.possible_simple_keys
:
327 key
= self
.possible_simple_keys
[self
.flow_level
]
329 # I don't think it's possible, but I could be wrong.
330 assert not key
.required
332 # raise ScannerError("while scanning a simple key", key.mark,
333 # "could not found expected ':'", self.reader.get_mark())
335 # Indentation functions.
337 def unwind_indent(self
, column
):
339 ## In flow context, tokens should respect indentation.
340 ## Actually the condition should be `self.indent >= column` according to
341 ## the spec. But this condition will prohibit intuitively correct
342 ## constructions such as
345 #if self.flow_level and self.indent > column:
346 # raise ScannerError(None, None,
347 # "invalid intendation or unclosed '[' or '{'",
348 # self.reader.get_mark())
350 # In the flow context, indentation is ignored. We make the scanner less
351 # restrictive then specification requires.
355 # In block context, we may need to issue the BLOCK-END tokens.
356 while self
.indent
> column
:
357 mark
= self
.reader
.get_mark()
358 self
.indent
= self
.indents
.pop()
359 self
.tokens
.append(BlockEndToken(mark
, mark
))
361 def add_indent(self
, column
):
362 # Check if we need to increase indentation.
363 if self
.indent
< column
:
364 self
.indents
.append(self
.indent
)
371 def fetch_stream_end(self
):
373 # Set the current intendation to -1.
374 self
.unwind_indent(-1)
376 # Reset everything (not really needed).
377 self
.allow_simple_key
= False
378 self
.possible_simple_keys
= {}
381 mark
= self
.reader
.get_mark()
384 self
.tokens
.append(StreamEndToken(mark
, mark
))
386 # The reader is ended.
389 def fetch_directive(self
):
391 # Set the current intendation to -1.
392 self
.unwind_indent(-1)
395 self
.remove_possible_simple_key()
396 self
.allow_simple_key
= False
398 # Scan and add DIRECTIVE.
399 self
.tokens
.append(self
.scan_directive())
401 def fetch_document_start(self
):
402 self
.fetch_document_indicator(DocumentStartToken
)
404 def fetch_document_end(self
):
405 self
.fetch_document_indicator(DocumentEndToken
)
407 def fetch_document_indicator(self
, TokenClass
):
409 # Set the current intendation to -1.
410 self
.unwind_indent(-1)
412 # Reset simple keys. Note that there could not be a block collection
414 self
.remove_possible_simple_key()
415 self
.allow_simple_key
= False
417 # Add DOCUMENT-START or DOCUMENT-END.
418 start_mark
= self
.reader
.get_mark()
419 self
.reader
.forward(3)
420 end_mark
= self
.reader
.get_mark()
421 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
423 def fetch_flow_sequence_start(self
):
424 self
.fetch_flow_collection_start(FlowSequenceStartToken
)
426 def fetch_flow_mapping_start(self
):
427 self
.fetch_flow_collection_start(FlowMappingStartToken
)
429 def fetch_flow_collection_start(self
, TokenClass
):
431 # '[' and '{' may start a simple key.
432 self
.save_possible_simple_key()
434 # Increase the flow level.
437 # Simple keys are allowed after '[' and '{'.
438 self
.allow_simple_key
= True
440 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
441 start_mark
= self
.reader
.get_mark()
442 self
.reader
.forward()
443 end_mark
= self
.reader
.get_mark()
444 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
446 def fetch_flow_sequence_end(self
):
447 self
.fetch_flow_collection_end(FlowSequenceEndToken
)
449 def fetch_flow_mapping_end(self
):
450 self
.fetch_flow_collection_end(FlowMappingEndToken
)
452 def fetch_flow_collection_end(self
, TokenClass
):
454 # Reset possible simple key on the current level.
455 self
.remove_possible_simple_key()
457 # Decrease the flow level.
460 # No simple keys after ']' or '}'.
461 self
.allow_simple_key
= False
463 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
464 start_mark
= self
.reader
.get_mark()
465 self
.reader
.forward()
466 end_mark
= self
.reader
.get_mark()
467 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
469 def fetch_flow_entry(self
):
471 # Simple keys are allowed after ','.
472 self
.allow_simple_key
= True
474 # Reset possible simple key on the current level.
475 self
.remove_possible_simple_key()
478 start_mark
= self
.reader
.get_mark()
479 self
.reader
.forward()
480 end_mark
= self
.reader
.get_mark()
481 self
.tokens
.append(FlowEntryToken(start_mark
, end_mark
))
483 def fetch_block_entry(self
):
485 # Block context needs additional checks.
486 if not self
.flow_level
:
488 # Are we allowed to start a new entry?
489 if not self
.allow_simple_key
:
490 raise ScannerError(None, None,
491 "sequence entries are not allowed here",
492 self
.reader
.get_mark())
494 # We may need to add BLOCK-SEQUENCE-START.
495 if self
.add_indent(self
.reader
.column
):
496 mark
= self
.reader
.get_mark()
497 self
.tokens
.append(BlockSequenceStartToken(mark
, mark
))
499 # It's an error for the block entry to occur in the flow context,
500 # but we let the parser detect this.
504 # Simple keys are allowed after '-'.
505 self
.allow_simple_key
= True
507 # Reset possible simple key on the current level.
508 self
.remove_possible_simple_key()
511 start_mark
= self
.reader
.get_mark()
512 self
.reader
.forward()
513 end_mark
= self
.reader
.get_mark()
514 self
.tokens
.append(BlockEntryToken(start_mark
, end_mark
))
518 # Block context needs additional checks.
519 if not self
.flow_level
:
521 # Are we allowed to start a key (not nessesary a simple)?
522 if not self
.allow_simple_key
:
523 raise ScannerError(None, None,
524 "mapping keys are not allowed here",
525 self
.reader
.get_mark())
527 # We may need to add BLOCK-MAPPING-START.
528 if self
.add_indent(self
.reader
.column
):
529 mark
= self
.reader
.get_mark()
530 self
.tokens
.append(BlockMappingStartToken(mark
, mark
))
532 # Simple keys are allowed after '?' in the block context.
533 self
.allow_simple_key
= not self
.flow_level
535 # Reset possible simple key on the current level.
536 self
.remove_possible_simple_key()
539 start_mark
= self
.reader
.get_mark()
540 self
.reader
.forward()
541 end_mark
= self
.reader
.get_mark()
542 self
.tokens
.append(KeyToken(start_mark
, end_mark
))
544 def fetch_value(self
):
546 # Do we determine a simple key?
547 if self
.flow_level
in self
.possible_simple_keys
:
550 key
= self
.possible_simple_keys
[self
.flow_level
]
551 del self
.possible_simple_keys
[self
.flow_level
]
552 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
553 KeyToken(key
.mark
, key
.mark
))
555 # If this key starts a new block mapping, we need to add
556 # BLOCK-MAPPING-START.
557 if not self
.flow_level
:
558 if self
.add_indent(key
.column
):
559 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
560 BlockMappingStartToken(key
.mark
, key
.mark
))
562 # There cannot be two simple keys one after another.
563 self
.allow_simple_key
= False
565 # It must be a part of a complex key.
568 # Block context needs additional checks.
569 # (Do we really need them? They will be catched by the parser
571 if not self
.flow_level
:
573 # We are allowed to start a complex value if and only if
574 # we can start a simple key.
575 if not self
.allow_simple_key
:
576 raise ScannerError(None, None,
577 "mapping values are not allowed here",
578 self
.reader
.get_mark())
580 # Simple keys are allowed after ':' in the block context.
581 self
.allow_simple_key
= not self
.flow_level
583 # Reset possible simple key on the current level.
584 self
.remove_possible_simple_key()
587 start_mark
= self
.reader
.get_mark()
588 self
.reader
.forward()
589 end_mark
= self
.reader
.get_mark()
590 self
.tokens
.append(ValueToken(start_mark
, end_mark
))
592 def fetch_alias(self
):
594 # ALIAS could be a simple key.
595 self
.save_possible_simple_key()
597 # No simple keys after ALIAS.
598 self
.allow_simple_key
= False
600 # Scan and add ALIAS.
601 self
.tokens
.append(self
.scan_anchor(AliasToken
))
603 def fetch_anchor(self
):
605 # ANCHOR could start a simple key.
606 self
.save_possible_simple_key()
608 # No simple keys after ANCHOR.
609 self
.allow_simple_key
= False
611 # Scan and add ANCHOR.
612 self
.tokens
.append(self
.scan_anchor(AnchorToken
))
616 # TAG could start a simple key.
617 self
.save_possible_simple_key()
619 # No simple keys after TAG.
620 self
.allow_simple_key
= False
623 self
.tokens
.append(self
.scan_tag())
625 def fetch_literal(self
):
626 self
.fetch_block_scalar(folded
=False)
628 def fetch_folded(self
):
629 self
.fetch_block_scalar(folded
=True)
631 def fetch_block_scalar(self
, folded
):
633 # A simple key may follow a block scalar.
634 self
.allow_simple_key
= True
636 # Reset possible simple key on the current level.
637 self
.remove_possible_simple_key()
639 # Scan and add SCALAR.
640 self
.tokens
.append(self
.scan_block_scalar(folded
))
642 def fetch_single(self
):
643 self
.fetch_flow_scalar(double
=False)
645 def fetch_double(self
):
646 self
.fetch_flow_scalar(double
=True)
648 def fetch_flow_scalar(self
, double
):
650 # A flow scalar could be a simple key.
651 self
.save_possible_simple_key()
653 # No simple keys after flow scalars.
654 self
.allow_simple_key
= False
656 # Scan and add SCALAR.
657 self
.tokens
.append(self
.scan_flow_scalar(double
))
659 def fetch_plain(self
):
661 # A plain scalar could be a simple key.
662 self
.save_possible_simple_key()
664 # No simple keys after plain scalars. But note that `scan_plain` will
665 # change this flag if the scan is finished at the beginning of the
667 self
.allow_simple_key
= False
669 # Scan and add SCALAR. May change `allow_simple_key`.
670 self
.tokens
.append(self
.scan_plain())
674 def check_directive(self
):
676 # DIRECTIVE: ^ '%' ...
677 # The '%' indicator is already checked.
678 if self
.reader
.column
== 0:
681 def check_document_start(self
):
683 # DOCUMENT-START: ^ '---' (' '|'\n')
684 if self
.reader
.column
== 0:
685 if self
.reader
.prefix(3) == u
'---' \
686 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
689 def check_document_end(self
):
691 # DOCUMENT-END: ^ '...' (' '|'\n')
692 if self
.reader
.column
== 0:
693 prefix
= self
.reader
.peek(4)
694 if self
.reader
.prefix(3) == u
'...' \
695 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
698 def check_block_entry(self
):
700 # BLOCK-ENTRY: '-' (' '|'\n')
701 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
705 # KEY(flow context): '?'
709 # KEY(block context): '?' (' '|'\n')
711 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
713 def check_value(self
):
715 # VALUE(flow context): ':'
719 # VALUE(block context): ':' (' '|'\n')
721 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
723 def check_plain(self
):
725 # A plain scalar may start with any non-space character except:
726 # '-', '?', ':', ',', '[', ']', '{', '}',
727 # '#', '&', '*', '!', '|', '>', '\'', '\"',
730 # It may also start with
732 # if it is followed by a non-space character.
734 # Note that we limit the last rule to the block context (except the
735 # '-' character) because we want the flow context to be space
737 ch
= self
.reader
.peek()
738 return ch
not in u
'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
739 or (self
.reader
.peek(1) not in u
'\0 \t\r\n\x85\u2028\u2029'
740 and (ch
== '-' or (not self
.flow_level
and ch
in u
'?:')))
744 def scan_to_next_token(self
):
745 # We ignore spaces, line breaks and comments.
746 # If we find a line break in the block context, we set the flag
747 # `allow_simple_key` on.
748 # The byte order mark is stripped if it's the first character in the
749 # stream. We do not yet support BOM inside the stream as the
750 # specification requires. Any such mark will be considered as a part
753 # TODO: We need to make tab handling rules more sane. A good rule is
754 # Tabs cannot precede tokens
755 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
756 # KEY(block), VALUE(block), BLOCK-ENTRY
757 # So the checking code is
759 # self.allow_simple_keys = False
760 # We also need to add the check for `allow_simple_keys == True` to
761 # `unwind_indent` before issuing BLOCK-END.
762 # Scanners for block, flow, and plain scalars need to be modified.
764 if self
.reader
.index
== 0 and self
.reader
.peek() == u
'\uFEFF':
765 self
.reader
.forward()
768 while self
.reader
.peek() == u
' ':
769 self
.reader
.forward()
770 if self
.reader
.peek() == u
'#':
771 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
772 self
.reader
.forward()
773 if self
.scan_line_break():
774 if not self
.flow_level
:
775 self
.allow_simple_key
= True
779 def scan_directive(self
):
780 # See the specification for details.
781 start_mark
= self
.reader
.get_mark()
782 self
.reader
.forward()
783 name
= self
.scan_directive_name(start_mark
)
786 value
= self
.scan_yaml_directive_value(start_mark
)
787 end_mark
= self
.reader
.get_mark()
789 value
= self
.scan_tag_directive_value(start_mark
)
790 end_mark
= self
.reader
.get_mark()
792 end_mark
= self
.reader
.get_mark()
793 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
794 self
.reader
.forward()
795 self
.scan_directive_ignored_line(start_mark
)
796 return DirectiveToken(name
, value
, start_mark
, end_mark
)
798 def scan_directive_name(self
, start_mark
):
799 # See the specification for details.
801 ch
= self
.reader
.peek(length
)
802 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
805 ch
= self
.reader
.peek(length
)
807 raise ScannerError("while scanning a directive", start_mark
,
808 "expected alphabetic or numeric character, but found %r"
809 % ch
.encode('utf-8'), self
.reader
.get_mark())
810 value
= self
.reader
.prefix(length
)
811 self
.reader
.forward(length
)
812 ch
= self
.reader
.peek()
813 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
814 raise ScannerError("while scanning a directive", start_mark
,
815 "expected alphabetic or numeric character, but found %r"
816 % ch
.encode('utf-8'), self
.reader
.get_mark())
819 def scan_yaml_directive_value(self
, start_mark
):
820 # See the specification for details.
821 while self
.reader
.peek() == u
' ':
822 self
.reader
.forward()
823 major
= self
.scan_yaml_directive_number(start_mark
)
824 if self
.reader
.peek() != '.':
825 raise ScannerError("while scanning a directive", start_mark
,
826 "expected a digit or '.', but found %r"
827 % self
.reader
.peek().encode('utf-8'),
828 self
.reader
.get_mark())
829 self
.reader
.forward()
830 minor
= self
.scan_yaml_directive_number(start_mark
)
831 if self
.reader
.peek() not in u
'\0 \r\n\x85\u2028\u2029':
832 raise ScannerError("while scanning a directive", start_mark
,
833 "expected a digit or ' ', but found %r"
834 % self
.reader
.peek().encode('utf-8'),
835 self
.reader
.get_mark())
836 return (major
, minor
)
838 def scan_yaml_directive_number(self
, start_mark
):
839 # See the specification for details.
840 ch
= self
.reader
.peek()
841 if not (u
'0' <= ch
<= '9'):
842 raise ScannerError("while scanning a directive", start_mark
,
843 "expected a digit, but found %r" % ch
.encode('utf-8'),
844 self
.reader
.get_mark())
846 while u
'0' <= self
.reader
.peek(length
) <= u
'9':
848 value
= int(self
.reader
.prefix(length
))
849 self
.reader
.forward(length
)
852 def scan_tag_directive_value(self
, start_mark
):
853 # See the specification for details.
854 while self
.reader
.peek() == u
' ':
855 self
.reader
.forward()
856 handle
= self
.scan_tag_directive_handle(start_mark
)
857 while self
.reader
.peek() == u
' ':
858 self
.reader
.forward()
859 prefix
= self
.scan_tag_directive_prefix(start_mark
)
860 return (handle
, prefix
)
862 def scan_tag_directive_handle(self
, start_mark
):
863 # See the specification for details.
864 value
= self
.scan_tag_handle('directive', start_mark
)
865 ch
= self
.reader
.peek()
867 raise ScannerError("while scanning a directive", start_mark
,
868 "expected ' ', but found %r" % ch
.encode('utf-8'),
869 self
.reader
.get_mark())
872 def scan_tag_directive_prefix(self
, start_mark
):
873 # See the specification for details.
874 value
= self
.scan_tag_uri('directive', start_mark
)
875 ch
= self
.reader
.peek()
876 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
877 raise ScannerError("while scanning a directive", start_mark
,
878 "expected ' ', but found %r" % ch
.encode('utf-8'),
879 self
.reader
.get_mark())
882 def scan_directive_ignored_line(self
, start_mark
):
883 # See the specification for details.
884 while self
.reader
.peek() == u
' ':
885 self
.reader
.forward()
886 if self
.reader
.peek() == u
'#':
887 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
888 self
.reader
.forward()
889 ch
= self
.reader
.peek()
890 if ch
not in u
'\0\r\n\x85\u2028\u2029':
891 raise ScannerError("while scanning a directive", start_mark
,
892 "expected a comment or a line break, but found %r"
893 % ch
.encode('utf-8'), self
.reader
.get_mark())
894 self
.scan_line_break()
896 def scan_anchor(self
, TokenClass
):
897 # The specification does not restrict characters for anchors and
898 # aliases. This may lead to problems, for instance, the document:
900 # can be interpteted in two ways, as
903 # [ *alias , "value" ]
904 # Therefore we restrict aliases to numbers and ASCII letters.
905 start_mark
= self
.reader
.get_mark()
906 indicator
= self
.reader
.peek()
911 self
.reader
.forward()
913 ch
= self
.reader
.peek(length
)
914 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
917 ch
= self
.reader
.peek(length
)
919 raise ScannerError("while scanning an %s" % name
, start_mark
,
920 "expected alphabetic or numeric character, but found %r"
921 % ch
.encode('utf-8'), self
.reader
.get_mark())
922 value
= self
.reader
.prefix(length
)
923 self
.reader
.forward(length
)
924 ch
= self
.reader
.peek()
925 if ch
not in u
'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
926 raise ScannerError("while scanning an %s" % name
, start_mark
,
927 "expected alphabetic or numeric character, but found %r"
928 % ch
.encode('utf-8'), self
.reader
.get_mark())
929 end_mark
= self
.reader
.get_mark()
930 return TokenClass(value
, start_mark
, end_mark
)
933 # See the specification for details.
934 start_mark
= self
.reader
.get_mark()
935 ch
= self
.reader
.peek(1)
938 self
.reader
.forward(2)
939 suffix
= self
.scan_tag_uri('tag', start_mark
)
940 if self
.reader
.peek() != u
'>':
941 raise ScannerError("while parsing a tag", start_mark
,
942 "expected '>', but found %r" % self
.reader
.peek().encode('utf-8'),
943 self
.reader
.get_mark())
944 self
.reader
.forward()
945 elif ch
in u
'\0 \t\r\n\x85\u2028\u2029':
948 self
.reader
.forward()
952 while ch
not in u
'\0 \r\n\x85\u2028\u2029':
957 ch
= self
.reader
.peek(length
)
960 handle
= self
.scan_tag_handle('tag', start_mark
)
963 self
.reader
.forward()
964 suffix
= self
.scan_tag_uri('tag', start_mark
)
965 ch
= self
.reader
.peek()
966 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
967 raise ScannerError("while scanning a tag", start_mark
,
968 "expected ' ', but found %r" % ch
.encode('utf-8'),
969 self
.reader
.get_mark())
970 value
= (handle
, suffix
)
971 end_mark
= self
.reader
.get_mark()
972 return TagToken(value
, start_mark
, end_mark
)
974 def scan_block_scalar(self
, folded
):
975 # See the specification for details.
978 start_mark
= self
.reader
.get_mark()
981 self
.reader
.forward()
982 chomping
, increment
= self
.scan_block_scalar_indicators(start_mark
)
983 self
.scan_block_scalar_ignored_line(start_mark
)
985 # Determine the indentation level and go to the first non-empty line.
986 min_indent
= self
.indent
+1
989 if increment
is None:
990 breaks
, max_indent
, end_mark
= self
.scan_block_scalar_indentation()
991 indent
= max(min_indent
, max_indent
)
993 indent
= min_indent
+increment
-1
994 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
997 # Scan the inner part of the block scalar.
998 while self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
999 chunks
.extend(breaks
)
1000 leading_non_space
= self
.reader
.peek() not in u
' \t'
1002 while self
.reader
.peek(length
) not in u
'\0\r\n\x85\u2028\u2029':
1004 chunks
.append(self
.reader
.prefix(length
))
1005 self
.reader
.forward(length
)
1006 line_break
= self
.scan_line_break()
1007 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
1008 if self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
1009 # Unfortunately, folding rules are ambiguous.
1011 # This is the folding according to the specification:
1013 if folded
and line_break
== u
'\n' \
1014 and leading_non_space
and self
.reader
.peek() not in u
' \t':
1018 chunks
.append(line_break
)
1020 # This is Clark Evans's interpretation (also in the spec
1023 #if folded and line_break == u'\n':
1025 # if self.reader.peek() not in ' \t':
1026 # chunks.append(u' ')
1028 # chunks.append(line_break)
1030 # chunks.append(line_break)
1035 if chomping
is not False:
1036 chunks
.append(line_break
)
1037 if chomping
is True:
1038 chunks
.extend(breaks
)
1041 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
)
1043 def scan_block_scalar_indicators(self
, start_mark
):
1044 # See the specification for details.
1047 ch
= self
.reader
.peek()
1053 self
.reader
.forward()
1054 ch
= self
.reader
.peek()
1055 if ch
in u
'0123456789':
1058 raise ScannerError("while scanning a block scalar", start_mark
,
1059 "expected indentation indicator in the range 1-9, but found 0",
1060 self
.reader
.get_mark())
1061 self
.reader
.forward()
1062 elif ch
in u
'0123456789':
1065 raise ScannerError("while scanning a block scalar", start_mark
,
1066 "expected indentation indicator in the range 1-9, but found 0",
1067 self
.reader
.get_mark())
1068 self
.reader
.forward()
1069 ch
= self
.reader
.peek()
1075 self
.reader
.forward()
1076 ch
= self
.reader
.peek()
1077 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
1078 raise ScannerError("while scanning a block scalar", start_mark
,
1079 "expected chomping or indentation indicators, but found %r"
1080 % ch
.encode('utf-8'), self
.reader
.get_mark())
1081 return chomping
, increment
1083 def scan_block_scalar_ignored_line(self
, start_mark
):
1084 # See the specification for details.
1085 while self
.reader
.peek() == u
' ':
1086 self
.reader
.forward()
1087 if self
.reader
.peek() == u
'#':
1088 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1089 self
.reader
.forward()
1090 ch
= self
.reader
.peek()
1091 if ch
not in u
'\0\r\n\x85\u2028\u2029':
1092 raise ScannerError("while scanning a block scalar", start_mark
,
1093 "expected a comment or a line break, but found %r"
1094 % ch
.encode('utf-8'), self
.reader
.get_mark())
1095 self
.scan_line_break()
1097 def scan_block_scalar_indentation(self
):
1098 # See the specification for details.
1101 end_mark
= self
.reader
.get_mark()
1102 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1103 if self
.reader
.peek() != u
' ':
1104 chunks
.append(self
.scan_line_break())
1105 end_mark
= self
.reader
.get_mark()
1107 self
.reader
.forward()
1108 if self
.reader
.column
> max_indent
:
1109 max_indent
= self
.reader
.column
1110 return chunks
, max_indent
, end_mark
1112 def scan_block_scalar_breaks(self
, indent
):
1113 # See the specification for details.
1115 end_mark
= self
.reader
.get_mark()
1116 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1117 self
.reader
.forward()
1118 while self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1119 chunks
.append(self
.scan_line_break())
1120 end_mark
= self
.reader
.get_mark()
1121 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1122 self
.reader
.forward()
1123 return chunks
, end_mark
1125 def scan_flow_scalar(self
, double
):
1126 # See the specification for details.
1127 # Note that we loose indentation rules for quoted scalars. Quoted
1128 # scalars don't need to adhere indentation because " and ' clearly
1129 # mark the beginning and the end of them. Therefore we are less
1130 # restrictive then the specification requires. We only need to check
1131 # that document separators are not included in scalars.
1133 start_mark
= self
.reader
.get_mark()
1134 quote
= self
.reader
.peek()
1135 self
.reader
.forward()
1136 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1137 while self
.reader
.peek() != quote
:
1138 chunks
.extend(self
.scan_flow_scalar_spaces(double
, start_mark
))
1139 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1140 self
.reader
.forward()
1141 end_mark
= self
.reader
.get_mark()
1142 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
)
1144 ESCAPE_REPLACEMENTS
= {
1170 def scan_flow_scalar_non_spaces(self
, double
, start_mark
):
1171 # See the specification for details.
1175 while self
.reader
.peek(length
) not in u
'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1178 chunks
.append(self
.reader
.prefix(length
))
1179 self
.reader
.forward(length
)
1180 ch
= self
.reader
.peek()
1181 if not double
and ch
== u
'\'' and self
.reader
.peek(1) == u
'\'':
1182 chunks
.append(u
'\'')
1183 self
.reader
.forward(2)
1184 elif (double
and ch
== u
'\'') or (not double
and ch
in u
'\"\\'):
1186 self
.reader
.forward()
1187 elif double
and ch
== u
'\\':
1188 self
.reader
.forward()
1189 ch
= self
.reader
.peek()
1190 if ch
in self
.ESCAPE_REPLACEMENTS
:
1191 chunks
.append(self
.ESCAPE_REPLACEMENTS
[ch
])
1192 self
.reader
.forward()
1193 elif ch
in self
.ESCAPE_CODES
:
1194 length
= self
.ESCAPE_CODES
[ch
]
1195 self
.reader
.forward()
1196 for k
in range(length
):
1197 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1198 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1199 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1200 (length
, self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_mark())
1201 code
= int(self
.reader
.prefix(length
), 16)
1202 chunks
.append(unichr(code
))
1203 self
.reader
.forward(length
)
1204 elif ch
in u
'\r\n\x85\u2028\u2029':
1205 self
.scan_line_break()
1206 chunks
.extend(self
.scan_flow_scalar_breaks(double
, start_mark
))
1208 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1209 "found unknown escape character %r" % ch
.encode('utf-8'), self
.reader
.get_mark())
1213 def scan_flow_scalar_spaces(self
, double
, start_mark
):
1214 # See the specification for details.
1217 while self
.reader
.peek(length
) in u
' \t':
1219 whitespaces
= self
.reader
.prefix(length
)
1220 self
.reader
.forward(length
)
1221 ch
= self
.reader
.peek()
1223 raise ScannerError("while scanning a quoted scalar", start_mark
,
1224 "found unexpected end of stream", self
.reader
.get_mark())
1225 elif ch
in u
'\r\n\x85\u2028\u2029':
1226 line_break
= self
.scan_line_break()
1227 breaks
= self
.scan_flow_scalar_breaks(double
, start_mark
)
1228 if line_break
!= u
'\n':
1229 chunks
.append(line_break
)
1232 chunks
.extend(breaks
)
1234 chunks
.append(whitespaces
)
1237 def scan_flow_scalar_breaks(self
, double
, start_mark
):
1238 # See the specification for details.
1241 # Instead of checking indentation, we check for document
1243 prefix
= self
.reader
.prefix(3)
1244 if (prefix
== u
'---' or prefix
== u
'...') \
1245 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1246 raise ScannerError("while scanning a quoted scalar", start_mark
,
1247 "found unexpected document separator", self
.reader
.get_mark())
1248 while self
.reader
.peek() in u
' \t':
1249 self
.reader
.forward()
1250 if self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1251 chunks
.append(self
.scan_line_break())
1255 def scan_plain(self
):
1256 # See the specification for details.
1257 # We add an additional restriction for the flow context:
1258 # plain scalars in the flow context cannot contain ',', ':' and '?'.
1259 # We also keep track of the `allow_simple_key` flag here.
1260 # Indentation rules are loosed for the flow context.
1262 start_mark
= self
.reader
.get_mark()
1263 end_mark
= start_mark
1264 indent
= self
.indent
+1
1265 # We allow zero indentation for scalars, but then we need to check for
1266 # document separators at the beginning of the line.
1272 if self
.reader
.peek() == u
'#':
1275 ch
= self
.reader
.peek(length
)
1276 if ch
in u
'\0 \t\r\n\x85\u2028\u2029' \
1277 or (not self
.flow_level
and ch
== u
':' and
1278 self
.reader
.peek(length
+1) in u
'\0 \t\r\n\x28\u2028\u2029') \
1279 or (self
.flow_level
and ch
in u
',:?[]{}'):
1284 self
.allow_simple_key
= False
1285 chunks
.extend(spaces
)
1286 chunks
.append(self
.reader
.prefix(length
))
1287 self
.reader
.forward(length
)
1288 end_mark
= self
.reader
.get_mark()
1289 spaces
= self
.scan_plain_spaces(indent
, start_mark
)
1290 if not spaces
or self
.reader
.peek() == u
'#' \
1291 or (not self
.flow_level
and self
.reader
.column
< indent
):
1293 return ScalarToken(u
''.join(chunks
), True, start_mark
, end_mark
)
1295 def scan_plain_spaces(self
, indent
, start_mark
):
1296 # See the specification for details.
1297 # The specification is really confusing about tabs in plain scalars.
1298 # We just forbid them completely. Do not use tabs in YAML!
1301 while self
.reader
.peek(length
) in u
' ':
1303 whitespaces
= self
.reader
.prefix(length
)
1304 self
.reader
.forward(length
)
1305 ch
= self
.reader
.peek()
1306 if ch
in u
'\r\n\x85\u2028\u2029':
1307 line_break
= self
.scan_line_break()
1308 self
.allow_simple_key
= True
1309 prefix
= self
.reader
.prefix(3)
1310 if (prefix
== u
'---' or prefix
== u
'...') \
1311 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1314 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1315 if self
.reader
.peek() == ' ':
1316 self
.reader
.forward()
1318 breaks
.append(self
.scan_line_break())
1319 prefix
= self
.reader
.prefix(3)
1320 if (prefix
== u
'---' or prefix
== u
'...') \
1321 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1323 if line_break
!= u
'\n':
1324 chunks
.append(line_break
)
1327 chunks
.extend(breaks
)
1329 chunks
.append(whitespaces
)
1332 def scan_tag_handle(self
, name
, start_mark
):
1333 # See the specification for details.
1334 # For some strange reasons, the specification does not allow '_' in
1335 # tag handles. I have allowed it anyway.
1336 ch
= self
.reader
.peek()
1338 raise ScannerError("while scanning a %s" % name
, start_mark
,
1339 "expected '!', but found %r" % ch
.encode('utf-8'),
1340 self
.reader
.get_mark())
1342 ch
= self
.reader
.peek(length
)
1344 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1347 ch
= self
.reader
.peek(length
)
1349 self
.reader
.forward(length
)
1350 raise ScannerError("while scanning a %s" % name
, start_mark
,
1351 "expected '!', but found %r" % ch
.encode('utf-8'),
1352 self
.reader
.get_mark())
1354 value
= self
.reader
.prefix(length
)
1355 self
.reader
.forward(length
)
1358 def scan_tag_uri(self
, name
, start_mark
):
1359 # See the specification for details.
1360 # Note: we do not check if URI is well-formed.
1363 ch
= self
.reader
.peek(length
)
1364 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1365 or ch
in u
'-;/?:@&=+$,_.!~*\'()[]%':
1367 chunks
.append(self
.reader
.prefix(length
))
1368 self
.reader
.forward(length
)
1370 chunks
.append(self
.scan_uri_escapes(name
, start_mark
))
1373 ch
= self
.reader
.peek(length
)
1375 chunks
.append(self
.reader
.prefix(length
))
1376 self
.reader
.forward(length
)
1379 raise ScannerError("while parsing a %s" % name
, start_mark
,
1380 "expected URI, but found %r" % ch
.encode('utf-8'),
1381 self
.reader
.get_mark())
1382 return u
''.join(chunks
)
1384 def scan_uri_escapes(self
, name
, start_mark
):
1385 # See the specification for details.
1387 mark
= self
.reader
.get_mark()
1388 while self
.reader
.peek() == u
'%':
1389 self
.reader
.forward()
1391 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1392 raise ScannerError("while scanning a %s" % name
, start_mark
,
1393 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1394 (self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_mark())
1395 bytes
.append(chr(int(self
.reader
.prefix(2), 16)))
1396 self
.reader
.forward(2)
1398 value
= unicode(''.join(bytes
), 'utf-8')
1399 except UnicodeDecodeError, exc
:
1400 raise ScannerError("while scanning a %s" % name
, start_mark
, str(exc
), mark
)
1403 def scan_line_break(self
):
1409 # '\u2028' : '\u2028'
1410 # '\u2029 : '\u2029'
1412 ch
= self
.reader
.peek()
1413 if ch
in u
'\r\n\x85':
1414 if self
.reader
.prefix(2) == u
'\r\n':
1415 self
.reader
.forward(2)
1417 self
.reader
.forward()
1419 elif ch
in u
'\u2028\u2029':
1420 self
.reader
.forward()
1426 # psyco.bind(Scanner)
1427 #except ImportError: