Fix a typo in a plain scalar scanner.
[pyyaml/python3.git] / lib / yaml / scanner.py
blob25e1afea8976c102a7eab72dae9a11a1b440d367
2 # Scanner produces tokens of the following types:
3 # STREAM-START
4 # STREAM-END
5 # DIRECTIVE(name, value)
6 # DOCUMENT-START
7 # DOCUMENT-END
8 # BLOCK-SEQUENCE-START
9 # BLOCK-MAPPING-START
10 # BLOCK-END
11 # FLOW-SEQUENCE-START
12 # FLOW-MAPPING-START
13 # FLOW-SEQUENCE-END
14 # FLOW-MAPPING-END
15 # BLOCK-ENTRY
16 # FLOW-ENTRY
17 # KEY
18 # VALUE
19 # ALIAS(value)
20 # ANCHOR(value)
21 # TAG(value)
22 # SCALAR(value, plain)
24 # Read comments in the Scanner code for more details.
27 __all__ = ['Scanner', 'ScannerError']
29 from error import MarkedYAMLError
30 from tokens import *
32 class ScannerError(MarkedYAMLError):
33 pass
35 class SimpleKey:
36 # See below simple keys treatment.
38 def __init__(self, token_number, required, index, line, column, mark):
39 self.token_number = token_number
40 self.required = required
41 self.index = index
42 self.line = line
43 self.column = column
44 self.mark = mark
46 class Scanner:
48 def __init__(self):
49 """Initialize the scanner."""
50 # It is assumed that Scanner and Reader will have a common descendant.
51 # Reader do the dirty work of checking for BOM and converting the
52 # input data to Unicode. It also adds NUL to the end.
54 # Reader supports the following methods
55 # self.peek(i=0) # peek the next i-th character
56 # self.prefix(l=1) # peek the next l characters
57 # self.forward(l=1) # read the next l characters and move the pointer.
59 # Had we reached the end of the stream?
60 self.done = False
62 # The number of unclosed '{' and '['. `flow_level == 0` means block
63 # context.
64 self.flow_level = 0
66 # List of processed tokens that are not yet emitted.
67 self.tokens = []
69 # Add the STREAM-START token.
70 self.fetch_stream_start()
72 # Number of tokens that were emitted through the `get_token` method.
73 self.tokens_taken = 0
75 # The current indentation level.
76 self.indent = -1
78 # Past indentation levels.
79 self.indents = []
81 # Variables related to simple keys treatment.
83 # A simple key is a key that is not denoted by the '?' indicator.
84 # Example of simple keys:
85 # ---
86 # block simple key: value
87 # ? not a simple key:
88 # : { flow simple key: value }
89 # We emit the KEY token before all keys, so when we find a potential
90 # simple key, we try to locate the corresponding ':' indicator.
91 # Simple keys should be limited to a single line and 1024 characters.
93 # Can a simple key start at the current position? A simple key may
94 # start:
95 # - at the beginning of the line, not counting indentation spaces
96 # (in block context),
97 # - after '{', '[', ',' (in the flow context),
98 # - after '?', ':', '-' (in the block context).
99 # In the block context, this flag also signifies if a block collection
100 # may start at the current position.
101 self.allow_simple_key = True
103 # Keep track of possible simple keys. This is a dictionary. The key
104 # is `flow_level`; there can be no more that one possible simple key
105 # for each level. The value is a SimpleKey record:
106 # (token_number, required, index, line, column, mark)
107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
108 # '[', or '{' tokens.
109 self.possible_simple_keys = {}
111 # Public methods.
113 def check_token(self, *choices):
114 # Check if the next token is one of the given types.
115 while self.need_more_tokens():
116 self.fetch_more_tokens()
117 if self.tokens:
118 if not choices:
119 return True
120 for choice in choices:
121 if isinstance(self.tokens[0], choice):
122 return True
123 return False
125 def peek_token(self):
126 # Return the next token, but do not delete if from the queue.
127 while self.need_more_tokens():
128 self.fetch_more_tokens()
129 if self.tokens:
130 return self.tokens[0]
132 def get_token(self):
133 # Return the next token.
134 while self.need_more_tokens():
135 self.fetch_more_tokens()
136 if self.tokens:
137 self.tokens_taken += 1
138 return self.tokens.pop(0)
140 def __iter__(self):
141 # Iterator protocol.
142 while self.need_more_tokens():
143 self.fetch_more_tokens()
144 while self.tokens:
145 self.tokens_taken += 1
146 yield self.tokens.pop(0)
147 while self.need_more_tokens():
148 self.fetch_more_tokens()
150 # Private methods.
152 def need_more_tokens(self):
153 if self.done:
154 return False
155 if not self.tokens:
156 return True
157 # The current token may be a potential simple key, so we
158 # need to look further.
159 self.stale_possible_simple_keys()
160 if self.next_possible_simple_key() == self.tokens_taken:
161 return True
163 def fetch_more_tokens(self):
165 # Eat whitespaces and comments until we reach the next token.
166 self.scan_to_next_token()
168 # Remove obsolete possible simple keys.
169 self.stale_possible_simple_keys()
171 # Compare the current indentation and column. It may add some tokens
172 # and decrease the current indentation level.
173 self.unwind_indent(self.column)
175 # Peek the next character.
176 ch = self.peek()
178 # Is it the end of stream?
179 if ch == u'\0':
180 return self.fetch_stream_end()
182 # Is it a directive?
183 if ch == u'%' and self.check_directive():
184 return self.fetch_directive()
186 # Is it the document start?
187 if ch == u'-' and self.check_document_start():
188 return self.fetch_document_start()
190 # Is it the document end?
191 if ch == u'.' and self.check_document_end():
192 return self.fetch_document_end()
194 # TODO: support for BOM within a stream.
195 #if ch == u'\uFEFF':
196 # return self.fetch_bom() <-- issue BOMToken
198 # Note: the order of the following checks is NOT significant.
200 # Is it the flow sequence start indicator?
201 if ch == u'[':
202 return self.fetch_flow_sequence_start()
204 # Is it the flow mapping start indicator?
205 if ch == u'{':
206 return self.fetch_flow_mapping_start()
208 # Is it the flow sequence end indicator?
209 if ch == u']':
210 return self.fetch_flow_sequence_end()
212 # Is it the flow mapping end indicator?
213 if ch == u'}':
214 return self.fetch_flow_mapping_end()
216 # Is it the flow entry indicator?
217 if ch == u',':
218 return self.fetch_flow_entry()
220 # Is it the block entry indicator?
221 if ch == u'-' and self.check_block_entry():
222 return self.fetch_block_entry()
224 # Is it the key indicator?
225 if ch == u'?' and self.check_key():
226 return self.fetch_key()
228 # Is it the value indicator?
229 if ch == u':' and self.check_value():
230 return self.fetch_value()
232 # Is it an alias?
233 if ch == u'*':
234 return self.fetch_alias()
236 # Is it an anchor?
237 if ch == u'&':
238 return self.fetch_anchor()
240 # Is it a tag?
241 if ch == u'!':
242 return self.fetch_tag()
244 # Is it a literal scalar?
245 if ch == u'|' and not self.flow_level:
246 return self.fetch_literal()
248 # Is it a folded scalar?
249 if ch == u'>' and not self.flow_level:
250 return self.fetch_folded()
252 # Is it a single quoted scalar?
253 if ch == u'\'':
254 return self.fetch_single()
256 # Is it a double quoted scalar?
257 if ch == u'\"':
258 return self.fetch_double()
260 # It must be a plain scalar then.
261 if self.check_plain():
262 return self.fetch_plain()
264 # No? It's an error. Let's produce a nice error message.
265 raise ScannerError("while scanning for the next token", None,
266 "found character %r that cannot start any token"
267 % ch.encode('utf-8'), self.get_mark())
269 # Simple keys treatment.
271 def next_possible_simple_key(self):
272 # Return the number of the nearest possible simple key. Actually we
273 # don't need to loop through the whole dictionary. We may replace it
274 # with the following code:
275 # if not self.possible_simple_keys:
276 # return None
277 # return self.possible_simple_keys[
278 # min(self.possible_simple_keys.keys())].token_number
279 min_token_number = None
280 for level in self.possible_simple_keys:
281 key = self.possible_simple_keys[level]
282 if min_token_number is None or key.token_number < min_token_number:
283 min_token_number = key.token_number
284 return min_token_number
286 def stale_possible_simple_keys(self):
287 # Remove entries that are no longer possible simple keys. According to
288 # the YAML specification, simple keys
289 # - should be limited to a single line,
290 # - should be no longer than 1024 characters.
291 # Disabling this procedure will allow simple keys of any length and
292 # height (may cause problems if indentation is broken though).
293 for level in self.possible_simple_keys.keys():
294 key = self.possible_simple_keys[level]
295 if key.line != self.line \
296 or self.index-key.index > 1024:
297 if key.required:
298 raise ScannerError("while scanning a simple key", key.mark,
299 "could not found expected ':'", self.get_mark())
300 del self.possible_simple_keys[level]
302 def save_possible_simple_key(self):
303 # The next token may start a simple key. We check if it's possible
304 # and save its position. This function is called for
305 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
307 # Check if a simple key is required at the current position.
308 required = not self.flow_level and self.indent == self.column
310 # A simple key is required only if it is the first token in the current
311 # line. Therefore it is always allowed.
312 assert self.allow_simple_key or not required
314 # The next token might be a simple key. Let's save it's number and
315 # position.
316 if self.allow_simple_key:
317 self.remove_possible_simple_key()
318 token_number = self.tokens_taken+len(self.tokens)
319 key = SimpleKey(token_number, required,
320 self.index, self.line, self.column, self.get_mark())
321 self.possible_simple_keys[self.flow_level] = key
323 def remove_possible_simple_key(self):
324 # Remove the saved possible key position at the current flow level.
325 if self.flow_level in self.possible_simple_keys:
326 key = self.possible_simple_keys[self.flow_level]
328 if key.required:
329 raise ScannerError("while scanning a simple key", key.mark,
330 "could not found expected ':'", self.get_mark())
332 del self.possible_simple_keys[self.flow_level]
334 # Indentation functions.
336 def unwind_indent(self, column):
338 ## In flow context, tokens should respect indentation.
339 ## Actually the condition should be `self.indent >= column` according to
340 ## the spec. But this condition will prohibit intuitively correct
341 ## constructions such as
342 ## key : {
343 ## }
344 #if self.flow_level and self.indent > column:
345 # raise ScannerError(None, None,
346 # "invalid intendation or unclosed '[' or '{'",
347 # self.get_mark())
349 # In the flow context, indentation is ignored. We make the scanner less
350 # restrictive then specification requires.
351 if self.flow_level:
352 return
354 # In block context, we may need to issue the BLOCK-END tokens.
355 while self.indent > column:
356 mark = self.get_mark()
357 self.indent = self.indents.pop()
358 self.tokens.append(BlockEndToken(mark, mark))
360 def add_indent(self, column):
361 # Check if we need to increase indentation.
362 if self.indent < column:
363 self.indents.append(self.indent)
364 self.indent = column
365 return True
366 return False
368 # Fetchers.
370 def fetch_stream_start(self):
371 # We always add STREAM-START as the first token and STREAM-END as the
372 # last token.
374 # Read the token.
375 mark = self.get_mark()
377 # Add STREAM-START.
378 self.tokens.append(StreamStartToken(mark, mark,
379 encoding=self.encoding))
382 def fetch_stream_end(self):
384 # Set the current intendation to -1.
385 self.unwind_indent(-1)
387 # Reset everything (not really needed).
388 self.allow_simple_key = False
389 self.possible_simple_keys = {}
391 # Read the token.
392 mark = self.get_mark()
394 # Add STREAM-END.
395 self.tokens.append(StreamEndToken(mark, mark))
397 # The steam is finished.
398 self.done = True
400 def fetch_directive(self):
402 # Set the current intendation to -1.
403 self.unwind_indent(-1)
405 # Reset simple keys.
406 self.remove_possible_simple_key()
407 self.allow_simple_key = False
409 # Scan and add DIRECTIVE.
410 self.tokens.append(self.scan_directive())
412 def fetch_document_start(self):
413 self.fetch_document_indicator(DocumentStartToken)
415 def fetch_document_end(self):
416 self.fetch_document_indicator(DocumentEndToken)
418 def fetch_document_indicator(self, TokenClass):
420 # Set the current intendation to -1.
421 self.unwind_indent(-1)
423 # Reset simple keys. Note that there could not be a block collection
424 # after '---'.
425 self.remove_possible_simple_key()
426 self.allow_simple_key = False
428 # Add DOCUMENT-START or DOCUMENT-END.
429 start_mark = self.get_mark()
430 self.forward(3)
431 end_mark = self.get_mark()
432 self.tokens.append(TokenClass(start_mark, end_mark))
434 def fetch_flow_sequence_start(self):
435 self.fetch_flow_collection_start(FlowSequenceStartToken)
437 def fetch_flow_mapping_start(self):
438 self.fetch_flow_collection_start(FlowMappingStartToken)
440 def fetch_flow_collection_start(self, TokenClass):
442 # '[' and '{' may start a simple key.
443 self.save_possible_simple_key()
445 # Increase the flow level.
446 self.flow_level += 1
448 # Simple keys are allowed after '[' and '{'.
449 self.allow_simple_key = True
451 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
452 start_mark = self.get_mark()
453 self.forward()
454 end_mark = self.get_mark()
455 self.tokens.append(TokenClass(start_mark, end_mark))
457 def fetch_flow_sequence_end(self):
458 self.fetch_flow_collection_end(FlowSequenceEndToken)
460 def fetch_flow_mapping_end(self):
461 self.fetch_flow_collection_end(FlowMappingEndToken)
463 def fetch_flow_collection_end(self, TokenClass):
465 # Reset possible simple key on the current level.
466 self.remove_possible_simple_key()
468 # Decrease the flow level.
469 self.flow_level -= 1
471 # No simple keys after ']' or '}'.
472 self.allow_simple_key = False
474 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
475 start_mark = self.get_mark()
476 self.forward()
477 end_mark = self.get_mark()
478 self.tokens.append(TokenClass(start_mark, end_mark))
480 def fetch_flow_entry(self):
482 # Simple keys are allowed after ','.
483 self.allow_simple_key = True
485 # Reset possible simple key on the current level.
486 self.remove_possible_simple_key()
488 # Add FLOW-ENTRY.
489 start_mark = self.get_mark()
490 self.forward()
491 end_mark = self.get_mark()
492 self.tokens.append(FlowEntryToken(start_mark, end_mark))
494 def fetch_block_entry(self):
496 # Block context needs additional checks.
497 if not self.flow_level:
499 # Are we allowed to start a new entry?
500 if not self.allow_simple_key:
501 raise ScannerError(None, None,
502 "sequence entries are not allowed here",
503 self.get_mark())
505 # We may need to add BLOCK-SEQUENCE-START.
506 if self.add_indent(self.column):
507 mark = self.get_mark()
508 self.tokens.append(BlockSequenceStartToken(mark, mark))
510 # It's an error for the block entry to occur in the flow context,
511 # but we let the parser detect this.
512 else:
513 pass
515 # Simple keys are allowed after '-'.
516 self.allow_simple_key = True
518 # Reset possible simple key on the current level.
519 self.remove_possible_simple_key()
521 # Add BLOCK-ENTRY.
522 start_mark = self.get_mark()
523 self.forward()
524 end_mark = self.get_mark()
525 self.tokens.append(BlockEntryToken(start_mark, end_mark))
527 def fetch_key(self):
529 # Block context needs additional checks.
530 if not self.flow_level:
532 # Are we allowed to start a key (not nessesary a simple)?
533 if not self.allow_simple_key:
534 raise ScannerError(None, None,
535 "mapping keys are not allowed here",
536 self.get_mark())
538 # We may need to add BLOCK-MAPPING-START.
539 if self.add_indent(self.column):
540 mark = self.get_mark()
541 self.tokens.append(BlockMappingStartToken(mark, mark))
543 # Simple keys are allowed after '?' in the block context.
544 self.allow_simple_key = not self.flow_level
546 # Reset possible simple key on the current level.
547 self.remove_possible_simple_key()
549 # Add KEY.
550 start_mark = self.get_mark()
551 self.forward()
552 end_mark = self.get_mark()
553 self.tokens.append(KeyToken(start_mark, end_mark))
555 def fetch_value(self):
557 # Do we determine a simple key?
558 if self.flow_level in self.possible_simple_keys:
560 # Add KEY.
561 key = self.possible_simple_keys[self.flow_level]
562 del self.possible_simple_keys[self.flow_level]
563 self.tokens.insert(key.token_number-self.tokens_taken,
564 KeyToken(key.mark, key.mark))
566 # If this key starts a new block mapping, we need to add
567 # BLOCK-MAPPING-START.
568 if not self.flow_level:
569 if self.add_indent(key.column):
570 self.tokens.insert(key.token_number-self.tokens_taken,
571 BlockMappingStartToken(key.mark, key.mark))
573 # There cannot be two simple keys one after another.
574 self.allow_simple_key = False
576 # It must be a part of a complex key.
577 else:
579 # Block context needs additional checks.
580 # (Do we really need them? They will be catched by the parser
581 # anyway.)
582 if not self.flow_level:
584 # We are allowed to start a complex value if and only if
585 # we can start a simple key.
586 if not self.allow_simple_key:
587 raise ScannerError(None, None,
588 "mapping values are not allowed here",
589 self.get_mark())
591 # If this value starts a new block mapping, we need to add
592 # BLOCK-MAPPING-START. It will be detected as an error later by
593 # the parser.
594 if not self.flow_level:
595 if self.add_indent(self.column):
596 mark = self.get_mark()
597 self.tokens.append(BlockMappingStartToken(mark, mark))
599 # Simple keys are allowed after ':' in the block context.
600 self.allow_simple_key = not self.flow_level
602 # Reset possible simple key on the current level.
603 self.remove_possible_simple_key()
605 # Add VALUE.
606 start_mark = self.get_mark()
607 self.forward()
608 end_mark = self.get_mark()
609 self.tokens.append(ValueToken(start_mark, end_mark))
611 def fetch_alias(self):
613 # ALIAS could be a simple key.
614 self.save_possible_simple_key()
616 # No simple keys after ALIAS.
617 self.allow_simple_key = False
619 # Scan and add ALIAS.
620 self.tokens.append(self.scan_anchor(AliasToken))
622 def fetch_anchor(self):
624 # ANCHOR could start a simple key.
625 self.save_possible_simple_key()
627 # No simple keys after ANCHOR.
628 self.allow_simple_key = False
630 # Scan and add ANCHOR.
631 self.tokens.append(self.scan_anchor(AnchorToken))
633 def fetch_tag(self):
635 # TAG could start a simple key.
636 self.save_possible_simple_key()
638 # No simple keys after TAG.
639 self.allow_simple_key = False
641 # Scan and add TAG.
642 self.tokens.append(self.scan_tag())
644 def fetch_literal(self):
645 self.fetch_block_scalar(style='|')
647 def fetch_folded(self):
648 self.fetch_block_scalar(style='>')
650 def fetch_block_scalar(self, style):
652 # A simple key may follow a block scalar.
653 self.allow_simple_key = True
655 # Reset possible simple key on the current level.
656 self.remove_possible_simple_key()
658 # Scan and add SCALAR.
659 self.tokens.append(self.scan_block_scalar(style))
661 def fetch_single(self):
662 self.fetch_flow_scalar(style='\'')
664 def fetch_double(self):
665 self.fetch_flow_scalar(style='"')
667 def fetch_flow_scalar(self, style):
669 # A flow scalar could be a simple key.
670 self.save_possible_simple_key()
672 # No simple keys after flow scalars.
673 self.allow_simple_key = False
675 # Scan and add SCALAR.
676 self.tokens.append(self.scan_flow_scalar(style))
678 def fetch_plain(self):
680 # A plain scalar could be a simple key.
681 self.save_possible_simple_key()
683 # No simple keys after plain scalars. But note that `scan_plain` will
684 # change this flag if the scan is finished at the beginning of the
685 # line.
686 self.allow_simple_key = False
688 # Scan and add SCALAR. May change `allow_simple_key`.
689 self.tokens.append(self.scan_plain())
691 # Checkers.
693 def check_directive(self):
695 # DIRECTIVE: ^ '%' ...
696 # The '%' indicator is already checked.
697 if self.column == 0:
698 return True
700 def check_document_start(self):
702 # DOCUMENT-START: ^ '---' (' '|'\n')
703 if self.column == 0:
704 if self.prefix(3) == u'---' \
705 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
706 return True
708 def check_document_end(self):
710 # DOCUMENT-END: ^ '...' (' '|'\n')
711 if self.column == 0:
712 if self.prefix(3) == u'...' \
713 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
714 return True
716 def check_block_entry(self):
718 # BLOCK-ENTRY: '-' (' '|'\n')
719 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
721 def check_key(self):
723 # KEY(flow context): '?'
724 if self.flow_level:
725 return True
727 # KEY(block context): '?' (' '|'\n')
728 else:
729 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
731 def check_value(self):
733 # VALUE(flow context): ':'
734 if self.flow_level:
735 return True
737 # VALUE(block context): ':' (' '|'\n')
738 else:
739 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
741 def check_plain(self):
743 # A plain scalar may start with any non-space character except:
744 # '-', '?', ':', ',', '[', ']', '{', '}',
745 # '#', '&', '*', '!', '|', '>', '\'', '\"',
746 # '%', '@', '`'.
748 # It may also start with
749 # '-', '?', ':'
750 # if it is followed by a non-space character.
752 # Note that we limit the last rule to the block context (except the
753 # '-' character) because we want the flow context to be space
754 # independent.
755 ch = self.peek()
756 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
757 or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
758 and (ch == u'-' or (not self.flow_level and ch in u'?:')))
760 # Scanners.
762 def scan_to_next_token(self):
763 # We ignore spaces, line breaks and comments.
764 # If we find a line break in the block context, we set the flag
765 # `allow_simple_key` on.
766 # The byte order mark is stripped if it's the first character in the
767 # stream. We do not yet support BOM inside the stream as the
768 # specification requires. Any such mark will be considered as a part
769 # of the document.
771 # TODO: We need to make tab handling rules more sane. A good rule is
772 # Tabs cannot precede tokens
773 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
774 # KEY(block), VALUE(block), BLOCK-ENTRY
775 # So the checking code is
776 # if <TAB>:
777 # self.allow_simple_keys = False
778 # We also need to add the check for `allow_simple_keys == True` to
779 # `unwind_indent` before issuing BLOCK-END.
780 # Scanners for block, flow, and plain scalars need to be modified.
782 if self.index == 0 and self.peek() == u'\uFEFF':
783 self.forward()
784 found = False
785 while not found:
786 while self.peek() == u' ':
787 self.forward()
788 if self.peek() == u'#':
789 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
790 self.forward()
791 if self.scan_line_break():
792 if not self.flow_level:
793 self.allow_simple_key = True
794 else:
795 found = True
797 def scan_directive(self):
798 # See the specification for details.
799 start_mark = self.get_mark()
800 self.forward()
801 name = self.scan_directive_name(start_mark)
802 value = None
803 if name == u'YAML':
804 value = self.scan_yaml_directive_value(start_mark)
805 end_mark = self.get_mark()
806 elif name == u'TAG':
807 value = self.scan_tag_directive_value(start_mark)
808 end_mark = self.get_mark()
809 else:
810 end_mark = self.get_mark()
811 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
812 self.forward()
813 self.scan_directive_ignored_line(start_mark)
814 return DirectiveToken(name, value, start_mark, end_mark)
816 def scan_directive_name(self, start_mark):
817 # See the specification for details.
818 length = 0
819 ch = self.peek(length)
820 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
821 or ch in u'-_':
822 length += 1
823 ch = self.peek(length)
824 if not length:
825 raise ScannerError("while scanning a directive", start_mark,
826 "expected alphabetic or numeric character, but found %r"
827 % ch.encode('utf-8'), self.get_mark())
828 value = self.prefix(length)
829 self.forward(length)
830 ch = self.peek()
831 if ch not in u'\0 \r\n\x85\u2028\u2029':
832 raise ScannerError("while scanning a directive", start_mark,
833 "expected alphabetic or numeric character, but found %r"
834 % ch.encode('utf-8'), self.get_mark())
835 return value
837 def scan_yaml_directive_value(self, start_mark):
838 # See the specification for details.
839 while self.peek() == u' ':
840 self.forward()
841 major = self.scan_yaml_directive_number(start_mark)
842 if self.peek() != '.':
843 raise ScannerError("while scanning a directive", start_mark,
844 "expected a digit or '.', but found %r"
845 % self.peek().encode('utf-8'),
846 self.get_mark())
847 self.forward()
848 minor = self.scan_yaml_directive_number(start_mark)
849 if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
850 raise ScannerError("while scanning a directive", start_mark,
851 "expected a digit or ' ', but found %r"
852 % self.peek().encode('utf-8'),
853 self.get_mark())
854 return (major, minor)
856 def scan_yaml_directive_number(self, start_mark):
857 # See the specification for details.
858 ch = self.peek()
859 if not (u'0' <= ch <= '9'):
860 raise ScannerError("while scanning a directive", start_mark,
861 "expected a digit, but found %r" % ch.encode('utf-8'),
862 self.get_mark())
863 length = 0
864 while u'0' <= self.peek(length) <= u'9':
865 length += 1
866 value = int(self.prefix(length))
867 self.forward(length)
868 return value
870 def scan_tag_directive_value(self, start_mark):
871 # See the specification for details.
872 while self.peek() == u' ':
873 self.forward()
874 handle = self.scan_tag_directive_handle(start_mark)
875 while self.peek() == u' ':
876 self.forward()
877 prefix = self.scan_tag_directive_prefix(start_mark)
878 return (handle, prefix)
880 def scan_tag_directive_handle(self, start_mark):
881 # See the specification for details.
882 value = self.scan_tag_handle('directive', start_mark)
883 ch = self.peek()
884 if ch != u' ':
885 raise ScannerError("while scanning a directive", start_mark,
886 "expected ' ', but found %r" % ch.encode('utf-8'),
887 self.get_mark())
888 return value
890 def scan_tag_directive_prefix(self, start_mark):
891 # See the specification for details.
892 value = self.scan_tag_uri('directive', start_mark)
893 ch = self.peek()
894 if ch not in u'\0 \r\n\x85\u2028\u2029':
895 raise ScannerError("while scanning a directive", start_mark,
896 "expected ' ', but found %r" % ch.encode('utf-8'),
897 self.get_mark())
898 return value
900 def scan_directive_ignored_line(self, start_mark):
901 # See the specification for details.
902 while self.peek() == u' ':
903 self.forward()
904 if self.peek() == u'#':
905 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
906 self.forward()
907 ch = self.peek()
908 if ch not in u'\0\r\n\x85\u2028\u2029':
909 raise ScannerError("while scanning a directive", start_mark,
910 "expected a comment or a line break, but found %r"
911 % ch.encode('utf-8'), self.get_mark())
912 self.scan_line_break()
914 def scan_anchor(self, TokenClass):
915 # The specification does not restrict characters for anchors and
916 # aliases. This may lead to problems, for instance, the document:
917 # [ *alias, value ]
918 # can be interpteted in two ways, as
919 # [ "value" ]
920 # and
921 # [ *alias , "value" ]
922 # Therefore we restrict aliases to numbers and ASCII letters.
923 start_mark = self.get_mark()
924 indicator = self.peek()
925 if indicator == '*':
926 name = 'alias'
927 else:
928 name = 'anchor'
929 self.forward()
930 length = 0
931 ch = self.peek(length)
932 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
933 or ch in u'-_':
934 length += 1
935 ch = self.peek(length)
936 if not length:
937 raise ScannerError("while scanning an %s" % name, start_mark,
938 "expected alphabetic or numeric character, but found %r"
939 % ch.encode('utf-8'), self.get_mark())
940 value = self.prefix(length)
941 self.forward(length)
942 ch = self.peek()
943 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
944 raise ScannerError("while scanning an %s" % name, start_mark,
945 "expected alphabetic or numeric character, but found %r"
946 % ch.encode('utf-8'), self.get_mark())
947 end_mark = self.get_mark()
948 return TokenClass(value, start_mark, end_mark)
950 def scan_tag(self):
951 # See the specification for details.
952 start_mark = self.get_mark()
953 ch = self.peek(1)
954 if ch == u'<':
955 handle = None
956 self.forward(2)
957 suffix = self.scan_tag_uri('tag', start_mark)
958 if self.peek() != u'>':
959 raise ScannerError("while parsing a tag", start_mark,
960 "expected '>', but found %r" % self.peek().encode('utf-8'),
961 self.get_mark())
962 self.forward()
963 elif ch in u'\0 \t\r\n\x85\u2028\u2029':
964 handle = None
965 suffix = u'!'
966 self.forward()
967 else:
968 length = 1
969 use_handle = False
970 while ch not in u'\0 \r\n\x85\u2028\u2029':
971 if ch == u'!':
972 use_handle = True
973 break
974 length += 1
975 ch = self.peek(length)
976 handle = u'!'
977 if use_handle:
978 handle = self.scan_tag_handle('tag', start_mark)
979 else:
980 handle = u'!'
981 self.forward()
982 suffix = self.scan_tag_uri('tag', start_mark)
983 ch = self.peek()
984 if ch not in u'\0 \r\n\x85\u2028\u2029':
985 raise ScannerError("while scanning a tag", start_mark,
986 "expected ' ', but found %r" % ch.encode('utf-8'),
987 self.get_mark())
988 value = (handle, suffix)
989 end_mark = self.get_mark()
990 return TagToken(value, start_mark, end_mark)
992 def scan_block_scalar(self, style):
993 # See the specification for details.
995 if style == '>':
996 folded = True
997 else:
998 folded = False
1000 chunks = []
1001 start_mark = self.get_mark()
1003 # Scan the header.
1004 self.forward()
1005 chomping, increment = self.scan_block_scalar_indicators(start_mark)
1006 self.scan_block_scalar_ignored_line(start_mark)
1008 # Determine the indentation level and go to the first non-empty line.
1009 min_indent = self.indent+1
1010 if min_indent < 1:
1011 min_indent = 1
1012 if increment is None:
1013 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
1014 indent = max(min_indent, max_indent)
1015 else:
1016 indent = min_indent+increment-1
1017 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1018 line_break = u''
1020 # Scan the inner part of the block scalar.
1021 while self.column == indent and self.peek() != u'\0':
1022 chunks.extend(breaks)
1023 leading_non_space = self.peek() not in u' \t'
1024 length = 0
1025 while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
1026 length += 1
1027 chunks.append(self.prefix(length))
1028 self.forward(length)
1029 line_break = self.scan_line_break()
1030 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1031 if self.column == indent and self.peek() != u'\0':
1033 # Unfortunately, folding rules are ambiguous.
1035 # This is the folding according to the specification:
1037 if folded and line_break == u'\n' \
1038 and leading_non_space and self.peek() not in u' \t':
1039 if not breaks:
1040 chunks.append(u' ')
1041 else:
1042 chunks.append(line_break)
1044 # This is Clark Evans's interpretation (also in the spec
1045 # examples):
1047 #if folded and line_break == u'\n':
1048 # if not breaks:
1049 # if self.peek() not in ' \t':
1050 # chunks.append(u' ')
1051 # else:
1052 # chunks.append(line_break)
1053 #else:
1054 # chunks.append(line_break)
1055 else:
1056 break
1058 # Chomp the tail.
1059 if chomping is not False:
1060 chunks.append(line_break)
1061 if chomping is True:
1062 chunks.extend(breaks)
1064 # We are done.
1065 return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1066 style)
1068 def scan_block_scalar_indicators(self, start_mark):
1069 # See the specification for details.
1070 chomping = None
1071 increment = None
1072 ch = self.peek()
1073 if ch in u'+-':
1074 if ch == '+':
1075 chomping = True
1076 else:
1077 chomping = False
1078 self.forward()
1079 ch = self.peek()
1080 if ch in u'0123456789':
1081 increment = int(ch)
1082 if increment == 0:
1083 raise ScannerError("while scanning a block scalar", start_mark,
1084 "expected indentation indicator in the range 1-9, but found 0",
1085 self.get_mark())
1086 self.forward()
1087 elif ch in u'0123456789':
1088 increment = int(ch)
1089 if increment == 0:
1090 raise ScannerError("while scanning a block scalar", start_mark,
1091 "expected indentation indicator in the range 1-9, but found 0",
1092 self.get_mark())
1093 self.forward()
1094 ch = self.peek()
1095 if ch in u'+-':
1096 if ch == '+':
1097 chomping = True
1098 else:
1099 chomping = False
1100 self.forward()
1101 ch = self.peek()
1102 if ch not in u'\0 \r\n\x85\u2028\u2029':
1103 raise ScannerError("while scanning a block scalar", start_mark,
1104 "expected chomping or indentation indicators, but found %r"
1105 % ch.encode('utf-8'), self.get_mark())
1106 return chomping, increment
1108 def scan_block_scalar_ignored_line(self, start_mark):
1109 # See the specification for details.
1110 while self.peek() == u' ':
1111 self.forward()
1112 if self.peek() == u'#':
1113 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
1114 self.forward()
1115 ch = self.peek()
1116 if ch not in u'\0\r\n\x85\u2028\u2029':
1117 raise ScannerError("while scanning a block scalar", start_mark,
1118 "expected a comment or a line break, but found %r"
1119 % ch.encode('utf-8'), self.get_mark())
1120 self.scan_line_break()
1122 def scan_block_scalar_indentation(self):
1123 # See the specification for details.
1124 chunks = []
1125 max_indent = 0
1126 end_mark = self.get_mark()
1127 while self.peek() in u' \r\n\x85\u2028\u2029':
1128 if self.peek() != u' ':
1129 chunks.append(self.scan_line_break())
1130 end_mark = self.get_mark()
1131 else:
1132 self.forward()
1133 if self.column > max_indent:
1134 max_indent = self.column
1135 return chunks, max_indent, end_mark
1137 def scan_block_scalar_breaks(self, indent):
1138 # See the specification for details.
1139 chunks = []
1140 end_mark = self.get_mark()
1141 while self.column < indent and self.peek() == u' ':
1142 self.forward()
1143 while self.peek() in u'\r\n\x85\u2028\u2029':
1144 chunks.append(self.scan_line_break())
1145 end_mark = self.get_mark()
1146 while self.column < indent and self.peek() == u' ':
1147 self.forward()
1148 return chunks, end_mark
1150 def scan_flow_scalar(self, style):
1151 # See the specification for details.
1152 # Note that we loose indentation rules for quoted scalars. Quoted
1153 # scalars don't need to adhere indentation because " and ' clearly
1154 # mark the beginning and the end of them. Therefore we are less
1155 # restrictive then the specification requires. We only need to check
1156 # that document separators are not included in scalars.
1157 if style == '"':
1158 double = True
1159 else:
1160 double = False
1161 chunks = []
1162 start_mark = self.get_mark()
1163 quote = self.peek()
1164 self.forward()
1165 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1166 while self.peek() != quote:
1167 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1168 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1169 self.forward()
1170 end_mark = self.get_mark()
1171 return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1172 style)
1174 ESCAPE_REPLACEMENTS = {
1175 u'0': u'\0',
1176 u'a': u'\x07',
1177 u'b': u'\x08',
1178 u't': u'\x09',
1179 u'\t': u'\x09',
1180 u'n': u'\x0A',
1181 u'v': u'\x0B',
1182 u'f': u'\x0C',
1183 u'r': u'\x0D',
1184 u'e': u'\x1B',
1185 u' ': u'\x20',
1186 u'\"': u'\"',
1187 u'\\': u'\\',
1188 u'N': u'\x85',
1189 u'_': u'\xA0',
1190 u'L': u'\u2028',
1191 u'P': u'\u2029',
1194 ESCAPE_CODES = {
1195 u'x': 2,
1196 u'u': 4,
1197 u'U': 8,
1200 def scan_flow_scalar_non_spaces(self, double, start_mark):
1201 # See the specification for details.
1202 chunks = []
1203 while True:
1204 length = 0
1205 while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1206 length += 1
1207 if length:
1208 chunks.append(self.prefix(length))
1209 self.forward(length)
1210 ch = self.peek()
1211 if not double and ch == u'\'' and self.peek(1) == u'\'':
1212 chunks.append(u'\'')
1213 self.forward(2)
1214 elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1215 chunks.append(ch)
1216 self.forward()
1217 elif double and ch == u'\\':
1218 self.forward()
1219 ch = self.peek()
1220 if ch in self.ESCAPE_REPLACEMENTS:
1221 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1222 self.forward()
1223 elif ch in self.ESCAPE_CODES:
1224 length = self.ESCAPE_CODES[ch]
1225 self.forward()
1226 for k in range(length):
1227 if self.peek(k) not in u'0123456789ABCDEFabcdef':
1228 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1229 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1230 (length, self.peek(k).encode('utf-8')), self.get_mark())
1231 code = int(self.prefix(length), 16)
1232 chunks.append(unichr(code))
1233 self.forward(length)
1234 elif ch in u'\r\n\x85\u2028\u2029':
1235 self.scan_line_break()
1236 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1237 else:
1238 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1239 "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
1240 else:
1241 return chunks
1243 def scan_flow_scalar_spaces(self, double, start_mark):
1244 # See the specification for details.
1245 chunks = []
1246 length = 0
1247 while self.peek(length) in u' \t':
1248 length += 1
1249 whitespaces = self.prefix(length)
1250 self.forward(length)
1251 ch = self.peek()
1252 if ch == u'\0':
1253 raise ScannerError("while scanning a quoted scalar", start_mark,
1254 "found unexpected end of stream", self.get_mark())
1255 elif ch in u'\r\n\x85\u2028\u2029':
1256 line_break = self.scan_line_break()
1257 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1258 if line_break != u'\n':
1259 chunks.append(line_break)
1260 elif not breaks:
1261 chunks.append(u' ')
1262 chunks.extend(breaks)
1263 else:
1264 chunks.append(whitespaces)
1265 return chunks
1267 def scan_flow_scalar_breaks(self, double, start_mark):
1268 # See the specification for details.
1269 chunks = []
1270 while True:
1271 # Instead of checking indentation, we check for document
1272 # separators.
1273 prefix = self.prefix(3)
1274 if (prefix == u'---' or prefix == u'...') \
1275 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1276 raise ScannerError("while scanning a quoted scalar", start_mark,
1277 "found unexpected document separator", self.get_mark())
1278 while self.peek() in u' \t':
1279 self.forward()
1280 if self.peek() in u'\r\n\x85\u2028\u2029':
1281 chunks.append(self.scan_line_break())
1282 else:
1283 return chunks
1285 def scan_plain(self):
1286 # See the specification for details.
1287 # We add an additional restriction for the flow context:
1288 # plain scalars in the flow context cannot contain ',', ':' and '?'.
1289 # We also keep track of the `allow_simple_key` flag here.
1290 # Indentation rules are loosed for the flow context.
1291 chunks = []
1292 start_mark = self.get_mark()
1293 end_mark = start_mark
1294 indent = self.indent+1
1295 # We allow zero indentation for scalars, but then we need to check for
1296 # document separators at the beginning of the line.
1297 #if indent == 0:
1298 # indent = 1
1299 spaces = []
1300 while True:
1301 length = 0
1302 if self.peek() == u'#':
1303 break
1304 while True:
1305 ch = self.peek(length)
1306 if ch in u'\0 \t\r\n\x85\u2028\u2029' \
1307 or (not self.flow_level and ch == u':' and
1308 self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
1309 or (self.flow_level and ch in u',:?[]{}'):
1310 break
1311 length += 1
1312 # It's not clear what we should do with ':' in the flow context.
1313 if (self.flow_level and ch == u':'
1314 and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
1315 self.forward(length)
1316 raise ScannerError("while scanning a plain scalar", start_mark,
1317 "found unexpected ':'", self.get_mark(),
1318 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
1319 if length == 0:
1320 break
1321 self.allow_simple_key = False
1322 chunks.extend(spaces)
1323 chunks.append(self.prefix(length))
1324 self.forward(length)
1325 end_mark = self.get_mark()
1326 spaces = self.scan_plain_spaces(indent, start_mark)
1327 if not spaces or self.peek() == u'#' \
1328 or (not self.flow_level and self.column < indent):
1329 break
1330 return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
1332 def scan_plain_spaces(self, indent, start_mark):
1333 # See the specification for details.
1334 # The specification is really confusing about tabs in plain scalars.
1335 # We just forbid them completely. Do not use tabs in YAML!
1336 chunks = []
1337 length = 0
1338 while self.peek(length) in u' ':
1339 length += 1
1340 whitespaces = self.prefix(length)
1341 self.forward(length)
1342 ch = self.peek()
1343 if ch in u'\r\n\x85\u2028\u2029':
1344 line_break = self.scan_line_break()
1345 self.allow_simple_key = True
1346 prefix = self.prefix(3)
1347 if (prefix == u'---' or prefix == u'...') \
1348 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1349 return
1350 breaks = []
1351 while self.peek() in u' \r\n\x85\u2028\u2029':
1352 if self.peek() == ' ':
1353 self.forward()
1354 else:
1355 breaks.append(self.scan_line_break())
1356 prefix = self.prefix(3)
1357 if (prefix == u'---' or prefix == u'...') \
1358 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1359 return
1360 if line_break != u'\n':
1361 chunks.append(line_break)
1362 elif not breaks:
1363 chunks.append(u' ')
1364 chunks.extend(breaks)
1365 elif whitespaces:
1366 chunks.append(whitespaces)
1367 return chunks
1369 def scan_tag_handle(self, name, start_mark):
1370 # See the specification for details.
1371 # For some strange reasons, the specification does not allow '_' in
1372 # tag handles. I have allowed it anyway.
1373 ch = self.peek()
1374 if ch != u'!':
1375 raise ScannerError("while scanning a %s" % name, start_mark,
1376 "expected '!', but found %r" % ch.encode('utf-8'),
1377 self.get_mark())
1378 length = 1
1379 ch = self.peek(length)
1380 if ch != u' ':
1381 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1382 or ch in u'-_':
1383 length += 1
1384 ch = self.peek(length)
1385 if ch != u'!':
1386 self.forward(length)
1387 raise ScannerError("while scanning a %s" % name, start_mark,
1388 "expected '!', but found %r" % ch.encode('utf-8'),
1389 self.get_mark())
1390 length += 1
1391 value = self.prefix(length)
1392 self.forward(length)
1393 return value
1395 def scan_tag_uri(self, name, start_mark):
1396 # See the specification for details.
1397 # Note: we do not check if URI is well-formed.
1398 chunks = []
1399 length = 0
1400 ch = self.peek(length)
1401 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1402 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1403 if ch == u'%':
1404 chunks.append(self.prefix(length))
1405 self.forward(length)
1406 length = 0
1407 chunks.append(self.scan_uri_escapes(name, start_mark))
1408 else:
1409 length += 1
1410 ch = self.peek(length)
1411 if length:
1412 chunks.append(self.prefix(length))
1413 self.forward(length)
1414 length = 0
1415 if not chunks:
1416 raise ScannerError("while parsing a %s" % name, start_mark,
1417 "expected URI, but found %r" % ch.encode('utf-8'),
1418 self.get_mark())
1419 return u''.join(chunks)
1421 def scan_uri_escapes(self, name, start_mark):
1422 # See the specification for details.
1423 bytes = []
1424 mark = self.get_mark()
1425 while self.peek() == u'%':
1426 self.forward()
1427 for k in range(2):
1428 if self.peek(k) not in u'0123456789ABCDEFabcdef':
1429 raise ScannerError("while scanning a %s" % name, start_mark,
1430 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1431 (self.peek(k).encode('utf-8')), self.get_mark())
1432 bytes.append(chr(int(self.prefix(2), 16)))
1433 self.forward(2)
1434 try:
1435 value = unicode(''.join(bytes), 'utf-8')
1436 except UnicodeDecodeError, exc:
1437 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
1438 return value
1440 def scan_line_break(self):
1441 # Transforms:
1442 # '\r\n' : '\n'
1443 # '\r' : '\n'
1444 # '\n' : '\n'
1445 # '\x85' : '\n'
1446 # '\u2028' : '\u2028'
1447 # '\u2029 : '\u2029'
1448 # default : ''
1449 ch = self.peek()
1450 if ch in u'\r\n\x85':
1451 if self.prefix(2) == u'\r\n':
1452 self.forward(2)
1453 else:
1454 self.forward()
1455 return u'\n'
1456 elif ch in u'\u2028\u2029':
1457 self.forward()
1458 return ch
1459 return u''
1461 #try:
1462 # import psyco
1463 # psyco.bind(Scanner)
1464 #except ImportError:
1465 # pass