3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
31 """Defines the parser for MapReduce FileInputReader's file format string."""
41 from google
.appengine
.ext
.mapreduce
import file_formats
44 def parse(format_string
):
45 """Parses format string.
48 format_string: format_string from MapReduce FileInputReader.
51 a list of file_formats._FileFormat objects.
54 ValueError: when format_string parsing fails because of invalid syntax
57 tokenizer
= _Tokenizer(format_string
)
58 return _Parser(tokenizer
).formats
61 class _Parser(object):
62 """Parses a format string according to the following grammar.
64 In Python's modified BNF notation.
65 format_string ::= parameterized_format ( "[" parameterized_format "]" )*
66 parameterized_format ::= format [ format_parameters ]
67 format_parameters ::= "(" format_paramter ("," format_parameter )* ")"
68 format_parameter ::= format_specific_parameter "=" parameter_value
69 format ::= (<letter>|<number>)+
70 parameter_value ::= (<letter>|<number>|<punctuation>)+
71 format_specific_parameter ::= (<letter>|<number>)+
74 def __init__(self
, tokenizer
):
78 tokenizer: an instance of _Tokenizer.
81 ValueError: when parser couldn't consume all format_string.
84 self
._tokenizer
= tokenizer
85 self
._parse
_format
_string
()
86 if tokenizer
.remainder():
87 raise ValueError('Extra chars after index -%d' % tokenizer
.remainder())
89 def _add_format(self
, format_name
, kwargs
):
90 """Add a format to result list.
92 The format name will be resolved to its corresponding _FileFormat class.
93 kwargs will be passed to the class's __init___.
96 format_name: name of the parsed format in str.
97 kwargs: a dict containing key word arguments for the format.
100 ValueError: when format_name is not supported or the kwargs are not
101 supported by the format.
103 if format_name
not in file_formats
.FORMATS
:
104 raise ValueError('Invalid format %s.' % format_name
)
105 format_cls
= file_formats
.FORMATS
[format_name
]
107 if k
not in format_cls
.ARGUMENTS
:
108 raise ValueError('Invalid argument %s for format %s' %
110 self
.formats
.append(format_cls
.default_instance(**kwargs
))
112 def _parse_format_string(self
):
113 """Parses format_string."""
114 self
._parse
_parameterized
_format
()
115 if self
._tokenizer
.consume_if('['):
116 self
._parse
_format
_string
()
117 self
._tokenizer
.consume(']')
119 def _validate_string(self
, text
):
120 """Validates a string is composed of valid characters.
123 text: any str to validate.
126 ValueError: when text contains illegal characters.
128 if not re
.match(tokenize
.Name
, text
):
129 raise ValueError('%s should only contain ascii letters or digits.' %
132 def _parse_parameterized_format(self
):
133 """Parses parameterized_format."""
134 format_name
= self
._tokenizer
.next()
135 self
._validate
_string
(format_name
)
139 if self
._tokenizer
.consume_if('('):
140 arguments
= self
._parse
_format
_parameters
()
141 self
._tokenizer
.consume(')')
143 self
._add
_format
(format_name
, arguments
)
145 def _parse_format_parameters(self
):
146 """Parses format_parameters.
149 a dict of parameter names to their values for this format.
152 ValueError: when the format_parameters have illegal syntax or semantics.
156 while self
._tokenizer
.peek() not in ')]':
158 raise ValueError('Arguments should be separated by comma at index %d.'
159 % self
._tokenizer
.index
)
160 key
= self
._tokenizer
.next()
161 self
._validate
_string
(key
)
162 self
._tokenizer
.consume('=')
163 value
= self
._tokenizer
.next()
164 comma_exist
= self
._tokenizer
.consume_if(',')
166 raise ValueError('Argument %s defined more than once.' % key
)
167 arguments
[key
] = value
171 class _Tokenizer(object):
172 """Tokenizes a user supplied format string.
174 A token is either a special character or a group of characters between
175 two special characters or the beginning or the end of format string.
176 Escape character can be used to escape special characters and itself.
179 SPECIAL_CHARS
= '[]()=,'
182 def __init__(self
, format_string
):
186 format_string: user supplied format string for MapReduce InputReader.
189 self
._format
_string
= format_string
192 """Returns the next token with surrounding white spaces stripped.
194 This method does not advance underlying buffer.
197 the next token with surrounding whitespaces stripped.
199 return self
.next(advance
=False)
201 def next(self
, advance
=True):
202 """Returns the next token with surrounding white spaces stripped.
205 advance: boolean. True if underlying buffer should be advanced.
208 the next token with surrounding whitespaces stripped.
212 previous_index
= self
.index
213 while self
.remainder():
214 char
= self
._format
_string
[self
.index
]
215 if char
== self
.ESCAPE_CHAR
:
223 elif char
in self
.SPECIAL_CHARS
and not escaped
:
224 if not token
.strip():
234 self
.index
= previous_index
238 def consume(self
, expected_token
):
239 """Consumes the next token which must match expectation.
242 expected_token: the expected value of the next token.
245 ValueError: raised when the next token doesn't match expected_token.
248 if token
!= expected_token
:
249 raise ValueError('Expect "%s" but got "%s" at offset %d' %
250 (expected_token
, token
, self
.index
))
252 def consume_if(self
, token
):
253 """Consumes the next token when it matches expectation.
256 token: the expected next token.
259 True when next token matches the argument and is consumed.
262 if self
.peek() == token
:
268 """Returns the number of bytes left to be processed."""
269 return len(self
._format
_string
) - self
.index