App Engine Python SDK version 1.7.7
[gae.git] / python / google / appengine / ext / mapreduce / file_format_parser.py
blob30826e7ea253f502009fca36e41a8c2b3a135421
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
31 """Defines the parser for MapReduce FileInputReader's file format string."""
36 __all__ = ['parse']
38 import re
39 import tokenize
41 from google.appengine.ext.mapreduce import file_formats
44 def parse(format_string):
45 """Parses format string.
47 Args:
48 format_string: format_string from MapReduce FileInputReader.
50 Returns:
51 a list of file_formats._FileFormat objects.
53 Raises:
54 ValueError: when format_string parsing fails because of invalid syntax
55 or semantics.
56 """
57 tokenizer = _Tokenizer(format_string)
58 return _Parser(tokenizer).formats
61 class _Parser(object):
62 """Parses a format string according to the following grammar.
64 In Python's modified BNF notation.
65 format_string ::= parameterized_format ( "[" parameterized_format "]" )*
66 parameterized_format ::= format [ format_parameters ]
67 format_parameters ::= "(" format_paramter ("," format_parameter )* ")"
68 format_parameter ::= format_specific_parameter "=" parameter_value
69 format ::= (<letter>|<number>)+
70 parameter_value ::= (<letter>|<number>|<punctuation>)+
71 format_specific_parameter ::= (<letter>|<number>)+
72 """
74 def __init__(self, tokenizer):
75 """Initialize.
77 Args:
78 tokenizer: an instance of _Tokenizer.
80 Raises:
81 ValueError: when parser couldn't consume all format_string.
82 """
83 self.formats = []
84 self._tokenizer = tokenizer
85 self._parse_format_string()
86 if tokenizer.remainder():
87 raise ValueError('Extra chars after index -%d' % tokenizer.remainder())
89 def _add_format(self, format_name, kwargs):
90 """Add a format to result list.
92 The format name will be resolved to its corresponding _FileFormat class.
93 kwargs will be passed to the class's __init___.
95 Args:
96 format_name: name of the parsed format in str.
97 kwargs: a dict containing key word arguments for the format.
99 Raises:
100 ValueError: when format_name is not supported or the kwargs are not
101 supported by the format.
103 if format_name not in file_formats.FORMATS:
104 raise ValueError('Invalid format %s.' % format_name)
105 format_cls = file_formats.FORMATS[format_name]
106 for k in kwargs:
107 if k not in format_cls.ARGUMENTS:
108 raise ValueError('Invalid argument %s for format %s' %
109 (k, format_name))
110 self.formats.append(format_cls.default_instance(**kwargs))
112 def _parse_format_string(self):
113 """Parses format_string."""
114 self._parse_parameterized_format()
115 if self._tokenizer.consume_if('['):
116 self._parse_format_string()
117 self._tokenizer.consume(']')
119 def _validate_string(self, text):
120 """Validates a string is composed of valid characters.
122 Args:
123 text: any str to validate.
125 Raises:
126 ValueError: when text contains illegal characters.
128 if not re.match(tokenize.Name, text):
129 raise ValueError('%s should only contain ascii letters or digits.' %
130 text)
132 def _parse_parameterized_format(self):
133 """Parses parameterized_format."""
134 format_name = self._tokenizer.next()
135 self._validate_string(format_name)
137 arguments = {}
139 if self._tokenizer.consume_if('('):
140 arguments = self._parse_format_parameters()
141 self._tokenizer.consume(')')
143 self._add_format(format_name, arguments)
145 def _parse_format_parameters(self):
146 """Parses format_parameters.
148 Returns:
149 a dict of parameter names to their values for this format.
151 Raises:
152 ValueError: when the format_parameters have illegal syntax or semantics.
154 arguments = {}
155 comma_exist = True
156 while self._tokenizer.peek() not in ')]':
157 if not comma_exist:
158 raise ValueError('Arguments should be separated by comma at index %d.'
159 % self._tokenizer.index)
160 key = self._tokenizer.next()
161 self._validate_string(key)
162 self._tokenizer.consume('=')
163 value = self._tokenizer.next()
164 comma_exist = self._tokenizer.consume_if(',')
165 if key in arguments:
166 raise ValueError('Argument %s defined more than once.' % key)
167 arguments[key] = value
168 return arguments
171 class _Tokenizer(object):
172 """Tokenizes a user supplied format string.
174 A token is either a special character or a group of characters between
175 two special characters or the beginning or the end of format string.
176 Escape character can be used to escape special characters and itself.
179 SPECIAL_CHARS = '[]()=,'
180 ESCAPE_CHAR = '\\'
182 def __init__(self, format_string):
183 """Initialize.
185 Args:
186 format_string: user supplied format string for MapReduce InputReader.
188 self.index = 0
189 self._format_string = format_string
191 def peek(self):
192 """Returns the next token with surrounding white spaces stripped.
194 This method does not advance underlying buffer.
196 Returns:
197 the next token with surrounding whitespaces stripped.
199 return self.next(advance=False)
201 def next(self, advance=True):
202 """Returns the next token with surrounding white spaces stripped.
204 Args:
205 advance: boolean. True if underlying buffer should be advanced.
207 Returns:
208 the next token with surrounding whitespaces stripped.
210 escaped = False
211 token = ''
212 previous_index = self.index
213 while self.remainder():
214 char = self._format_string[self.index]
215 if char == self.ESCAPE_CHAR:
216 if escaped:
217 token += char
218 self.index += 1
219 escaped = False
220 else:
221 self.index += 1
222 escaped = True
223 elif char in self.SPECIAL_CHARS and not escaped:
224 if not token.strip():
225 self.index += 1
226 token += char
227 break
228 else:
229 escaped = False
230 self.index += 1
231 token += char
233 if not advance:
234 self.index = previous_index
236 return token.strip()
238 def consume(self, expected_token):
239 """Consumes the next token which must match expectation.
241 Args:
242 expected_token: the expected value of the next token.
244 Raises:
245 ValueError: raised when the next token doesn't match expected_token.
247 token = self.next()
248 if token != expected_token:
249 raise ValueError('Expect "%s" but got "%s" at offset %d' %
250 (expected_token, token, self.index))
252 def consume_if(self, token):
253 """Consumes the next token when it matches expectation.
255 Args:
256 token: the expected next token.
258 Returns:
259 True when next token matches the argument and is consumed.
260 False otherwise.
262 if self.peek() == token:
263 self.consume(token)
264 return True
265 return False
267 def remainder(self):
268 """Returns the number of bytes left to be processed."""
269 return len(self._format_string) - self.index