python/google/appengine/ext/mapreduce/file_format_parser.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31 """Defines the parser for MapReduce FileInputReader's file format string."""
  32
  33
  34
  35
  36 __all__ = ['parse']
  37
  38 import re
  39 import tokenize
  40
  41 from google.appengine.ext.mapreduce import file_formats
  42
  43
  44 def parse(format_string):
  45   """Parses format string.
  46
  47   Args:
  48     format_string: format_string from MapReduce FileInputReader.
  49
  50   Returns:
  51     a list of file_formats._FileFormat objects.
  52
  53   Raises:
  54     ValueError: when format_string parsing fails because of invalid syntax
  55       or semantics.
  56   """
  57   tokenizer = _Tokenizer(format_string)
  58   return _Parser(tokenizer).formats
  59
  60
  61 class _Parser(object):
  62   """Parses a format string according to the following grammar.
  63
  64   In Python's modified BNF notation.
  65   format_string ::= parameterized_format ( "[" parameterized_format "]" )*
  66   parameterized_format ::= format [ format_parameters ]
  67   format_parameters ::= "(" format_paramter ("," format_parameter )* ")"
  68   format_parameter ::= format_specific_parameter "=" parameter_value
  69   format ::= (<letter>|<number>)+
  70   parameter_value ::= (<letter>|<number>|<punctuation>)+
  71   format_specific_parameter ::= (<letter>|<number>)+
  72   """
  73
  74   def __init__(self, tokenizer):
  75     """Initialize.
  76
  77     Args:
  78       tokenizer: an instance of _Tokenizer.
  79
  80     Raises:
  81       ValueError: when parser couldn't consume all format_string.
  82     """
  83     self.formats = []
  84     self._tokenizer = tokenizer
  85     self._parse_format_string()
  86     if tokenizer.remainder():
  87       raise ValueError('Extra chars after index -%d' % tokenizer.remainder())
  88
  89   def _add_format(self, format_name, kwargs):
  90     """Add a format to result list.
  91
  92     The format name will be resolved to its corresponding _FileFormat class.
  93     kwargs will be passed to the class's __init___.
  94
  95     Args:
  96       format_name: name of the parsed format in str.
  97       kwargs: a dict containing key word arguments for the format.
  98
  99     Raises:
 100       ValueError: when format_name is not supported or the kwargs are not
 101         supported by the format.
 102     """
 103     if format_name not in file_formats.FORMATS:
 104       raise ValueError('Invalid format %s.' % format_name)
 105     format_cls = file_formats.FORMATS[format_name]
 106     for k in kwargs:
 107       if k not in format_cls.ARGUMENTS:
 108         raise ValueError('Invalid argument %s for format %s' %
 109                          (k, format_name))
 110     self.formats.append(format_cls.default_instance(**kwargs))
 111
 112   def _parse_format_string(self):
 113     """Parses format_string."""
 114     self._parse_parameterized_format()
 115     if self._tokenizer.consume_if('['):
 116       self._parse_format_string()
 117       self._tokenizer.consume(']')
 118
 119   def _validate_string(self, text):
 120     """Validates a string is composed of valid characters.
 121
 122     Args:
 123       text: any str to validate.
 124
 125     Raises:
 126       ValueError: when text contains illegal characters.
 127     """
 128     if not re.match(tokenize.Name, text):
 129       raise ValueError('%s should only contain ascii letters or digits.' %
 130                        text)
 131
 132   def _parse_parameterized_format(self):
 133     """Parses parameterized_format."""
 134     format_name = self._tokenizer.next()
 135     self._validate_string(format_name)
 136
 137     arguments = {}
 138
 139     if self._tokenizer.consume_if('('):
 140       arguments = self._parse_format_parameters()
 141       self._tokenizer.consume(')')
 142
 143     self._add_format(format_name, arguments)
 144
 145   def _parse_format_parameters(self):
 146     """Parses format_parameters.
 147
 148     Returns:
 149       a dict of parameter names to their values for this format.
 150
 151     Raises:
 152       ValueError: when the format_parameters have illegal syntax or semantics.
 153     """
 154     arguments = {}
 155     comma_exist = True
 156     while self._tokenizer.peek() not in ')]':
 157       if not comma_exist:
 158         raise ValueError('Arguments should be separated by comma at index %d.'
 159                          % self._tokenizer.index)
 160       key = self._tokenizer.next()
 161       self._validate_string(key)
 162       self._tokenizer.consume('=')
 163       value = self._tokenizer.next()
 164       comma_exist = self._tokenizer.consume_if(',')
 165       if key in arguments:
 166         raise ValueError('Argument %s defined more than once.' % key)
 167       arguments[key] = value
 168     return arguments
 169
 170
 171 class _Tokenizer(object):
 172   """Tokenizes a user supplied format string.
 173
 174   A token is either a special character or a group of characters between
 175   two special characters or the beginning or the end of format string.
 176   Escape character can be used to escape special characters and itself.
 177   """
 178
 179   SPECIAL_CHARS = '[]()=,'
 180   ESCAPE_CHAR = '\\'
 181
 182   def __init__(self, format_string):
 183     """Initialize.
 184
 185     Args:
 186       format_string: user supplied format string for MapReduce InputReader.
 187     """
 188     self.index = 0
 189     self._format_string = format_string
 190
 191   def peek(self):
 192     """Returns the next token with surrounding white spaces stripped.
 193
 194     This method does not advance underlying buffer.
 195
 196     Returns:
 197       the next token with surrounding whitespaces stripped.
 198     """
 199     return self.next(advance=False)
 200
 201   def next(self, advance=True):
 202     """Returns the next token with surrounding white spaces stripped.
 203
 204     Args:
 205       advance: boolean. True if underlying buffer should be advanced.
 206
 207     Returns:
 208       the next token with surrounding whitespaces stripped.
 209     """
 210     escaped = False
 211     token = ''
 212     previous_index = self.index
 213     while self.remainder():
 214       char = self._format_string[self.index]
 215       if char == self.ESCAPE_CHAR:
 216         if escaped:
 217           token += char
 218           self.index += 1
 219           escaped = False
 220         else:
 221           self.index += 1
 222           escaped = True
 223       elif char in self.SPECIAL_CHARS and not escaped:
 224         if not token.strip():
 225           self.index += 1
 226           token += char
 227         break
 228       else:
 229         escaped = False
 230         self.index += 1
 231         token += char
 232
 233     if not advance:
 234       self.index = previous_index
 235
 236     return token.strip()
 237
 238   def consume(self, expected_token):
 239     """Consumes the next token which must match expectation.
 240
 241     Args:
 242       expected_token: the expected value of the next token.
 243
 244     Raises:
 245       ValueError: raised when the next token doesn't match expected_token.
 246     """
 247     token = self.next()
 248     if token != expected_token:
 249       raise ValueError('Expect "%s" but got "%s" at offset %d' %
 250                        (expected_token, token, self.index))
 251
 252   def consume_if(self, token):
 253     """Consumes the next token when it matches expectation.
 254
 255     Args:
 256       token: the expected next token.
 257
 258     Returns:
 259       True when next token matches the argument and is consumed.
 260       False otherwise.
 261     """
 262     if self.peek() == token:
 263       self.consume(token)
 264       return True
 265     return False
 266
 267   def remainder(self):
 268     """Returns the number of bytes left to be processed."""
 269     return len(self._format_string) - self.index
 270