cvs2svn_rcsparse/texttools.py

   1 #
   2 # Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved.
   3 #
   4 # By using this file, you agree to the terms and conditions set forth in
   5 # the LICENSE.html file which can be found at the top level of the ViewCVS
   6 # distribution or at http://viewcvs.sourceforge.net/license-1.html.
   7 #
   8 # Contact information:
   9 #   Greg Stein, PO Box 760, Palo Alto, CA, 94302
  10 #   gstein@lyra.org, http://viewcvs.sourceforge.net/
  11 #
  12 # -----------------------------------------------------------------------
  13 #
  14 # This software is being maintained as part of the ViewCVS project.
  15 # Information is available at:
  16 #    http://viewcvs.sourceforge.net/
  17 #
  18 # -----------------------------------------------------------------------
  19
  20 import string
  21
  22 # note: this will raise an ImportError if it isn't available. the rcsparse
  23 # package will recognize this and switch over to the default parser.
  24 from mx import TextTools
  25
  26 import common
  27
  28
  29 # for convenience
  30 _tt = TextTools
  31
  32 _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
  33 _idchar_list.remove('$')
  34 _idchar_list.remove(',')
  35 #_idchar_list.remove('.')       leave as part of 'num' symbol
  36 _idchar_list.remove(':')
  37 _idchar_list.remove(';')
  38 _idchar_list.remove('@')
  39 _idchar = string.join(_idchar_list, '')
  40 _idchar_set = _tt.set(_idchar)
  41
  42 _onechar_token_set = _tt.set(':;')
  43
  44 _not_at_set = _tt.invset('@')
  45
  46 _T_TOKEN = 30
  47 _T_STRING_START = 40
  48 _T_STRING_SPAN = 60
  49 _T_STRING_END = 70
  50
  51 _E_COMPLETE = 100       # ended on a complete token
  52 _E_TOKEN = 110  # ended mid-token
  53 _E_STRING_SPAN = 130    # ended within a string
  54 _E_STRING_END = 140     # ended with string-end ('@') (could be mid-@@)
  55
  56 _SUCCESS = +100
  57
  58 _EOF = 'EOF'
  59 _CONTINUE = 'CONTINUE'
  60 _UNUSED = 'UNUSED'
  61
  62
  63 # continuation of a token over a chunk boundary
  64 _c_token_table = (
  65   (_T_TOKEN,      _tt.AllInSet, _idchar_set),
  66   )
  67
  68 class _mxTokenStream:
  69
  70   # the algorithm is about the same speed for any CHUNK_SIZE chosen.
  71   # grab a good-sized chunk, but not too large to overwhelm memory.
  72   # note: we use a multiple of a standard block size
  73   CHUNK_SIZE  = 192 * 512  # about 100k
  74
  75 #  CHUNK_SIZE  = 5      # for debugging, make the function grind...
  76
  77   def __init__(self, file):
  78     self.rcsfile = file
  79     self.tokens = [ ]
  80     self.partial = None
  81
  82     self.string_end = None
  83
  84   def _parse_chunk(self, buf, start=0):
  85     "Get the next token from the RCS file."
  86
  87     buflen = len(buf)
  88
  89     assert start < buflen
  90
  91     # construct a tag table which refers to the buffer we need to parse.
  92     table = (
  93       # ignore whitespace. with or without whitespace, move to the next rule.
  94       (None, _tt.AllInSet, _tt.whitespace_set, +1),
  95
  96       (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
  97
  98       # accumulate token text and exit, or move to the next rule.
  99       (_UNUSED,      _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
 100
 101       (_E_TOKEN,  _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
 102
 103       # single character tokens exit immediately, or move to the next rule
 104       (_UNUSED,    _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
 105
 106       (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
 107
 108       # if this isn't an '@' symbol, then we have a syntax error (go to a
 109       # negative index to indicate that condition). otherwise, suck it up
 110       # and move to the next rule.
 111       (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
 112
 113       (None, _tt.Is, '@', +4, +1),
 114       (buf, _tt.Is, '@', +1, -1),
 115       (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
 116       (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
 117
 118       (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
 119
 120       # suck up everything that isn't an AT. go to next rule to look for EOF
 121       (buf,  _tt.AllInSet, _not_at_set, 0, +1),
 122
 123       # go back to look for double AT if we aren't at the end of the string
 124       (_E_STRING_SPAN,   _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
 125       )
 126
 127     success, taglist, idx = _tt.tag(buf, table, start)
 128
 129     if not success:
 130       ### need a better way to report this error
 131       raise common.RCSIllegalCharacter()
 132     assert idx == buflen
 133
 134     # pop off the last item
 135     last_which = taglist.pop()
 136
 137     i = 0
 138     tlen = len(taglist)
 139     while i < tlen:
 140       if taglist[i] == _T_STRING_START:
 141         j = i + 1
 142         while j < tlen:
 143           if taglist[j] == _T_STRING_END:
 144             s = _tt.join(taglist, '', i+1, j)
 145             del taglist[i:j]
 146             tlen = len(taglist)
 147             taglist[i] = s
 148             break
 149           j = j + 1
 150         else:
 151           assert last_which == _E_STRING_SPAN
 152           s = _tt.join(taglist, '', i+1)
 153           del taglist[i:]
 154           self.partial = (_T_STRING_SPAN, [ s ])
 155           break
 156       i = i + 1
 157
 158     # figure out whether we have a partial last-token
 159     if last_which == _E_TOKEN:
 160       self.partial = (_T_TOKEN, [ taglist.pop() ])
 161     elif last_which == _E_COMPLETE:
 162       pass
 163     elif last_which == _E_STRING_SPAN:
 164       assert self.partial
 165     else:
 166       assert last_which == _E_STRING_END
 167       self.partial = (_T_STRING_END, [ taglist.pop() ])
 168
 169     taglist.reverse()
 170     taglist.extend(self.tokens)
 171     self.tokens = taglist
 172
 173   def _set_end(self, taglist, text, l, r, subtags):
 174     self.string_end = l
 175
 176   def _handle_partial(self, buf):
 177     which, chunks = self.partial
 178     if which == _T_TOKEN:
 179       success, taglist, idx = _tt.tag(buf, _c_token_table)
 180       if not success:
 181         # The start of this buffer was not a token. So the end of the
 182         # prior buffer was a complete token.
 183         self.tokens.insert(0, string.join(chunks, ''))
 184       else:
 185         assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
 186                and taglist[0][1] == 0 and taglist[0][2] == idx
 187         if idx == len(buf):
 188           #
 189           # The whole buffer was one huge token, so we may have a
 190           # partial token again.
 191           #
 192           # Note: this modifies the list of chunks in self.partial
 193           #
 194           chunks.append(buf)
 195
 196           # consumed the whole buffer
 197           return len(buf)
 198
 199         # got the rest of the token.
 200         chunks.append(buf[:idx])
 201         self.tokens.insert(0, string.join(chunks, ''))
 202
 203       # no more partial token
 204       self.partial = None
 205
 206       return idx
 207
 208     if which == _T_STRING_END:
 209       if buf[0] != '@':
 210         self.tokens.insert(0, string.join(chunks, ''))
 211         return 0
 212       chunks.append('@')
 213       start = 1
 214     else:
 215       start = 0
 216
 217     self.string_end = None
 218     string_table = (
 219       (None,    _tt.Is, '@', +3, +1),
 220       (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
 221       (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
 222
 223       (None,    _tt.EOF, _tt.Here, +1, _SUCCESS),
 224
 225       # suck up everything that isn't an AT. move to next rule to look
 226       # for EOF
 227       (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
 228
 229       # go back to look for double AT if we aren't at the end of the string
 230       (None,    _tt.EOF, _tt.Here, -5, _SUCCESS),
 231       )
 232
 233     success, unused, idx = _tt.tag(buf, string_table,
 234                                    start, len(buf), chunks)
 235
 236     # must have matched at least one item
 237     assert success
 238
 239     if self.string_end is None:
 240       assert idx == len(buf)
 241       self.partial = (_T_STRING_SPAN, chunks)
 242     elif self.string_end < len(buf):
 243       self.partial = None
 244       self.tokens.insert(0, string.join(chunks, ''))
 245     else:
 246       self.partial = (_T_STRING_END, chunks)
 247
 248     return idx
 249
 250   def _parse_more(self):
 251     buf = self.rcsfile.read(self.CHUNK_SIZE)
 252     if not buf:
 253       return _EOF
 254
 255     if self.partial:
 256       idx = self._handle_partial(buf)
 257       if idx is None:
 258         return _CONTINUE
 259       if idx < len(buf):
 260         self._parse_chunk(buf, idx)
 261     else:
 262       self._parse_chunk(buf)
 263
 264     return _CONTINUE
 265
 266   def get(self):
 267     try:
 268       return self.tokens.pop()
 269     except IndexError:
 270       pass
 271
 272     while not self.tokens:
 273       action = self._parse_more()
 274       if action == _EOF:
 275         return None
 276
 277     return self.tokens.pop()
 278
 279
 280 #  _get = get
 281 #  def get(self):
 282     token = self._get()
 283     print 'T:', `token`
 284     return token
 285
 286   def match(self, match):
 287     if self.tokens:
 288       token = self.tokens.pop()
 289       if token != match:
 290         raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
 291                              'Expected token: %s, but saw: %s'
 292                              % (match, token))
 293     else:
 294       token = self.get()
 295       if token != match:
 296         raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
 297                              'Expected token: %s, but saw: %s'
 298                              % (match, token))
 299
 300   def unget(self, token):
 301     self.tokens.append(token)
 302
 303   def mget(self, count):
 304     "Return multiple tokens. 'next' is at the end."
 305     while len(self.tokens) < count:
 306       action = self._parse_more()
 307       if action == _EOF:
 308         ### fix this
 309         raise RuntimeError, 'EOF hit while expecting tokens'
 310     result = self.tokens[-count:]
 311     del self.tokens[-count:]
 312     return result
 313
 314
 315 class Parser(common._Parser):
 316   stream_class = _mxTokenStream