cvs2svn_rcsparse/texttools.py

   1 # -*-python-*-
   2 #
   3 # Copyright (C) 1999-2008 The ViewCVS Group. All Rights Reserved.
   4 #
   5 # By using this file, you agree to the terms and conditions set forth in
   6 # the LICENSE.html file which can be found at the top level of the ViewVC
   7 # distribution or at http://viewvc.org/license-1.html.
   8 #
   9 # For more information, visit http://viewvc.org/
  10 #
  11 # -----------------------------------------------------------------------
  12
  13 import string
  14
  15 # note: this will raise an ImportError if it isn't available. the rcsparse
  16 # package will recognize this and switch over to the default parser.
  17 from mx import TextTools
  18
  19 import common
  20
  21
  22 # for convenience
  23 _tt = TextTools
  24
  25 _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
  26 _idchar_list.remove('$')
  27 _idchar_list.remove(',')
  28 #_idchar_list.remove('.')   # leave as part of 'num' symbol
  29 _idchar_list.remove(':')
  30 _idchar_list.remove(';')
  31 _idchar_list.remove('@')
  32 _idchar = string.join(_idchar_list, '')
  33 _idchar_set = _tt.set(_idchar)
  34
  35 _onechar_token_set = _tt.set(':;')
  36
  37 _not_at_set = _tt.invset('@')
  38
  39 _T_TOKEN = 30
  40 _T_STRING_START = 40
  41 _T_STRING_SPAN = 60
  42 _T_STRING_END = 70
  43
  44 _E_COMPLETE = 100       # ended on a complete token
  45 _E_TOKEN = 110          # ended mid-token
  46 _E_STRING_SPAN = 130    # ended within a string
  47 _E_STRING_END = 140     # ended with string-end ('@') (could be mid-@@)
  48
  49 _SUCCESS = +100
  50
  51 _EOF = 'EOF'
  52 _CONTINUE = 'CONTINUE'
  53 _UNUSED = 'UNUSED'
  54
  55
  56 # continuation of a token over a chunk boundary
  57 _c_token_table = (
  58   (_T_TOKEN,      _tt.AllInSet, _idchar_set),
  59   )
  60
  61 class _mxTokenStream:
  62
  63   # the algorithm is about the same speed for any CHUNK_SIZE chosen.
  64   # grab a good-sized chunk, but not too large to overwhelm memory.
  65   # note: we use a multiple of a standard block size
  66   CHUNK_SIZE  = 192 * 512  # about 100k
  67
  68 # CHUNK_SIZE  = 5   # for debugging, make the function grind...
  69
  70   def __init__(self, file):
  71     self.rcsfile = file
  72     self.tokens = [ ]
  73     self.partial = None
  74
  75     self.string_end = None
  76
  77   def _parse_chunk(self, buf, start=0):
  78     "Get the next token from the RCS file."
  79
  80     buflen = len(buf)
  81
  82     assert start < buflen
  83
  84     # construct a tag table which refers to the buffer we need to parse.
  85     table = (
  86       #1: ignore whitespace. with or without whitespace, move to the next rule.
  87       (None, _tt.AllInSet, _tt.whitespace_set, +1),
  88
  89       #2
  90       (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
  91
  92       #3: accumulate token text and exit, or move to the next rule.
  93       (_UNUSED,      _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
  94
  95       #4
  96       (_E_TOKEN,  _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
  97
  98       #5: single character tokens exit immediately, or move to the next rule
  99       (_UNUSED,    _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
 100
 101       #6
 102       (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
 103
 104       #7: if this isn't an '@' symbol, then we have a syntax error (go to a
 105       # negative index to indicate that condition). otherwise, suck it up
 106       # and move to the next rule.
 107       (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
 108
 109       #8
 110       (None, _tt.Is, '@', +4, +1),
 111       #9
 112       (buf, _tt.Is, '@', +1, -1),
 113       #10
 114       (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
 115       #11
 116       (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
 117
 118       #12
 119       (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
 120
 121       #13: suck up everything that isn't an AT. go to next rule to look for EOF
 122       (buf,  _tt.AllInSet, _not_at_set, 0, +1),
 123
 124       #14: go back to look for double AT if we aren't at the end of the string
 125       (_E_STRING_SPAN,   _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
 126       )
 127
 128     # Fast, texttools may be, but it's somewhat lacking in clarity.
 129     # Here's an attempt to document the logic encoded in the table above:
 130     #
 131     # Flowchart:
 132     #                                   _____
 133     #                                  /    /\
 134     # 1 -> 2 ->  3 ->  5 ->  7 ->     8  ->  9 -> 10 -> 11
 135     # |         \/    \/           \/  /\               \/
 136     #  \         4     6          12    14              /
 137     #   \_______/_____/            \    /              /
 138     #    \                           13               /
 139     #     \__________________________________________/
 140     #
 141     # #1: Skip over any whitespace.
 142     # #2: If now EOF, exit with code _E_COMPLETE.
 143     # #3: If we have a series of characters in _idchar_set, then:
 144     #     #4: Output them as a token, and go back to #1.
 145     # #5: If we have a character in _onechar_token_set, then:
 146     #     #6: Output it as a token, and go back to #1.
 147     # #7: If we do not have an '@', then error.
 148     #     If we do, then log a _T_STRING_START and continue.
 149     # #8: If we have another '@', continue on to #9. Otherwise:
 150     #     #12: If now EOF, exit with code _E_STRING_SPAN.
 151     #     #13: Record the slice up to the next '@' (or EOF).
 152     #     #14: If now EOF, exit with code _E_STRING_SPAN.
 153     #          Otherwise, go back to #8.
 154     # #9: If we have another '@', then we've just seen an escaped
 155     #     (by doubling) '@' within an @-string.  Record a slice including
 156     #     just one '@' character, and jump back to #8.
 157     #     Otherwise, we've *either* seen the terminating '@' of an @-string,
 158     #     *or* we've seen one half of an escaped @@ sequence that just
 159     #     happened to be split over a chunk boundary - in either case,
 160     #     we continue on to #10.
 161     # #10: Log a _T_STRING_END.
 162     # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
 163
 164     success, taglist, idx = _tt.tag(buf, table, start)
 165
 166     if not success:
 167       ### need a better way to report this error
 168       raise common.RCSIllegalCharacter()
 169     assert idx == buflen
 170
 171     # pop off the last item
 172     last_which = taglist.pop()
 173
 174     i = 0
 175     tlen = len(taglist)
 176     while i < tlen:
 177       if taglist[i] == _T_STRING_START:
 178         j = i + 1
 179         while j < tlen:
 180           if taglist[j] == _T_STRING_END:
 181             s = _tt.join(taglist, '', i+1, j)
 182             del taglist[i:j]
 183             tlen = len(taglist)
 184             taglist[i] = s
 185             break
 186           j = j + 1
 187         else:
 188           assert last_which == _E_STRING_SPAN
 189           s = _tt.join(taglist, '', i+1)
 190           del taglist[i:]
 191           self.partial = (_T_STRING_SPAN, [ s ])
 192           break
 193       i = i + 1
 194
 195     # figure out whether we have a partial last-token
 196     if last_which == _E_TOKEN:
 197       self.partial = (_T_TOKEN, [ taglist.pop() ])
 198     elif last_which == _E_COMPLETE:
 199       pass
 200     elif last_which == _E_STRING_SPAN:
 201       assert self.partial
 202     else:
 203       assert last_which == _E_STRING_END
 204       self.partial = (_T_STRING_END, [ taglist.pop() ])
 205
 206     taglist.reverse()
 207     taglist.extend(self.tokens)
 208     self.tokens = taglist
 209
 210   def _set_end(self, taglist, text, l, r, subtags):
 211     self.string_end = l
 212
 213   def _handle_partial(self, buf):
 214     which, chunks = self.partial
 215     if which == _T_TOKEN:
 216       success, taglist, idx = _tt.tag(buf, _c_token_table)
 217       if not success:
 218         # The start of this buffer was not a token. So the end of the
 219         # prior buffer was a complete token.
 220         self.tokens.insert(0, string.join(chunks, ''))
 221       else:
 222         assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
 223                and taglist[0][1] == 0 and taglist[0][2] == idx
 224         if idx == len(buf):
 225           #
 226           # The whole buffer was one huge token, so we may have a
 227           # partial token again.
 228           #
 229           # Note: this modifies the list of chunks in self.partial
 230           #
 231           chunks.append(buf)
 232
 233           # consumed the whole buffer
 234           return len(buf)
 235
 236         # got the rest of the token.
 237         chunks.append(buf[:idx])
 238         self.tokens.insert(0, string.join(chunks, ''))
 239
 240       # no more partial token
 241       self.partial = None
 242
 243       return idx
 244
 245     if which == _T_STRING_END:
 246       if buf[0] != '@':
 247         self.tokens.insert(0, string.join(chunks, ''))
 248         return 0
 249       chunks.append('@')
 250       start = 1
 251     else:
 252       start = 0
 253
 254     self.string_end = None
 255     string_table = (
 256       (None,    _tt.Is, '@', +3, +1),
 257       (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
 258       (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
 259
 260       (None,    _tt.EOF, _tt.Here, +1, _SUCCESS),
 261
 262       # suck up everything that isn't an AT. move to next rule to look
 263       # for EOF
 264       (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
 265
 266       # go back to look for double AT if we aren't at the end of the string
 267       (None,    _tt.EOF, _tt.Here, -5, _SUCCESS),
 268       )
 269
 270     success, unused, idx = _tt.tag(buf, string_table,
 271                                    start, len(buf), chunks)
 272
 273     # must have matched at least one item
 274     assert success
 275
 276     if self.string_end is None:
 277       assert idx == len(buf)
 278       self.partial = (_T_STRING_SPAN, chunks)
 279     elif self.string_end < len(buf):
 280       self.partial = None
 281       self.tokens.insert(0, string.join(chunks, ''))
 282     else:
 283       self.partial = (_T_STRING_END, chunks)
 284
 285     return idx
 286
 287   def _parse_more(self):
 288     buf = self.rcsfile.read(self.CHUNK_SIZE)
 289     if not buf:
 290       return _EOF
 291
 292     if self.partial:
 293       idx = self._handle_partial(buf)
 294       if idx is None:
 295         return _CONTINUE
 296       if idx < len(buf):
 297         self._parse_chunk(buf, idx)
 298     else:
 299       self._parse_chunk(buf)
 300
 301     return _CONTINUE
 302
 303   def get(self):
 304     try:
 305       return self.tokens.pop()
 306     except IndexError:
 307       pass
 308
 309     while not self.tokens:
 310       action = self._parse_more()
 311       if action == _EOF:
 312         return None
 313
 314     return self.tokens.pop()
 315
 316
 317 #  _get = get
 318 #  def get(self):
 319     token = self._get()
 320     print 'T:', `token`
 321     return token
 322
 323   def match(self, match):
 324     if self.tokens:
 325       token = self.tokens.pop()
 326     else:
 327       token = self.get()
 328
 329     if token != match:
 330       raise common.RCSExpected(token, match)
 331
 332   def unget(self, token):
 333     self.tokens.append(token)
 334
 335   def mget(self, count):
 336     "Return multiple tokens. 'next' is at the end."
 337     while len(self.tokens) < count:
 338       action = self._parse_more()
 339       if action == _EOF:
 340         ### fix this
 341         raise RuntimeError, 'EOF hit while expecting tokens'
 342     result = self.tokens[-count:]
 343     del self.tokens[-count:]
 344     return result
 345
 346
 347 class Parser(common._Parser):
 348   stream_class = _mxTokenStream