Delegate to FilterSink methods rather than reimplementing them.
[cvs2svn.git] / cvs2svn_rcsparse / texttools.py
blob7c713eba082f6c291c90f4772418bf284d29f35d
1 # -*-python-*-
3 # Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
5 # By using this file, you agree to the terms and conditions set forth in
6 # the LICENSE.html file which can be found at the top level of the ViewVC
7 # distribution or at http://viewvc.org/license-1.html.
9 # For more information, visit http://viewvc.org/
11 # -----------------------------------------------------------------------
13 import string
15 # note: this will raise an ImportError if it isn't available. the rcsparse
16 # package will recognize this and switch over to the default parser.
17 from mx import TextTools
19 import common
22 # for convenience
23 _tt = TextTools
25 _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
26 _idchar_list.remove('$')
27 _idchar_list.remove(',')
28 #_idchar_list.remove('.') # leave as part of 'num' symbol
29 _idchar_list.remove(':')
30 _idchar_list.remove(';')
31 _idchar_list.remove('@')
32 _idchar = string.join(_idchar_list, '')
33 _idchar_set = _tt.set(_idchar)
35 _onechar_token_set = _tt.set(':;')
37 _not_at_set = _tt.invset('@')
39 _T_TOKEN = 30
40 _T_STRING_START = 40
41 _T_STRING_SPAN = 60
42 _T_STRING_END = 70
44 _E_COMPLETE = 100 # ended on a complete token
45 _E_TOKEN = 110 # ended mid-token
46 _E_STRING_SPAN = 130 # ended within a string
47 _E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
49 _SUCCESS = +100
51 _EOF = 'EOF'
52 _CONTINUE = 'CONTINUE'
53 _UNUSED = 'UNUSED'
56 # continuation of a token over a chunk boundary
57 _c_token_table = (
58 (_T_TOKEN, _tt.AllInSet, _idchar_set),
61 class _mxTokenStream:
63 # the algorithm is about the same speed for any CHUNK_SIZE chosen.
64 # grab a good-sized chunk, but not too large to overwhelm memory.
65 # note: we use a multiple of a standard block size
66 CHUNK_SIZE = 192 * 512 # about 100k
68 # CHUNK_SIZE = 5 # for debugging, make the function grind...
70 def __init__(self, file):
71 self.rcsfile = file
72 self.tokens = [ ]
73 self.partial = None
75 self.string_end = None
77 def _parse_chunk(self, buf, start=0):
78 "Get the next token from the RCS file."
80 buflen = len(buf)
82 assert start < buflen
84 # construct a tag table which refers to the buffer we need to parse.
85 table = (
86 #1: ignore whitespace. with or without whitespace, move to the next rule.
87 (None, _tt.AllInSet, _tt.whitespace_set, +1),
90 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
92 #3: accumulate token text and exit, or move to the next rule.
93 (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
96 (_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
98 #5: single character tokens exit immediately, or move to the next rule
99 (_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
102 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
104 #7: if this isn't an '@' symbol, then we have a syntax error (go to a
105 # negative index to indicate that condition). otherwise, suck it up
106 # and move to the next rule.
107 (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
110 (None, _tt.Is, '@', +4, +1),
112 (buf, _tt.Is, '@', +1, -1),
114 (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
116 (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
119 (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
121 #13: suck up everything that isn't an AT. go to next rule to look for EOF
122 (buf, _tt.AllInSet, _not_at_set, 0, +1),
124 #14: go back to look for double AT if we aren't at the end of the string
125 (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
128 # Fast, texttools may be, but it's somewhat lacking in clarity.
129 # Here's an attempt to document the logic encoded in the table above:
131 # Flowchart:
132 # _____
133 # / /\
134 # 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
135 # | \/ \/ \/ /\ \/
136 # \ 4 6 12 14 /
137 # \_______/_____/ \ / /
138 # \ 13 /
139 # \__________________________________________/
141 # #1: Skip over any whitespace.
142 # #2: If now EOF, exit with code _E_COMPLETE.
143 # #3: If we have a series of characters in _idchar_set, then:
144 # #4: Output them as a token, and go back to #1.
145 # #5: If we have a character in _onechar_token_set, then:
146 # #6: Output it as a token, and go back to #1.
147 # #7: If we do not have an '@', then error.
148 # If we do, then log a _T_STRING_START and continue.
149 # #8: If we have another '@', continue on to #9. Otherwise:
150 # #12: If now EOF, exit with code _E_STRING_SPAN.
151 # #13: Record the slice up to the next '@' (or EOF).
152 # #14: If now EOF, exit with code _E_STRING_SPAN.
153 # Otherwise, go back to #8.
154 # #9: If we have another '@', then we've just seen an escaped
155 # (by doubling) '@' within an @-string. Record a slice including
156 # just one '@' character, and jump back to #8.
157 # Otherwise, we've *either* seen the terminating '@' of an @-string,
158 # *or* we've seen one half of an escaped @@ sequence that just
159 # happened to be split over a chunk boundary - in either case,
160 # we continue on to #10.
161 # #10: Log a _T_STRING_END.
162 # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
164 success, taglist, idx = _tt.tag(buf, table, start)
166 if not success:
167 ### need a better way to report this error
168 raise common.RCSIllegalCharacter()
169 assert idx == buflen
171 # pop off the last item
172 last_which = taglist.pop()
174 i = 0
175 tlen = len(taglist)
176 while i < tlen:
177 if taglist[i] == _T_STRING_START:
178 j = i + 1
179 while j < tlen:
180 if taglist[j] == _T_STRING_END:
181 s = _tt.join(taglist, '', i+1, j)
182 del taglist[i:j]
183 tlen = len(taglist)
184 taglist[i] = s
185 break
186 j = j + 1
187 else:
188 assert last_which == _E_STRING_SPAN
189 s = _tt.join(taglist, '', i+1)
190 del taglist[i:]
191 self.partial = (_T_STRING_SPAN, [ s ])
192 break
193 i = i + 1
195 # figure out whether we have a partial last-token
196 if last_which == _E_TOKEN:
197 self.partial = (_T_TOKEN, [ taglist.pop() ])
198 elif last_which == _E_COMPLETE:
199 pass
200 elif last_which == _E_STRING_SPAN:
201 assert self.partial
202 else:
203 assert last_which == _E_STRING_END
204 self.partial = (_T_STRING_END, [ taglist.pop() ])
206 taglist.reverse()
207 taglist.extend(self.tokens)
208 self.tokens = taglist
210 def _set_end(self, taglist, text, l, r, subtags):
211 self.string_end = l
213 def _handle_partial(self, buf):
214 which, chunks = self.partial
215 if which == _T_TOKEN:
216 success, taglist, idx = _tt.tag(buf, _c_token_table)
217 if not success:
218 # The start of this buffer was not a token. So the end of the
219 # prior buffer was a complete token.
220 self.tokens.insert(0, string.join(chunks, ''))
221 else:
222 assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
223 and taglist[0][1] == 0 and taglist[0][2] == idx
224 if idx == len(buf):
226 # The whole buffer was one huge token, so we may have a
227 # partial token again.
229 # Note: this modifies the list of chunks in self.partial
231 chunks.append(buf)
233 # consumed the whole buffer
234 return len(buf)
236 # got the rest of the token.
237 chunks.append(buf[:idx])
238 self.tokens.insert(0, string.join(chunks, ''))
240 # no more partial token
241 self.partial = None
243 return idx
245 if which == _T_STRING_END:
246 if buf[0] != '@':
247 self.tokens.insert(0, string.join(chunks, ''))
248 return 0
249 chunks.append('@')
250 start = 1
251 else:
252 start = 0
254 self.string_end = None
255 string_table = (
256 (None, _tt.Is, '@', +3, +1),
257 (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
258 (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
260 (None, _tt.EOF, _tt.Here, +1, _SUCCESS),
262 # suck up everything that isn't an AT. move to next rule to look
263 # for EOF
264 (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
266 # go back to look for double AT if we aren't at the end of the string
267 (None, _tt.EOF, _tt.Here, -5, _SUCCESS),
270 success, unused, idx = _tt.tag(buf, string_table,
271 start, len(buf), chunks)
273 # must have matched at least one item
274 assert success
276 if self.string_end is None:
277 assert idx == len(buf)
278 self.partial = (_T_STRING_SPAN, chunks)
279 elif self.string_end < len(buf):
280 self.partial = None
281 self.tokens.insert(0, string.join(chunks, ''))
282 else:
283 self.partial = (_T_STRING_END, chunks)
285 return idx
287 def _parse_more(self):
288 buf = self.rcsfile.read(self.CHUNK_SIZE)
289 if not buf:
290 return _EOF
292 if self.partial:
293 idx = self._handle_partial(buf)
294 if idx is None:
295 return _CONTINUE
296 if idx < len(buf):
297 self._parse_chunk(buf, idx)
298 else:
299 self._parse_chunk(buf)
301 return _CONTINUE
303 def get(self):
304 try:
305 return self.tokens.pop()
306 except IndexError:
307 pass
309 while not self.tokens:
310 action = self._parse_more()
311 if action == _EOF:
312 return None
314 return self.tokens.pop()
317 # _get = get
318 # def get(self):
319 token = self._get()
320 print 'T:', `token`
321 return token
323 def match(self, match):
324 if self.tokens:
325 token = self.tokens.pop()
326 else:
327 token = self.get()
329 if token != match:
330 raise common.RCSExpected(token, match)
332 def unget(self, token):
333 self.tokens.append(token)
335 def mget(self, count):
336 "Return multiple tokens. 'next' is at the end."
337 while len(self.tokens) < count:
338 action = self._parse_more()
339 if action == _EOF:
340 ### fix this
341 raise RuntimeError, 'EOF hit while expecting tokens'
342 result = self.tokens[-count:]
343 del self.tokens[-count:]
344 return result
347 class Parser(common._Parser):
348 stream_class = _mxTokenStream