Introduce proper XHTML boilerplate into the cvs2svn webpages.
[cvs2svn.git] / cvs2svn_rcsparse / texttools.py
blobe0f42ae611ee7773bd2eaaa63e5a45dd9becf029
2 # Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved.
4 # By using this file, you agree to the terms and conditions set forth in
5 # the LICENSE.html file which can be found at the top level of the ViewCVS
6 # distribution or at http://viewcvs.sourceforge.net/license-1.html.
8 # Contact information:
9 # Greg Stein, PO Box 760, Palo Alto, CA, 94302
10 # gstein@lyra.org, http://viewcvs.sourceforge.net/
12 # -----------------------------------------------------------------------
14 # This software is being maintained as part of the ViewCVS project.
15 # Information is available at:
16 # http://viewcvs.sourceforge.net/
18 # -----------------------------------------------------------------------
20 import string
22 # note: this will raise an ImportError if it isn't available. the rcsparse
23 # package will recognize this and switch over to the default parser.
24 from mx import TextTools
26 import common
29 # for convenience
30 _tt = TextTools
32 _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
33 _idchar_list.remove('$')
34 _idchar_list.remove(',')
35 #_idchar_list.remove('.') leave as part of 'num' symbol
36 _idchar_list.remove(':')
37 _idchar_list.remove(';')
38 _idchar_list.remove('@')
39 _idchar = string.join(_idchar_list, '')
40 _idchar_set = _tt.set(_idchar)
42 _onechar_token_set = _tt.set(':;')
44 _not_at_set = _tt.invset('@')
46 _T_TOKEN = 30
47 _T_STRING_START = 40
48 _T_STRING_SPAN = 60
49 _T_STRING_END = 70
51 _E_COMPLETE = 100 # ended on a complete token
52 _E_TOKEN = 110 # ended mid-token
53 _E_STRING_SPAN = 130 # ended within a string
54 _E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
56 _SUCCESS = +100
58 _EOF = 'EOF'
59 _CONTINUE = 'CONTINUE'
60 _UNUSED = 'UNUSED'
63 # continuation of a token over a chunk boundary
64 _c_token_table = (
65 (_T_TOKEN, _tt.AllInSet, _idchar_set),
68 class _mxTokenStream:
70 # the algorithm is about the same speed for any CHUNK_SIZE chosen.
71 # grab a good-sized chunk, but not too large to overwhelm memory.
72 # note: we use a multiple of a standard block size
73 CHUNK_SIZE = 192 * 512 # about 100k
75 # CHUNK_SIZE = 5 # for debugging, make the function grind...
77 def __init__(self, file):
78 self.rcsfile = file
79 self.tokens = [ ]
80 self.partial = None
82 self.string_end = None
84 def _parse_chunk(self, buf, start=0):
85 "Get the next token from the RCS file."
87 buflen = len(buf)
89 assert start < buflen
91 # construct a tag table which refers to the buffer we need to parse.
92 table = (
93 # ignore whitespace. with or without whitespace, move to the next rule.
94 (None, _tt.AllInSet, _tt.whitespace_set, +1),
96 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
98 # accumulate token text and exit, or move to the next rule.
99 (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
101 (_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
103 # single character tokens exit immediately, or move to the next rule
104 (_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
106 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
108 # if this isn't an '@' symbol, then we have a syntax error (go to a
109 # negative index to indicate that condition). otherwise, suck it up
110 # and move to the next rule.
111 (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
113 (None, _tt.Is, '@', +4, +1),
114 (buf, _tt.Is, '@', +1, -1),
115 (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
116 (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
118 (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
120 # suck up everything that isn't an AT. go to next rule to look for EOF
121 (buf, _tt.AllInSet, _not_at_set, 0, +1),
123 # go back to look for double AT if we aren't at the end of the string
124 (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
127 success, taglist, idx = _tt.tag(buf, table, start)
129 if not success:
130 ### need a better way to report this error
131 raise common.RCSIllegalCharacter()
132 assert idx == buflen
134 # pop off the last item
135 last_which = taglist.pop()
137 i = 0
138 tlen = len(taglist)
139 while i < tlen:
140 if taglist[i] == _T_STRING_START:
141 j = i + 1
142 while j < tlen:
143 if taglist[j] == _T_STRING_END:
144 s = _tt.join(taglist, '', i+1, j)
145 del taglist[i:j]
146 tlen = len(taglist)
147 taglist[i] = s
148 break
149 j = j + 1
150 else:
151 assert last_which == _E_STRING_SPAN
152 s = _tt.join(taglist, '', i+1)
153 del taglist[i:]
154 self.partial = (_T_STRING_SPAN, [ s ])
155 break
156 i = i + 1
158 # figure out whether we have a partial last-token
159 if last_which == _E_TOKEN:
160 self.partial = (_T_TOKEN, [ taglist.pop() ])
161 elif last_which == _E_COMPLETE:
162 pass
163 elif last_which == _E_STRING_SPAN:
164 assert self.partial
165 else:
166 assert last_which == _E_STRING_END
167 self.partial = (_T_STRING_END, [ taglist.pop() ])
169 taglist.reverse()
170 taglist.extend(self.tokens)
171 self.tokens = taglist
173 def _set_end(self, taglist, text, l, r, subtags):
174 self.string_end = l
176 def _handle_partial(self, buf):
177 which, chunks = self.partial
178 if which == _T_TOKEN:
179 success, taglist, idx = _tt.tag(buf, _c_token_table)
180 if not success:
181 # The start of this buffer was not a token. So the end of the
182 # prior buffer was a complete token.
183 self.tokens.insert(0, string.join(chunks, ''))
184 else:
185 assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
186 and taglist[0][1] == 0 and taglist[0][2] == idx
187 if idx == len(buf):
189 # The whole buffer was one huge token, so we may have a
190 # partial token again.
192 # Note: this modifies the list of chunks in self.partial
194 chunks.append(buf)
196 # consumed the whole buffer
197 return len(buf)
199 # got the rest of the token.
200 chunks.append(buf[:idx])
201 self.tokens.insert(0, string.join(chunks, ''))
203 # no more partial token
204 self.partial = None
206 return idx
208 if which == _T_STRING_END:
209 if buf[0] != '@':
210 self.tokens.insert(0, string.join(chunks, ''))
211 return 0
212 chunks.append('@')
213 start = 1
214 else:
215 start = 0
217 self.string_end = None
218 string_table = (
219 (None, _tt.Is, '@', +3, +1),
220 (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
221 (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
223 (None, _tt.EOF, _tt.Here, +1, _SUCCESS),
225 # suck up everything that isn't an AT. move to next rule to look
226 # for EOF
227 (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
229 # go back to look for double AT if we aren't at the end of the string
230 (None, _tt.EOF, _tt.Here, -5, _SUCCESS),
233 success, unused, idx = _tt.tag(buf, string_table,
234 start, len(buf), chunks)
236 # must have matched at least one item
237 assert success
239 if self.string_end is None:
240 assert idx == len(buf)
241 self.partial = (_T_STRING_SPAN, chunks)
242 elif self.string_end < len(buf):
243 self.partial = None
244 self.tokens.insert(0, string.join(chunks, ''))
245 else:
246 self.partial = (_T_STRING_END, chunks)
248 return idx
250 def _parse_more(self):
251 buf = self.rcsfile.read(self.CHUNK_SIZE)
252 if not buf:
253 return _EOF
255 if self.partial:
256 idx = self._handle_partial(buf)
257 if idx is None:
258 return _CONTINUE
259 if idx < len(buf):
260 self._parse_chunk(buf, idx)
261 else:
262 self._parse_chunk(buf)
264 return _CONTINUE
266 def get(self):
267 try:
268 return self.tokens.pop()
269 except IndexError:
270 pass
272 while not self.tokens:
273 action = self._parse_more()
274 if action == _EOF:
275 return None
277 return self.tokens.pop()
280 # _get = get
281 # def get(self):
282 token = self._get()
283 print 'T:', `token`
284 return token
286 def match(self, match):
287 if self.tokens:
288 token = self.tokens.pop()
289 if token != match:
290 raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
291 'Expected token: %s, but saw: %s'
292 % (match, token))
293 else:
294 token = self.get()
295 if token != match:
296 raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
297 'Expected token: %s, but saw: %s'
298 % (match, token))
300 def unget(self, token):
301 self.tokens.append(token)
303 def mget(self, count):
304 "Return multiple tokens. 'next' is at the end."
305 while len(self.tokens) < count:
306 action = self._parse_more()
307 if action == _EOF:
308 ### fix this
309 raise RuntimeError, 'EOF hit while expecting tokens'
310 result = self.tokens[-count:]
311 del self.tokens[-count:]
312 return result
315 class Parser(common._Parser):
316 stream_class = _mxTokenStream