3 # Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
5 # By using this file, you agree to the terms and conditions set forth in
6 # the LICENSE.html file which can be found at the top level of the ViewVC
7 # distribution or at http://viewvc.org/license-1.html.
9 # For more information, visit http://viewvc.org/
11 # -----------------------------------------------------------------------
15 # note: this will raise an ImportError if it isn't available. the rcsparse
16 # package will recognize this and switch over to the default parser.
17 from mx
import TextTools
25 _idchar_list
= map(chr, range(33, 127)) + map(chr, range(160, 256))
26 _idchar_list
.remove('$')
27 _idchar_list
.remove(',')
28 #_idchar_list.remove('.') # leave as part of 'num' symbol
29 _idchar_list
.remove(':')
30 _idchar_list
.remove(';')
31 _idchar_list
.remove('@')
32 _idchar
= string
.join(_idchar_list
, '')
33 _idchar_set
= _tt
.set(_idchar
)
35 _onechar_token_set
= _tt
.set(':;')
37 _not_at_set
= _tt
.invset('@')
44 _E_COMPLETE
= 100 # ended on a complete token
45 _E_TOKEN
= 110 # ended mid-token
46 _E_STRING_SPAN
= 130 # ended within a string
47 _E_STRING_END
= 140 # ended with string-end ('@') (could be mid-@@)
52 _CONTINUE
= 'CONTINUE'
56 # continuation of a token over a chunk boundary
58 (_T_TOKEN
, _tt
.AllInSet
, _idchar_set
),
63 # the algorithm is about the same speed for any CHUNK_SIZE chosen.
64 # grab a good-sized chunk, but not too large to overwhelm memory.
65 # note: we use a multiple of a standard block size
66 CHUNK_SIZE
= 192 * 512 # about 100k
68 # CHUNK_SIZE = 5 # for debugging, make the function grind...
70 def __init__(self
, file):
75 self
.string_end
= None
77 def _parse_chunk(self
, buf
, start
=0):
78 "Get the next token from the RCS file."
84 # construct a tag table which refers to the buffer we need to parse.
86 #1: ignore whitespace. with or without whitespace, move to the next rule.
87 (None, _tt
.AllInSet
, _tt
.whitespace_set
, +1),
90 (_E_COMPLETE
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, +1, _SUCCESS
),
92 #3: accumulate token text and exit, or move to the next rule.
93 (_UNUSED
, _tt
.AllInSet
+ _tt
.AppendMatch
, _idchar_set
, +2),
96 (_E_TOKEN
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, -3, _SUCCESS
),
98 #5: single character tokens exit immediately, or move to the next rule
99 (_UNUSED
, _tt
.IsInSet
+ _tt
.AppendMatch
, _onechar_token_set
, +2),
102 (_E_COMPLETE
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, -5, _SUCCESS
),
104 #7: if this isn't an '@' symbol, then we have a syntax error (go to a
105 # negative index to indicate that condition). otherwise, suck it up
106 # and move to the next rule.
107 (_T_STRING_START
, _tt
.Is
+ _tt
.AppendTagobj
, '@'),
110 (None, _tt
.Is
, '@', +4, +1),
112 (buf
, _tt
.Is
, '@', +1, -1),
114 (_T_STRING_END
, _tt
.Skip
+ _tt
.AppendTagobj
, 0, 0, +1),
116 (_E_STRING_END
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, -10, _SUCCESS
),
119 (_E_STRING_SPAN
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, +1, _SUCCESS
),
121 #13: suck up everything that isn't an AT. go to next rule to look for EOF
122 (buf
, _tt
.AllInSet
, _not_at_set
, 0, +1),
124 #14: go back to look for double AT if we aren't at the end of the string
125 (_E_STRING_SPAN
, _tt
.EOF
+ _tt
.AppendTagobj
, _tt
.Here
, -6, _SUCCESS
),
128 # Fast, texttools may be, but it's somewhat lacking in clarity.
129 # Here's an attempt to document the logic encoded in the table above:
134 # 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
137 # \_______/_____/ \ / /
139 # \__________________________________________/
141 # #1: Skip over any whitespace.
142 # #2: If now EOF, exit with code _E_COMPLETE.
143 # #3: If we have a series of characters in _idchar_set, then:
144 # #4: Output them as a token, and go back to #1.
145 # #5: If we have a character in _onechar_token_set, then:
146 # #6: Output it as a token, and go back to #1.
147 # #7: If we do not have an '@', then error.
148 # If we do, then log a _T_STRING_START and continue.
149 # #8: If we have another '@', continue on to #9. Otherwise:
150 # #12: If now EOF, exit with code _E_STRING_SPAN.
151 # #13: Record the slice up to the next '@' (or EOF).
152 # #14: If now EOF, exit with code _E_STRING_SPAN.
153 # Otherwise, go back to #8.
154 # #9: If we have another '@', then we've just seen an escaped
155 # (by doubling) '@' within an @-string. Record a slice including
156 # just one '@' character, and jump back to #8.
157 # Otherwise, we've *either* seen the terminating '@' of an @-string,
158 # *or* we've seen one half of an escaped @@ sequence that just
159 # happened to be split over a chunk boundary - in either case,
160 # we continue on to #10.
161 # #10: Log a _T_STRING_END.
162 # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
164 success
, taglist
, idx
= _tt
.tag(buf
, table
, start
)
167 ### need a better way to report this error
168 raise common
.RCSIllegalCharacter()
171 # pop off the last item
172 last_which
= taglist
.pop()
177 if taglist
[i
] == _T_STRING_START
:
180 if taglist
[j
] == _T_STRING_END
:
181 s
= _tt
.join(taglist
, '', i
+1, j
)
188 assert last_which
== _E_STRING_SPAN
189 s
= _tt
.join(taglist
, '', i
+1)
191 self
.partial
= (_T_STRING_SPAN
, [ s
])
195 # figure out whether we have a partial last-token
196 if last_which
== _E_TOKEN
:
197 self
.partial
= (_T_TOKEN
, [ taglist
.pop() ])
198 elif last_which
== _E_COMPLETE
:
200 elif last_which
== _E_STRING_SPAN
:
203 assert last_which
== _E_STRING_END
204 self
.partial
= (_T_STRING_END
, [ taglist
.pop() ])
207 taglist
.extend(self
.tokens
)
208 self
.tokens
= taglist
210 def _set_end(self
, taglist
, text
, l
, r
, subtags
):
213 def _handle_partial(self
, buf
):
214 which
, chunks
= self
.partial
215 if which
== _T_TOKEN
:
216 success
, taglist
, idx
= _tt
.tag(buf
, _c_token_table
)
218 # The start of this buffer was not a token. So the end of the
219 # prior buffer was a complete token.
220 self
.tokens
.insert(0, string
.join(chunks
, ''))
222 assert len(taglist
) == 1 and taglist
[0][0] == _T_TOKEN \
223 and taglist
[0][1] == 0 and taglist
[0][2] == idx
226 # The whole buffer was one huge token, so we may have a
227 # partial token again.
229 # Note: this modifies the list of chunks in self.partial
233 # consumed the whole buffer
236 # got the rest of the token.
237 chunks
.append(buf
[:idx
])
238 self
.tokens
.insert(0, string
.join(chunks
, ''))
240 # no more partial token
245 if which
== _T_STRING_END
:
247 self
.tokens
.insert(0, string
.join(chunks
, ''))
254 self
.string_end
= None
256 (None, _tt
.Is
, '@', +3, +1),
257 (_UNUSED
, _tt
.Is
+ _tt
.AppendMatch
, '@', +1, -1),
258 (self
._set
_end
, _tt
.Skip
+ _tt
.CallTag
, 0, 0, _SUCCESS
),
260 (None, _tt
.EOF
, _tt
.Here
, +1, _SUCCESS
),
262 # suck up everything that isn't an AT. move to next rule to look
264 (_UNUSED
, _tt
.AllInSet
+ _tt
.AppendMatch
, _not_at_set
, 0, +1),
266 # go back to look for double AT if we aren't at the end of the string
267 (None, _tt
.EOF
, _tt
.Here
, -5, _SUCCESS
),
270 success
, unused
, idx
= _tt
.tag(buf
, string_table
,
271 start
, len(buf
), chunks
)
273 # must have matched at least one item
276 if self
.string_end
is None:
277 assert idx
== len(buf
)
278 self
.partial
= (_T_STRING_SPAN
, chunks
)
279 elif self
.string_end
< len(buf
):
281 self
.tokens
.insert(0, string
.join(chunks
, ''))
283 self
.partial
= (_T_STRING_END
, chunks
)
287 def _parse_more(self
):
288 buf
= self
.rcsfile
.read(self
.CHUNK_SIZE
)
293 idx
= self
._handle
_partial
(buf
)
297 self
._parse
_chunk
(buf
, idx
)
299 self
._parse
_chunk
(buf
)
305 return self
.tokens
.pop()
309 while not self
.tokens
:
310 action
= self
._parse
_more
()
314 return self
.tokens
.pop()
323 def match(self
, match
):
325 token
= self
.tokens
.pop()
330 raise common
.RCSExpected(token
, match
)
332 def unget(self
, token
):
333 self
.tokens
.append(token
)
335 def mget(self
, count
):
336 "Return multiple tokens. 'next' is at the end."
337 while len(self
.tokens
) < count
:
338 action
= self
._parse
_more
()
341 raise RuntimeError, 'EOF hit while expecting tokens'
342 result
= self
.tokens
[-count
:]
343 del self
.tokens
[-count
:]
347 class Parser(common
._Parser
):
348 stream_class
= _mxTokenStream