remove generated file flinkspkg/readme.py from version control
[flinks.git] / flinkspkg / IndexedHypertext.py
blob5f4e092af8b6d79400930e84dae7563d06c578c1
1 # Part of flinks
2 # (C) Martin Bays 2008
3 # Released under the terms of the GPLv3
5 from __future__ import unicode_literals
7 import sys, os
9 from string import *
10 bwhitespace = list([c.encode() for c in list(whitespace)])
12 import re
13 from io import BytesIO
14 from select import select
16 from .lynxDump import lynxDump
18 from .constants import USER_AGENT
20 def placeInList(ref, list, hint=0):
21 if list == []: return -1
23 lowerBound = max(0, hint)
24 while lowerBound > 0 and ref < list[lowerBound]:
25 lowerBound -= 1
27 for i,e in enumerate( list[lowerBound:] ):
28 if ref < e:
29 return lowerBound + i - 1
30 return lowerBound + i
32 def getItemCapped(list, i):
33 if i < 0: i = 0
34 if i >= len(list): i = len(list)-1
35 return list[i]
38 STATE_WHITESPACE = 0
39 STATE_NEWLINE = 1
40 STATE_WORD = 2
41 STATE_LINKNUM = 3
42 STATE_HYPHEN = 4
44 class BlockException(BaseException): pass
46 class IndexedHypertext:
47 def __init__(self, url, lynxArgs=[], forcehtml=False, notSentenceEnder=""):
48 class Index:
49 """An increasing list of points (word numbers) in the text. 'at'
50 is maintained as the index of the last indexed point before the
51 current word position."""
52 def __init__(iself):
53 iself.list = []
54 iself.at = -1
55 def findIndex(iself, ref, near=False):
56 """Returns last index for which list[index] <= ref, returning
57 -1 if none.
58 """
59 if iself.list == []: return -1
61 if near:
62 i = max(0, iself.at)
63 else:
64 # binary search
65 i = len(iself)//2
66 di = i//2 + i%2
67 while di > 1:
68 n = iself[i]
69 i += di * (-1)**(n is None or n <= ref)
70 i = max(0,i)
71 di = di//2 + di%2
73 while i > 0 and (iself[i] is None or ref < iself[i]):
74 i -= 1
75 while True:
76 next = iself[i+1]
77 if next is None or ref < iself[i+1]:
78 break
79 i += 1
80 return i
82 def setPos(iself, to, near=False):
83 iself.at = iself.findIndex(to, near)
84 def pos(iself):
85 return iself[iself.at]
86 def nextPos(iself):
87 return iself[iself.at+1]
88 def append(iself, v):
89 if iself.list == [] or iself.list[-1] != v:
90 iself.list.append(v)
91 def __getitem__(iself, n):
92 if n < 0:
93 return -1
94 try:
95 while len(iself.list) <= n:
96 self.parseMore()
97 return iself.list[n]
98 except (StopIteration, BlockException):
99 return None
100 def __len__(iself):
101 return len(iself.list)
103 self.url = url
104 self.notSentenceEnder=notSentenceEnder
106 self.text = None
107 self.lynxErr = ""
108 self.links = None
109 if (url and url[0] == '!'):
110 # command
111 from subprocess import Popen, PIPE
112 sub=Popen(url[1:], shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, bufsize=0)
113 sub.stdin.close()
114 self.text = sub.stdout
115 if not (forcehtml or re.match('.*\.html?$', url)):
116 # first try opening as a local plaintext file
117 try: self.text = open(os.path.expanduser(url), 'rb', 0)
118 except IOError: pass
119 if not self.text:
120 dumped, linkUrls, self.lynxErr = lynxDump(url, lynxArgs=lynxArgs)
121 self.text = BytesIO(dumped.encode("utf8"))
122 self.links = [ {"url": url, "word": ''} for url in linkUrls ]
124 self.knownAnchors = {}
126 self.words = []
128 self.marks = {}
130 # index word position of start of each sentence, paragraph, link
131 self.sentenceIndex = Index()
132 self.sentenceIndex.append(0)
133 self.paraIndex = Index()
134 self.paraIndex.append(0)
135 self.linkIndex = Index()
136 self.startQuoteIndex = Index()
137 self.endQuoteIndex = Index()
138 self.indices = ( self.sentenceIndex, self.paraIndex, self.linkIndex,
139 self.startQuoteIndex, self.endQuoteIndex )
141 self.parserGenerator = self.getParserGenerator()
142 self.parsed = False
143 self.atWord = -1
144 self.nextWord = 0
145 if len(self.linkIndex) and self.linkIndex[0] == 0:
146 self.linkIndex.at = 0
148 def readWouldBlock(self):
149 try: return select([self.text.fileno()],[],[],0)[0] == []
150 except:
151 # select will raise an exception on some platforms (e.g. windows).
152 # Just assume non-blocking.
153 return False
155 def parseMore(self):
156 if next(self.parserGenerator) is None:
157 raise BlockException
159 def getParserGenerator(self):
160 class Parser:
161 def __init__(parseSelf):
162 parseSelf.wordNum = 0
163 parseSelf.maybeLink = False
164 parseSelf.inQuote = False
165 def endsSentence(parseSelf, word):
166 if word and word[-1] == '"': word = word[:-1]
167 if word and word[-1] in ".!?":
168 return not re.match(self.notSentenceEnder, word)
169 return False
171 def wordCompleted(parseSelf, word):
172 word = word.decode('utf8', 'replace')
173 try:
174 if parseSelf.endsSentence(word):
175 self.sentenceIndex.append(parseSelf.wordNum+1)
177 except IndexError:
178 pass
180 def handleSubword(word, linking):
181 if linking:
182 self.linkIndex.append(parseSelf.wordNum)
183 self.links[len(self.linkIndex)-1]["word"] = word
184 self.words.append(word)
185 parseSelf.wordNum += 1
186 return word
187 linking = False
188 if parseSelf.maybeLink:
189 parseSelf.maybeLink = False
190 def regexp():
191 return ('(.*?)' +
192 '\[' + repr(len(self.linkIndex)+1+linking) + '\]' +
193 '(.*)$')
194 while True:
195 m = re.match(regexp(), word)
196 if m:
197 wordstart, wordend = m.groups()
198 if wordstart:
199 yield handleSubword(wordstart, linking)
200 if ( self.links and
201 len(self.linkIndex) < len(self.links) ):
202 linking = True
203 word=wordend
204 if word=="":
205 word="{}"
206 else: break
208 yield handleSubword(word, linking)
210 def paraEnded(parseSelf):
211 self.paraIndex.append(parseSelf.wordNum)
212 if not (self.sentenceIndex and
213 self.sentenceIndex[-1] == parseSelf.wordNum):
214 self.sentenceIndex.append(parseSelf.wordNum)
215 if parseSelf.inQuote:
216 self.endQuoteIndex.append(parseSelf.wordNum)
217 parseSelf.inQuote = False
219 def quoteMark(parseSelf):
220 if parseSelf.inQuote:
221 self.endQuoteIndex.append(parseSelf.wordNum)
222 else:
223 self.startQuoteIndex.append(parseSelf.wordNum)
224 parseSelf.inQuote = not parseSelf.inQuote
226 def gen(parseSelf):
227 state = STATE_WHITESPACE
228 wordacc = b''
229 while True:
230 while self.readWouldBlock():
231 yield None
233 c = self.text.read(1)
235 if c == b'': break
237 if state in [STATE_WHITESPACE,STATE_NEWLINE]:
238 if c == b'\n':
239 if state == STATE_NEWLINE:
240 parseSelf.paraEnded()
241 state = STATE_NEWLINE
242 elif c in bwhitespace:
243 state = STATE_WHITESPACE
244 else:
245 state = STATE_WORD
247 wordacc = c
248 if c == b'[':
249 parseSelf.maybeLink = True
250 if c == b'"':
251 parseSelf.quoteMark()
254 elif state == STATE_WORD:
255 if c in bwhitespace+[b'\"']:
256 if c == b'\n': state = STATE_NEWLINE
257 else: state = STATE_WHITESPACE
258 if c == b'"':
259 for w in parseSelf.wordCompleted(wordacc+b'\"'):
260 yield w
261 parseSelf.quoteMark()
262 else:
263 for w in parseSelf.wordCompleted(wordacc):
264 yield w
265 wordacc=b''
266 elif c == b'-':
267 state = STATE_HYPHEN
268 else:
269 wordacc += c
270 if c == b'[':
271 parseSelf.maybeLink = True
272 elif state == STATE_HYPHEN:
273 if c == b'-':
274 if wordacc:
275 for w in parseSelf.wordCompleted(wordacc):
276 yield w
277 wordacc=b''
278 for w in parseSelf.wordCompleted(b'--'):
279 yield w
280 state = STATE_WHITESPACE
281 elif c in bwhitespace:
282 wordacc += b'-'
283 for w in parseSelf.wordCompleted(wordacc):
284 yield w
285 wordacc=b''
286 state = STATE_WHITESPACE
287 else:
288 wordacc += b'-'
289 if len(wordacc) > 4 or c == b'[':
290 for w in parseSelf.wordCompleted(wordacc):
291 yield w
292 wordacc=b''
293 wordacc += c
294 state = STATE_WORD
295 if c == b'[':
296 parseSelf.maybeLink = True
297 if c == b'"':
298 parseSelf.quoteMark()
300 if wordacc:
301 for w in parseSelf.wordCompleted(wordacc):
302 yield w
303 self.parsed = True
305 parser = Parser()
306 return parser.gen()
308 def getWord(self, n):
309 if n < 0: return ''
310 try:
311 return self.words[n]
312 except IndexError:
313 try:
314 for i in range(n - len(self.words) + 1):
315 self.parseMore()
316 except (StopIteration, BlockException):
317 return ''
318 return self.words[n]
320 def currentLink(self):
321 try:
322 if self.atWord == self.linkIndex.pos():
323 return self.linkIndex.at
324 except IndexError: pass
325 return None
327 def readWord(self):
328 word = self.getWord(self.nextWord)
329 if word == '':
330 return None
332 self.atWord = self.nextWord
333 self.nextWord += 1
335 for index in self.indices:
336 index.setPos(self.atWord, True)
338 return word
340 def currentWord(self):
341 if self.atWord == -1:
342 return ''
343 else:
344 return self.getWord(self.atWord)
346 def isEmpty(self):
347 return self.getWord(0) == ''
348 def atEnd(self):
349 if self.atWord == len(self.words) - 1:
350 try:
351 self.parseMore()
352 return False
353 except (StopIteration, BlockException):
354 return True
356 def seekWord(self, toWord, setMark=False):
357 if setMark:
358 self.mark('\'')
359 # parse to the point, and don't go beyond the end:
360 if self.getWord(toWord) == '':
361 toWord = len(self.words)-1
363 near = abs(self.atWord - toWord) < 100
364 self.atWord = toWord
365 self.nextWord = max(0,toWord)
367 for index in self.indices:
368 index.setPos(toWord,near)
370 def seekWordRel(self, rel):
371 self.seekWord(max(0, self.atWord+rel))
373 def parseAll(self):
374 try:
375 while True:
376 self.parseMore()
377 except (StopIteration, BlockException):
378 pass
380 def linkOfWord(self, n):
381 i = placeInList(n, self.linkIndex)
382 if i != -1 and n == self.linkIndex[i]:
383 return i
384 else:
385 return None
387 def seekEnd(self):
388 self.parseAll()
389 self.seekWord(len(self.words)-1, setMark=True)
391 def seekIndex(self, index, n):
392 if n < 0: n = 0
393 i = index[n]
394 if i is not None:
395 self.seekWord(i)
396 elif len(index) != 0:
397 self.seekWord(index[-1])
398 def seekIndexRel(self, index, rel):
399 self.seekIndex(index, index.at+rel)
401 def atSentenceStart(self, n=None):
402 if n == None:
403 return self.atWord == self.sentenceIndex.pos()
404 else:
405 i = self.sentenceIndex.findIndex(n, True)
406 return i != -1 and self.sentenceIndex[i] == n
407 def atParaStart(self, n=None):
408 if n == None:
409 return self.atWord == self.paraIndex.pos()
410 else:
411 i = self.paraIndex.findIndex(n, True)
412 return i != -1 and self.paraIndex[i] == n
413 def atSentenceEnd(self, n=None):
414 if n == None:
415 return self.atWord + 1 == self.sentenceIndex.nextPos()
416 else:
417 return self.atSentenceStart(n+1)
418 def atParaEnd(self, n=None):
419 if n == None:
420 return self.atWord + 1 == self.paraIndex.nextPos()
421 else:
422 return self.atParaStart(n+1)
424 def inQuote(self, point=None):
425 if not point:
426 point = self.atWord
427 sq = self.startQuoteIndex.findIndex(point, True)
428 eq = self.endQuoteIndex.findIndex(point, True)
429 return ( (sq>=0 and not eq>=0) or (sq>=0 and self.startQuoteIndex[sq]
430 >= self.endQuoteIndex[eq]) )
432 def search(self, pattern, dir=1, matchCase=False, wrap=False):
434 patternWords = pattern.split()
436 if len(patternWords) == 0:
437 return
439 if dir == 1:
440 initial = self.atWord+1
441 else:
442 initial = self.atWord-1
444 i = initial
446 while True:
447 if i < 0:
448 if wrap:
449 self.parseAll()
450 i = len(self.words) - 1
451 else:
452 return False
453 matched = 0
454 for pi in range(len(patternWords)):
455 word = self.getWord(i+pi)
456 if word == '' and dir == 1:
457 # reached end
458 if wrap:
459 i = -1
460 break
461 else:
462 return False
463 pword = patternWords[pi]
464 if matchCase:
465 f = word.find(pword)
466 else:
467 f = word.lower().find(pword.lower())
468 if (f == -1 or
469 (pi > 0 and f > 0) or
470 (pi == 0 and len(patternWords) > 1 and
471 len(word) - f != len(pword) # doesn't match to end
473 break
474 else:
475 matched += 1
476 if matched == len(patternWords):
477 self.seekWord(i, setMark=True)
478 return True
479 if dir == 1:
480 i += 1
481 else:
482 i -= 1
483 if wrap and i == initial:
484 return False
486 def mark(self, char):
487 self.marks[char] = self.atWord
488 def goMark(self, char):
489 if char in self.marks:
490 n = self.marks[char]
491 self.seekWord(n, setMark=True)
492 def findAnchor(self, anchor):
493 """findAnchor: return index of a named anchor, or None on failure.
495 XXX: the implementation here is rather hacky. It is unlikely to be
496 very robust.
498 We retrieve a copy of the html, search and replace to insert a marker
499 at an appropriate point, then process the result via lynx and search
500 for the marker."""
502 anchor = anchor.lower()
504 try:
505 return self.knownAnchors[anchor]
506 except KeyError:
507 if not hasattr(self, "anchorTempIndexed"):
508 from tempfile import mkstemp
510 try: from urllib import URLopener
511 except ImportError:
512 from urllib.request import URLopener
514 class myURLopener(URLopener):
515 version = USER_AGENT
517 (_, tempname) = mkstemp()
518 try:
519 try:
520 # fetch a copy - note that urllib keeps a cache, so this only
521 # actually retrieves the url the first time.
522 opener = myURLopener()
523 opener.retrieve(self.url, tempname)
525 temphtml = open(tempname).read()
527 def getRep(m):
528 tag = m.group(0)
529 element, attribute, value = m.groups()
531 # don't want to pick up meta elements, since they
532 # are in the header. Of course this is rather
533 # hacky.
534 if element == "meta":
535 return tag
537 return '%s__FLINKS_ANCHOR_%s__' % (tag, value)
539 temphtml = re.sub(
540 '<\s*(\w+)\s[^>]*?(name|id)\s*=\s*"?([\w-]+)"?[^>]*>',
541 getRep, temphtml)
542 open(tempname, 'w').write(temphtml)
544 self.anchorTempIndexed = IndexedHypertext(tempname,
545 lynxArgs=["-force-html"])
546 self.anchorFakeWordCount = 0
547 except (IOError, UnicodeDecodeError):
548 return None
549 finally:
550 os.remove(tempname)
552 while True:
553 if not self.anchorTempIndexed.search('__FLINKS_ANCHOR_'):
554 return None
556 m = re.search('__FLINKS_ANCHOR_([\w-]+)__',
557 self.anchorTempIndexed.currentWord())
558 if m:
559 name = m.group(1).lower()
560 n = self.anchorTempIndexed.atWord - self.anchorFakeWordCount
561 whole = re.match('^__FLINKS_ANCHOR_([\w-]+)__$',
562 self.anchorTempIndexed.currentWord())
563 if whole:
564 self.anchorFakeWordCount += 1
565 self.knownAnchors[name] = n
566 if name == anchor:
567 return n
568 else:
569 continue