3 # Released under the terms of the GPLv3
5 from __future__
import unicode_literals
10 bwhitespace
= list([c
.encode() for c
in list(whitespace
)])
13 from io
import BytesIO
14 from select
import select
16 from .lynxDump
import lynxDump
18 from .constants
import USER_AGENT
20 def placeInList(ref
, list, hint
=0):
21 if list == []: return -1
23 lowerBound
= max(0, hint
)
24 while lowerBound
> 0 and ref
< list[lowerBound
]:
27 for i
,e
in enumerate( list[lowerBound
:] ):
29 return lowerBound
+ i
- 1
32 def getItemCapped(list, i
):
34 if i
>= len(list): i
= len(list)-1
44 class BlockException(BaseException
): pass
46 class IndexedHypertext
:
47 def __init__(self
, url
, lynxArgs
=[], forcehtml
=False, notSentenceEnder
=""):
49 """An increasing list of points (word numbers) in the text. 'at'
50 is maintained as the index of the last indexed point before the
51 current word position."""
55 def findIndex(iself
, ref
, near
=False):
56 """Returns last index for which list[index] <= ref, returning
59 if iself
.list == []: return -1
69 i
+= di
* (-1)**(n
is None or n
<= ref
)
73 while i
> 0 and (iself
[i
] is None or ref
< iself
[i
]):
77 if next
is None or ref
< iself
[i
+1]:
82 def setPos(iself
, to
, near
=False):
83 iself
.at
= iself
.findIndex(to
, near
)
85 return iself
[iself
.at
]
87 return iself
[iself
.at
+1]
89 if iself
.list == [] or iself
.list[-1] != v
:
91 def __getitem__(iself
, n
):
95 while len(iself
.list) <= n
:
98 except (StopIteration, BlockException
):
101 return len(iself
.list)
104 self
.notSentenceEnder
=notSentenceEnder
109 if (url
and url
[0] == '!'):
111 from subprocess
import Popen
, PIPE
112 sub
=Popen(url
[1:], shell
=True, stdin
=PIPE
, stdout
=PIPE
, stderr
=PIPE
, bufsize
=0)
114 self
.text
= sub
.stdout
115 if not (forcehtml
or re
.match('.*\.html?$', url
)):
116 # first try opening as a local plaintext file
117 try: self
.text
= open(os
.path
.expanduser(url
), 'rb', 0)
120 dumped
, linkUrls
, self
.lynxErr
= lynxDump(url
, lynxArgs
=lynxArgs
)
121 self
.text
= BytesIO(dumped
.encode("utf8"))
122 self
.links
= [ {"url": url
, "word": ''} for url
in linkUrls
]
124 self
.knownAnchors
= {}
130 # index word position of start of each sentence, paragraph, link
131 self
.sentenceIndex
= Index()
132 self
.sentenceIndex
.append(0)
133 self
.paraIndex
= Index()
134 self
.paraIndex
.append(0)
135 self
.linkIndex
= Index()
136 self
.startQuoteIndex
= Index()
137 self
.endQuoteIndex
= Index()
138 self
.indices
= ( self
.sentenceIndex
, self
.paraIndex
, self
.linkIndex
,
139 self
.startQuoteIndex
, self
.endQuoteIndex
)
141 self
.parserGenerator
= self
.getParserGenerator()
145 if len(self
.linkIndex
) and self
.linkIndex
[0] == 0:
146 self
.linkIndex
.at
= 0
148 def readWouldBlock(self
):
149 try: return select([self
.text
.fileno()],[],[],0)[0] == []
151 # select will raise an exception on some platforms (e.g. windows).
152 # Just assume non-blocking.
156 if next(self
.parserGenerator
) is None:
159 def getParserGenerator(self
):
161 def __init__(parseSelf
):
162 parseSelf
.wordNum
= 0
163 parseSelf
.maybeLink
= False
164 parseSelf
.inQuote
= False
165 def endsSentence(parseSelf
, word
):
166 if word
and word
[-1] == '"': word
= word
[:-1]
167 if word
and word
[-1] in ".!?":
168 return not re
.match(self
.notSentenceEnder
, word
)
171 def wordCompleted(parseSelf
, word
):
172 word
= word
.decode('utf8', 'replace')
174 if parseSelf
.endsSentence(word
):
175 self
.sentenceIndex
.append(parseSelf
.wordNum
+1)
180 def handleSubword(word
, linking
):
182 self
.linkIndex
.append(parseSelf
.wordNum
)
183 self
.links
[len(self
.linkIndex
)-1]["word"] = word
184 self
.words
.append(word
)
185 parseSelf
.wordNum
+= 1
188 if parseSelf
.maybeLink
:
189 parseSelf
.maybeLink
= False
192 '\[' + repr(len(self
.linkIndex
)+1+linking
) + '\]' +
195 m
= re
.match(regexp(), word
)
197 wordstart
, wordend
= m
.groups()
199 yield handleSubword(wordstart
, linking
)
201 len(self
.linkIndex
) < len(self
.links
) ):
208 yield handleSubword(word
, linking
)
210 def paraEnded(parseSelf
):
211 self
.paraIndex
.append(parseSelf
.wordNum
)
212 if not (self
.sentenceIndex
and
213 self
.sentenceIndex
[-1] == parseSelf
.wordNum
):
214 self
.sentenceIndex
.append(parseSelf
.wordNum
)
215 if parseSelf
.inQuote
:
216 self
.endQuoteIndex
.append(parseSelf
.wordNum
)
217 parseSelf
.inQuote
= False
219 def quoteMark(parseSelf
):
220 if parseSelf
.inQuote
:
221 self
.endQuoteIndex
.append(parseSelf
.wordNum
)
223 self
.startQuoteIndex
.append(parseSelf
.wordNum
)
224 parseSelf
.inQuote
= not parseSelf
.inQuote
227 state
= STATE_WHITESPACE
230 while self
.readWouldBlock():
233 c
= self
.text
.read(1)
237 if state
in [STATE_WHITESPACE
,STATE_NEWLINE
]:
239 if state
== STATE_NEWLINE
:
240 parseSelf
.paraEnded()
241 state
= STATE_NEWLINE
242 elif c
in bwhitespace
:
243 state
= STATE_WHITESPACE
249 parseSelf
.maybeLink
= True
251 parseSelf
.quoteMark()
254 elif state
== STATE_WORD
:
255 if c
in bwhitespace
+[b
'\"']:
256 if c
== b
'\n': state
= STATE_NEWLINE
257 else: state
= STATE_WHITESPACE
259 for w
in parseSelf
.wordCompleted(wordacc
+b
'\"'):
261 parseSelf
.quoteMark()
263 for w
in parseSelf
.wordCompleted(wordacc
):
271 parseSelf
.maybeLink
= True
272 elif state
== STATE_HYPHEN
:
275 for w
in parseSelf
.wordCompleted(wordacc
):
278 for w
in parseSelf
.wordCompleted(b
'--'):
280 state
= STATE_WHITESPACE
281 elif c
in bwhitespace
:
283 for w
in parseSelf
.wordCompleted(wordacc
):
286 state
= STATE_WHITESPACE
289 if len(wordacc
) > 4 or c
== b
'[':
290 for w
in parseSelf
.wordCompleted(wordacc
):
296 parseSelf
.maybeLink
= True
298 parseSelf
.quoteMark()
301 for w
in parseSelf
.wordCompleted(wordacc
):
308 def getWord(self
, n
):
314 for i
in range(n
- len(self
.words
) + 1):
316 except (StopIteration, BlockException
):
320 def currentLink(self
):
322 if self
.atWord
== self
.linkIndex
.pos():
323 return self
.linkIndex
.at
324 except IndexError: pass
328 word
= self
.getWord(self
.nextWord
)
332 self
.atWord
= self
.nextWord
335 for index
in self
.indices
:
336 index
.setPos(self
.atWord
, True)
340 def currentWord(self
):
341 if self
.atWord
== -1:
344 return self
.getWord(self
.atWord
)
347 return self
.getWord(0) == ''
349 if self
.atWord
== len(self
.words
) - 1:
353 except (StopIteration, BlockException
):
356 def seekWord(self
, toWord
, setMark
=False):
359 # parse to the point, and don't go beyond the end:
360 if self
.getWord(toWord
) == '':
361 toWord
= len(self
.words
)-1
363 near
= abs(self
.atWord
- toWord
) < 100
365 self
.nextWord
= max(0,toWord
)
367 for index
in self
.indices
:
368 index
.setPos(toWord
,near
)
370 def seekWordRel(self
, rel
):
371 self
.seekWord(max(0, self
.atWord
+rel
))
377 except (StopIteration, BlockException
):
380 def linkOfWord(self
, n
):
381 i
= placeInList(n
, self
.linkIndex
)
382 if i
!= -1 and n
== self
.linkIndex
[i
]:
389 self
.seekWord(len(self
.words
)-1, setMark
=True)
391 def seekIndex(self
, index
, n
):
396 elif len(index
) != 0:
397 self
.seekWord(index
[-1])
398 def seekIndexRel(self
, index
, rel
):
399 self
.seekIndex(index
, index
.at
+rel
)
401 def atSentenceStart(self
, n
=None):
403 return self
.atWord
== self
.sentenceIndex
.pos()
405 i
= self
.sentenceIndex
.findIndex(n
, True)
406 return i
!= -1 and self
.sentenceIndex
[i
] == n
407 def atParaStart(self
, n
=None):
409 return self
.atWord
== self
.paraIndex
.pos()
411 i
= self
.paraIndex
.findIndex(n
, True)
412 return i
!= -1 and self
.paraIndex
[i
] == n
413 def atSentenceEnd(self
, n
=None):
415 return self
.atWord
+ 1 == self
.sentenceIndex
.nextPos()
417 return self
.atSentenceStart(n
+1)
418 def atParaEnd(self
, n
=None):
420 return self
.atWord
+ 1 == self
.paraIndex
.nextPos()
422 return self
.atParaStart(n
+1)
424 def inQuote(self
, point
=None):
427 sq
= self
.startQuoteIndex
.findIndex(point
, True)
428 eq
= self
.endQuoteIndex
.findIndex(point
, True)
429 return ( (sq
>=0 and not eq
>=0) or (sq
>=0 and self
.startQuoteIndex
[sq
]
430 >= self
.endQuoteIndex
[eq
]) )
432 def search(self
, pattern
, dir=1, matchCase
=False, wrap
=False):
434 patternWords
= pattern
.split()
436 if len(patternWords
) == 0:
440 initial
= self
.atWord
+1
442 initial
= self
.atWord
-1
450 i
= len(self
.words
) - 1
454 for pi
in range(len(patternWords
)):
455 word
= self
.getWord(i
+pi
)
456 if word
== '' and dir == 1:
463 pword
= patternWords
[pi
]
467 f
= word
.lower().find(pword
.lower())
469 (pi
> 0 and f
> 0) or
470 (pi
== 0 and len(patternWords
) > 1 and
471 len(word
) - f
!= len(pword
) # doesn't match to end
476 if matched
== len(patternWords
):
477 self
.seekWord(i
, setMark
=True)
483 if wrap
and i
== initial
:
486 def mark(self
, char
):
487 self
.marks
[char
] = self
.atWord
488 def goMark(self
, char
):
489 if char
in self
.marks
:
491 self
.seekWord(n
, setMark
=True)
492 def findAnchor(self
, anchor
):
493 """findAnchor: return index of a named anchor, or None on failure.
495 XXX: the implementation here is rather hacky. It is unlikely to be
498 We retrieve a copy of the html, search and replace to insert a marker
499 at an appropriate point, then process the result via lynx and search
502 anchor
= anchor
.lower()
505 return self
.knownAnchors
[anchor
]
507 if not hasattr(self
, "anchorTempIndexed"):
508 from tempfile
import mkstemp
510 try: from urllib
import URLopener
512 from urllib
.request
import URLopener
514 class myURLopener(URLopener
):
517 (_
, tempname
) = mkstemp()
520 # fetch a copy - note that urllib keeps a cache, so this only
521 # actually retrieves the url the first time.
522 opener
= myURLopener()
523 opener
.retrieve(self
.url
, tempname
)
525 temphtml
= open(tempname
).read()
529 element
, attribute
, value
= m
.groups()
531 # don't want to pick up meta elements, since they
532 # are in the header. Of course this is rather
534 if element
== "meta":
537 return '%s__FLINKS_ANCHOR_%s__' % (tag
, value
)
540 '<\s*(\w+)\s[^>]*?(name|id)\s*=\s*"?([\w-]+)"?[^>]*>',
542 open(tempname
, 'w').write(temphtml
)
544 self
.anchorTempIndexed
= IndexedHypertext(tempname
,
545 lynxArgs
=["-force-html"])
546 self
.anchorFakeWordCount
= 0
547 except (IOError, UnicodeDecodeError):
553 if not self
.anchorTempIndexed
.search('__FLINKS_ANCHOR_'):
556 m
= re
.search('__FLINKS_ANCHOR_([\w-]+)__',
557 self
.anchorTempIndexed
.currentWord())
559 name
= m
.group(1).lower()
560 n
= self
.anchorTempIndexed
.atWord
- self
.anchorFakeWordCount
561 whole
= re
.match('^__FLINKS_ANCHOR_([\w-]+)__$',
562 self
.anchorTempIndexed
.currentWord())
564 self
.anchorFakeWordCount
+= 1
565 self
.knownAnchors
[name
] = n