remove generated file flinkspkg/readme.py from version control
[flinks.git] / flinkspkg / lynxDump.py
blob2121f69f7682ab178df0369688e9d64e177b264c
1 # Part of flinks
2 # (C) Martin Bays 2008
3 # Released under the terms of the GPLv3
5 import sys, os
6 from string import *
8 import re
9 import codecs
11 from subprocess import Popen, PIPE
13 from .constants import USER_AGENT
14 from .readme import README
16 def lynxDump(url, lynxArgs=[]):
17 if url == "special:README":
18 return README, [], ''
20 try:
21 p = Popen(['lynx', '-dump', '-force_html', '-useragent="%s via lynx"' % USER_AGENT] +
22 lynxArgs + [url],
23 stdin=None, stdout=PIPE, stderr=PIPE)
24 (lynxStdout, lynxErrout) = (p.stdout, p.stderr)
25 except OSError:
26 return "", [], "Fatal error - lynx execution failed. Is it installed?"
28 # TODO: work out the encoding somehow? For now we assume it's latin1...
29 lynxDecoded = codecs.EncodedFile(lynxStdout, 'utf8', 'latin1', 'replace')
31 dumped = ''
32 refdumped = ''
33 linkUrls = []
34 readingRefs = False
35 for binaryline in lynxDecoded:
36 line = lynxDecoded.decode(binaryline)[0]
37 if line == 'References\n':
38 if readingRefs:
39 # The previous matched 'References' was part of the
40 # document...
41 dumped += refdumped
42 refdumped = ''
43 linkUrls = []
44 readingRefs = True
46 if readingRefs:
47 m = re.match(r'\s*\d+\. (.*)\n', line)
48 if m:
49 linkUrls += [m.groups()[0]]
50 refdumped += line
51 else:
52 dumped += line
54 lynxDecoded.close()
55 lynxErr = lynxErrout.read().decode('utf8', 'replace')
56 return dumped, linkUrls, lynxErr