flinkspkg/lynxDump.py

   1 # Part of flinks
   2 # (C) Martin Bays 2008
   3 # Released under the terms of the GPLv3
   4
   5 import sys, os
   6 from string import *
   7
   8 import re
   9 import codecs
  10
  11 from subprocess import Popen, PIPE
  12
  13 from .constants import USER_AGENT
  14 from .readme import README
  15
  16 def lynxDump(url, lynxArgs=[]):
  17     if url == "special:README":
  18         return README, [], ''
  19
  20     try:
  21         p = Popen(['lynx', '-dump', '-force_html', '-useragent="%s via lynx"' % USER_AGENT] +
  22                 lynxArgs + [url],
  23                 stdin=None, stdout=PIPE, stderr=PIPE)
  24         (lynxStdout, lynxErrout) = (p.stdout, p.stderr)
  25     except OSError:
  26         return "", [], "Fatal error - lynx execution failed. Is it installed?"
  27
  28     # TODO: work out the encoding somehow? For now we assume it's latin1...
  29     lynxDecoded = codecs.EncodedFile(lynxStdout, 'utf8', 'latin1', 'replace')
  30
  31     dumped = ''
  32     refdumped = ''
  33     linkUrls = []
  34     readingRefs = False
  35     for binaryline in lynxDecoded:
  36         line = lynxDecoded.decode(binaryline)[0]
  37         if line == 'References\n':
  38             if readingRefs:
  39                 # The previous matched 'References' was part of the
  40                 # document...
  41                 dumped += refdumped
  42                 refdumped = ''
  43                 linkUrls = []
  44             readingRefs = True
  45
  46         if readingRefs:
  47             m = re.match(r'\s*\d+\. (.*)\n', line)
  48             if m:
  49                 linkUrls += [m.groups()[0]]
  50             refdumped += line
  51         else:
  52             dumped += line
  53
  54     lynxDecoded.close()
  55     lynxErr = lynxErrout.read().decode('utf8', 'replace')
  56     return dumped, linkUrls, lynxErr