Tools/webchecker/webchecker.py

   1 #! /usr/bin/env python
   2
   3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
   4 # including code to check URL fragments.
   5
   6 """Web tree checker.
   7
   8 This utility is handy to check a subweb of the world-wide web for
   9 errors.  A subweb is specified by giving one or more ``root URLs''; a
  10 page belongs to the subweb if one of the root URLs is an initial
  11 prefix of it.
  12
  13 File URL extension:
  14
  15 In order to easy the checking of subwebs via the local file system,
  16 the interpretation of ``file:'' URLs is extended to mimic the behavior
  17 of your average HTTP daemon: if a directory pathname is given, the
  18 file index.html in that directory is returned if it exists, otherwise
  19 a directory listing is returned.  Now, you can point webchecker to the
  20 document tree in the local file system of your HTTP daemon, and have
  21 most of it checked.  In fact the default works this way if your local
  22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
  23 the NCSA HTTP daemon and probably others).
  24
  25 Report printed:
  26
  27 When done, it reports pages with bad links within the subweb.  When
  28 interrupted, it reports for the pages that it has checked already.
  29
  30 In verbose mode, additional messages are printed during the
  31 information gathering phase.  By default, it prints a summary of its
  32 work status every 50 URLs (adjustable with the -r option), and it
  33 reports errors as they are encountered.  Use the -q option to disable
  34 this output.
  35
  36 Checkpoint feature:
  37
  38 Whether interrupted or not, it dumps its state (a Python pickle) to a
  39 checkpoint file and the -R option allows it to restart from the
  40 checkpoint (assuming that the pages on the subweb that were already
  41 processed haven't changed).  Even when it has run till completion, -R
  42 can still be useful -- it will print the reports again, and -Rq prints
  43 the errors only.  In this case, the checkpoint file is not written
  44 again.  The checkpoint file can be set with the -d option.
  45
  46 The checkpoint file is written as a Python pickle.  Remember that
  47 Python's pickle module is currently quite slow.  Give it the time it
  48 needs to load and save the checkpoint file.  When interrupted while
  49 writing the checkpoint file, the old checkpoint file is not
  50 overwritten, but all work done in the current run is lost.
  51
  52 Miscellaneous:
  53
  54 - You may find the (Tk-based) GUI version easier to use.  See wcgui.py.
  55
  56 - Webchecker honors the "robots.txt" convention.  Thanks to Skip
  57 Montanaro for his robotparser.py module (included in this directory)!
  58 The agent name is hardwired to "webchecker".  URLs that are disallowed
  59 by the robots.txt file are reported as external URLs.
  60
  61 - Because the SGML parser is a bit slow, very large SGML files are
  62 skipped.  The size limit can be set with the -m option.
  63
  64 - When the server or protocol does not tell us a file's type, we guess
  65 it based on the URL's suffix.  The mimetypes.py module (also in this
  66 directory) has a built-in table mapping most currently known suffixes,
  67 and in addition attempts to read the mime.types configuration files in
  68 the default locations of Netscape and the NCSA HTTP daemon.
  69
  70 - We follow links indicated by <A>, <FRAME> and <IMG> tags.  We also
  71 honor the <BASE> tag.
  72
  73 - We now check internal NAME anchor links, as well as toplevel links.
  74
  75 - Checking external links is now done by default; use -x to *disable*
  76 this feature.  External links are now checked during normal
  77 processing.  (XXX The status of a checked link could be categorized
  78 better.  Later...)
  79
  80 - If external links are not checked, you can use the -t flag to
  81 provide specific overrides to -x.
  82
  83 Usage: webchecker.py [option] ... [rooturl] ...
  84
  85 Options:
  86
  87 -R        -- restart from checkpoint file
  88 -d file   -- checkpoint filename (default %(DUMPFILE)s)
  89 -m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
  90 -n        -- reports only, no checking (use with -R)
  91 -q        -- quiet operation (also suppresses external links report)
  92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
  93 -t root   -- specify root dir which should be treated as internal (can repeat)
  94 -v        -- verbose operation; repeating -v will increase verbosity
  95 -x        -- don't check external links (these are often slow to check)
  96 -a        -- don't check name anchors
  97
  98 Arguments:
  99
 100 rooturl   -- URL to start checking
 101              (default %(DEFROOT)s)
 102
 103 """
 104
 105
 106 __version__ = "$Revision$"
 107
 108
 109 import sys
 110 import os
 111 from types import *
 112 import StringIO
 113 import getopt
 114 import pickle
 115
 116 import urllib
 117 import urlparse
 118 import sgmllib
 119 import cgi
 120
 121 import mimetypes
 122 import robotparser
 123
 124 # Extract real version number if necessary
 125 if __version__[0] == '$':
 126     _v = __version__.split()
 127     if len(_v) == 3:
 128         __version__ = _v[1]
 129
 130
 131 # Tunable parameters
 132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/"   # Default root URL
 133 CHECKEXT = 1                            # Check external references (1 deep)
 134 VERBOSE = 1                             # Verbosity level (0-3)
 135 MAXPAGE = 150000                        # Ignore files bigger than this
 136 ROUNDSIZE = 50                          # Number of links processed per round
 137 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 138 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
 139 NONAMES = 0                             # Force name anchor checking
 140
 141
 142 # Global variables
 143
 144
 145 def main():
 146     checkext = CHECKEXT
 147     verbose = VERBOSE
 148     maxpage = MAXPAGE
 149     roundsize = ROUNDSIZE
 150     dumpfile = DUMPFILE
 151     restart = 0
 152     norun = 0
 153
 154     try:
 155         opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
 156     except getopt.error, msg:
 157         sys.stdout = sys.stderr
 158         print msg
 159         print __doc__%globals()
 160         sys.exit(2)
 161
 162     # The extra_roots variable collects extra roots.
 163     extra_roots = []
 164     nonames = NONAMES
 165
 166     for o, a in opts:
 167         if o == '-R':
 168             restart = 1
 169         if o == '-d':
 170             dumpfile = a
 171         if o == '-m':
 172             maxpage = int(a)
 173         if o == '-n':
 174             norun = 1
 175         if o == '-q':
 176             verbose = 0
 177         if o == '-r':
 178             roundsize = int(a)
 179         if o == '-t':
 180             extra_roots.append(a)
 181         if o == '-a':
 182             nonames = not nonames
 183         if o == '-v':
 184             verbose = verbose + 1
 185         if o == '-x':
 186             checkext = not checkext
 187
 188     if verbose > 0:
 189         print AGENTNAME, "version", __version__
 190
 191     if restart:
 192         c = load_pickle(dumpfile=dumpfile, verbose=verbose)
 193     else:
 194         c = Checker()
 195
 196     c.setflags(checkext=checkext, verbose=verbose,
 197                maxpage=maxpage, roundsize=roundsize,
 198                nonames=nonames
 199                )
 200
 201     if not restart and not args:
 202         args.append(DEFROOT)
 203
 204     for arg in args:
 205         c.addroot(arg)
 206
 207     # The -t flag is only needed if external links are not to be
 208     # checked. So -t values are ignored unless -x was specified.
 209     if not checkext:
 210         for root in extra_roots:
 211             # Make sure it's terminated by a slash,
 212             # so that addroot doesn't discard the last
 213             # directory component.
 214             if root[-1] != "/":
 215                 root = root + "/"
 216             c.addroot(root, add_to_do = 0)
 217
 218     try:
 219
 220         if not norun:
 221             try:
 222                 c.run()
 223             except KeyboardInterrupt:
 224                 if verbose > 0:
 225                     print "[run interrupted]"
 226
 227         try:
 228             c.report()
 229         except KeyboardInterrupt:
 230             if verbose > 0:
 231                 print "[report interrupted]"
 232
 233     finally:
 234         if c.save_pickle(dumpfile):
 235             if dumpfile == DUMPFILE:
 236                 print "Use ``%s -R'' to restart." % sys.argv[0]
 237             else:
 238                 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
 239                                                            dumpfile)
 240
 241
 242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
 243     if verbose > 0:
 244         print "Loading checkpoint from %s ..." % dumpfile
 245     f = open(dumpfile, "rb")
 246     c = pickle.load(f)
 247     f.close()
 248     if verbose > 0:
 249         print "Done."
 250         print "Root:", "\n      ".join(c.roots)
 251     return c
 252
 253
 254 class Checker:
 255
 256     checkext = CHECKEXT
 257     verbose = VERBOSE
 258     maxpage = MAXPAGE
 259     roundsize = ROUNDSIZE
 260     nonames = NONAMES
 261
 262     validflags = tuple(dir())
 263
 264     def __init__(self):
 265         self.reset()
 266
 267     def setflags(self, **kw):
 268         for key in kw.keys():
 269             if key not in self.validflags:
 270                 raise NameError, "invalid keyword argument: %s" % str(key)
 271         for key, value in kw.items():
 272             setattr(self, key, value)
 273
 274     def reset(self):
 275         self.roots = []
 276         self.todo = {}
 277         self.done = {}
 278         self.bad = {}
 279
 280         # Add a name table, so that the name URLs can be checked. Also
 281         # serves as an implicit cache for which URLs are done.
 282         self.name_table = {}
 283
 284         self.round = 0
 285         # The following are not pickled:
 286         self.robots = {}
 287         self.errors = {}
 288         self.urlopener = MyURLopener()
 289         self.changed = 0
 290
 291     def note(self, level, format, *args):
 292         if self.verbose > level:
 293             if args:
 294                 format = format%args
 295             self.message(format)
 296
 297     def message(self, format, *args):
 298         if args:
 299             format = format%args
 300         print format
 301
 302     def __getstate__(self):
 303         return (self.roots, self.todo, self.done, self.bad, self.round)
 304
 305     def __setstate__(self, state):
 306         self.reset()
 307         (self.roots, self.todo, self.done, self.bad, self.round) = state
 308         for root in self.roots:
 309             self.addrobot(root)
 310         for url in self.bad.keys():
 311             self.markerror(url)
 312
 313     def addroot(self, root, add_to_do = 1):
 314         if root not in self.roots:
 315             troot = root
 316             scheme, netloc, path, params, query, fragment = \
 317                     urlparse.urlparse(root)
 318             i = path.rfind("/") + 1
 319             if 0 < i < len(path):
 320                 path = path[:i]
 321                 troot = urlparse.urlunparse((scheme, netloc, path,
 322                                              params, query, fragment))
 323             self.roots.append(troot)
 324             self.addrobot(root)
 325             if add_to_do:
 326                 self.newlink((root, ""), ("<root>", root))
 327
 328     def addrobot(self, root):
 329         root = urlparse.urljoin(root, "/")
 330         if self.robots.has_key(root): return
 331         url = urlparse.urljoin(root, "/robots.txt")
 332         self.robots[root] = rp = robotparser.RobotFileParser()
 333         self.note(2, "Parsing %s", url)
 334         rp.debug = self.verbose > 3
 335         rp.set_url(url)
 336         try:
 337             rp.read()
 338         except (OSError, IOError), msg:
 339             self.note(1, "I/O error parsing %s: %s", url, msg)
 340
 341     def run(self):
 342         while self.todo:
 343             self.round = self.round + 1
 344             self.note(0, "\nRound %d (%s)\n", self.round, self.status())
 345             urls = self.todo.keys()
 346             urls.sort()
 347             del urls[self.roundsize:]
 348             for url in urls:
 349                 self.dopage(url)
 350
 351     def status(self):
 352         return "%d total, %d to do, %d done, %d bad" % (
 353             len(self.todo)+len(self.done),
 354             len(self.todo), len(self.done),
 355             len(self.bad))
 356
 357     def report(self):
 358         self.message("")
 359         if not self.todo: s = "Final"
 360         else: s = "Interim"
 361         self.message("%s Report (%s)", s, self.status())
 362         self.report_errors()
 363
 364     def report_errors(self):
 365         if not self.bad:
 366             self.message("\nNo errors")
 367             return
 368         self.message("\nError Report:")
 369         sources = self.errors.keys()
 370         sources.sort()
 371         for source in sources:
 372             triples = self.errors[source]
 373             self.message("")
 374             if len(triples) > 1:
 375                 self.message("%d Errors in %s", len(triples), source)
 376             else:
 377                 self.message("Error in %s", source)
 378             # Call self.format_url() instead of referring
 379             # to the URL directly, since the URLs in these
 380             # triples is now a (URL, fragment) pair. The value
 381             # of the "source" variable comes from the list of
 382             # origins, and is a URL, not a pair.
 383             for url, rawlink, msg in triples:
 384                 if rawlink != self.format_url(url): s = " (%s)" % rawlink
 385                 else: s = ""
 386                 self.message("  HREF %s%s\n    msg %s",
 387                              self.format_url(url), s, msg)
 388
 389     def dopage(self, url_pair):
 390
 391         # All printing of URLs uses format_url(); argument changed to
 392         # url_pair for clarity.
 393         if self.verbose > 1:
 394             if self.verbose > 2:
 395                 self.show("Check ", self.format_url(url_pair),
 396                           "  from", self.todo[url_pair])
 397             else:
 398                 self.message("Check %s", self.format_url(url_pair))
 399         url, local_fragment = url_pair
 400         if local_fragment and self.nonames:
 401             self.markdone(url_pair)
 402             return
 403         try:
 404             page = self.getpage(url_pair)
 405         except sgmllib.SGMLParseError, msg:
 406             msg = self.sanitize(msg)
 407             self.note(0, "Error parsing %s: %s",
 408                           self.format_url(url_pair), msg)
 409             # Dont actually mark the URL as bad - it exists, just
 410             # we can't parse it!
 411             page = None
 412         if page:
 413             # Store the page which corresponds to this URL.
 414             self.name_table[url] = page
 415             # If there is a fragment in this url_pair, and it's not
 416             # in the list of names for the page, call setbad(), since
 417             # it's a missing anchor.
 418             if local_fragment and local_fragment not in page.getnames():
 419                 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 420             for info in page.getlinkinfos():
 421                 # getlinkinfos() now returns the fragment as well,
 422                 # and we store that fragment here in the "todo" dictionary.
 423                 link, rawlink, fragment = info
 424                 # However, we don't want the fragment as the origin, since
 425                 # the origin is logically a page.
 426                 origin = url, rawlink
 427                 self.newlink((link, fragment), origin)
 428         else:
 429             # If no page has been created yet, we want to
 430             # record that fact.
 431             self.name_table[url_pair[0]] = None
 432         self.markdone(url_pair)
 433
 434     def newlink(self, url, origin):
 435         if self.done.has_key(url):
 436             self.newdonelink(url, origin)
 437         else:
 438             self.newtodolink(url, origin)
 439
 440     def newdonelink(self, url, origin):
 441         if origin not in self.done[url]:
 442             self.done[url].append(origin)
 443
 444         # Call self.format_url(), since the URL here
 445         # is now a (URL, fragment) pair.
 446         self.note(3, "  Done link %s", self.format_url(url))
 447
 448         # Make sure that if it's bad, that the origin gets added.
 449         if self.bad.has_key(url):
 450             source, rawlink = origin
 451             triple = url, rawlink, self.bad[url]
 452             self.seterror(source, triple)
 453
 454     def newtodolink(self, url, origin):
 455         # Call self.format_url(), since the URL here
 456         # is now a (URL, fragment) pair.
 457         if self.todo.has_key(url):
 458             if origin not in self.todo[url]:
 459                 self.todo[url].append(origin)
 460             self.note(3, "  Seen todo link %s", self.format_url(url))
 461         else:
 462             self.todo[url] = [origin]
 463             self.note(3, "  New todo link %s", self.format_url(url))
 464
 465     def format_url(self, url):
 466         link, fragment = url
 467         if fragment: return link + "#" + fragment
 468         else: return link
 469
 470     def markdone(self, url):
 471         self.done[url] = self.todo[url]
 472         del self.todo[url]
 473         self.changed = 1
 474
 475     def inroots(self, url):
 476         for root in self.roots:
 477             if url[:len(root)] == root:
 478                 return self.isallowed(root, url)
 479         return 0
 480
 481     def isallowed(self, root, url):
 482         root = urlparse.urljoin(root, "/")
 483         return self.robots[root].can_fetch(AGENTNAME, url)
 484
 485     def getpage(self, url_pair):
 486         # Incoming argument name is a (URL, fragment) pair.
 487         # The page may have been cached in the name_table variable.
 488         url, fragment = url_pair
 489         if self.name_table.has_key(url):
 490             return self.name_table[url]
 491
 492         scheme, path = urllib.splittype(url)
 493         if scheme in ('mailto', 'news', 'javascript', 'telnet'):
 494             self.note(1, " Not checking %s URL" % scheme)
 495             return None
 496         isint = self.inroots(url)
 497
 498         # Ensure that openpage gets the URL pair to
 499         # print out its error message and record the error pair
 500         # correctly.
 501         if not isint:
 502             if not self.checkext:
 503                 self.note(1, " Not checking ext link")
 504                 return None
 505             f = self.openpage(url_pair)
 506             if f:
 507                 self.safeclose(f)
 508             return None
 509         text, nurl = self.readhtml(url_pair)
 510
 511         if nurl != url:
 512             self.note(1, " Redirected to %s", nurl)
 513             url = nurl
 514         if text:
 515             return Page(text, url, maxpage=self.maxpage, checker=self)
 516
 517     # These next three functions take (URL, fragment) pairs as
 518     # arguments, so that openpage() receives the appropriate tuple to
 519     # record error messages.
 520     def readhtml(self, url_pair):
 521         url, fragment = url_pair
 522         text = None
 523         f, url = self.openhtml(url_pair)
 524         if f:
 525             text = f.read()
 526             f.close()
 527         return text, url
 528
 529     def openhtml(self, url_pair):
 530         url, fragment = url_pair
 531         f = self.openpage(url_pair)
 532         if f:
 533             url = f.geturl()
 534             info = f.info()
 535             if not self.checkforhtml(info, url):
 536                 self.safeclose(f)
 537                 f = None
 538         return f, url
 539
 540     def openpage(self, url_pair):
 541         url, fragment = url_pair
 542         try:
 543             return self.urlopener.open(url)
 544         except (OSError, IOError), msg:
 545             msg = self.sanitize(msg)
 546             self.note(0, "Error %s", msg)
 547             if self.verbose > 0:
 548                 self.show(" HREF ", url, "  from", self.todo[url_pair])
 549             self.setbad(url_pair, msg)
 550             return None
 551
 552     def checkforhtml(self, info, url):
 553         if info.has_key('content-type'):
 554             ctype = cgi.parse_header(info['content-type'])[0].lower()
 555             if ';' in ctype:
 556                 # handle content-type: text/html; charset=iso8859-1 :
 557                 ctype = ctype.split(';', 1)[0].strip()
 558         else:
 559             if url[-1:] == "/":
 560                 return 1
 561             ctype, encoding = mimetypes.guess_type(url)
 562         if ctype == 'text/html':
 563             return 1
 564         else:
 565             self.note(1, " Not HTML, mime type %s", ctype)
 566             return 0
 567
 568     def setgood(self, url):
 569         if self.bad.has_key(url):
 570             del self.bad[url]
 571             self.changed = 1
 572             self.note(0, "(Clear previously seen error)")
 573
 574     def setbad(self, url, msg):
 575         if self.bad.has_key(url) and self.bad[url] == msg:
 576             self.note(0, "(Seen this error before)")
 577             return
 578         self.bad[url] = msg
 579         self.changed = 1
 580         self.markerror(url)
 581
 582     def markerror(self, url):
 583         try:
 584             origins = self.todo[url]
 585         except KeyError:
 586             origins = self.done[url]
 587         for source, rawlink in origins:
 588             triple = url, rawlink, self.bad[url]
 589             self.seterror(source, triple)
 590
 591     def seterror(self, url, triple):
 592         try:
 593             # Because of the way the URLs are now processed, I need to
 594             # check to make sure the URL hasn't been entered in the
 595             # error list.  The first element of the triple here is a
 596             # (URL, fragment) pair, but the URL key is not, since it's
 597             # from the list of origins.
 598             if triple not in self.errors[url]:
 599                 self.errors[url].append(triple)
 600         except KeyError:
 601             self.errors[url] = [triple]
 602
 603     # The following used to be toplevel functions; they have been
 604     # changed into methods so they can be overridden in subclasses.
 605
 606     def show(self, p1, link, p2, origins):
 607         self.message("%s %s", p1, link)
 608         i = 0
 609         for source, rawlink in origins:
 610             i = i+1
 611             if i == 2:
 612                 p2 = ' '*len(p2)
 613             if rawlink != link: s = " (%s)" % rawlink
 614             else: s = ""
 615             self.message("%s %s%s", p2, source, s)
 616
 617     def sanitize(self, msg):
 618         if isinstance(IOError, ClassType) and isinstance(msg, IOError):
 619             # Do the other branch recursively
 620             msg.args = self.sanitize(msg.args)
 621         elif isinstance(msg, TupleType):
 622             if len(msg) >= 4 and msg[0] == 'http error' and \
 623                isinstance(msg[3], InstanceType):
 624                 # Remove the Message instance -- it may contain
 625                 # a file object which prevents pickling.
 626                 msg = msg[:3] + msg[4:]
 627         return msg
 628
 629     def safeclose(self, f):
 630         try:
 631             url = f.geturl()
 632         except AttributeError:
 633             pass
 634         else:
 635             if url[:4] == 'ftp:' or url[:7] == 'file://':
 636                 # Apparently ftp connections don't like to be closed
 637                 # prematurely...
 638                 text = f.read()
 639         f.close()
 640
 641     def save_pickle(self, dumpfile=DUMPFILE):
 642         if not self.changed:
 643             self.note(0, "\nNo need to save checkpoint")
 644         elif not dumpfile:
 645             self.note(0, "No dumpfile, won't save checkpoint")
 646         else:
 647             self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
 648             newfile = dumpfile + ".new"
 649             f = open(newfile, "wb")
 650             pickle.dump(self, f)
 651             f.close()
 652             try:
 653                 os.unlink(dumpfile)
 654             except os.error:
 655                 pass
 656             os.rename(newfile, dumpfile)
 657             self.note(0, "Done.")
 658             return 1
 659
 660
 661 class Page:
 662
 663     def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
 664         self.text = text
 665         self.url = url
 666         self.verbose = verbose
 667         self.maxpage = maxpage
 668         self.checker = checker
 669
 670         # The parsing of the page is done in the __init__() routine in
 671         # order to initialize the list of names the file
 672         # contains. Stored the parser in an instance variable. Passed
 673         # the URL to MyHTMLParser().
 674         size = len(self.text)
 675         if size > self.maxpage:
 676             self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
 677             self.parser = None
 678             return
 679         self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
 680         self.parser = MyHTMLParser(url, verbose=self.verbose,
 681                                    checker=self.checker)
 682         self.parser.feed(self.text)
 683         self.parser.close()
 684
 685     def note(self, level, msg, *args):
 686         if self.checker:
 687             apply(self.checker.note, (level, msg) + args)
 688         else:
 689             if self.verbose >= level:
 690                 if args:
 691                     msg = msg%args
 692                 print msg
 693
 694     # Method to retrieve names.
 695     def getnames(self):
 696         if self.parser:
 697             return self.parser.names
 698         else:
 699             return []
 700
 701     def getlinkinfos(self):
 702         # File reading is done in __init__() routine.  Store parser in
 703         # local variable to indicate success of parsing.
 704
 705         # If no parser was stored, fail.
 706         if not self.parser: return []
 707
 708         rawlinks = self.parser.getlinks()
 709         base = urlparse.urljoin(self.url, self.parser.getbase() or "")
 710         infos = []
 711         for rawlink in rawlinks:
 712             t = urlparse.urlparse(rawlink)
 713             # DON'T DISCARD THE FRAGMENT! Instead, include
 714             # it in the tuples which are returned. See Checker.dopage().
 715             fragment = t[-1]
 716             t = t[:-1] + ('',)
 717             rawlink = urlparse.urlunparse(t)
 718             link = urlparse.urljoin(base, rawlink)
 719             infos.append((link, rawlink, fragment))
 720
 721         return infos
 722
 723
 724 class MyStringIO(StringIO.StringIO):
 725
 726     def __init__(self, url, info):
 727         self.__url = url
 728         self.__info = info
 729         StringIO.StringIO.__init__(self)
 730
 731     def info(self):
 732         return self.__info
 733
 734     def geturl(self):
 735         return self.__url
 736
 737
 738 class MyURLopener(urllib.FancyURLopener):
 739
 740     http_error_default = urllib.URLopener.http_error_default
 741
 742     def __init__(*args):
 743         self = args[0]
 744         apply(urllib.FancyURLopener.__init__, args)
 745         self.addheaders = [
 746             ('User-agent', 'Python-webchecker/%s' % __version__),
 747             ]
 748
 749     def http_error_401(self, url, fp, errcode, errmsg, headers):
 750         return None
 751
 752     def open_file(self, url):
 753         path = urllib.url2pathname(urllib.unquote(url))
 754         if os.path.isdir(path):
 755             if path[-1] != os.sep:
 756                 url = url + '/'
 757             indexpath = os.path.join(path, "index.html")
 758             if os.path.exists(indexpath):
 759                 return self.open_file(url + "index.html")
 760             try:
 761                 names = os.listdir(path)
 762             except os.error, msg:
 763                 exc_type, exc_value, exc_tb = sys.exc_info()
 764                 raise IOError, msg, exc_tb
 765             names.sort()
 766             s = MyStringIO("file:"+url, {'content-type': 'text/html'})
 767             s.write('<BASE HREF="file:%s">\n' %
 768                     urllib.quote(os.path.join(path, "")))
 769             for name in names:
 770                 q = urllib.quote(name)
 771                 s.write('<A HREF="%s">%s</A>\n' % (q, q))
 772             s.seek(0)
 773             return s
 774         return urllib.FancyURLopener.open_file(self, url)
 775
 776
 777 class MyHTMLParser(sgmllib.SGMLParser):
 778
 779     def __init__(self, url, verbose=VERBOSE, checker=None):
 780         self.myverbose = verbose # now unused
 781         self.checker = checker
 782         self.base = None
 783         self.links = {}
 784         self.names = []
 785         self.url = url
 786         sgmllib.SGMLParser.__init__(self)
 787
 788     def check_name_id(self, attributes):
 789         """ Check the name or id attributes on an element.
 790         """
 791         # We must rescue the NAME or id (name is deprecated in XHTML)
 792         # attributes from the anchor, in order to
 793         # cache the internal anchors which are made
 794         # available in the page.
 795         for name, value in attributes:
 796             if name == "name" or name == "id":
 797                 if value in self.names:
 798                     self.checker.message("WARNING: duplicate ID name %s in %s",
 799                                          value, self.url)
 800                 else: self.names.append(value)
 801                 break
 802
 803     def unknown_starttag(self, tag, attributes):
 804         """ In XHTML, you can have id attributes on any element.
 805         """
 806         self.check_name_id(attributes)
 807
 808     def start_a(self, attributes):
 809         self.link_attr(attributes, 'href')
 810         self.check_name_id(attributes)
 811
 812     def end_a(self): pass
 813
 814     def do_area(self, attributes):
 815         self.link_attr(attributes, 'href')
 816         self.check_name_id(attributes)
 817
 818     def do_body(self, attributes):
 819         self.link_attr(attributes, 'background', 'bgsound')
 820         self.check_name_id(attributes)
 821
 822     def do_img(self, attributes):
 823         self.link_attr(attributes, 'src', 'lowsrc')
 824         self.check_name_id(attributes)
 825
 826     def do_frame(self, attributes):
 827         self.link_attr(attributes, 'src', 'longdesc')
 828         self.check_name_id(attributes)
 829
 830     def do_iframe(self, attributes):
 831         self.link_attr(attributes, 'src', 'longdesc')
 832         self.check_name_id(attributes)
 833
 834     def do_link(self, attributes):
 835         for name, value in attributes:
 836             if name == "rel":
 837                 parts = value.lower().split()
 838                 if (  parts == ["stylesheet"]
 839                       or parts == ["alternate", "stylesheet"]):
 840                     self.link_attr(attributes, "href")
 841                     break
 842         self.check_name_id(attributes)
 843
 844     def do_object(self, attributes):
 845         self.link_attr(attributes, 'data', 'usemap')
 846         self.check_name_id(attributes)
 847
 848     def do_script(self, attributes):
 849         self.link_attr(attributes, 'src')
 850         self.check_name_id(attributes)
 851
 852     def do_table(self, attributes):
 853         self.link_attr(attributes, 'background')
 854         self.check_name_id(attributes)
 855
 856     def do_td(self, attributes):
 857         self.link_attr(attributes, 'background')
 858         self.check_name_id(attributes)
 859
 860     def do_th(self, attributes):
 861         self.link_attr(attributes, 'background')
 862         self.check_name_id(attributes)
 863
 864     def do_tr(self, attributes):
 865         self.link_attr(attributes, 'background')
 866         self.check_name_id(attributes)
 867
 868     def link_attr(self, attributes, *args):
 869         for name, value in attributes:
 870             if name in args:
 871                 if value: value = value.strip()
 872                 if value: self.links[value] = None
 873
 874     def do_base(self, attributes):
 875         for name, value in attributes:
 876             if name == 'href':
 877                 if value: value = value.strip()
 878                 if value:
 879                     if self.checker:
 880                         self.checker.note(1, "  Base %s", value)
 881                     self.base = value
 882         self.check_name_id(attributes)
 883
 884     def getlinks(self):
 885         return self.links.keys()
 886
 887     def getbase(self):
 888         return self.base
 889
 890
 891 if __name__ == '__main__':
 892     main()