Issue #4969: The mimetypes module now reads the MIME database from
[python.git] / Tools / webchecker / websucker.py
blobef2fa44d7e08a321a4094b310cebf4cbb1789403
1 #! /usr/bin/env python
3 """A variant on webchecker that creates a mirror copy of a remote site."""
5 __version__ = "$Revision$"
7 import os
8 import sys
9 import urllib
10 import getopt
12 import webchecker
14 # Extract real version number if necessary
15 if __version__[0] == '$':
16 _v = __version__.split()
17 if len(_v) == 3:
18 __version__ = _v[1]
20 def main():
21 verbose = webchecker.VERBOSE
22 try:
23 opts, args = getopt.getopt(sys.argv[1:], "qv")
24 except getopt.error, msg:
25 print msg
26 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
27 return 2
28 for o, a in opts:
29 if o == "-q":
30 verbose = 0
31 if o == "-v":
32 verbose = verbose + 1
33 c = Sucker()
34 c.setflags(verbose=verbose)
35 c.urlopener.addheaders = [
36 ('User-agent', 'websucker/%s' % __version__),
38 for arg in args:
39 print "Adding root", arg
40 c.addroot(arg)
41 print "Run..."
42 c.run()
44 class Sucker(webchecker.Checker):
46 checkext = 0
47 nonames = 1
49 # SAM 11/13/99: in general, URLs are now URL pairs.
50 # Since we've suppressed name anchor checking,
51 # we can ignore the second dimension.
53 def readhtml(self, url_pair):
54 url = url_pair[0]
55 text = None
56 path = self.savefilename(url)
57 try:
58 f = open(path, "rb")
59 except IOError:
60 f = self.openpage(url_pair)
61 if f:
62 info = f.info()
63 nurl = f.geturl()
64 if nurl != url:
65 url = nurl
66 path = self.savefilename(url)
67 text = f.read()
68 f.close()
69 self.savefile(text, path)
70 if not self.checkforhtml(info, url):
71 text = None
72 else:
73 if self.checkforhtml({}, url):
74 text = f.read()
75 f.close()
76 return text, url
78 def savefile(self, text, path):
79 dir, base = os.path.split(path)
80 makedirs(dir)
81 try:
82 f = open(path, "wb")
83 f.write(text)
84 f.close()
85 self.message("saved %s", path)
86 except IOError, msg:
87 self.message("didn't save %s: %s", path, str(msg))
89 def savefilename(self, url):
90 type, rest = urllib.splittype(url)
91 host, path = urllib.splithost(rest)
92 path = path.lstrip("/")
93 user, host = urllib.splituser(host)
94 host, port = urllib.splitnport(host)
95 host = host.lower()
96 if not path or path[-1] == "/":
97 path = path + "index.html"
98 if os.sep != "/":
99 path = os.sep.join(path.split("/"))
100 if os.name == "mac":
101 path = os.sep + path
102 path = os.path.join(host, path)
103 return path
105 def makedirs(dir):
106 if not dir:
107 return
108 if os.path.exists(dir):
109 if not os.path.isdir(dir):
110 try:
111 os.rename(dir, dir + ".bak")
112 os.mkdir(dir)
113 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
114 except os.error:
115 pass
116 return
117 head, tail = os.path.split(dir)
118 if not tail:
119 print "Huh? Don't know how to make dir", dir
120 return
121 makedirs(head)
122 os.mkdir(dir, 0777)
124 if __name__ == '__main__':
125 sys.exit(main() or 0)