3 """A variant on webchecker that creates a mirror copy of a remote site."""
5 __version__
= "$Revision$"
14 # Extract real version number if necessary
15 if __version__
[0] == '$':
16 _v
= __version__
.split()
21 verbose
= webchecker
.VERBOSE
23 opts
, args
= getopt
.getopt(sys
.argv
[1:], "qv")
24 except getopt
.error
, msg
:
26 print "usage:", sys
.argv
[0], "[-qv] ... [rooturl] ..."
34 c
.setflags(verbose
=verbose
)
35 c
.urlopener
.addheaders
= [
36 ('User-agent', 'websucker/%s' % __version__
),
39 print "Adding root", arg
44 class Sucker(webchecker
.Checker
):
49 # SAM 11/13/99: in general, URLs are now URL pairs.
50 # Since we've suppressed name anchor checking,
51 # we can ignore the second dimension.
53 def readhtml(self
, url_pair
):
56 path
= self
.savefilename(url
)
60 f
= self
.openpage(url_pair
)
66 path
= self
.savefilename(url
)
69 self
.savefile(text
, path
)
70 if not self
.checkforhtml(info
, url
):
73 if self
.checkforhtml({}, url
):
78 def savefile(self
, text
, path
):
79 dir, base
= os
.path
.split(path
)
85 self
.message("saved %s", path
)
87 self
.message("didn't save %s: %s", path
, str(msg
))
89 def savefilename(self
, url
):
90 type, rest
= urllib
.splittype(url
)
91 host
, path
= urllib
.splithost(rest
)
92 path
= path
.lstrip("/")
93 user
, host
= urllib
.splituser(host
)
94 host
, port
= urllib
.splitnport(host
)
96 if not path
or path
[-1] == "/":
97 path
= path
+ "index.html"
99 path
= os
.sep
.join(path
.split("/"))
102 path
= os
.path
.join(host
, path
)
108 if os
.path
.exists(dir):
109 if not os
.path
.isdir(dir):
111 os
.rename(dir, dir + ".bak")
113 os
.rename(dir + ".bak", os
.path
.join(dir, "index.html"))
117 head
, tail
= os
.path
.split(dir)
119 print "Huh? Don't know how to make dir", dir
124 if __name__
== '__main__':
125 sys
.exit(main() or 0)