3 from .basecrawler
import BaseSiteCrawler
4 from .parsers
import RobotsParser
7 class GenericSiteCrawler(BaseSiteCrawler
):
8 def __init__(self
, hostname
, dbconn
, siteid
, https
=False):
9 super(GenericSiteCrawler
, self
).__init
__(hostname
, dbconn
, siteid
, https
=https
)
13 self
.robots
= RobotsParser("http://%s/robots.txt" % self
.hostname
)
15 # We need to seed the crawler with every URL we've already seen, since
16 # we don't recrawl the contents if they haven't changed.
17 allpages
= self
.scantimes
.keys()
19 # Figure out if there are any excludes to deal with (beyond the
21 curs
= self
.dbconn
.cursor()
22 curs
.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
25 self
.extra_excludes
= [re
.compile(x
) for x
, in curs
.fetchall()]
27 # We *always* crawl the root page, of course
28 self
.queue
.put(("/", 0.5, False))
30 # Now do all the other pages
32 self
.queue
.put((x
, 0.5, False))
34 def exclude_url(self
, url
):
37 if self
.robots
and self
.robots
.block_url(url
):
39 for r
in self
.extra_excludes
:
44 def queue_url(self
, url
):
45 self
.queue
.put((url
.strip(), 0.5, False))
47 def post_process_page(self
, url
):
48 for l
in self
.resolve_links(self
.page
.links
, url
):
49 if l
in self
.pages_crawled
or l
+ "/" in self
.pages_crawled
:
51 if self
.exclude_url(l
):