Update syntax for relative imports
[pgweb/local.git] / tools / search / crawler / lib / genericsite.py
blob2204cdaca565689852e6399d0032e46fdc81b445
1 import re
3 from .basecrawler import BaseSiteCrawler
4 from .parsers import RobotsParser
7 class GenericSiteCrawler(BaseSiteCrawler):
8 def __init__(self, hostname, dbconn, siteid, https=False):
9 super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
11 def init_crawl(self):
12 # Load robots.txt
13 self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)
15 # We need to seed the crawler with every URL we've already seen, since
16 # we don't recrawl the contents if they haven't changed.
17 allpages = self.scantimes.keys()
19 # Figure out if there are any excludes to deal with (beyond the
20 # robots.txt ones)
21 curs = self.dbconn.cursor()
22 curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
23 'site': self.siteid,
25 self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
27 # We *always* crawl the root page, of course
28 self.queue.put(("/", 0.5, False))
30 # Now do all the other pages
31 for x in allpages:
32 self.queue.put((x, 0.5, False))
34 def exclude_url(self, url):
35 if ".." in url:
36 return True
37 if self.robots and self.robots.block_url(url):
38 return True
39 for r in self.extra_excludes:
40 if r.search(url):
41 return True
42 return False
44 def queue_url(self, url):
45 self.queue.put((url.strip(), 0.5, False))
47 def post_process_page(self, url):
48 for l in self.resolve_links(self.page.links, url):
49 if l in self.pages_crawled or l + "/" in self.pages_crawled:
50 continue
51 if self.exclude_url(l):
52 continue
53 self.queue_url(l)