From 2c84a8ec63d14738e5db65083940441a87c0e248 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Wed, 23 Jan 2019 22:14:56 +0100 Subject: [PATCH] Update search crawler for python3 This includes the switch to requests, but also a bunch of other changes. While at it, fix it so we can do proper https validation. --- tools/search/crawler/lib/basecrawler.py | 83 +++++++++++++++++++-------------- tools/search/crawler/lib/parsers.py | 20 ++------ tools/search/crawler/lib/sitemapsite.py | 21 +++++---- 3 files changed, 61 insertions(+), 63 deletions(-) diff --git a/tools/search/crawler/lib/basecrawler.py b/tools/search/crawler/lib/basecrawler.py index c7579cd3..e97b4c75 100644 --- a/tools/search/crawler/lib/basecrawler.py +++ b/tools/search/crawler/lib/basecrawler.py @@ -1,15 +1,28 @@ import datetime -import httplib import time from email.utils import formatdate, parsedate -import urlparse -import ssl +import urllib.parse +import requests +import urllib3 -from Queue import Queue +from queue import Queue import threading from lib.log import log -from lib.parsers import GenericHtmlParser, lossy_unicode +from lib.parsers import GenericHtmlParser + + +_orig_create_connection = urllib3.util.connection.create_connection + + +def override_create_connection(hostname, ipaddr): + def _override(address, *args, **kwargs): + host, port = address + if host == hostname: + return _orig_create_connection((ipaddr, port), *args, **kwargs) + else: + return _orig_create_connection(address, *args, **kwargs) + urllib3.util.connection.create_connection = _override class BaseSiteCrawler(object): @@ -25,6 +38,9 @@ class BaseSiteCrawler(object): self.pages_deleted = 0 self.status_interval = 5 + if serverip: + override_create_connection(hostname, serverip) + curs = dbconn.cursor() curs.execute("SELECT suburl, lastscanned FROM webpages WHERE site=%(id)s AND lastscanned IS NOT NULL", {'id': siteid}) self.scantimes = dict(curs.fetchall()) @@ -124,7 +140,6 @@ class BaseSiteCrawler(object): return # Try to convert pagedata to a unicode string - pagedata = lossy_unicode(pagedata) try: self.page = self.parse_html(pagedata) except Exception as e: @@ -167,46 +182,42 @@ class BaseSiteCrawler(object): def fetch_page(self, url): try: - # Unfortunatley, persistent connections seem quite unreliable, - # so create a new one for each page. - if self.serverip: - if not self.https: - h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10) - else: - h = httplib.HTTPSConnection(host=self.serverip, port=443, strict=True, timeout=10, context=ssl._create_unverified_context()) - h.putrequest("GET", url, skip_host=1) - h.putheader("Host", self.hostname) - else: - if not self.https: - h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10) - else: - h = httplib.HTTPSConnection(host=self.hostname, port=443, strict=True, timeout=10, context=ssl._create_unverified_context()) - h.putrequest("GET", url) - h.putheader("User-agent", "pgsearch/0.2") - h.putheader("Connection", "close") + headers = { + 'User-agent': 'pgsearch/0.2', + } if url in self.scantimes: - h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple()))) - h.endheaders() - resp = h.getresponse() + headers["If-Modified-Since"] = formatdate(time.mktime(self.scantimes[url].timetuple())) + + if self.serverip and False: + connectto = self.serverip + headers['Host'] = self.hostname + else: + connectto = self.hostname + + resp = requests.get( + '{}://{}{}'.format(self.https and 'https' or 'http', connectto, url), + headers=headers, + timeout=10, + ) - if resp.status == 200: - if not self.accept_contenttype(resp.getheader("content-type")): + if resp.status_code == 200: + if not self.accept_contenttype(resp.headers["content-type"]): # Content-type we're not interested in return (2, None, None) - return (0, resp.read(), self.get_date(resp.getheader("last-modified"))) - elif resp.status == 304: + return (0, resp.text, self.get_date(resp.headers.get("last-modified", None))) + elif resp.status_code == 304: # Not modified, so no need to reprocess, but also don't # give an error message for it... return (0, None, None) - elif resp.status == 301: + elif resp.status_code == 301: # A redirect... So try again with the redirected-to URL # We send this through our link resolver to deal with both # absolute and relative URLs - if resp.getheader('location', '') == '': + if resp.headers.get('location', '') == '': log("Url %s returned empty redirect" % url) return (2, None, None) - for tgt in self.resolve_links([resp.getheader('location', '')], url): + for tgt in self.resolve_links([resp.header['location'], ], url): return (1, tgt, None) # No redirect at all found, becaue it was invalid? return (2, None, None) @@ -233,7 +244,7 @@ class BaseSiteCrawler(object): def resolve_links(self, links, pageurl): for x in links: - p = urlparse.urlsplit(x) + p = urllib.parse.urlsplit(x) if p.scheme in ("http", "https"): if p.netloc != self.hostname: # Remote link @@ -252,10 +263,10 @@ class BaseSiteCrawler(object): if p[2][0] == "/": # Absolute link on this host, so just return it - yield urlparse.urlunsplit(p) + yield urllib.parse.urlunsplit(p) else: # Relative link - yield urlparse.urljoin(pageurl, urlparse.urlunsplit(p)) + yield urllib.parse.urljoin(pageurl, urllib.parse.urlunsplit(p)) else: # Ignore unknown url schemes like mailto pass diff --git a/tools/search/crawler/lib/parsers.py b/tools/search/crawler/lib/parsers.py index c19bf932..8ea56eaf 100644 --- a/tools/search/crawler/lib/parsers.py +++ b/tools/search/crawler/lib/parsers.py @@ -1,5 +1,5 @@ import re -import urllib +import requests from io import StringIO import dateutil.parser from datetime import timedelta @@ -61,12 +61,10 @@ class GenericHtmlParser(HTMLParser): class RobotsParser(object): def __init__(self, url): try: - u = urllib.urlopen(url) - txt = u.read() - u.close() + r = requests.get(url) self.disallows = [] activeagent = False - for l in txt.splitlines(): + for l in r.text.splitlines(): if l.lower().startswith("user-agent: ") and len(l) > 12: if l[12] == "*" or l[12:20] == "pgsearch": activeagent = True @@ -83,15 +81,3 @@ class RobotsParser(object): if url.startswith(d): return True return False - - -# Convert a string to unicode, try utf8 first, then latin1, then give -# up and do a best-effort utf8. -def lossy_unicode(s): - try: - return str(s, 'utf8') - except UnicodeDecodeError: - try: - return str(s, 'latin1') - except UnicodeDecodeError: - return str(s, 'utf8', 'replace') diff --git a/tools/search/crawler/lib/sitemapsite.py b/tools/search/crawler/lib/sitemapsite.py index 4e98cfd1..b4574457 100644 --- a/tools/search/crawler/lib/sitemapsite.py +++ b/tools/search/crawler/lib/sitemapsite.py @@ -1,6 +1,6 @@ -import urllib import xml.parsers.expat import dateutil.parser +import requests from lib.log import log from lib.basecrawler import BaseSiteCrawler @@ -10,7 +10,7 @@ class SitemapParser(object): def __init__(self): self.urls = [] - def parse(self, f, internal=False): + def parse(self, data, internal=False): self.parser = xml.parsers.expat.ParserCreate() self.currenturl = "" self.currentprio = 0 @@ -25,7 +25,7 @@ class SitemapParser(object): self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data) self.internal = internal - self.parser.ParseFile(f) + self.parser.Parse(data) def processelement(self, name, attrs): if name == "url": @@ -67,19 +67,20 @@ class SitemapSiteCrawler(BaseSiteCrawler): def init_crawl(self): # Fetch the sitemap. We ignore robots.txt in this case, and # assume it's always under /sitemap.xml - u = urllib.urlopen("https://%s/sitemap.xml" % self.hostname) + r = requests.get("https://%s/sitemap.xml" % self.hostname) + if r.status_code != 200: + raise Exception("Could not load sitemap: %s" % r.status_code) + p = SitemapParser() - p.parse(u) - u.close() + p.parse(r.text) # Attempt to fetch a sitempa_internal.xml. This is used to index # pages on our internal search engine that we don't want on # Google. They should also be excluded from default search # results (unless searching with a specific suburl) - u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname) - if u.getcode() == 200: - p.parse(u, True) - u.close() + r = requests.get("https://%s/sitemap_internal.xml" % self.hostname) + if r.status_code == 200: + p.parse(r.text, True) for url, prio, lastmod, internal in p.urls: # Advance 8 characters - length of https://. -- 2.11.4.GIT