botlistprojects/botspider/spider/lib/python/url_info_pool.py

   1 """
   2 File: spiderdb.py
   3
   4 Copyright (c) 2007, Botnode.com (Berlin Brown)
   5 http://www.opensource.org/licenses/bsd-license.php
   6
   7 All rights reserved.
   8
   9 Redistribution and use in source and binary forms, with or without modification,
  10 are permitted provided that the following conditions are met:
  11
  12     * Redistributions of source code must retain the above copyright notice,
  13     this list of conditions and the following disclaimer.
  14     * Redistributions in binary form must reproduce the above copyright notice,
  15     this list of conditions and the following disclaimer in the documentation
  16     and/or other materials provided with the distribution.
  17     * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
  18     the names of its contributors may be used to endorse or promote
  19     products derived from this software without specific prior written permission.
  20
  21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 Description:
  34
  35 Save spider database format in big endian format (network format).
  36
  37 """
  38
  39 __author__ = "Berlin Brown"
  40 __copyright__ = "Copyright (c) 2006-2008 Berlin Brown"
  41 __license__ = "New BSD"
  42
  43 import sys
  44 import time, datetime
  45 import socket
  46
  47 from soup.BeautifulSoup import *
  48 import urllib2
  49 from urlparse import urlparse
  50 from optparse import OptionParser
  51 import glob
  52
  53 from database.spiderdb import create_database
  54 from spiderbot_util import DEFAULT_REQUEST_TIMEOUT, FF_USER_AGENT, \
  55     LINK_SET_INDICATOR, URLField, buildOpener, validateSubLink, convertStrAscii
  56 from content.spiderbot_content import doc_ignore_content, \
  57     clean_content, build_page_info
  58
  59 def processSubLink(link_tag):
  60         """Process each link, ensure that a 'href' value is available,
  61         also convert relative URIs to full URLs"""
  62         # TODO: BUG, currently ignoring all internal links (don't have http)
  63         link_val = link_tag['href']
  64         link = None
  65         # If URL found, ignore; if relative than attempt to build URL
  66         if link_val.lower().startswith('http'):
  67                 link = link_val
  68         else:
  69                 link = link_val
  70         return link
  71
  72 def get_meta_content(meta_data_arr):
  73         """ Use with soup, in the following manner:
  74         <code>meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
  75         meta_data_descr = soup.findAll('meta', {'name':'description'})</code>
  76         keywords = get_meta_content(meta_data_keywords)"""
  77         try:
  78                 content_content = None
  79                 if meta_data_arr and len(meta_data_arr) > 0:
  80                         content_data = [el['content'] for el in meta_data_arr]
  81                         if content_data and len(content_data) > 0:
  82                                 return content_data[0]
  83         except:
  84                 pass
  85         return ""
  86
  87 def crawlSingleURL(link, idx, total_links):
  88         try:
  89                 opener = buildOpener()
  90                 start = time.time()
  91                 data = opener.open(link).read()
  92                 soup = BeautifulSoup(data)
  93                 meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
  94                 meta_data_descr = soup.findAll('meta', {'name':'description'})
  95                 keywords = get_meta_content(meta_data_keywords)
  96                 descr = get_meta_content(meta_data_descr)
  97                 # Extract the title tag
  98                 titleTag = None
  99                 try:
 100                         titleTag = soup.html.head.title
 101                         titleTag = str(titleTag.string)
 102                 except:
 103                         titleTag = ""
 104                 end = time.time()
 105
 106                 # Return the basic URL data structure
 107                 field = URLField(link, titleTag, descr, keywords)
 108                 field.populate()
 109                 if ((idx % LINK_SET_INDICATOR) == 0):
 110                         sys.stdout.write("[%s/%s] " % (idx, total_links))
 111                 # Exit crawl single URL with url field.
 112                 # @return URLField
 113                 return field
 114         except socket.timeout:
 115                 print "ERR: timeout [%s/%s] " % (idx, total_links)
 116         except urllib2.URLError:
 117                 print "ERR: timeout [%s/%s] " % (idx, total_links)
 118         except Exception, e:
 119                 pass
 120
 121 def crawlSingleURLForContent(link, idx, total_links):
 122         """ Crawl this URL but only extract the content for content
 123         analysis.  A more extensive model than crawlSingleURL"""
 124         try:
 125                 opener = buildOpener()
 126                 start = time.time()
 127                 data = opener.open(link).read()
 128                 istats = build_page_info(link, data)
 129                 data = clean_content(data)
 130                 soup = BeautifulSoup(data)
 131                 meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
 132                 meta_data_descr = soup.findAll('meta', {'name':'description'})
 133                 keywords = get_meta_content(meta_data_keywords)
 134                 descr = get_meta_content(meta_data_descr)
 135
 136                 # Extract the title tag
 137                 titleTag = None
 138                 try:
 139                         titleTag = soup.html.head.title
 140                         titleTag = str(titleTag.string)
 141                 except:
 142                         titleTag = ""
 143                 # Ignore content we aren't concerned with
 144                 partial_content = doc_ignore_content(soup)
 145
 146                 end = time.time()
 147                 # Return the basic URL data structure
 148                 field = URLField(link, titleTag, descr, keywords)
 149
 150                 field.descr = field.tokenizeTags(field.descr)
 151                 field.keywords = field.tokenizeTags(field.keywords)
 152
 153                 field.full_content = data
 154                 field.extract_content = partial_content
 155                 field.info_stats = istats
 156                 field.populate()
 157                 if ((idx % LINK_SET_INDICATOR) == 0):
 158                         sys.stdout.write("[%s/%s] " % (idx, total_links))
 159
 160                 # Exit crawl single URL with url field.
 161                 # @return URLField
 162                 return field
 163         except urllib2.URLError:
 164                 print "ERR: timeout [%s/%s] " % (idx, total_links)
 165         except Exception, e:
 166                 # NOTE: if pass allowed, compile errors will be ignored.
 167                 print "ERR<crawlSingleURLForContent>: %s" % e
 168                 pass
 169
 170 def crawlForURLContentDump(link_list):
 171         dump_data = []
 172         """ Iterate through list and dump data"""
 173         for index, link in enumerate(link_list):
 174                 data_field = crawlSingleURLForContent(link, index, len(link_list))
 175                 dump_data.append(data_field)
 176         return dump_data
 177
 178 def crawlBuildLinks(link_list):
 179         opener = buildOpener()
 180         """ Iterate through the list of links and collect links found
 181         on each page through the use of the beautiful soup lib."""
 182         total_links = 0
 183         total_links_tag = 0
 184         sub_links = None
 185         for link in link_list:
 186                 try:
 187                         data = opener.open(link).read()
 188                         soup = BeautifulSoup(data)
 189                         sub_links_tag = soup.findAll('a')
 190                         total_links_tag = total_links_tag + len(sub_links_tag)
 191                         sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)]
 192                         # Filter out duplicates with set
 193                         sub_links = set(sub_links)
 194                         total_links = total_links + len(sub_links)
 195                 except Exception, e:
 196                         print "ERR <crawlBuildLinks>: %s" % e
 197                         print "    <crawlBuildLinks>: url=[%s]" % link
 198
 199         if total_links_tag != 0:
 200                 valid_ratio =  float(total_links) / total_links_tag
 201                 print "INFO: valid links ratio: %s, max=%s/%s" % \
 202                 (valid_ratio,
 203                  total_links,
 204                  total_links_tag)
 205
 206         # Return an empty list or valid content
 207         if sub_links is None:
 208                 return ([], total_links)
 209         else:
 210                 return (sub_links, total_links)
 211
 212 class URLInfoPool:
 213         def __init__(self):
 214                 self.url_pool = []
 215
 216         def buildURLPool(self, link_list):
 217                 links, total_links = crawlBuildLinks(link_list)
 218                 for index, link_proc in enumerate(links):
 219                         # DEBUG
 220                         if index > 10:
 221                                 break
 222                         url_info = crawlSingleURL(link_proc, index, total_links)
 223                         if url_info:
 224                                 self.url_pool.append(url_info)