adding all of botlist, initial add
[botlist.git] / botlistprojects / botspider / spider / lib / python / url_info_pool.py
blobce20ecd26f3b61ed1c89407eef6c284e431bdcb7
1 """
2 File: spiderdb.py
4 Copyright (c) 2007, Botnode.com (Berlin Brown)
5 http://www.opensource.org/licenses/bsd-license.php
7 All rights reserved.
9 Redistribution and use in source and binary forms, with or without modification,
10 are permitted provided that the following conditions are met:
12 * Redistributions of source code must retain the above copyright notice,
13 this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
18 the names of its contributors may be used to endorse or promote
19 products derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 Description:
35 Save spider database format in big endian format (network format).
37 """
39 __author__ = "Berlin Brown"
40 __copyright__ = "Copyright (c) 2006-2008 Berlin Brown"
41 __license__ = "New BSD"
43 import sys
44 import time, datetime
45 import socket
47 from soup.BeautifulSoup import *
48 import urllib2
49 from urlparse import urlparse
50 from optparse import OptionParser
51 import glob
53 from database.spiderdb import create_database
54 from spiderbot_util import DEFAULT_REQUEST_TIMEOUT, FF_USER_AGENT, \
55 LINK_SET_INDICATOR, URLField, buildOpener, validateSubLink, convertStrAscii
56 from content.spiderbot_content import doc_ignore_content, \
57 clean_content, build_page_info
59 def processSubLink(link_tag):
60 """Process each link, ensure that a 'href' value is available,
61 also convert relative URIs to full URLs"""
62 # TODO: BUG, currently ignoring all internal links (don't have http)
63 link_val = link_tag['href']
64 link = None
65 # If URL found, ignore; if relative than attempt to build URL
66 if link_val.lower().startswith('http'):
67 link = link_val
68 else:
69 link = link_val
70 return link
72 def get_meta_content(meta_data_arr):
73 """ Use with soup, in the following manner:
74 <code>meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
75 meta_data_descr = soup.findAll('meta', {'name':'description'})</code>
76 keywords = get_meta_content(meta_data_keywords)"""
77 try:
78 content_content = None
79 if meta_data_arr and len(meta_data_arr) > 0:
80 content_data = [el['content'] for el in meta_data_arr]
81 if content_data and len(content_data) > 0:
82 return content_data[0]
83 except:
84 pass
85 return ""
87 def crawlSingleURL(link, idx, total_links):
88 try:
89 opener = buildOpener()
90 start = time.time()
91 data = opener.open(link).read()
92 soup = BeautifulSoup(data)
93 meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
94 meta_data_descr = soup.findAll('meta', {'name':'description'})
95 keywords = get_meta_content(meta_data_keywords)
96 descr = get_meta_content(meta_data_descr)
97 # Extract the title tag
98 titleTag = None
99 try:
100 titleTag = soup.html.head.title
101 titleTag = str(titleTag.string)
102 except:
103 titleTag = ""
104 end = time.time()
106 # Return the basic URL data structure
107 field = URLField(link, titleTag, descr, keywords)
108 field.populate()
109 if ((idx % LINK_SET_INDICATOR) == 0):
110 sys.stdout.write("[%s/%s] " % (idx, total_links))
111 # Exit crawl single URL with url field.
112 # @return URLField
113 return field
114 except socket.timeout:
115 print "ERR: timeout [%s/%s] " % (idx, total_links)
116 except urllib2.URLError:
117 print "ERR: timeout [%s/%s] " % (idx, total_links)
118 except Exception, e:
119 pass
121 def crawlSingleURLForContent(link, idx, total_links):
122 """ Crawl this URL but only extract the content for content
123 analysis. A more extensive model than crawlSingleURL"""
124 try:
125 opener = buildOpener()
126 start = time.time()
127 data = opener.open(link).read()
128 istats = build_page_info(link, data)
129 data = clean_content(data)
130 soup = BeautifulSoup(data)
131 meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
132 meta_data_descr = soup.findAll('meta', {'name':'description'})
133 keywords = get_meta_content(meta_data_keywords)
134 descr = get_meta_content(meta_data_descr)
136 # Extract the title tag
137 titleTag = None
138 try:
139 titleTag = soup.html.head.title
140 titleTag = str(titleTag.string)
141 except:
142 titleTag = ""
143 # Ignore content we aren't concerned with
144 partial_content = doc_ignore_content(soup)
146 end = time.time()
147 # Return the basic URL data structure
148 field = URLField(link, titleTag, descr, keywords)
150 field.descr = field.tokenizeTags(field.descr)
151 field.keywords = field.tokenizeTags(field.keywords)
153 field.full_content = data
154 field.extract_content = partial_content
155 field.info_stats = istats
156 field.populate()
157 if ((idx % LINK_SET_INDICATOR) == 0):
158 sys.stdout.write("[%s/%s] " % (idx, total_links))
160 # Exit crawl single URL with url field.
161 # @return URLField
162 return field
163 except urllib2.URLError:
164 print "ERR: timeout [%s/%s] " % (idx, total_links)
165 except Exception, e:
166 # NOTE: if pass allowed, compile errors will be ignored.
167 print "ERR<crawlSingleURLForContent>: %s" % e
168 pass
170 def crawlForURLContentDump(link_list):
171 dump_data = []
172 """ Iterate through list and dump data"""
173 for index, link in enumerate(link_list):
174 data_field = crawlSingleURLForContent(link, index, len(link_list))
175 dump_data.append(data_field)
176 return dump_data
178 def crawlBuildLinks(link_list):
179 opener = buildOpener()
180 """ Iterate through the list of links and collect links found
181 on each page through the use of the beautiful soup lib."""
182 total_links = 0
183 total_links_tag = 0
184 sub_links = None
185 for link in link_list:
186 try:
187 data = opener.open(link).read()
188 soup = BeautifulSoup(data)
189 sub_links_tag = soup.findAll('a')
190 total_links_tag = total_links_tag + len(sub_links_tag)
191 sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)]
192 # Filter out duplicates with set
193 sub_links = set(sub_links)
194 total_links = total_links + len(sub_links)
195 except Exception, e:
196 print "ERR <crawlBuildLinks>: %s" % e
197 print " <crawlBuildLinks>: url=[%s]" % link
199 if total_links_tag != 0:
200 valid_ratio = float(total_links) / total_links_tag
201 print "INFO: valid links ratio: %s, max=%s/%s" % \
202 (valid_ratio,
203 total_links,
204 total_links_tag)
206 # Return an empty list or valid content
207 if sub_links is None:
208 return ([], total_links)
209 else:
210 return (sub_links, total_links)
212 class URLInfoPool:
213 def __init__(self):
214 self.url_pool = []
216 def buildURLPool(self, link_list):
217 links, total_links = crawlBuildLinks(link_list)
218 for index, link_proc in enumerate(links):
219 # DEBUG
220 if index > 10:
221 break
222 url_info = crawlSingleURL(link_proc, index, total_links)
223 if url_info:
224 self.url_pool.append(url_info)