4 Copyright (c) 2007, Botnode.com (Berlin Brown)
5 http://www.opensource.org/licenses/bsd-license.php
9 Redistribution and use in source and binary forms, with or without modification,
10 are permitted provided that the following conditions are met:
12 * Redistributions of source code must retain the above copyright notice,
13 this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
18 the names of its contributors may be used to endorse or promote
19 products derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 Save spider database format in big endian format (network format).
39 __author__
= "Berlin Brown"
40 __copyright__
= "Copyright (c) 2006-2008 Berlin Brown"
41 __license__
= "New BSD"
47 from soup
.BeautifulSoup
import *
49 from urlparse
import urlparse
50 from optparse
import OptionParser
53 from database
.spiderdb
import create_database
54 from spiderbot_util
import DEFAULT_REQUEST_TIMEOUT
, FF_USER_AGENT
, \
55 LINK_SET_INDICATOR
, URLField
, buildOpener
, validateSubLink
, convertStrAscii
56 from content
.spiderbot_content
import doc_ignore_content
, \
57 clean_content
, build_page_info
59 def processSubLink(link_tag
):
60 """Process each link, ensure that a 'href' value is available,
61 also convert relative URIs to full URLs"""
62 # TODO: BUG, currently ignoring all internal links (don't have http)
63 link_val
= link_tag
['href']
65 # If URL found, ignore; if relative than attempt to build URL
66 if link_val
.lower().startswith('http'):
72 def get_meta_content(meta_data_arr
):
73 """ Use with soup, in the following manner:
74 <code>meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
75 meta_data_descr = soup.findAll('meta', {'name':'description'})</code>
76 keywords = get_meta_content(meta_data_keywords)"""
78 content_content
= None
79 if meta_data_arr
and len(meta_data_arr
) > 0:
80 content_data
= [el
['content'] for el
in meta_data_arr
]
81 if content_data
and len(content_data
) > 0:
82 return content_data
[0]
87 def crawlSingleURL(link
, idx
, total_links
):
89 opener
= buildOpener()
91 data
= opener
.open(link
).read()
92 soup
= BeautifulSoup(data
)
93 meta_data_keywords
= soup
.findAll('meta', {'name':'keywords'})
94 meta_data_descr
= soup
.findAll('meta', {'name':'description'})
95 keywords
= get_meta_content(meta_data_keywords
)
96 descr
= get_meta_content(meta_data_descr
)
97 # Extract the title tag
100 titleTag
= soup
.html
.head
.title
101 titleTag
= str(titleTag
.string
)
106 # Return the basic URL data structure
107 field
= URLField(link
, titleTag
, descr
, keywords
)
109 if ((idx
% LINK_SET_INDICATOR
) == 0):
110 sys
.stdout
.write("[%s/%s] " % (idx
, total_links
))
111 # Exit crawl single URL with url field.
114 except socket
.timeout
:
115 print "ERR: timeout [%s/%s] " % (idx
, total_links
)
116 except urllib2
.URLError
:
117 print "ERR: timeout [%s/%s] " % (idx
, total_links
)
121 def crawlSingleURLForContent(link
, idx
, total_links
):
122 """ Crawl this URL but only extract the content for content
123 analysis. A more extensive model than crawlSingleURL"""
125 opener
= buildOpener()
127 data
= opener
.open(link
).read()
128 istats
= build_page_info(link
, data
)
129 data
= clean_content(data
)
130 soup
= BeautifulSoup(data
)
131 meta_data_keywords
= soup
.findAll('meta', {'name':'keywords'})
132 meta_data_descr
= soup
.findAll('meta', {'name':'description'})
133 keywords
= get_meta_content(meta_data_keywords
)
134 descr
= get_meta_content(meta_data_descr
)
136 # Extract the title tag
139 titleTag
= soup
.html
.head
.title
140 titleTag
= str(titleTag
.string
)
143 # Ignore content we aren't concerned with
144 partial_content
= doc_ignore_content(soup
)
147 # Return the basic URL data structure
148 field
= URLField(link
, titleTag
, descr
, keywords
)
150 field
.descr
= field
.tokenizeTags(field
.descr
)
151 field
.keywords
= field
.tokenizeTags(field
.keywords
)
153 field
.full_content
= data
154 field
.extract_content
= partial_content
155 field
.info_stats
= istats
157 if ((idx
% LINK_SET_INDICATOR
) == 0):
158 sys
.stdout
.write("[%s/%s] " % (idx
, total_links
))
160 # Exit crawl single URL with url field.
163 except urllib2
.URLError
:
164 print "ERR: timeout [%s/%s] " % (idx
, total_links
)
166 # NOTE: if pass allowed, compile errors will be ignored.
167 print "ERR<crawlSingleURLForContent>: %s" % e
170 def crawlForURLContentDump(link_list
):
172 """ Iterate through list and dump data"""
173 for index
, link
in enumerate(link_list
):
174 data_field
= crawlSingleURLForContent(link
, index
, len(link_list
))
175 dump_data
.append(data_field
)
178 def crawlBuildLinks(link_list
):
179 opener
= buildOpener()
180 """ Iterate through the list of links and collect links found
181 on each page through the use of the beautiful soup lib."""
185 for link
in link_list
:
187 data
= opener
.open(link
).read()
188 soup
= BeautifulSoup(data
)
189 sub_links_tag
= soup
.findAll('a')
190 total_links_tag
= total_links_tag
+ len(sub_links_tag
)
191 sub_links
= [processSubLink(el
) for el
in sub_links_tag
if validateSubLink(el
)]
192 # Filter out duplicates with set
193 sub_links
= set(sub_links
)
194 total_links
= total_links
+ len(sub_links
)
196 print "ERR <crawlBuildLinks>: %s" % e
197 print " <crawlBuildLinks>: url=[%s]" % link
199 if total_links_tag
!= 0:
200 valid_ratio
= float(total_links
) / total_links_tag
201 print "INFO: valid links ratio: %s, max=%s/%s" % \
206 # Return an empty list or valid content
207 if sub_links
is None:
208 return ([], total_links
)
210 return (sub_links
, total_links
)
216 def buildURLPool(self
, link_list
):
217 links
, total_links
= crawlBuildLinks(link_list
)
218 for index
, link_proc
in enumerate(links
):
222 url_info
= crawlSingleURL(link_proc
, index
, total_links
)
224 self
.url_pool
.append(url_info
)