adding all of botlist, initial add
[botlist.git] / botlistprojects / botspider / spider / lib / python / content / spiderbot_content.py
blob51ad6d06bba1a967f5738179f17636069fdaf8ff
1 """
2 File: spiderdb.py
4 Copyright (c) 2007, Botnode.com (Berlin Brown)
5 http://www.opensource.org/licenses/bsd-license.php
7 All rights reserved.
9 Redistribution and use in source and binary forms, with or without modification,
10 are permitted provided that the following conditions are met:
12 * Redistributions of source code must retain the above copyright notice,
13 this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
18 the names of its contributors may be used to endorse or promote
19 products derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 Description:
35 Save spider database format in big endian format (network format).
37 """
39 __author__ = "Berlin Brown"
40 __copyright__ = "Copyright (c) 2006-2008 Berlin Brown"
41 __license__ = "New BSD"
43 from soup.BeautifulSoup import *
44 from spiderbot_util import convertStrAscii, ignoreHtmlEntity, PageInfoStats
45 from spiderbot_const import KEY_HTML_TAGS, HTML_TAG_MAP
47 TAG_a = 0
48 TAG_b = 1
49 TAG_bq = 2
50 TAG_div = 3
51 TAG_h1 = 4
52 TAG_h2 = 5
53 TAG_i = 6
54 TAG_img = 7
55 TAG_p = 8
56 TAG_span = 9
57 TAG_strong = 10
58 TAG_table = 11
60 KEY_HTML_TAGS = [
61 "a",
62 "b",
63 "blockquote",
64 "div",
65 "h1",
66 "h2",
67 "i",
68 "img",
69 "p",
70 "span",
71 "strong",
72 "table",
75 def set_stats_prop(stats_class, prop_attr, val):
76 stats_class.__dict__[prop_attr] = val
78 def build_page_info(page_url, data):
79 """ Build page statistics based on beautiful soup invoke,
80 note: this may reload the data content again in order have a fresh start.
81 See http://www.w3schools.com/tags/default.asp
82 for HTML tag references."""
83 soup = BeautifulSoup(data)
84 stats = PageInfoStats(page_url)
85 for info_tag in KEY_HTML_TAGS:
86 tag_arr = soup.findAll(info_tag)
87 n = len(tag_arr)
88 # Simple switch statement, change handler depending on tag type
89 page_info_switch = {
90 KEY_HTML_TAGS[TAG_a]: lambda x: set_stats_prop(stats, 'anchor_ct', x),
91 KEY_HTML_TAGS[TAG_b]: lambda x: set_stats_prop(stats, 'bold_ct', x),
92 KEY_HTML_TAGS[TAG_bq]: lambda x: set_stats_prop(stats, 'block_ct', x),
93 KEY_HTML_TAGS[TAG_div]: lambda x: set_stats_prop(stats, 'div_ct', x),
94 KEY_HTML_TAGS[TAG_h1]: lambda x: set_stats_prop(stats, 'h1_ct', x),
95 KEY_HTML_TAGS[TAG_h2]: lambda x: set_stats_prop(stats, 'h2_ct', x),
96 KEY_HTML_TAGS[TAG_i]: lambda x: set_stats_prop(stats, 'italic_ct', x),
97 KEY_HTML_TAGS[TAG_img]: lambda x: set_stats_prop(stats, 'img_ct', x),
98 KEY_HTML_TAGS[TAG_p]: lambda x: set_stats_prop(stats, 'para_ct', x),
99 KEY_HTML_TAGS[TAG_span]: lambda x: set_stats_prop(stats, 'span_ct', x),
100 KEY_HTML_TAGS[TAG_strong]: lambda x: set_stats_prop(stats, 'strong_ct', x),
101 KEY_HTML_TAGS[TAG_table]: lambda x: set_stats_prop(stats, 'table_ct', x)
102 } [info_tag](n)
103 return stats
105 def doc_ignore_content(soup):
106 """ With beautiful soup's api, ignore content
107 we are not interested in like comments"""
109 # Attempt to extract script data
110 strip_invalids = soup.findAll(text=lambda text:isinstance(text, Comment))
111 [comment.extract() for comment in strip_invalids]
113 # Remove SCRIPT and STYLE tags.
114 [soup.script.extract() for script in soup("script")]
115 [soup.style.extract() for style in soup("style")]
117 # Only extract text content.
118 txt_lst = soup.findAll(text=True)
119 txt_lst = [ convertStrAscii(n) \
120 for n in txt_lst if len(n.strip()) > 1 ]
121 doc_str = '\n'.join(txt_lst)
122 return doc_str
124 def clean_content(content):
126 #*****************************************
127 # Additional filters and cleanups
128 #*****************************************
129 if content is not None:
130 # Encode to simple ascii format.
131 try:
132 content = convertStrAscii(content)
133 content = ignoreHtmlEntity(content)
134 return content
135 except UnicodeError, e:
136 print e
138 return ""