4 Copyright (c) 2007, Botnode.com (Berlin Brown)
5 http://www.opensource.org/licenses/bsd-license.php
9 Redistribution and use in source and binary forms, with or without modification,
10 are permitted provided that the following conditions are met:
12 * Redistributions of source code must retain the above copyright notice,
13 this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
18 the names of its contributors may be used to endorse or promote
19 products derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 Save spider database format in big endian format (network format).
39 __author__
= "Berlin Brown"
40 __copyright__
= "Copyright (c) 2006-2008 Berlin Brown"
41 __license__
= "New BSD"
43 from soup
.BeautifulSoup
import *
44 from spiderbot_util
import convertStrAscii
, ignoreHtmlEntity
, PageInfoStats
45 from spiderbot_const
import KEY_HTML_TAGS
, HTML_TAG_MAP
75 def set_stats_prop(stats_class
, prop_attr
, val
):
76 stats_class
.__dict
__[prop_attr
] = val
78 def build_page_info(page_url
, data
):
79 """ Build page statistics based on beautiful soup invoke,
80 note: this may reload the data content again in order have a fresh start.
81 See http://www.w3schools.com/tags/default.asp
82 for HTML tag references."""
83 soup
= BeautifulSoup(data
)
84 stats
= PageInfoStats(page_url
)
85 for info_tag
in KEY_HTML_TAGS
:
86 tag_arr
= soup
.findAll(info_tag
)
88 # Simple switch statement, change handler depending on tag type
90 KEY_HTML_TAGS
[TAG_a
]: lambda x
: set_stats_prop(stats
, 'anchor_ct', x
),
91 KEY_HTML_TAGS
[TAG_b
]: lambda x
: set_stats_prop(stats
, 'bold_ct', x
),
92 KEY_HTML_TAGS
[TAG_bq
]: lambda x
: set_stats_prop(stats
, 'block_ct', x
),
93 KEY_HTML_TAGS
[TAG_div
]: lambda x
: set_stats_prop(stats
, 'div_ct', x
),
94 KEY_HTML_TAGS
[TAG_h1
]: lambda x
: set_stats_prop(stats
, 'h1_ct', x
),
95 KEY_HTML_TAGS
[TAG_h2
]: lambda x
: set_stats_prop(stats
, 'h2_ct', x
),
96 KEY_HTML_TAGS
[TAG_i
]: lambda x
: set_stats_prop(stats
, 'italic_ct', x
),
97 KEY_HTML_TAGS
[TAG_img
]: lambda x
: set_stats_prop(stats
, 'img_ct', x
),
98 KEY_HTML_TAGS
[TAG_p
]: lambda x
: set_stats_prop(stats
, 'para_ct', x
),
99 KEY_HTML_TAGS
[TAG_span
]: lambda x
: set_stats_prop(stats
, 'span_ct', x
),
100 KEY_HTML_TAGS
[TAG_strong
]: lambda x
: set_stats_prop(stats
, 'strong_ct', x
),
101 KEY_HTML_TAGS
[TAG_table
]: lambda x
: set_stats_prop(stats
, 'table_ct', x
)
105 def doc_ignore_content(soup
):
106 """ With beautiful soup's api, ignore content
107 we are not interested in like comments"""
109 # Attempt to extract script data
110 strip_invalids
= soup
.findAll(text
=lambda text
:isinstance(text
, Comment
))
111 [comment
.extract() for comment
in strip_invalids
]
113 # Remove SCRIPT and STYLE tags.
114 [soup
.script
.extract() for script
in soup("script")]
115 [soup
.style
.extract() for style
in soup("style")]
117 # Only extract text content.
118 txt_lst
= soup
.findAll(text
=True)
119 txt_lst
= [ convertStrAscii(n
) \
120 for n
in txt_lst
if len(n
.strip()) > 1 ]
121 doc_str
= '\n'.join(txt_lst
)
124 def clean_content(content
):
126 #*****************************************
127 # Additional filters and cleanups
128 #*****************************************
129 if content
is not None:
130 # Encode to simple ascii format.
132 content
= convertStrAscii(content
)
133 content
= ignoreHtmlEntity(content
)
135 except UnicodeError, e
: