botlistprojects/botspider/spider/lib/python/content/spiderbot_content.py

   1 """
   2 File: spiderdb.py
   3
   4 Copyright (c) 2007, Botnode.com (Berlin Brown)
   5 http://www.opensource.org/licenses/bsd-license.php
   6
   7 All rights reserved.
   8
   9 Redistribution and use in source and binary forms, with or without modification,
  10 are permitted provided that the following conditions are met:
  11
  12     * Redistributions of source code must retain the above copyright notice,
  13     this list of conditions and the following disclaimer.
  14     * Redistributions in binary form must reproduce the above copyright notice,
  15     this list of conditions and the following disclaimer in the documentation
  16     and/or other materials provided with the distribution.
  17     * Neither the name of the Newspiritcompany.com (Berlin Brown) nor
  18     the names of its contributors may be used to endorse or promote
  19     products derived from this software without specific prior written permission.
  20
  21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  25 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 Description:
  34
  35 Save spider database format in big endian format (network format).
  36
  37 """
  38
  39 __author__ = "Berlin Brown"
  40 __copyright__ = "Copyright (c) 2006-2008 Berlin Brown"
  41 __license__ = "New BSD"
  42
  43 from soup.BeautifulSoup import *
  44 from spiderbot_util import convertStrAscii, ignoreHtmlEntity, PageInfoStats
  45 from spiderbot_const import KEY_HTML_TAGS, HTML_TAG_MAP
  46
  47 TAG_a = 0
  48 TAG_b = 1
  49 TAG_bq = 2
  50 TAG_div = 3
  51 TAG_h1 = 4
  52 TAG_h2 = 5
  53 TAG_i = 6
  54 TAG_img = 7
  55 TAG_p = 8
  56 TAG_span = 9
  57 TAG_strong = 10
  58 TAG_table = 11
  59
  60 KEY_HTML_TAGS = [
  61         "a",
  62         "b",
  63         "blockquote",
  64         "div",
  65         "h1",
  66         "h2",
  67         "i",
  68         "img",
  69         "p",
  70         "span",
  71         "strong",
  72         "table",
  73 ];
  74
  75 def set_stats_prop(stats_class, prop_attr, val):
  76         stats_class.__dict__[prop_attr] = val
  77
  78 def build_page_info(page_url, data):
  79         """ Build page statistics based on beautiful soup invoke,
  80         note: this may reload the data content again in order have a fresh start.
  81         See http://www.w3schools.com/tags/default.asp
  82         for HTML tag references."""
  83         soup = BeautifulSoup(data)
  84         stats = PageInfoStats(page_url)
  85         for info_tag in KEY_HTML_TAGS:
  86                 tag_arr = soup.findAll(info_tag)
  87                 n = len(tag_arr)
  88                 # Simple switch statement, change handler depending on tag type
  89                 page_info_switch = {
  90                         KEY_HTML_TAGS[TAG_a]: lambda x: set_stats_prop(stats, 'anchor_ct', x),
  91                         KEY_HTML_TAGS[TAG_b]: lambda x: set_stats_prop(stats, 'bold_ct', x),
  92                         KEY_HTML_TAGS[TAG_bq]: lambda x: set_stats_prop(stats, 'block_ct', x),
  93                         KEY_HTML_TAGS[TAG_div]: lambda x: set_stats_prop(stats, 'div_ct', x),
  94                         KEY_HTML_TAGS[TAG_h1]: lambda x: set_stats_prop(stats, 'h1_ct', x),
  95                         KEY_HTML_TAGS[TAG_h2]: lambda x: set_stats_prop(stats, 'h2_ct', x),
  96                         KEY_HTML_TAGS[TAG_i]: lambda x: set_stats_prop(stats, 'italic_ct', x),
  97                         KEY_HTML_TAGS[TAG_img]: lambda x: set_stats_prop(stats, 'img_ct', x),
  98                         KEY_HTML_TAGS[TAG_p]: lambda x: set_stats_prop(stats, 'para_ct', x),
  99                         KEY_HTML_TAGS[TAG_span]: lambda x: set_stats_prop(stats, 'span_ct', x),
 100                         KEY_HTML_TAGS[TAG_strong]: lambda x: set_stats_prop(stats, 'strong_ct', x),
 101                         KEY_HTML_TAGS[TAG_table]: lambda x: set_stats_prop(stats, 'table_ct', x)
 102                         } [info_tag](n)
 103         return stats
 104
 105 def doc_ignore_content(soup):
 106         """ With beautiful soup's api, ignore content
 107         we are not interested in like comments"""
 108
 109         # Attempt to extract script data
 110         strip_invalids = soup.findAll(text=lambda text:isinstance(text, Comment))
 111         [comment.extract() for comment in strip_invalids]
 112
 113         # Remove SCRIPT and STYLE tags.
 114         [soup.script.extract() for script in soup("script")]
 115         [soup.style.extract() for style in soup("style")]
 116
 117         # Only extract text content.
 118         txt_lst = soup.findAll(text=True)
 119         txt_lst = [ convertStrAscii(n) \
 120                                 for n in txt_lst if len(n.strip()) > 1 ]
 121         doc_str = '\n'.join(txt_lst)
 122         return doc_str
 123
 124 def clean_content(content):
 125
 126         #*****************************************
 127         # Additional filters and cleanups
 128         #*****************************************
 129         if content is not None:
 130                 # Encode to simple ascii format.
 131                 try:
 132                         content = convertStrAscii(content)
 133                         content = ignoreHtmlEntity(content)
 134                         return content
 135                 except UnicodeError, e:
 136                         print e
 137
 138         return ""