dur/summarize.py

   1 import sys
   2 import re
   3 import pycurl
   4 import StringIO
   5 from mwlib.uparser import simpleparse
   6 import mwlib
   7 import xml.dom.minidom
   8 from xml.dom.minidom import Node
   9
  10 def lookup(title, maxredirects=5):
  11   if (maxredirects < 0):
  12     raise Exception("too many redirects: %s (at %s)" % (sys.argv[1], title))
  13   curl = pycurl.Curl()
  14
  15   curl.setopt(pycurl.URL, (u'http://en.wikipedia.org/wiki/Special:Export/%s' % (title)).encode('utf8'))
  16   b = StringIO.StringIO()
  17   curl.setopt(pycurl.WRITEFUNCTION, b.write)
  18   curl.setopt(pycurl.FOLLOWLOCATION, 1)
  19   curl.setopt(pycurl.MAXREDIRS, 5)
  20   curl.perform()
  21   xmlcontents = b.getvalue()
  22
  23   doc = xml.dom.minidom.parseString(xmlcontents)
  24   if not doc.getElementsByTagName("text"):
  25     print "%s not found" % (sys.argv[1])
  26     return
  27   textnode = doc.getElementsByTagName("text")[0]
  28   if len(textnode.childNodes) != 0:
  29     match = re.match("#REDIRECT\s*\[\[(.*)\]\]", textnode.childNodes[0].nodeValue)
  30     if match:
  31       return lookup(match.group(1), maxredirects=maxredirects-1)
  32     else:
  33       contents = textnode.childNodes[0].nodeValue
  34   else:
  35     contents = textnode.nodeValue
  36
  37   contents = re.sub('\{\{[^\}\{]*\}\}', '', contents)
  38   contents = re.sub('\{\{[^\}\{]*\}\}', '', contents)
  39   contents = re.sub('<[^><]*>[^<>]*</[^<>]*>', '', contents)
  40   contents = re.sub('<[^><]*>', '', contents)
  41
  42   parsed = simpleparse(contents)
  43   categories = {}
  44   contents = ''
  45   for node in parsed.allchildren():
  46     if len(contents) < 140:
  47       if type(node) == mwlib.parser.nodes.Text:
  48         contents += node.text
  49       #contents += node.
  50       if type(node) == mwlib.parser.nodes.ArticleLink:
  51         contents += "@" + re.sub(' ', '_', node.target) + " "
  52     if type(node) == mwlib.parser.nodes.CategoryLink:
  53       link = node.target.split(u':')[1]
  54       categories.setdefault(link, 0)
  55       categories[link] += 1
  56
  57   print categories
  58   return contents
  59
  60 def find_categories(contents):
  61   pass
  62
  63 #print xmlcontents
  64 def rec(node, deep=0):
  65   for child in node.childNodes:
  66     if child != node:
  67       print ' ' * deep + child.nodeName
  68       rec(child, deep + 1)
  69
  70 #rec(doc)
  71 #page = sys.argv[1]
  72 #print lookup(page)
  73 #lookup(page)
  74