things almost work
[encyclr.git] / dur / summarize.py
blobc56daddbda5d2bf178f80de0d543b401796d0243
1 import sys
2 import re
3 import pycurl
4 import StringIO
5 from mwlib.uparser import simpleparse
6 import mwlib
7 import xml.dom.minidom
8 from xml.dom.minidom import Node
10 def lookup(title, maxredirects=5):
11 if (maxredirects < 0):
12 raise Exception("too many redirects: %s (at %s)" % (sys.argv[1], title))
13 curl = pycurl.Curl()
15 curl.setopt(pycurl.URL, (u'http://en.wikipedia.org/wiki/Special:Export/%s' % (title)).encode('utf8'))
16 b = StringIO.StringIO()
17 curl.setopt(pycurl.WRITEFUNCTION, b.write)
18 curl.setopt(pycurl.FOLLOWLOCATION, 1)
19 curl.setopt(pycurl.MAXREDIRS, 5)
20 curl.perform()
21 xmlcontents = b.getvalue()
23 doc = xml.dom.minidom.parseString(xmlcontents)
24 if not doc.getElementsByTagName("text"):
25 print "%s not found" % (sys.argv[1])
26 return
27 textnode = doc.getElementsByTagName("text")[0]
28 if len(textnode.childNodes) != 0:
29 match = re.match("#REDIRECT\s*\[\[(.*)\]\]", textnode.childNodes[0].nodeValue)
30 if match:
31 return lookup(match.group(1), maxredirects=maxredirects-1)
32 else:
33 contents = textnode.childNodes[0].nodeValue
34 else:
35 contents = textnode.nodeValue
37 contents = re.sub('\{\{[^\}\{]*\}\}', '', contents)
38 contents = re.sub('\{\{[^\}\{]*\}\}', '', contents)
39 contents = re.sub('<[^><]*>[^<>]*</[^<>]*>', '', contents)
40 contents = re.sub('<[^><]*>', '', contents)
42 parsed = simpleparse(contents)
43 categories = {}
44 contents = ''
45 for node in parsed.allchildren():
46 if len(contents) < 140:
47 if type(node) == mwlib.parser.nodes.Text:
48 contents += node.text
49 #contents += node.
50 if type(node) == mwlib.parser.nodes.ArticleLink:
51 contents += "@" + re.sub(' ', '_', node.target) + " "
52 if type(node) == mwlib.parser.nodes.CategoryLink:
53 link = node.target.split(u':')[1]
54 categories.setdefault(link, 0)
55 categories[link] += 1
57 print categories
58 return contents
60 def find_categories(contents):
61 pass
63 #print xmlcontents
64 def rec(node, deep=0):
65 for child in node.childNodes:
66 if child != node:
67 print ' ' * deep + child.nodeName
68 rec(child, deep + 1)
70 #rec(doc)
71 #page = sys.argv[1]
72 #print lookup(page)
73 #lookup(page)