5 from mwlib
.uparser
import simpleparse
8 from xml
.dom
.minidom
import Node
10 def lookup(title
, maxredirects
=5):
11 if (maxredirects
< 0):
12 raise Exception("too many redirects: %s (at %s)" % (sys
.argv
[1], title
))
15 curl
.setopt(pycurl
.URL
, (u
'http://en.wikipedia.org/wiki/Special:Export/%s' % (title
)).encode('utf8'))
16 b
= StringIO
.StringIO()
17 curl
.setopt(pycurl
.WRITEFUNCTION
, b
.write
)
18 curl
.setopt(pycurl
.FOLLOWLOCATION
, 1)
19 curl
.setopt(pycurl
.MAXREDIRS
, 5)
21 xmlcontents
= b
.getvalue()
23 doc
= xml
.dom
.minidom
.parseString(xmlcontents
)
24 if not doc
.getElementsByTagName("text"):
25 print "%s not found" % (sys
.argv
[1])
27 textnode
= doc
.getElementsByTagName("text")[0]
28 if len(textnode
.childNodes
) != 0:
29 match
= re
.match("#REDIRECT\s*\[\[(.*)\]\]", textnode
.childNodes
[0].nodeValue
)
31 return lookup(match
.group(1), maxredirects
=maxredirects
-1)
33 contents
= textnode
.childNodes
[0].nodeValue
35 contents
= textnode
.nodeValue
37 contents
= re
.sub('\{\{[^\}\{]*\}\}', '', contents
)
38 contents
= re
.sub('\{\{[^\}\{]*\}\}', '', contents
)
39 contents
= re
.sub('<[^><]*>[^<>]*</[^<>]*>', '', contents
)
40 contents
= re
.sub('<[^><]*>', '', contents
)
42 parsed
= simpleparse(contents
)
45 for node
in parsed
.allchildren():
46 if len(contents
) < 140:
47 if type(node
) == mwlib
.parser
.nodes
.Text
:
50 if type(node
) == mwlib
.parser
.nodes
.ArticleLink
:
51 contents
+= "@" + re
.sub(' ', '_', node
.target
) + " "
52 if type(node
) == mwlib
.parser
.nodes
.CategoryLink
:
53 link
= node
.target
.split(u
':')[1]
54 categories
.setdefault(link
, 0)
60 def find_categories(contents
):
64 def rec(node
, deep
=0):
65 for child
in node
.childNodes
:
67 print ' ' * deep
+ child
.nodeName