Adding a bit
[apertium.git] / apertium-en-de / wordmirror / WordMirror.py
blob22f851dba3cef291a38e8c1efea5d625c0ef4704
1 # WordMirror scraper
2 # Stephen Paulger <stephen.paulger@gmail.com>
4 # Usage:
5 # Command line test
6 # python WordMirror.py haus
7 # nb. there is an issue in the command line test output with umlauts
8 # this is not a problem if you access the string objects rather
9 # than printing the result dict directly
11 # Within a program
12 # import WordMirror
13 # result = WordMirror.WordMirrorNounInflection("Ecke")
14 # print result["Gender"]
15 # print result["Indefinite Article"]["Genitive"]
17 from urllib2 import urlopen
19 def WordMirrorNounInflection(noun):
21 url = "http://www.wordmirror.com/inflect.php?lang=en&lp=de_en&q=%s&version=0&pos=noun" % (noun)
23 webdoc = urlopen(url).read()
24 doclines = webdoc.splitlines()
26 readingIntro = False
27 readingMain = False
29 mainCount = 0
30 mainTableHeading = 0
31 caseNum = 0
33 inflect = {}
35 for line in doclines:
36 # Introduction begins with <!-- introductory table -->
37 # and ends with <!-- end of introductory table -->
38 if line.find("introductory table") >= 0:
39 readingIntro = not readingIntro
41 if readingIntro and line.endswith(":"):
42 key = line[line.find(">")+1:-1]
43 elif readingIntro and line.find("<td>") >= 0:
44 inflect[key] = line[line.find("<td>")+4:]
46 # Main table begins with <!-- main table -->
47 # and ends with <!-- end of main table -->
48 # except the beginning line is duplicated so we must ignore the first.
49 if line.find("main table") >= 0:
50 mainCount += 1
52 # Exit the loop after the main table is read
53 if mainCount > 2:
54 break
55 elif mainCount <2:
56 continue
58 # We read the table headings, the first one isn't any use to us
59 # so we skip it
60 if line.find("tableheading") >= 0:
61 mainTableHeading += 1
62 if mainTableHeading < 2:
63 continue
65 # If we got here, then the table heading is one we're
66 # interested in
67 key = line[line.find(">")+1:-5].replace("<br>"," ")
68 inflect[key] = {}
69 caseNum = 0
70 continue
72 cases = ["Nominative", "Accusative", "Dative", "Genitive"]
74 if line.endswith("</td>"):
75 inflect[key][cases[caseNum]] = line[line.find(">")+1:-5]
76 caseNum +=1
78 return inflect
80 def getParadigms(dixFile, wordType):
81 from xml.dom import minidom
82 from xml import xpath
84 doc = minidom.parse("apertium-en-de.de.dix").documentElement
85 xpath = '/dictionary/pardefs/pardef[substring-after(@n,"__")="%s"]' % (wordType)
86 return xpath.Evaluate(xpath, doc)
88 def matchParadigm(dixFile, wordType, paradigm):
89 pardefs = getParadigms(dixFile, wordType)
90 for pardef in pardefs:
91 print pardef.toxml()
94 if __name__ == "__main__":
95 import sys
96 inflect = WordMirrorNounInflection(sys.argv[1])
97 print inflect