apertium-en-de/wordmirror/WordMirror.py

   1 # WordMirror scraper
   2 # Stephen Paulger <stephen.paulger@gmail.com>
   3 #
   4 # Usage:
   5 # Command line test
   6 # python WordMirror.py haus
   7 # nb. there is an issue in the command line test output with umlauts
   8 #     this is not a problem if you access the string objects rather
   9 #     than printing the result dict directly
  10 #
  11 # Within a program
  12 # import WordMirror
  13 # result = WordMirror.WordMirrorNounInflection("Ecke")
  14 # print result["Gender"]
  15 # print result["Indefinite Article"]["Genitive"]
  16
  17 from urllib2 import urlopen
  18
  19 def WordMirrorNounInflection(noun):
  20
  21         url = "http://www.wordmirror.com/inflect.php?lang=en&lp=de_en&q=%s&version=0&pos=noun" % (noun)
  22
  23         webdoc = urlopen(url).read()
  24         doclines = webdoc.splitlines()
  25
  26         readingIntro = False
  27         readingMain = False
  28
  29         mainCount = 0
  30         mainTableHeading = 0
  31         caseNum = 0
  32
  33         inflect = {}
  34
  35         for line in doclines:
  36                 # Introduction begins with <!-- introductory table -->
  37                 # and ends with <!-- end of introductory table -->
  38                 if line.find("introductory table") >= 0:
  39                         readingIntro = not readingIntro
  40
  41                 if readingIntro and line.endswith(":"):
  42                         key = line[line.find(">")+1:-1]
  43                 elif readingIntro and line.find("<td>") >= 0:
  44                         inflect[key] = line[line.find("<td>")+4:]
  45
  46                 # Main table begins with <!-- main table -->
  47                 # and ends with <!-- end of main table -->
  48                 # except the beginning line is duplicated so we must ignore the first.
  49                 if line.find("main table") >= 0:
  50                         mainCount += 1
  51
  52                 # Exit the loop after the main table is read
  53                 if mainCount > 2:
  54                         break
  55                 elif mainCount <2:
  56                         continue
  57
  58                 # We read the table headings, the first one isn't any use to us
  59                 # so we skip it
  60                 if line.find("tableheading") >= 0:
  61                         mainTableHeading += 1
  62                         if mainTableHeading < 2:
  63                                 continue
  64
  65                         # If we got here, then the table heading is one we're
  66                         # interested in
  67                         key = line[line.find(">")+1:-5].replace("<br>"," ")
  68                         inflect[key] = {}
  69                         caseNum = 0
  70                         continue
  71
  72                 cases = ["Nominative", "Accusative", "Dative", "Genitive"]
  73
  74                 if line.endswith("</td>"):
  75                         inflect[key][cases[caseNum]] = line[line.find(">")+1:-5]
  76                         caseNum +=1
  77
  78         return inflect
  79
  80 def getParadigms(dixFile, wordType):
  81         from xml.dom import minidom
  82         from xml import xpath
  83
  84         doc = minidom.parse("apertium-en-de.de.dix").documentElement
  85         xpath = '/dictionary/pardefs/pardef[substring-after(@n,"__")="%s"]' % (wordType)
  86         return xpath.Evaluate(xpath, doc)
  87
  88 def matchParadigm(dixFile, wordType, paradigm):
  89         pardefs = getParadigms(dixFile, wordType)
  90         for pardef in pardefs:
  91                 print pardef.toxml()
  92
  93
  94 if __name__ == "__main__":
  95         import sys
  96         inflect = WordMirrorNounInflection(sys.argv[1])
  97         print inflect
  98