From 13c61e7038ae38ab8bd8b5098462037985fee8a7 Mon Sep 17 00:00:00 2001 From: Paul Goins Date: Fri, 12 Mar 2010 21:41:22 +0900 Subject: [PATCH] Fleshed out cElementTree-based Kanjidic2 parser. --- jbparse/jbparse/kanjidic2.py | 170 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 142 insertions(+), 28 deletions(-) diff --git a/jbparse/jbparse/kanjidic2.py b/jbparse/jbparse/kanjidic2.py index 103834e..3690b60 100644 --- a/jbparse/jbparse/kanjidic2.py +++ b/jbparse/jbparse/kanjidic2.py @@ -46,6 +46,103 @@ def jis_kuten_to_hex(kuten): return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20) +class Kanjidic2Node(object): + + def __init__(self, xml_node): + self.xml = xml_node + self.literal = self._get_literal() + + def _get_literal(self): + literal = self.xml.find("literal").text.strip() + assert len(literal) == 1, "Literal has more than one character!" + return literal + + def _get_grade(self): + o = self.xml.find("misc/grade") + return int(o.text) + + def _get_freq(self): + # By the spec, it seems like multiple freqs are possible?? + # So... let's get all entries and assert. + o = self.xml.findall("misc/freq") + if not o: + return None + assert len(o) == 1, ( + u"Character %s: Expected 1 freq entry, found %d" % + (self._get_literal(), len(o))) + return int(o[0].text) + + def _get_jlpt(self): + o = self.xml.find("misc/jlpt") + return int(o.text) + + def _get_nanori(self): + nodes = self.xml.findall("reading_meaning/nanori") + if not nodes: + return None + nanori = [o.text for o in nodes] + return nanori + + def _get_attrdict(self, path, attr_name): + """Helper: stores elements on path in dict, keyed by attribute.""" + d = {} + nodes = self.xml.findall(path) + attrs = set(o.attrib.get(attr_name) for o in nodes) + for attr in attrs: + d[attr] = [o.text for o in nodes + if o.attrib.get(attr_name) == attr] + return d + + def _get_readings(self): + """Returns dictionary of reading lists, keyed by type.""" + return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type") + + def _get_meanings(self): + """Returns dictionary of gloss lists, keyed by language prefix.""" + meaning_d = self._get_attrdict( + "reading_meaning/rmgroup/meaning", "m_lang") + if None in meaning_d: + meaning_d['en'] = meaning_d[None] + del meaning_d[None] + return meaning_d + + def __unicode__(self): + readings = self._get_readings() + meanings = self._get_meanings() + nanori = self._get_nanori() + grade = self._get_grade() + jlpt = self._get_jlpt() + freq = self._get_freq() + + pieces = [] + pieces.append(u"Literal: %s" % self.literal) + + pieces.append(u"On-yomi: %s" % u"、".join(readings['ja_on'])) + pieces.append(u"Kun-yomi: %s" % u"、".join(readings['ja_kun'])) + pieces.append(u"Nanori: %s" % u"、".join(nanori)) + + pieces.append(u"Korean (Hangul): %s" % + u", ".join(readings['korean_h'])) + pieces.append(u"Korean (Romanized): %s" % + u", ".join(readings['korean_r'])) + pieces.append(u"Pinyin: %s" % u", ".join(readings['pinyin'])) + + for lang in sorted(meanings): + pieces.append(u"Meanings (%s): %s" % + (lang, "; ".join(meanings[lang]))) + + if jlpt: + pieces.append(u"JLPT grade level: %d" % jlpt) + if grade: + pieces.append(u"Jouyou grade level: %d" % grade) + if freq: + pieces.append(u"Newspaper frequency: %d" % freq) + + pieces.append(u"Unicode value: %04X" % ord(self.literal)) + + return u"\n".join(pieces) + + class Parser(object): def __init__(self, filename, encoding="utf-8"): @@ -61,7 +158,7 @@ class Parser(object): raise Exception("Dictionary file does not exist.") self.filename = filename self.encoding = encoding - + self.indexed = False self.header, self.characters = self.load_via_etree() def load_via_etree(self): @@ -73,6 +170,7 @@ class Parser(object): f.close() nodes = et.getroot().getchildren() header, characters = nodes[0], nodes[1:] + characters = [Kanjidic2Node(char) for char in characters] return header, characters def get_header(self): @@ -82,6 +180,41 @@ class Parser(object): d[o.tag] = cdata return "\n".join("%s: %s" % (k, d[k]) for k in sorted(d)) + def search(self, query): + self.create_indices() + for u in query: + c = self.by_kanji.get(u) + if c: + yield c + + def create_indices(self): + if self.indexed: + return + print "Creating indices..." + self.indexed = True + + self.by_kanji = {} + for char in self.characters: + literal = char.xml.find("literal").text.strip() + self.by_kanji[literal] = char + print "Done creating indices!" + + +def encode_or_else(s): + if os.name == "nt": + charset = "cp932" + else: + charset = "utf-8" + lines = s.split(u"\n") + out = [] + for line in lines: + try: + val = line.encode(charset) + out.append(line) + except: + pass + return u"\n".join(out) + if __name__ == "__main__": import sys, os @@ -104,31 +237,12 @@ if __name__ == "__main__": else: charset = "utf-8" - print "HEADER:\n{\n%s\n}" % p.get_header() + print "HEADER" + print "======" + print p.get_header() + print print "%d characters found" % len(p.characters) - for i, elem in enumerate(p.characters): - print - print "TAG:", elem.tag - print "TEXT:", "".join((elem.text, elem.tail)).strip() - print "CHILDREN:", elem.getchildren() - print "LITERAL:", elem.find("literal").text - skip_codes = [e for e in elem.findall("query_code/q_code") - if e.attrib['qc_type'] == 'skip'] - print "SKIP CODES:", ", ".join(e.text for e in skip_codes) - if i >= 2: break - - exit(0) - - #for i, kanji in enumerate(kp.search(sys.argv[2].decode(charset))): - # lines = kanji.to_string().split(u'\n') - # def encode_or_else(s): - # try: - # val = s.encode("cp932") - # val = s - # except: - # val = None - # return val - # xlines = map(encode_or_else, lines) - # xlines = [l for l in xlines if l] - # xlines = u"\n".join(list(xlines)) - # print _(u"Entry %d:\n%s\n") % (i+1, xlines) + + for i, kanji in enumerate(p.search("".join(args).decode(charset))): + kstr = encode_or_else(unicode(kanji)) + print _(u"Entry %d:\n%s\n") % (i+1, kstr) -- 2.11.4.GIT