From ccaa7b781d739c579b2cf56d4fb2afa70d5e2e0a Mon Sep 17 00:00:00 2001 From: Paul Goins Date: Thu, 16 Jul 2009 02:03:43 +0900 Subject: [PATCH] Finished basic versions of KANJIDIC and KANJIDIC2 parsers. --- parsers/kanjidic.py | 61 ++++++++++++++----------- parsers/kanjidic2.py | 113 ++++++++++++++++++---------------------------- parsers/tests/kanjidic.py | 2 +- 3 files changed, 81 insertions(+), 95 deletions(-) diff --git a/parsers/kanjidic.py b/parsers/kanjidic.py index 68b59fa..08ccfad 100644 --- a/parsers/kanjidic.py +++ b/parsers/kanjidic.py @@ -104,7 +104,6 @@ class KanjidicEntry(object): def __init__(self): # Key info self.literal = None - self.jis = None self.meanings = [] self.kunyomi = [] self.onyomi = [] @@ -118,6 +117,7 @@ class KanjidicEntry(object): self.jlpt = None # Info of low importance for most target users + self.jis = None self.radical = None self.radical_c = None # "Classic" KangXi Zidian radical self.radname = None @@ -130,7 +130,9 @@ class KanjidicEntry(object): self.qcodes = {} # Dictionary codes - # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn + + # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, + # Knnnn, Lnnnn, Onnnn # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM self.dcodes = {} @@ -243,23 +245,11 @@ class KanjidicEntry(object): lines.append(_(u"Unicode: 0x%04X") % ord(self.literal)) if self.jis: kuten = jis_hex_to_kuten(self.jis) - lines.append(_(u"JIS code: Kuten = %s, Hex = 0x%04X") - % (kuten, self.jis)) + jis_set = u"208" # For now, hard-code it. + lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X") + % (jis_set, kuten, self.jis)) - #self.xref = [] if self.xref: - # FIXME/TODO: Finish this section! - # From KANJIDIC documentation: - # - # Xxxxxxx -- a cross-reference code. An entry of, say, - # XN1234 will mean that the user is referred to the kanji - # with the (unique) Nelson index of 1234. XJ0xxxx and - # XJ1xxxx are cross-references to the kanji with the JIS - # hexadecimal code of xxxx. The `0' means the reference is - # to a JIS X 0208 kanji, and the `1' references a JIS X - # 0212 kanji. - # - for ref in self.xref: if ref[0] == 'J': # JIS crossrefs @@ -470,25 +460,44 @@ class KanjidicParser(object): return entry def search(self, query): - if self.use_cache and self.cache: - for char in query: - kanji = self.cache.get(char) - if kanji: yield kanji - else: + """Returns a list of kanji entries matching kanji in the query. + + Note: Previous versions implemented this as a generator. + While I liked that solution, it did not maintain the order of + kanji in the query. Since the KANJIDIC2 parser does this, + I've done it here as well for consistency. + + """ + results = [] + + data = None + if self.use_cache: data = self.cache + + if not data: if len(self.filename) >= 3 and self.filename[-3:] == ".gz": f = gzip.open(self.filename) else: f = open(self.filename, "rb") - data = f.read() + fdata = f.read() f.close() - data = data.decode(self.encoding) - self.data = data.splitlines() + fdata = fdata.decode(self.encoding) + # seld.data is needed by self.get_entry() + self.data = fdata.splitlines() + + data = {} entry = self.get_entry() while entry: if self.use_cache: self.cache[entry.literal] = entry - if entry.literal in query: yield entry + if entry.literal in query: data[entry.literal] = entry entry = self.get_entry() + pass + + for char in query: + kanji = data.get(char) + if kanji: results.append(kanji) + + return results if __name__ == "__main__": import sys, os diff --git a/parsers/kanjidic2.py b/parsers/kanjidic2.py index c26905c..d7b5330 100644 --- a/parsers/kanjidic2.py +++ b/parsers/kanjidic2.py @@ -41,12 +41,17 @@ gettext.install('pyjben', unicode=True) from parsers.kanjidic_common \ import jstring_convert, kanjidic2_key_to_str, qcode_to_desc +def jis_kuten_to_hex(kuten): + """Kuten string to hex conversion""" + pieces = map(int, kuten.split(u'-')) + print "DEBUG: kuten: %s, pieces: %s" % (kuten, str(pieces)) + return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20) + class Kanjidic2Entry(object): def __init__(self): # Key info self.literal = None - self.jis = None self.meanings = {} self.ja_kun = [] self.ja_on = [] @@ -60,6 +65,7 @@ class Kanjidic2Entry(object): self.jlpt = None # Info of low importance for most target users + self.cps = [] # JIS codepoints self.radical = None self.radical_c = None # "Classic" KangXi Zidian radical self.radname = None @@ -73,7 +79,8 @@ class Kanjidic2Entry(object): self.qcodes = {} # Dictionary codes - # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn + # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, + # Knnnn, Lnnnn, Onnnn # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM self.dcodes = {} @@ -180,59 +187,33 @@ class Kanjidic2Entry(object): # "self.unicode" is always present. ;) lines.append(_(u"Unicode: 0x%04X") % ord(self.literal)) - # ***FIXME: The below entries have not been tested since they - # are not yet properly - if self.jis: - def jis_hex_to_kuten(hex_code): - """KANJIDIC2-style kuten string""" - return u"%s-%s" % ( - (((hex_code >> 8) & 0xFF) - 0x20), - ((hex_code & 0xFF) - 0x20)) - - kuten = jis_hex_to_kuten(self.jis) - lines.append(_(u"JIS code: Kuten = %s, Hex = 0x%04X") - % (kuten, self.jis)) + if self.cps: + for jis_set, kuten in self.cps: + # jis_set == "jis###" - we'll just splice the last 3 digits in + hexcode = jis_kuten_to_hex(kuten) + lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X") + % (jis_set[3:], kuten, hexcode)) - #self.xref = [] if self.xref: - # From KANJIDIC documentation: - # - # Xxxxxxx -- a cross-reference code. An entry of, say, - # XN1234 will mean that the user is referred to the kanji - # with the (unique) Nelson index of 1234. XJ0xxxx and - # XJ1xxxx are cross-references to the kanji with the JIS - # hexadecimal code of xxxx. The `0' means the reference is - # to a JIS X 0208 kanji, and the `1' references a JIS X - # 0212 kanji. - # - - # For now, just dump to the console. - lines.append(_(u"Crossref codes: %s") % ", ".join(self.xref)) - - # From J-Ben 1: - #/* Crossref codes */ - #if(!k.var_j208.empty()) - #result << "
  • JIS-208: " << k.var_j208 << "
  • "; - #if(!k.var_j212.empty()) - #result << "
  • JIS-212: " << k.var_j212 << "
  • "; - #if(!k.var_j213.empty()) - #result << "
  • JIS-213: " << k.var_j213 << "
  • "; - #if(!k.var_ucs.empty()) - #result << "
  • Unicode: " << k.var_ucs << "
  • "; - #if(!k.var_deroo.empty()) - #result << "
  • De Roo code: " << k.var_deroo << "
  • "; - #if(!k.var_nelson_c.empty()) - #result << "
  • Modern Reader's Japanese-English Character " - #"Dictionary (Nelson): " << k.var_nelson_c << "
  • "; - #if(!k.var_njecd.empty()) - #result << "
  • New Japanese-English Character Dictionary " - #"(Halpern): " << k.var_njecd << "
  • "; - #if(!k.var_oneill.empty()) - #result << "
  • Japanese Names (O'Neill): " << k.var_oneill - #<< "
  • "; - #if(!k.var_s_h.empty()) - #result << "
  • Spahn/Hadamitzky Kanji Dictionary code: " - #<< k.var_s_h << "
  • "; + for var_type, code in self.xref: + d = { + 'jis208': u'JIS X 0208', + 'jis212': u'JIS X 0212', + 'jis213': u'JIS X 0213', + 'deroo': u'De Roo', + 'njecd': u'Halpern NJECD', + 's_h': u'Kanji Dictionary (Spahn/Hadamitzky)', + 'nelson_c': u'"Classic" Nelson', + 'oneill': u"Japanese Names (O'Neill)", + 'ucs': u'Unicode hex' + } + s = d.get(var_type, var_type) + if var_type[:3] == u'jis': + hexcode = jis_kuten_to_hex(code) + lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, " + u"Hex = 0x%04X") % (code, hexcode)) + else: + lines.append(_(u"Crossref: %s code: %s") % (s, code)) return u"\n".join(lines) @@ -301,15 +282,6 @@ class KD2SAXHandler(xml.sax.handler.ContentHandler): def characters(self, content): content = content.strip() if content and self.parsing: - # Sanity check: see if the current node type is already - # included under a different full path. - #path = self.get_path() - #self.full_keys.add(path) - # - #keys = [k for k in self.full_keys if k[-(len(node)):] == node] - #if len(keys) != 1: - # print "CHECKME: Node: %s, Keys: %s" % (node, str(keys)) - node, attrs = self.path[-1] # I am exploiting the fact that any given element type can @@ -368,15 +340,21 @@ class KD2SAXHandler(xml.sax.handler.ContentHandler): except ValueError: self.kanji.dcodes[attr] = content - # FIXME/TODO: These still need to be implemented! elif node == u"cp_value": # codepoint/cp_value - pass + cp_type = attrs[u'cp_type'] + if cp_type != u'ucs': + self.kanji.cps.append((cp_type, content)) elif node == u"rad_value": # radical/rad_value - pass + rad_type = attrs[u'rad_type'] + if rad_type == "classical": + self.kanji.radical_c = int(content) + else: # nelson_c + self.kanji.radical = int(content) elif node == u"variant": # misc/variant - pass + var_type = attrs[u'var_type'] + self.kanji.xref.append((var_type, content)) elif node == u"rad_name": # misc/rad_name - pass + self.kanji.radname = content else: # Anything unhandled... try: path = self.get_path() @@ -384,7 +362,6 @@ class KD2SAXHandler(xml.sax.handler.ContentHandler): % (path, self.get_attr_str(), content) - # Do some stuff based upon the current path and content except UnicodeEncodeError: pass # Can't display code on console; just squelch the output. except Exception, e: diff --git a/parsers/tests/kanjidic.py b/parsers/tests/kanjidic.py index 378a279..76078ce 100644 --- a/parsers/tests/kanjidic.py +++ b/parsers/tests/kanjidic.py @@ -64,7 +64,7 @@ class KanjidicTest(unittest.TestCase): self.assertTrue(second_t <= first_t) def test_no_cache(self): - """KANJIDIC2: Check that parser works without caching.""" + """KANJIDIC: Check that parser works without caching.""" self.kp = kanjidic.KanjidicParser(SRC_NAME, use_cache=False) self.assertFalse(self.kp.cache) -- 2.11.4.GIT