parser_kanjidic.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gettext
   5 gettext.install('pyjben', unicode=True)
   6
   7 # Copied from J-Ben 1.x and modified using Gnome Character Map's
   8 # "Unicode Block" information.
   9 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
  10
  11 def is_hiragana(uc):
  12     # 3040..309F; Hiragana
  13     o = ord(uc)
  14     return o >= 0x3040 and o <= 0x309F
  15
  16 def is_katakana(uc):
  17     # 30A0..30FF; Katakana
  18     # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
  19     o = ord(uc)
  20     return o >= 0x30A0 and o <= 0x30FF
  21
  22 def is_furigana(uc):
  23     return is_hiragana(uc) or is_katakana(uc)
  24
  25
  26 class KanjidicEntry(object):
  27
  28     def __init__(self):
  29         # Key info
  30         self.literal = None
  31         self.meanings = []
  32         self.kunyomi = []
  33         self.onyomi = []
  34         self.nanori = []
  35
  36         # Secondary info
  37         self.strokes = None
  38         self.strokes_alt = []
  39         self.freq = None
  40         self.grade = None
  41         self.jlpt = None
  42
  43         # Info of low importance for most target users
  44         self.radical = None
  45         self.radical_c = None  # "Classic" KangXi Zidian radical
  46         self.radname = None
  47         self.pinyin = []
  48         self.korean = []
  49
  50         # "Query codes": Pattern-based lookup
  51         # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
  52         # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
  53         self.skip = []
  54         self.deroo = None
  55         self.sh_desc = None
  56         self.fc = None
  57
  58         # Dictionary codes
  59         # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
  60         # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
  61         self.dcodes = {}
  62
  63         # Dictionary-related metadata
  64         self.xref = []
  65         self.misclass = []
  66
  67         self.unparsed = []
  68
  69     def __unicode__(self):
  70         """Dummy string dumper"""
  71         strs = [self.literal]
  72         for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
  73             strs.extend(l)
  74         if self.radname:
  75             strs.insert(3, self.radname)
  76
  77         return u", ".join(strs)
  78
  79 class ParserState(object):
  80     def __init__(self):
  81         self.t_class = 0
  82
  83 class KanjidicParser(object):
  84
  85     def __init__(self, filename, encoding="EUC-JP"):
  86         f = open(filename, "rb")
  87         data = f.read()
  88         f.close()
  89         data = data.decode(encoding)
  90         self.data = data.splitlines()
  91
  92     def get_entry(self):
  93         line = None
  94         while self.data and (not line or line[0] == u"#"):
  95             line = self.data.pop(0).strip()
  96         return self.parse_line(line)
  97
  98
  99     def _parse_japanese(self, entry, state, data):
 100         if not state.t_class:
 101             # Check hiragana/katakana
 102             for c in data:
 103                 if is_hiragana(c):
 104                     entry.kunyomi.append(data)
 105                     break
 106                 elif is_katakana(c):
 107                     entry.onyomi.append(data)
 108                     break
 109         elif state.t_class == 1:
 110             entry.nanori.append(data)
 111         elif state.t_class == 2:
 112             entry.radname = data
 113
 114     def _parse_info(self, entry, state, data):
 115         try:
 116             c = data[0]
 117             if c == 'U':
 118                 # Unicode value - we alread store the literal as unicode, so let's
 119                 # use this as our encoding sanity check!
 120                 assert ord(entry.literal) == int(data[1:], 16), \
 121                     "Encoding error detected"
 122             elif c == 'B':
 123                 entry.radical = int(data[1:])
 124             elif c == 'C':
 125                 entry.radical_c = int(data[1:])
 126             elif c == 'F':
 127                 entry.freq = int(data[1:])
 128             elif c == 'G':
 129                 entry.grade = int(data[1:])
 130             elif c == 'J':
 131                 entry.jlpt = int(data[1:])
 132             elif c == 'S':
 133                 i = int(data[1:])
 134                 if not entry.strokes:
 135                     entry.strokes = i
 136                 else:
 137                     entry.strokes_alt.append(i)
 138             elif c == 'W':
 139                 entry.korean.append(data[1:])
 140             elif c == 'Y':
 141                 entry.pinyin.append(data[1:])
 142             elif c == 'X':
 143                 entry.xref.append(data[1:])
 144             elif c == 'Z':
 145                 entry.misclass.append(data[1:])
 146             elif c == 'T':
 147                 state.t_class = int(data[1:])
 148             # Below this point is dictionary/query codes.
 149             # Much of this is copied and modified from J-Ben 1's source code.
 150             elif c == 'H':
 151                 # New Japanese-English Character Dictionary (Halpern)
 152                 entry.dcodes["halpern_njecd"] = data[1:]
 153             elif c == 'N':
 154                 # Modern Reader's Japanese-English Character Dictionary (Nelson)
 155                 entry.dcodes["nelson_c"] = data[1:]
 156             elif c == 'V':
 157                 # The New Nelson's Japanese-English Character Dictionary
 158                 entry.dcodes["nelson_n"] = data[1:]
 159             elif c == 'P':
 160                 # SKIP codes.
 161                 # Thanks to changes in permissible SKIP code usage (change to
 162                 # Creative Commons licensing in January 2008), we can now use
 163                 # this without problems.
 164                 entry.skip.append(data[1:]);
 165             elif c == 'I':  # Spahn/Hadamitzky dictionaries
 166                 if data[1] =='N':
 167                     # Kanji & Kana (Spahn, Hadamitzky)
 168                     entry.dcodes["sh_kk"] = data[2:]
 169                 else:
 170                     # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
 171                     entry.sh_desc = data[1:]
 172             elif c == 'Q':
 173                 # Four Corner code
 174                 entry.fc = data[1:]
 175             elif c == 'M':
 176                 if data[1] == 'N':
 177                     # Morohashi Daikanwajiten Index
 178                     #entry.dcodes["moro"].insert(0,"] ps->substr(2));
 179                     pass
 180                 elif data[1] == 'P':
 181                     # Morohashi Daikanwajiten Volume/Page
 182                     #entry.dcodes["moro"] \
 183                     #    .append(1, '/').append(ps->substr(2));
 184                     pass
 185             elif c == 'E':
 186                 # A Guide to Remembering Japanese Characters (Henshall)
 187                 entry.dcodes["henshall"] = data[1:]
 188             elif c == 'K':
 189                 # Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage")
 190                 entry.dcodes["gakken"] = data[1:]
 191             elif c == 'L':
 192                 # Remembering the Kanji (Heisig)
 193                 entry.dcodes["heisig"] = data[1:]
 194             elif c == 'O':
 195                 # Japanese Names (O'Neill)
 196                 entry.dcodes["oneill_names"] = data[1:]
 197             elif c == 'D':
 198                 c = data[1]
 199                 if c == 'B':
 200                     # Japanese for Busy People (AJLT)
 201                     entry.dcodes["busy_people"] = data[2:]
 202                 elif c == 'C':
 203                     # The Kanji Way to Japanese Language Power (Crowley)
 204                     entry.dcodes["crowley"] = int(data[2:])
 205                 elif c == 'F':
 206                     # Japanese Kanji Flashcards (White Rabbit Press)
 207                     entry.dcodes["jf_cards"] = int(data[2:])
 208                 elif c == 'G':
 209                     # Kodansha Compact Kanji Guide
 210                     entry.dcodes["kodansha_compact"] = int(data[2:])
 211                 elif c == 'H':
 212                     # A Guide To Reading and Writing Japanese (Henshall)
 213                     entry.dcodes["henshall3"] = int(data[2:])
 214                 elif c == 'J':
 215                     # Kanji in Context (Nishiguchi and Kono)
 216                     entry.dcodes["kanji_in_context"] = int(data[2:])
 217                 elif c == 'K':
 218                     # Kodansha Kanji Learner's Dictionary (Halpern)
 219                     entry.dcodes["halpern_kkld"] = int(data[2:])
 220                 elif c == 'O':
 221                     # Essential Kanji (O'Neill)
 222                     entry.dcodes["oneill_kk"] = int(data[2:])
 223                 elif c == 'R':
 224                     # Query Code: 2001 Kanji (De Roo)
 225                     entry.deroo = int(data[2:])
 226                 elif c == 'S':
 227                     # A Guide to Reading and Writing Japanese (Sakade)
 228                     entry.dcodes["sakade"] = int(data[2:])
 229                 elif c == 'T':
 230                     # Tuttle Kanji Cards (Kask)
 231                     entry.dcodes["tutt_cards"] = int(data[2:])
 232                 elif c == 'M':
 233                     # Yves Maniette's French adaption of Heisig
 234                     entry.dcodes["maniette"] = int(data[2:])
 235                 else:
 236                     entry.unparsed.append(data)
 237             else:
 238                 entry.unparsed.append(data)
 239         except:
 240             entry.unparsed.append(data)
 241
 242     def parse_line(self, line):
 243         if not line:
 244             return None
 245         entry = KanjidicEntry()
 246         state = ParserState()  # Holds "t class"
 247
 248         # First 2 fields are always the same
 249         pieces = line.split(None, 2)
 250         entry.literal = pieces.pop(0)
 251         entry.jis = int(pieces.pop(0), 16)
 252         misc = pieces.pop()
 253
 254         # Parse the remainder
 255         si = ei = 0
 256         while si < len(misc):
 257             c = misc[si]
 258             i = ord(c)
 259             if c == u' ':
 260                 si += 1
 261                 continue
 262             if i > 0xFF or c in (u'-', u'.'):
 263                 # Parse Japanese
 264                 ei = misc.find(u' ', si+1)
 265                 if ei == -1:
 266                     ei = len(misc) + 1
 267                 sub = misc[si:ei]
 268
 269                 self._parse_japanese(entry, state, sub)
 270             elif c == u'{':
 271                 # Parse Translation
 272                 si += 1  # Move si inside of {
 273                 ei = misc.find(u'}', si+1)
 274                 if ei == -1:
 275                     ei = len(misc) + 1
 276                 sub = misc[si:ei]
 277                 ei += 1  # Move ei past }
 278
 279                 entry.meanings.append(sub)
 280             else:
 281                 # Parse info field
 282                 ei = misc.find(u' ', si+1)
 283                 if ei == -1:
 284                     ei = len(misc) + 1
 285                 sub = misc[si:ei]
 286
 287                 self._parse_info(entry, state, sub)
 288
 289             si = ei + 1
 290
 291         return entry
 292
 293
 294 if __name__ == "__main__":
 295     import sys
 296
 297     if len(sys.argv) < 2:
 298         print _("Please specify a dictionary file.")
 299         exit(-1)
 300     try:
 301         kp = KanjidicParser(sys.argv[1])
 302     except Exception, e:
 303         print _("Could not create KanjidicParser: %s") % str(e)
 304         exit(-1)
 305
 306     err_count = 0
 307     entry = kp.get_entry()
 308     while entry:
 309         try:
 310             if entry.unparsed:
 311                 lines = []
 312                 lines.append(_(u"[%s] Unparsed: [%s]")
 313                              % (entry.literal, ", ".join(entry.unparsed)))
 314                 print u"\n".join(lines)
 315         except UnicodeEncodeError, e:
 316             err_count += 1
 317         entry = kp.get_entry()
 318     if err_count:
 319         print _("Warning: could not print %d entries, since they could not be "
 320                 "properly displayed on your terminal.") % err_count