Python-Skript Update.
[wortliste.git] / skripte / python / hyphenation.py
blob50e5c964796ef6433a2b42c860e89f39f2de8749
1 #!/usr/bin/env python
2 # -*- coding: utf8 -*-
4 # Original from http://nedbatchelder.com/code/modules/hyphenate.py
6 u"""Hyphenation using a pure Python implementation of Frank Liang's algorithm.
8 This module provides a class to hyphenate words.
10 `Hyphenator.split_word(word)` takes a string (the word), and returns a
11 list of parts that can be separated by hyphens.
13 hyphenator = Hyphenator(pattern_file)
14 >>> hyphenator.split_word(u"hyphenation")
15 [u'hy', u'phen', u'ation']
16 >>> hyphenator.hyphenate_word(u"supercalifragilisticexpialidocious", '-')
17 u'su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious'
18 >>> hyphenator.hyphenate_word(u"project")
19 u'project'
21 version 1: Ned Batchelder, July 2007.
22 This Python code is in the public domain.
24 version 2: Internationalization (external pattern files, Unicode)
25 © 2013 Günter Milde
26 """
28 import re, optparse
30 __version__ = '2.0 2014-07-04'
32 class Hyphenator:
33 def __init__(self, pattern_file, exceptions=''):
34 self.tree = {}
35 for pattern in self.yield_patterns(pattern_file):
36 self._insert_pattern(pattern)
38 self.exceptions = {}
39 for ex in exceptions.split():
40 # Convert the hyphenated pattern into a point array for use later.
41 self.exceptions[ex.replace(u'-', u'')] = [0] + [ int(h == u'-')
42 for h in re.split(ur"[^-]", ex) ]
44 def yield_patterns(self, path, invalid_chars = '%\\{}', encoding='utf8'):
45 """
46 Yield hyphenation patterns from a file.
48 Pattern file format: As used by TEX
49 * one pattern per line,
50 * every line containing one of the characters in
51 the string `invalid_chars` (comments and TeX macros) is discarded,
52 * file encoding in argument `encoding` (default 'utf8').
53 """
54 # TODO: process OpenOffice hyphenation files?
55 # (Suffix '.dic', encoding specified in first line of file).
56 lines = open(path)
57 for line in lines:
58 for c in invalid_chars:
59 if c in line:
60 line = ''
61 continue
62 line = line.decode(encoding).strip()
63 if line:
64 yield line
65 lines.close()
67 def _insert_pattern(self, pattern):
68 # Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
69 # and a list of points [ 1, 0, 3, 4 ].
70 chars = re.sub(u'[0-9]', u'', pattern)
71 points = [ int(d or 0) for d in re.split(u'[^0-9]', pattern) ]
73 # Insert the pattern into the tree. Each character finds a dict
74 # another level down in the tree, and leaf nodes have the list of
75 # points.
76 t = self.tree
77 for c in chars:
78 if c not in t:
79 t[c] = {}
80 t = t[c]
81 t[None] = points
83 def split_word(self, word, lmin=2, rmin=2):
84 """ Given a word, returns a list of pieces, broken at the possible
85 hyphenation points.
86 """
87 # Short words aren't hyphenated.
88 if len(word) <= (lmin + rmin):
89 return [word]
90 # If the word is an exception, get the stored points.
91 if word.lower() in self.exceptions:
92 points = self.exceptions[word.lower()]
93 else:
94 work = '.' + word.lower() + '.'
95 points = [0] * (len(work)+1)
96 for i in range(len(work)):
97 t = self.tree
98 for c in work[i:]:
99 if c in t:
100 t = t[c]
101 if None in t:
102 p = t[None]
103 for j in range(len(p)):
104 points[i+j] = max(points[i+j], p[j])
105 else:
106 break
107 # No hyphens in the first `lmin` chars or the last `rmin` ones:
108 for i in range(lmin):
109 points[i+1] = 0
110 for i in range(rmin):
111 points[-2-i] = 0
112 # points[1] = points[2] = points[-2] = points[-3] = 0
114 # Examine the points to build the pieces list.
115 pieces = ['']
116 for c, p in zip(word, points[2:]):
117 pieces[-1] += c
118 if p % 2:
119 pieces.append('')
120 return pieces
122 def hyphenate_word(self, word, hyphen=u'­', lmin=2, rmin=2):
123 """ Return `word` with (soft-)hyphens at the possible
124 hyphenation points.
126 return hyphen.join(self.split_word(word, lmin, rmin))
129 pattern_file = 'en-US.pat'
130 # pattern_file = '../../dehyphn-x/dehyphn-x-2014-06-25.pat'
131 # pattern_file = '../../dehyphn-x-fugen/dehyphn-x-fugen-2014-07-01.pat'
134 if __name__ == '__main__':
136 usage = u'%prog [options] [words to be hyphenated]\n\n' + __doc__
138 parser = optparse.OptionParser(usage=usage)
139 parser.add_option('-f', '--pattern-file', dest='pattern_file',
140 help='Pattern file, Default "en-US.pat"',
141 default='en-US.pat')
142 parser.add_option('-e', '--exception-file', dest='exception_file',
143 help='File of hyphenated words (exceptions), '
144 'Default None', default='')
145 parser.add_option('-i', '--input-file', dest='input_file',
146 help='Eingabedatei (ein Wort/Zeile)',
147 default='')
148 parser.add_option('', '--lmin',
149 help='Unhyphenated characters at start of word, default 2',
150 default='2')
151 parser.add_option('', '--rmin',
152 help='Unhyphenated characters at end of word, default 2',
153 default='2')
154 (options, args) = parser.parse_args()
156 lmin = int(options.lmin)
157 rmin = int(options.rmin)
159 if options.exception_file:
160 ex_file = open(exception_file)
161 exceptions = ex_file.read().decode('utf8')
162 ex_file.close()
163 else:
164 exceptions = ''
165 if len(args) == 0: # self test
166 exceptions = u"""
167 as-so-ciate as-so-ciates dec-li-na-tion oblig-a-tory
168 phil-an-thropic present presents project projects reci-procity
169 re-cog-ni-zance ref-or-ma-tion ret-ri-bu-tion ta-ble
171 hyphenator = Hyphenator(options.pattern_file, exceptions)
172 del exceptions
174 if len(args) > 0:
175 words = [word.decode('utf8') for word in args]
176 for word in words:
177 print hyphenator.hyphenate_word(word, lmin=lmin, rmin=rmin)
178 else:
179 import doctest
180 doctest.testmod(verbose=True)