4 # Original from http://nedbatchelder.com/code/modules/hyphenate.py
6 u
"""Hyphenation using a pure Python implementation of Frank Liang's algorithm.
8 This module provides a class to hyphenate words.
10 `Hyphenator.split_word(word)` takes a string (the word), and returns a
11 list of parts that can be separated by hyphens.
13 hyphenator = Hyphenator(pattern_file)
14 >>> hyphenator.split_word(u"hyphenation")
15 [u'hy', u'phen', u'ation']
16 >>> hyphenator.hyphenate_word(u"supercalifragilisticexpialidocious", '-')
17 u'su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious'
18 >>> hyphenator.hyphenate_word(u"project")
21 version 1: Ned Batchelder, July 2007.
22 This Python code is in the public domain.
24 version 2: Internationalization (external pattern files, Unicode)
30 __version__
= '2.0 2014-07-04'
33 def __init__(self
, pattern_file
, exceptions
=''):
35 for pattern
in self
.yield_patterns(pattern_file
):
36 self
._insert
_pattern
(pattern
)
39 for ex
in exceptions
.split():
40 # Convert the hyphenated pattern into a point array for use later.
41 self
.exceptions
[ex
.replace(u
'-', u
'')] = [0] + [ int(h
== u
'-')
42 for h
in re
.split(ur
"[^-]", ex
) ]
44 def yield_patterns(self
, path
, invalid_chars
= '%\\{}', encoding
='utf8'):
46 Yield hyphenation patterns from a file.
48 Pattern file format: As used by TEX
49 * one pattern per line,
50 * every line containing one of the characters in
51 the string `invalid_chars` (comments and TeX macros) is discarded,
52 * file encoding in argument `encoding` (default 'utf8').
54 # TODO: process OpenOffice hyphenation files?
55 # (Suffix '.dic', encoding specified in first line of file).
58 for c
in invalid_chars
:
62 line
= line
.decode(encoding
).strip()
67 def _insert_pattern(self
, pattern
):
68 # Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
69 # and a list of points [ 1, 0, 3, 4 ].
70 chars
= re
.sub(u
'[0-9]', u
'', pattern
)
71 points
= [ int(d
or 0) for d
in re
.split(u
'[^0-9]', pattern
) ]
73 # Insert the pattern into the tree. Each character finds a dict
74 # another level down in the tree, and leaf nodes have the list of
83 def split_word(self
, word
, lmin
=2, rmin
=2):
84 """ Given a word, returns a list of pieces, broken at the possible
87 # Short words aren't hyphenated.
88 if len(word
) <= (lmin
+ rmin
):
90 # If the word is an exception, get the stored points.
91 if word
.lower() in self
.exceptions
:
92 points
= self
.exceptions
[word
.lower()]
94 work
= '.' + word
.lower() + '.'
95 points
= [0] * (len(work
)+1)
96 for i
in range(len(work
)):
103 for j
in range(len(p
)):
104 points
[i
+j
] = max(points
[i
+j
], p
[j
])
107 # No hyphens in the first `lmin` chars or the last `rmin` ones:
108 for i
in range(lmin
):
110 for i
in range(rmin
):
112 # points[1] = points[2] = points[-2] = points[-3] = 0
114 # Examine the points to build the pieces list.
116 for c
, p
in zip(word
, points
[2:]):
122 def hyphenate_word(self
, word
, hyphen
=u
'', lmin
=2, rmin
=2):
123 """ Return `word` with (soft-)hyphens at the possible
126 return hyphen
.join(self
.split_word(word
, lmin
, rmin
))
129 pattern_file
= 'en-US.pat'
130 # pattern_file = '../../dehyphn-x/dehyphn-x-2014-06-25.pat'
131 # pattern_file = '../../dehyphn-x-fugen/dehyphn-x-fugen-2014-07-01.pat'
134 if __name__
== '__main__':
136 usage
= u
'%prog [options] [words to be hyphenated]\n\n' + __doc__
138 parser
= optparse
.OptionParser(usage
=usage
)
139 parser
.add_option('-f', '--pattern-file', dest
='pattern_file',
140 help='Pattern file, Default "en-US.pat"',
142 parser
.add_option('-e', '--exception-file', dest
='exception_file',
143 help='File of hyphenated words (exceptions), '
144 'Default None', default
='')
145 parser
.add_option('-i', '--input-file', dest
='input_file',
146 help='Eingabedatei (ein Wort/Zeile)',
148 parser
.add_option('', '--lmin',
149 help='Unhyphenated characters at start of word, default 2',
151 parser
.add_option('', '--rmin',
152 help='Unhyphenated characters at end of word, default 2',
154 (options
, args
) = parser
.parse_args()
156 lmin
= int(options
.lmin
)
157 rmin
= int(options
.rmin
)
159 if options
.exception_file
:
160 ex_file
= open(exception_file
)
161 exceptions
= ex_file
.read().decode('utf8')
165 if len(args
) == 0: # self test
167 as-so-ciate as-so-ciates dec-li-na-tion oblig-a-tory
168 phil-an-thropic present presents project projects reci-procity
169 re-cog-ni-zance ref-or-ma-tion ret-ri-bu-tion ta-ble
171 hyphenator
= Hyphenator(options
.pattern_file
, exceptions
)
175 words
= [word
.decode('utf8') for word
in args
]
177 print hyphenator
.hyphenate_word(word
, lmin
=lmin
, rmin
=rmin
)
180 doctest
.testmod(verbose
=True)