1 # -*- coding: utf-8 -*-
2 import re
, sys
, argparse
6 Represent a syllable constituent (a single alphabetical character),
7 with other text (mute characters, punctuation, spaces, etc.) attached
10 sign.get_char() gives the alphabetical syllable consituent.
11 sign.get_text() gives the whole text attached to the sign
13 def __init__(self
, c
):
16 self
._word
_end
= False
17 self
._word
_start
= False
18 self
._forced
_syllable
_end
= False
19 self
._forced
_syllable
_start
= False
21 def add_text(self
, str):
22 self
._text
= "".join((self
._text
, str))
24 def set_forced_syllable_end(self
):
25 self
._forced
_syllable
_end
= True
27 def forced_syllable_end(self
):
28 return self
._forced
_syllable
_end
30 def set_forced_syllable_start(self
):
31 self
._forced
_syllable
_start
= True
33 def forced_syllable_start(self
):
34 return self
._forced
_syllable
_start
39 def set_word_end(self
):
43 return self
._word
_start
45 def set_word_start(self
):
46 self
._word
_start
= True
54 class SignTokenizer():
56 Provides a method for build a list of signs from a decorated verse string.
58 sign_tokenizer = SignTokenizer()
59 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
60 signs being a list of Sign objects
62 The decorations can be:
63 - "°" for grouping 'empty' words to 'full' words.
65 En°vain j'ay respecté la°celebre memoire
66 Des°Heros des°siecles passez ;
67 Can be overriden with word_separator_markers constructor keyword
69 - "*" for marking a mute letter (e.g. a 'h').
71 Et c'est l'*Hyver qui les°rassemble.
72 Can be overriden with mute_character_marker constructor keyword
74 - "=" for forcing syllable ends, e.g. for marking a diaeresis.
76 Trop *heureux Phrygi=ens, venez icy l'attendre.
77 Can be overriden with forced_syllable_end_marker constructor keyword
79 - other unused markers: < > { }
80 Can be overriden with ignored_markers constructor keyword
83 word_separators
= " -",
84 word_separator_markers
= "°",
85 simple_punctuations
= ".,",
86 double_punctuations
= ":;?!",
88 forced_syllable_end_marker
= "=",
89 mute_character_marker
= "*",
90 ignored_markers
= "<>{}",
91 ignored_characters
= "[]()|/~_"
93 self
.word_separators
= word_separators
94 self
.word_separator_markers
= word_separator_markers
95 self
.all_word_separators
= "".join((word_separators
,
96 word_separator_markers
))
97 self
.simple_punctuations
= simple_punctuations
98 self
.double_punctuations
= double_punctuations
99 self
.apostrophes
= apostrophes
100 self
.forced_syllable_end_marker
= forced_syllable_end_marker
101 self
.mute_character_marker
= mute_character_marker
102 self
.ignored_markers
= ignored_markers
103 self
.ignored_characters
= ignored_characters
104 self
.punctuation_re
= re
.compile(
105 " *([{}{}])".format(self
.simple_punctuations
,
106 self
.double_punctuations
))
107 self
.et_re
= re
.compile("([Ee]t)({})".format(
108 "|".join(self
.all_word_separators
)))
112 self
._current
_sign
= None
115 def _add_sign(self
, c
):
116 self
._current
_sign
= Sign(c
.lower())
117 self
._signs
.append(self
._current
_sign
)
118 if self
._prefix
!= "":
119 self
._current
_sign
.add_text(self
._prefix
)
122 def _add_prefix(self
, prefix
):
123 self
._prefix
= "".join((self
._prefix
, prefix
))
125 def _add_text(self
, text
):
126 self
._current
_sign
.add_text(text
)
128 def _set_forced_syllable_end(self
):
129 self
._current
_sign
.set_forced_syllable_end()
131 def _set_word_end(self
):
132 self
._current
_sign
.set_word_end()
134 def tokenize(self
, verse_text
):
136 sign_count
= len(verse_text
)
140 while (i
< sign_count
):
142 punctuation_match
= self
.punctuation_re
.match(verse_text
[i
:])
143 ## Markers: they are not real text
144 # forced syllable end marker
145 if c
== self
.forced_syllable_end_marker
:
146 self
._set
_forced
_syllable
_end
()
148 # mute character marker
149 elif c
== self
.mute_character_marker
:
153 elif c
in self
.ignored_markers
:
157 elif c
in self
.apostrophes
:
161 elif punctuation_match
:
162 punct
= punctuation_match
.group(1)
163 if punct
in self
.double_punctuations
:
164 self
._add
_text
("\u00A0")
165 self
._add
_text
(punct
)
166 i
+= len(punctuation_match
.group(0))
170 elif c
in self
.all_word_separators
:
173 if c
in self
.word_separator_markers
:
179 elif c
in self
.ignored_characters
:
189 m
= word_start
and self
.et_re
.match(verse_text
[i
:])
191 # special case: et -> &
193 self
._add
_text
(m
.group(1))
204 # the last character is at word end and syllable end
206 self
._set
_forced
_syllable
_end
()
207 # set word_start and forced_syllable_start for characters
208 # following a word end or forced_syllable_end
210 at_syllable_start
= True
211 for sign
in self
._signs
:
213 sign
.set_word_start()
214 if at_syllable_start
:
215 sign
.set_forced_syllable_start()
216 at_word_start
= sign
.word_end()
217 at_syllable_start
= sign
.forced_syllable_end()
221 return "".join([c
.get_char() for c
in self
._signs
])
223 def get_full_verse(self
):
224 return "".join([c
.get_text() for c
in self
._signs
])
229 Represents a syllable, consisting in a list of signs.
234 def add_sign(self
, sign
):
235 self
._signs
.append(sign
)
237 def add_signs(self
, signs
):
238 self
._signs
.extend(signs
)
243 def set_signs(self
, signs
):
247 return "".join([sign
.get_text() for sign
in self
._signs
])
250 return "".join([sign
.get_char() for sign
in self
._signs
])
253 return not self
._signs
255 def at_word_start(self
):
256 return self
._signs
[0].word_start()
258 def at_word_end(self
):
259 return self
._signs
[-1].word_end()
261 def is_feminine(self
):
263 A syllable is feminine iff:
264 - it is placed at word end
265 - it contains exactly one vowel, which is 'e' or 'ë', at the end
266 (with possibly a final s)
269 if self
.at_word_end():
270 chars
= "".join([sign
.get_char() for sign
in self
._signs
])
272 # exact words: ces, mes, ses, tes, les, des, es
274 if (self
.at_word_start()
275 and re
.match("^[cmstld]?es$", chars
)):
279 if char
in "aàâäeëéèêœiìïîoôòuùûüy&":
280 vowels
= "".join((vowels
, char
))
282 # only one vowel: e or ë, and word ends with -e or -es
283 ((vowels
== "e" or vowels
== "ë")
284 and (vowels
== chars
[-1] or (vowels
+ "s") == chars
[-2:]))
285 # two vowels: "que?" or "gues?"
286 or ((vowels
== "ue" or vowels
== "uë")
287 and re
.search("[qg]u[eë]s?", chars
)))
291 class SyllableTokenizer():
293 Provides a method for build a list of syllables from a list of signs.
295 sign_tokenizer = SignTokenizer()
296 syllable_tokenizer = SyllableTokenizer()
297 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
298 syllables = syllable_tokenizer.tokenize(signs)
299 syllables being a list of Syllable objects
303 other_vowels
= "aàâäéèêœiìïîoôòuùûüy&",
304 consonants_sonority_levels
= { 'liquid' : "lrh",
306 'constrictive' : "çfjsvxz",
307 'occlusive' : "bcdgkpqt" }
309 self
.e_vowels
= e_vowels
310 self
.other_vowels
= other_vowels
311 self
.vowels
= "".join((e_vowels
, other_vowels
))
312 self
.consonants_sonority_levels
= consonants_sonority_levels
313 self
.consonants
= "".join(consonants_sonority_levels
.values())
316 # [something][vowel (no feminine e)]<space>[vowel]
317 'hiatus' : ".[{}][{}]".format(self
.other_vowels
, self
.vowels
),
318 # <word start>s[cçpt][vowel]
319 '^sca' : "s[cçpt][{}]".format(self
.vowels
),
320 # <word start>s[cp][lr][vowel]
321 '^scla' : "s[cp][lr][{}]".format(self
.vowels
),
322 # <word start>ps[vowel]
323 '^psa' : "ps[{}]".format(self
.vowels
),
325 'gna' : "gn[{}]".format(self
.vowels
),
326 # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
327 'bla' : "[{}{}][{}][{}]".format(
328 self
.consonants_sonority_levels
['occlusive'],
329 self
.consonants_sonority_levels
['constrictive'].replace("s", ""),
330 self
.consonants_sonority_levels
['liquid'],
333 'thra' : "[tpc]h[rl][{}]".format(self
.vowels
),
335 'ba' : "[{}][{}]".format(self
.consonants
, self
.vowels
),
337 self
.compiled_re
= {}
338 for (key
, string
) in self
.re
.items():
339 self
.compiled_re
[key
] = re
.compile(string
)
340 self
._match
_data
= None
342 def _match(self
, re_key
, text
):
343 self
._match
_data
= self
.compiled_re
[re_key
].match(text
)
344 return self
._match
_data
346 def _get_match_data(self
):
347 return self
._match
_data
351 self
._current
_syllable
= None
352 self
._first
_syllable
= Syllable()
354 def _start_new_syllable(self
):
355 if (self
._first
_syllable
and not self
._first
_syllable
.is_empty()):
356 self
._syllables
.append(self
._first
_syllable
)
357 if not (self
._current
_syllable
358 and self
._current
_syllable
.is_empty()):
359 self
._current
_syllable
= Syllable()
360 self
._syllables
.append(self
._current
_syllable
)
361 self
._first
_syllable
= None
363 def _add_sign(self
, text
):
364 if self
._first
_syllable
:
365 self
._first
_syllable
.add_sign(text
)
367 self
._current
_syllable
.add_sign(text
)
369 def get_syllables(self
):
370 return self
._syllables
372 def tokenize(self
, signs
):
374 verse_text
= "".join([sign
.get_char() for sign
in signs
])
375 sign_count
= len(signs
)
377 while (i
< sign_count
):
378 word_start
= signs
[i
].word_start()
379 # forced syllable ends
380 if (i
> 0 and signs
[i
].forced_syllable_start()):
381 self
._start
_new
_syllable
()
386 and verse_text
[i
] in self
.vowels
387 and signs
[i
].word_end()):
388 self
._add
_sign
(signs
[i
])
390 self
._start
_new
_syllable
()
391 # [something][vowel (no feminine e)]<space>[vowel]
392 elif (self
._match
('hiatus', verse_text
[i
:])
393 and signs
[i
+1].word_end()):
394 self
._add
_sign
(signs
[i
])
395 self
._add
_sign
(signs
[i
+1])
396 self
._start
_new
_syllable
()
397 self
._add
_sign
(signs
[i
+2])
400 # <word start>s[cçpt][vowel]
401 (word_start
and self
._match
('^sca', verse_text
[i
:])
402 and not signs
[i
].word_end())
403 # <word start>s[cp][lr][vowel]
404 or (word_start
and self
._match
('^scla', verse_text
[i
:])
405 and not signs
[i
].word_end()
406 and not signs
[i
+1].word_end())
407 # <word start>ps[vowel]
408 or (word_start
and self
._match
('^psa', verse_text
[i
:]))
410 or (self
._match
('gna', verse_text
[i
:])
411 and not signs
[i
].word_end())
412 # [bcdgkpqtçfjvxz][lrh][vowel]
413 or (self
._match
('bla', verse_text
[i
:])
414 and not signs
[i
].word_end())
416 or (self
._match
('thra', verse_text
[i
:])
417 and not signs
[i
+1].word_end())
419 or self
._match
('ba', verse_text
[i
:])
421 match
= self
._get
_match
_data
().group(0)
422 self
._start
_new
_syllable
()
424 self
._add
_sign
(signs
[i
])
427 self
._add
_sign
(signs
[i
])
429 return self
.get_syllables()
432 class SyllableTokenizerWithWordSeparation(SyllableTokenizer
):
434 A specialized SyllableTokenizer which preferes syllable
435 breaking between words when possible. For instance:
438 gives: tant / at / ten / du
439 iso: tan / t at / ten / du
441 This is useful when breaking verses for lyrics.
444 sign_tokenizer = SignTokenizer()
445 syllable_tokenizer = SyllableTokenizerWithWordSeparation()
446 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
447 syllables = syllable_tokenizer.tokenize(signs)
448 syllables being a list of Syllable objects
450 def force_word_separation(self
, syllables
= None):
451 syllables
= syllables
or self
._syllables
452 syllable_count
= len(syllables
)
453 prev_syllable
= syllables
[0]
454 for this_syllable
in syllables
[1:]:
455 signs
= this_syllable
.get_signs()
456 if not signs
[0].word_start() and signs
[1:]:
457 tokens_count
= len(signs
)
459 while (not signs
[i
].word_start()
460 or not signs
[i
].get_char() in self
.vowels
):
462 if i
== tokens_count
:
465 # we found a vowel at word start at index i
466 # signs from indices 0 to i-1 go to the previous syllable
467 prev_syllable
.add_signs(signs
[0:i
])
468 this_syllable
.set_signs(signs
[i
:])
469 prev_syllable
= this_syllable
472 def tokenize(self
, signs
):
473 SyllableTokenizer
.tokenize(self
, signs
)
474 return self
.force_word_separation()
481 verse = Verse("Un ver avec des décorations")
482 # possible pass sign and syllable tokenizers to split:
484 verse.get_syllables()
485 => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
488 def __init__(self
, text
, lineno
= None):
491 self
._lineno
= lineno
493 def get_syllables(self
):
494 return [syll
.get_text() for syll
in self
._syllables
]
497 return "".join([syll
.get_text() for syll
in self
._syllables
])
500 sign_tokenizer
= SignTokenizer(),
501 syllable_tokenizer
= SyllableTokenizer()
503 self
._syllables
= syllable_tokenizer
.tokenize(
504 sign_tokenizer
.tokenize(self
._text
))
506 def get_metric(self
):
507 return len(self
._syllables
) - (1 if self
._syllables
[-1].is_feminine() else 0)
509 def hyphenate(self
, hyphen
= "-", add_space
= False):
512 count
= len(self
._syllables
)
513 for syllable
in self
._syllables
:
514 if (i
> 0) and not syllable
.at_word_start():
515 syllables
.append(hyphen
)
516 text
= syllable
.get_text()
517 syllables
.append(text
)
519 verse_end
= (i
== count
- 1)
520 # if syllable is word end and do not end with a space,
521 # add it (unless at verse end)
523 and syllable
.at_word_end()
524 and text
[-1] != " "):
525 syllables
.append(" ")
527 return "".join(syllables
)
531 A corpus, consisting of verses.
534 To generate LilyPond lyrics (where syllables in a word are separated
538 corpus.add_verse(["premier ver", "second ver..."])
539 corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
540 corpus.get_hyphenated_verses(hyphen = " -- ")
541 => ["pre -- mier ver", "se -- cond ver..."]
543 def __init__(self
, filename
= None):
545 self
._filename
= filename
547 def add_verse(self
, verse
, lineno
= None):
549 Add verse (a string) to the corpus.
551 self
._verses
.append(Verse(verse
, lineno
))
553 def get_verses(self
):
557 sign_tokenizer
= SignTokenizer(),
558 syllable_tokenizer
= SyllableTokenizer()):
560 Syllabify all the corpus verses.
562 for verse
in self
._verses
:
563 verse
.syllabify(sign_tokenizer
, syllable_tokenizer
)
565 def get_hyphenated_verses(self
, hyphen
= "-", add_space
= False):
567 Return the hyphenated verses (list of strings) contained in the
569 Corpus.syllabify() is supposed to have been called before.
571 return [verse
.hyphenate(hyphen
, add_space
)
572 for verse
in self
._verses
]
574 class CorpusReader():
576 def read(self
, filename
= "-"):
578 Read a corpus file (or stdin if filename is "-")
579 and produce a Corpus object.
581 file = open(filename
, 'r') if (filename
!= "-") else sys
.stdin
582 corpus
= Corpus(filename
)
591 elif re
.match(r
"^//", line
):
594 # TODO: titling directives
595 elif re
.match(r
"^#", line
):
600 # verse text TAB+ [properties]
601 # where properties can be:
602 # [LB]+ breve/long syllables indicators
606 # other lilypond code
607 # for now, we only keep the verse text itself
608 text
= re
.sub(r
"([^\t]+)\t.*$", r
"\1", line
)
609 corpus
.add_verse(text
, lineno
)
615 Syllabify and print verses.
617 parser
= argparse
.ArgumentParser(
618 description
='Verse syllabication.',
619 formatter_class
=argparse
.ArgumentDefaultsHelpFormatter
)
624 help='verse words to syllabify (if no corpus is provided)')
627 help="Corpus file to syllabify. Use - for reading from stdin")
631 help="String to be used when hyphenating a verse.")
634 default
="{hyphenated_verse}",
635 help="""Python format string for outputing the verse.
636 Possible keywords, to be used between curly braces in the format string,
638 *) hyphenated_verse: the verse after applying hyphenation
639 *) verse: the verse without hyphenation
640 *) metric: the verse metric (a number).""")
641 args
= vars(parser
.parse_args())
645 reader
= CorpusReader()
646 corpus
= reader
.read(args
['corpus'])
648 syllable_tokenizer
= SyllableTokenizerWithWordSeparation())
649 for verse
in corpus
.get_verses():
650 hyphenated_verse
= verse
.hyphenate(hyphen
= args
['hyphen'],
652 print(args
['format'].format(verse
= verse
.get_text(),
653 hyphenated_verse
= hyphenated_verse
,
654 metric
= verse
.get_metric()))
656 # read verse on command line arguments
657 verse
= Verse(" ".join(args
['verse']))
659 syllable_tokenizer
= SyllableTokenizerWithWordSeparation())
660 hyphenated_verse
= verse
.hyphenate(hyphen
= args
['hyphen'], add_space
= True)
661 print(args
['format'].format(verse
= verse
.get_text(),
662 hyphenated_verse
= hyphenated_verse
,
663 metric
= verse
.get_metric()))
667 if __name__
== '__main__':