Armide : acte 5 scène 2 [1/3]
[nenuvar.git] / scripts / syllabify.py
blob749773749df360c12ebc2f482520174e8c88fbbd
1 # -*- coding: utf-8 -*-
2 import re, sys, argparse
4 class Sign():
5 """
6 Represent a syllable constituent (a single alphabetical character),
7 with other text (mute characters, punctuation, spaces, etc.) attached
8 to it.
10 sign.get_char() gives the alphabetical syllable consituent.
11 sign.get_text() gives the whole text attached to the sign
12 """
13 def __init__(self, c):
14 self._sign = c
15 self._text = ""
16 self._word_end = False
17 self._word_start = False
18 self._forced_syllable_end = False
19 self._forced_syllable_start = False
21 def add_text(self, str):
22 self._text = "".join((self._text, str))
24 def set_forced_syllable_end(self):
25 self._forced_syllable_end = True
27 def forced_syllable_end(self):
28 return self._forced_syllable_end
30 def set_forced_syllable_start(self):
31 self._forced_syllable_start = True
33 def forced_syllable_start(self):
34 return self._forced_syllable_start
36 def word_end(self):
37 return self._word_end
39 def set_word_end(self):
40 self._word_end = True
42 def word_start(self):
43 return self._word_start
45 def set_word_start(self):
46 self._word_start = True
48 def get_char(self):
49 return self._sign
51 def get_text(self):
52 return self._text
54 class SignTokenizer():
55 """
56 Provides a method for build a list of signs from a decorated verse string.
57 Usage:
58 sign_tokenizer = SignTokenizer()
59 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
60 signs being a list of Sign objects
62 The decorations can be:
63 - "°" for grouping 'empty' words to 'full' words.
64 Example:
65 En°vain j'ay respecté la°celebre memoire
66 Des°Heros des°siecles passez ;
67 Can be overriden with word_separator_markers constructor keyword
69 - "*" for marking a mute letter (e.g. a 'h').
70 Example:
71 Et c'est l'*Hyver qui les°rassemble.
72 Can be overriden with mute_character_marker constructor keyword
74 - "=" for forcing syllable ends, e.g. for marking a diaeresis.
75 Example:
76 Trop *heureux Phrygi=ens, venez icy l'attendre.
77 Can be overriden with forced_syllable_end_marker constructor keyword
79 - other unused markers: < > { }
80 Can be overriden with ignored_markers constructor keyword
81 """
82 def __init__(self,
83 word_separators = " -",
84 word_separator_markers = "°",
85 simple_punctuations = ".,",
86 double_punctuations = ":;?!",
87 apostrophes = "'’",
88 forced_syllable_end_marker = "=",
89 mute_character_marker = "*",
90 ignored_markers = "<>{}",
91 ignored_characters = "[]()|/~_"
93 self.word_separators = word_separators
94 self.word_separator_markers = word_separator_markers
95 self.all_word_separators = "".join((word_separators,
96 word_separator_markers))
97 self.simple_punctuations = simple_punctuations
98 self.double_punctuations = double_punctuations
99 self.apostrophes = apostrophes
100 self.forced_syllable_end_marker = forced_syllable_end_marker
101 self.mute_character_marker = mute_character_marker
102 self.ignored_markers = ignored_markers
103 self.ignored_characters = ignored_characters
104 self.punctuation_re = re.compile(
105 " *([{}{}])".format(self.simple_punctuations,
106 self.double_punctuations))
107 self.et_re = re.compile("([Ee]t)({})".format(
108 "|".join(self.all_word_separators)))
110 def _reset(self):
111 self._prefix = ""
112 self._current_sign = None
113 self._signs = []
115 def _add_sign(self, c):
116 self._current_sign = Sign(c.lower())
117 self._signs.append(self._current_sign)
118 if self._prefix != "":
119 self._current_sign.add_text(self._prefix)
120 self._prefix = ""
122 def _add_prefix(self, prefix):
123 self._prefix = "".join((self._prefix, prefix))
125 def _add_text(self, text):
126 self._current_sign.add_text(text)
128 def _set_forced_syllable_end(self):
129 self._current_sign.set_forced_syllable_end()
131 def _set_word_end(self):
132 self._current_sign.set_word_end()
134 def tokenize(self, verse_text):
135 self._reset()
136 sign_count = len(verse_text)
137 i = 0
138 mute_next = False
139 word_start = True
140 while (i < sign_count):
141 c = verse_text[i]
142 punctuation_match = self.punctuation_re.match(verse_text[i:])
143 ## Markers: they are not real text
144 # forced syllable end marker
145 if c == self.forced_syllable_end_marker:
146 self._set_forced_syllable_end()
147 i += 1
148 # mute character marker
149 elif c == self.mute_character_marker:
150 i += 1
151 mute_next = True
152 # ignored markers
153 elif c in self.ignored_markers:
154 i += 1
155 ## Actual text
156 # apostroph
157 elif c in self.apostrophes:
158 self._add_text("’")
159 i += 1
160 # punctuation
161 elif punctuation_match:
162 punct = punctuation_match.group(1)
163 if punct in self.double_punctuations:
164 self._add_text("\u00A0")
165 self._add_text(punct)
166 i += len(punctuation_match.group(0))
167 self._set_word_end()
168 word_start = True
169 # word separator
170 elif c in self.all_word_separators:
171 self._set_word_end()
172 word_start = True
173 if c in self.word_separator_markers:
174 self._add_text(" ")
175 else:
176 self._add_text(c)
177 i += 1
178 # ignored characters
179 elif c in self.ignored_characters:
180 self._add_text(c)
181 i += 1
182 # consonant or vowel
183 else:
184 if mute_next:
185 self._add_prefix(c)
186 mute_next = False
187 i += 1
188 else:
189 m = word_start and self.et_re.match(verse_text[i:])
190 if m:
191 # special case: et -> &
192 self._add_sign("&")
193 self._add_text(m.group(1))
194 self._add_text(" ")
195 self._set_word_end()
196 word_start = True
197 i += len(m.group(0))
198 else:
199 # consonant or vowel
200 self._add_sign(c)
201 self._add_text(c)
202 word_start = False
203 i += 1
204 # the last character is at word end and syllable end
205 self._set_word_end()
206 self._set_forced_syllable_end()
207 # set word_start and forced_syllable_start for characters
208 # following a word end or forced_syllable_end
209 at_word_start = True
210 at_syllable_start = True
211 for sign in self._signs:
212 if at_word_start:
213 sign.set_word_start()
214 if at_syllable_start:
215 sign.set_forced_syllable_start()
216 at_word_start = sign.word_end()
217 at_syllable_start = sign.forced_syllable_end()
218 return self._signs
220 def get_chars(self):
221 return "".join([c.get_char() for c in self._signs])
223 def get_full_verse(self):
224 return "".join([c.get_text() for c in self._signs])
227 class Syllable():
229 Represents a syllable, consisting in a list of signs.
231 def __init__(self):
232 self._signs = []
234 def add_sign(self, sign):
235 self._signs.append(sign)
237 def add_signs(self, signs):
238 self._signs.extend(signs)
240 def get_signs(self):
241 return self._signs
243 def set_signs(self, signs):
244 self._signs = signs
246 def get_text(self):
247 return "".join([sign.get_text() for sign in self._signs])
249 def get_chars(self):
250 return "".join([sign.get_char() for sign in self._signs])
252 def is_empty(self):
253 return not self._signs
255 def at_word_start(self):
256 return self._signs[0].word_start()
258 def at_word_end(self):
259 return self._signs[-1].word_end()
261 def is_feminine(self):
263 A syllable is feminine iff:
264 - it is placed at word end
265 - it contains exactly one vowel, which is 'e' or 'ë', at the end
266 (with possibly a final s)
269 if self.at_word_end():
270 chars = "".join([sign.get_char() for sign in self._signs])
271 # special cases:
272 # exact words: ces, mes, ses, tes, les, des, es
273 # have no feminine e
274 if (self.at_word_start()
275 and re.match("^[cmstld]?es$", chars)):
276 return False
277 vowels = ""
278 for char in chars:
279 if char in "aàâäeëéèêœiìïîoôòuùûüy&":
280 vowels = "".join((vowels, char))
281 return not not (
282 # only one vowel: e or ë, and word ends with -e or -es
283 ((vowels == "e" or vowels == "ë")
284 and (vowels == chars[-1] or (vowels + "s") == chars[-2:]))
285 # two vowels: "que?" or "gues?"
286 or ((vowels == "ue" or vowels == "uë")
287 and re.search("[qg]u[eë]s?", chars)))
288 return False
291 class SyllableTokenizer():
293 Provides a method for build a list of syllables from a list of signs.
294 Usage:
295 sign_tokenizer = SignTokenizer()
296 syllable_tokenizer = SyllableTokenizer()
297 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
298 syllables = syllable_tokenizer.tokenize(signs)
299 syllables being a list of Syllable objects
301 def __init__(self,
302 e_vowels = "eë",
303 other_vowels = "aàâäéèêœiìïîoôòuùûüy&",
304 consonants_sonority_levels = { 'liquid' : "lrh",
305 'nasal' : "mn",
306 'constrictive' : "çfjsvxz",
307 'occlusive' : "bcdgkpqt" }
309 self.e_vowels = e_vowels
310 self.other_vowels = other_vowels
311 self.vowels = "".join((e_vowels, other_vowels))
312 self.consonants_sonority_levels = consonants_sonority_levels
313 self.consonants = "".join(consonants_sonority_levels.values())
314 self._reset()
315 self.re = {
316 # [something][vowel (no feminine e)]<space>[vowel]
317 'hiatus' : ".[{}][{}]".format(self.other_vowels, self.vowels),
318 # <word start>s[cçpt][vowel]
319 '^sca' : "s[cçpt][{}]".format(self.vowels),
320 # <word start>s[cp][lr][vowel]
321 '^scla' : "s[cp][lr][{}]".format(self.vowels),
322 # <word start>ps[vowel]
323 '^psa' : "ps[{}]".format(self.vowels),
324 # gn[vowel]
325 'gna' : "gn[{}]".format(self.vowels),
326 # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
327 'bla' : "[{}{}][{}][{}]".format(
328 self.consonants_sonority_levels['occlusive'],
329 self.consonants_sonority_levels['constrictive'].replace("s", ""),
330 self.consonants_sonority_levels['liquid'],
331 self.vowels),
332 # [tpc]h[rl][vowel]
333 'thra' : "[tpc]h[rl][{}]".format(self.vowels),
334 # [consonant][vowel]
335 'ba' : "[{}][{}]".format(self.consonants, self.vowels),
337 self.compiled_re = {}
338 for (key, string) in self.re.items():
339 self.compiled_re[key] = re.compile(string)
340 self._match_data = None
342 def _match(self, re_key, text):
343 self._match_data = self.compiled_re[re_key].match(text)
344 return self._match_data
346 def _get_match_data(self):
347 return self._match_data
349 def _reset(self):
350 self._syllables = []
351 self._current_syllable = None
352 self._first_syllable = Syllable()
354 def _start_new_syllable(self):
355 if (self._first_syllable and not self._first_syllable.is_empty()):
356 self._syllables.append(self._first_syllable)
357 if not (self._current_syllable
358 and self._current_syllable.is_empty()):
359 self._current_syllable = Syllable()
360 self._syllables.append(self._current_syllable)
361 self._first_syllable = None
363 def _add_sign(self, text):
364 if self._first_syllable:
365 self._first_syllable.add_sign(text)
366 else:
367 self._current_syllable.add_sign(text)
369 def get_syllables(self):
370 return self._syllables
372 def tokenize(self, signs):
373 self._reset()
374 verse_text = "".join([sign.get_char() for sign in signs])
375 sign_count = len(signs)
376 i = 0
377 while (i < sign_count):
378 word_start = signs[i].word_start()
379 # forced syllable ends
380 if (i > 0 and signs[i].forced_syllable_start()):
381 self._start_new_syllable()
383 # Hiatus
384 # ^[vowel]<space>
385 if (i == 0
386 and verse_text[i] in self.vowels
387 and signs[i].word_end()):
388 self._add_sign(signs[i])
389 i += 1
390 self._start_new_syllable()
391 # [something][vowel (no feminine e)]<space>[vowel]
392 elif (self._match('hiatus', verse_text[i:])
393 and signs[i+1].word_end()):
394 self._add_sign(signs[i])
395 self._add_sign(signs[i+1])
396 self._start_new_syllable()
397 self._add_sign(signs[i+2])
398 i += 3
399 elif (
400 # <word start>s[cçpt][vowel]
401 (word_start and self._match('^sca', verse_text[i:])
402 and not signs[i].word_end())
403 # <word start>s[cp][lr][vowel]
404 or (word_start and self._match('^scla', verse_text[i:])
405 and not signs[i].word_end()
406 and not signs[i+1].word_end())
407 # <word start>ps[vowel]
408 or (word_start and self._match('^psa', verse_text[i:]))
409 # gn[vowel]
410 or (self._match('gna', verse_text[i:])
411 and not signs[i].word_end())
412 # [bcdgkpqtçfjvxz][lrh][vowel]
413 or (self._match('bla', verse_text[i:])
414 and not signs[i].word_end())
415 # [tpc]h[rl][vowel]
416 or (self._match('thra', verse_text[i:])
417 and not signs[i+1].word_end())
418 # [consonant][vowel]
419 or self._match('ba', verse_text[i:])
421 match = self._get_match_data().group(0)
422 self._start_new_syllable()
423 for x in match:
424 self._add_sign(signs[i])
425 i += 1
426 else:
427 self._add_sign(signs[i])
428 i += 1
429 return self.get_syllables()
432 class SyllableTokenizerWithWordSeparation(SyllableTokenizer):
434 A specialized SyllableTokenizer which preferes syllable
435 breaking between words when possible. For instance:
437 "tant attendu"
438 gives: tant / at / ten / du
439 iso: tan / t at / ten / du
441 This is useful when breaking verses for lyrics.
443 Usage:
444 sign_tokenizer = SignTokenizer()
445 syllable_tokenizer = SyllableTokenizerWithWordSeparation()
446 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
447 syllables = syllable_tokenizer.tokenize(signs)
448 syllables being a list of Syllable objects
450 def force_word_separation(self, syllables = None):
451 syllables = syllables or self._syllables
452 syllable_count = len(syllables)
453 prev_syllable = syllables[0]
454 for this_syllable in syllables[1:]:
455 signs = this_syllable.get_signs()
456 if not signs[0].word_start() and signs[1:]:
457 tokens_count = len(signs)
458 i = 1
459 while (not signs[i].word_start()
460 or not signs[i].get_char() in self.vowels):
461 i += 1
462 if i == tokens_count:
463 break
464 else:
465 # we found a vowel at word start at index i
466 # signs from indices 0 to i-1 go to the previous syllable
467 prev_syllable.add_signs(signs[0:i])
468 this_syllable.set_signs(signs[i:])
469 prev_syllable = this_syllable
470 return syllables
472 def tokenize(self, signs):
473 SyllableTokenizer.tokenize(self, signs)
474 return self.force_word_separation()
476 class Verse():
478 A verse
480 Usage:
481 verse = Verse("Un ver avec des décorations")
482 # possible pass sign and syllable tokenizers to split:
483 verse.split()
484 verse.get_syllables()
485 => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
488 def __init__(self, text, lineno = None):
489 self._text = text
490 self._syllables = []
491 self._lineno = lineno
493 def get_syllables(self):
494 return [syll.get_text() for syll in self._syllables]
496 def get_text(self):
497 return "".join([syll.get_text() for syll in self._syllables])
499 def syllabify(self,
500 sign_tokenizer = SignTokenizer(),
501 syllable_tokenizer = SyllableTokenizer()
503 self._syllables = syllable_tokenizer.tokenize(
504 sign_tokenizer.tokenize(self._text))
506 def get_metric(self):
507 return len(self._syllables) - (1 if self._syllables[-1].is_feminine() else 0)
509 def hyphenate(self, hyphen = "-", add_space = False):
510 syllables = []
511 i = 0
512 count = len(self._syllables)
513 for syllable in self._syllables:
514 if (i > 0) and not syllable.at_word_start():
515 syllables.append(hyphen)
516 text = syllable.get_text()
517 syllables.append(text)
518 if add_space:
519 verse_end = (i == count - 1)
520 # if syllable is word end and do not end with a space,
521 # add it (unless at verse end)
522 if (not verse_end
523 and syllable.at_word_end()
524 and text[-1] != " "):
525 syllables.append(" ")
526 i += 1
527 return "".join(syllables)
529 class Corpus():
531 A corpus, consisting of verses.
533 Example:
534 To generate LilyPond lyrics (where syllables in a word are separated
535 with " -- ")
537 corpus = Corpus()
538 corpus.add_verse(["premier ver", "second ver..."])
539 corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
540 corpus.get_hyphenated_verses(hyphen = " -- ")
541 => ["pre -- mier ver", "se -- cond ver..."]
543 def __init__(self, filename = None):
544 self._verses = []
545 self._filename = filename
547 def add_verse(self, verse, lineno = None):
549 Add verse (a string) to the corpus.
551 self._verses.append(Verse(verse, lineno))
553 def get_verses(self):
554 return self._verses
556 def syllabify(self,
557 sign_tokenizer = SignTokenizer(),
558 syllable_tokenizer = SyllableTokenizer()):
560 Syllabify all the corpus verses.
562 for verse in self._verses:
563 verse.syllabify(sign_tokenizer, syllable_tokenizer)
565 def get_hyphenated_verses(self, hyphen = "-", add_space = False):
567 Return the hyphenated verses (list of strings) contained in the
568 corpus.
569 Corpus.syllabify() is supposed to have been called before.
571 return [verse.hyphenate(hyphen, add_space)
572 for verse in self._verses]
574 class CorpusReader():
576 def read(self, filename = "-"):
578 Read a corpus file (or stdin if filename is "-")
579 and produce a Corpus object.
581 file = open(filename, 'r') if (filename != "-") else sys.stdin
582 corpus = Corpus(filename)
583 lineno = 0
584 for line in file:
585 line = line.strip()
586 lineno += 1
587 # skip empty lines
588 if line == "":
589 pass
590 # skip comments
591 elif re.match(r"^//", line):
592 # TODO: do something
593 pass
594 # TODO: titling directives
595 elif re.match(r"^#", line):
596 pass
597 # a verse
598 else:
599 # verse format:
600 # verse text TAB+ [properties]
601 # where properties can be:
602 # [LB]+ breve/long syllables indicators
603 # [AT]+ schema (?)
604 # R "refrain"
605 # D "double"
606 # other lilypond code
607 # for now, we only keep the verse text itself
608 text = re.sub(r"([^\t]+)\t.*$", r"\1", line)
609 corpus.add_verse(text, lineno)
610 file.close()
611 return corpus
613 def main():
615 Syllabify and print verses.
617 parser = argparse.ArgumentParser(
618 description='Verse syllabication.',
619 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
620 parser.add_argument(
621 '--verse',
622 metavar="words",
623 nargs='+',
624 help='verse words to syllabify (if no corpus is provided)')
625 parser.add_argument(
626 '--corpus',
627 help="Corpus file to syllabify. Use - for reading from stdin")
628 parser.add_argument(
629 '--hyphen',
630 default=" -- ",
631 help="String to be used when hyphenating a verse.")
632 parser.add_argument(
633 '--format',
634 default="{hyphenated_verse}",
635 help="""Python format string for outputing the verse.
636 Possible keywords, to be used between curly braces in the format string,
638 *) hyphenated_verse: the verse after applying hyphenation
639 *) verse: the verse without hyphenation
640 *) metric: the verse metric (a number).""")
641 args = vars(parser.parse_args())
643 if args['corpus']:
644 # Syllabify a corpus
645 reader = CorpusReader()
646 corpus = reader.read(args['corpus'])
647 corpus.syllabify(
648 syllable_tokenizer = SyllableTokenizerWithWordSeparation())
649 for verse in corpus.get_verses():
650 hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'],
651 add_space = True)
652 print(args['format'].format(verse = verse.get_text(),
653 hyphenated_verse = hyphenated_verse,
654 metric = verse.get_metric()))
655 elif args['verse']:
656 # read verse on command line arguments
657 verse = Verse(" ".join(args['verse']))
658 verse.syllabify(
659 syllable_tokenizer = SyllableTokenizerWithWordSeparation())
660 hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'], add_space = True)
661 print(args['format'].format(verse = verse.get_text(),
662 hyphenated_verse = hyphenated_verse,
663 metric = verse.get_metric()))
664 else:
665 parser.print_help()
667 if __name__ == '__main__':
668 main()