scripts/syllabify.py

   1 # -*- coding: utf-8 -*-
   2 import re, sys, argparse
   3
   4 class Sign():
   5     """
   6     Represent a syllable constituent (a single alphabetical character),
   7     with other text (mute characters, punctuation, spaces, etc.) attached
   8     to it.
   9
  10     sign.get_char() gives the alphabetical syllable consituent.
  11     sign.get_text() gives the whole text attached to the sign
  12     """
  13     def __init__(self, c):
  14         self._sign = c
  15         self._text = ""
  16         self._word_end = False
  17         self._word_start = False
  18         self._forced_syllable_end = False
  19         self._forced_syllable_start = False
  20
  21     def add_text(self, str):
  22         self._text = "".join((self._text, str))
  23
  24     def set_forced_syllable_end(self):
  25         self._forced_syllable_end = True
  26
  27     def forced_syllable_end(self):
  28         return self._forced_syllable_end
  29
  30     def set_forced_syllable_start(self):
  31         self._forced_syllable_start = True
  32
  33     def forced_syllable_start(self):
  34         return self._forced_syllable_start
  35
  36     def word_end(self):
  37         return self._word_end
  38
  39     def set_word_end(self):
  40         self._word_end = True
  41
  42     def word_start(self):
  43         return self._word_start
  44
  45     def set_word_start(self):
  46         self._word_start = True
  47
  48     def get_char(self):
  49         return self._sign
  50
  51     def get_text(self):
  52         return self._text
  53
  54 class SignTokenizer():
  55     """
  56     Provides a method for build a list of signs from a decorated verse string.
  57     Usage:
  58       sign_tokenizer = SignTokenizer()
  59       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
  60     signs being a list of Sign objects
  61
  62     The decorations can be:
  63      - "°" for grouping 'empty' words to 'full' words.
  64        Example:
  65          En°vain j'ay respecté la°celebre memoire
  66          Des°Heros des°siecles passez ;
  67        Can be overriden with word_separator_markers constructor keyword
  68
  69      - "*" for marking a mute letter (e.g. a 'h').
  70        Example:
  71          Et c'est l'*Hyver qui les°rassemble.
  72        Can be overriden with mute_character_marker constructor keyword
  73
  74      - "=" for forcing syllable ends, e.g. for marking a diaeresis.
  75        Example:
  76          Trop *heureux Phrygi=ens, venez icy l'attendre.
  77        Can be overriden with forced_syllable_end_marker constructor keyword
  78
  79      - other unused markers: < > { }
  80        Can be overriden with ignored_markers constructor keyword
  81     """
  82     def __init__(self,
  83                  word_separators = " -",
  84                  word_separator_markers = "°",
  85                  simple_punctuations = ".,",
  86                  double_punctuations = ":;?!",
  87                  apostrophes = "'’",
  88                  forced_syllable_end_marker = "=",
  89                  mute_character_marker = "*",
  90                  ignored_markers = "<>{}",
  91                  ignored_characters = "[]()|/~_"
  92                  ):
  93         self.word_separators = word_separators
  94         self.word_separator_markers = word_separator_markers
  95         self.all_word_separators = "".join((word_separators,
  96                                             word_separator_markers))
  97         self.simple_punctuations = simple_punctuations
  98         self.double_punctuations = double_punctuations
  99         self.apostrophes = apostrophes
 100         self.forced_syllable_end_marker = forced_syllable_end_marker
 101         self.mute_character_marker = mute_character_marker
 102         self.ignored_markers = ignored_markers
 103         self.ignored_characters = ignored_characters
 104         self.punctuation_re = re.compile(
 105             " *([{}{}])".format(self.simple_punctuations,
 106                               self.double_punctuations))
 107         self.et_re = re.compile("([Ee]t)({})".format(
 108                 "|".join(self.all_word_separators)))
 109
 110     def _reset(self):
 111         self._prefix = ""
 112         self._current_sign = None
 113         self._signs = []
 114
 115     def _add_sign(self, c):
 116         self._current_sign = Sign(c.lower())
 117         self._signs.append(self._current_sign)
 118         if self._prefix != "":
 119             self._current_sign.add_text(self._prefix)
 120             self._prefix = ""
 121
 122     def _add_prefix(self, prefix):
 123         self._prefix = "".join((self._prefix, prefix))
 124
 125     def _add_text(self, text):
 126         self._current_sign.add_text(text)
 127
 128     def _set_forced_syllable_end(self):
 129         self._current_sign.set_forced_syllable_end()
 130
 131     def _set_word_end(self):
 132         self._current_sign.set_word_end()
 133
 134     def tokenize(self, verse_text):
 135         self._reset()
 136         sign_count = len(verse_text)
 137         i = 0
 138         mute_next = False
 139         word_start = True
 140         while (i < sign_count):
 141             c = verse_text[i]
 142             punctuation_match = self.punctuation_re.match(verse_text[i:])
 143             ## Markers: they are not real text
 144             # forced syllable end marker
 145             if c == self.forced_syllable_end_marker:
 146                 self._set_forced_syllable_end()
 147                 i += 1
 148             # mute character marker
 149             elif c == self.mute_character_marker:
 150                 i += 1
 151                 mute_next = True
 152             # ignored markers
 153             elif c in self.ignored_markers:
 154                 i += 1
 155             ## Actual text
 156             # apostroph
 157             elif c in self.apostrophes:
 158                 self._add_text("’")
 159                 i += 1
 160             # punctuation
 161             elif punctuation_match:
 162                 punct = punctuation_match.group(1)
 163                 if punct in self.double_punctuations:
 164                     self._add_text("\u00A0")
 165                 self._add_text(punct)
 166                 i += len(punctuation_match.group(0))
 167                 self._set_word_end()
 168                 word_start = True
 169             # word separator
 170             elif c in self.all_word_separators:
 171                 self._set_word_end()
 172                 word_start = True
 173                 if c in self.word_separator_markers:
 174                     self._add_text(" ")
 175                 else:
 176                     self._add_text(c)
 177                 i += 1
 178             # ignored characters
 179             elif c in self.ignored_characters:
 180                 self._add_text(c)
 181                 i += 1
 182             # consonant or vowel
 183             else:
 184                 if mute_next:
 185                     self._add_prefix(c)
 186                     mute_next = False
 187                     i += 1
 188                 else:
 189                     m = word_start and self.et_re.match(verse_text[i:])
 190                     if m:
 191                         # special case: et -> &
 192                         self._add_sign("&")
 193                         self._add_text(m.group(1))
 194                         self._add_text(" ")
 195                         self._set_word_end()
 196                         word_start = True
 197                         i += len(m.group(0))
 198                     else:
 199                         # consonant or vowel
 200                         self._add_sign(c)
 201                         self._add_text(c)
 202                         word_start = False
 203                         i += 1
 204         # the last character is at word end and syllable end
 205         self._set_word_end()
 206         self._set_forced_syllable_end()
 207         # set word_start and forced_syllable_start for characters
 208         # following a word end or forced_syllable_end
 209         at_word_start = True
 210         at_syllable_start = True
 211         for sign in self._signs:
 212             if at_word_start:
 213                 sign.set_word_start()
 214             if at_syllable_start:
 215                 sign.set_forced_syllable_start()
 216             at_word_start = sign.word_end()
 217             at_syllable_start = sign.forced_syllable_end()
 218         return self._signs
 219
 220     def get_chars(self):
 221         return "".join([c.get_char() for c in self._signs])
 222
 223     def get_full_verse(self):
 224         return "".join([c.get_text() for c in self._signs])
 225
 226
 227 class Syllable():
 228     """
 229     Represents a syllable, consisting in a list of signs.
 230     """
 231     def __init__(self):
 232         self._signs = []
 233
 234     def add_sign(self, sign):
 235         self._signs.append(sign)
 236
 237     def add_signs(self, signs):
 238         self._signs.extend(signs)
 239
 240     def get_signs(self):
 241         return self._signs
 242
 243     def set_signs(self, signs):
 244         self._signs = signs
 245
 246     def get_text(self):
 247         return "".join([sign.get_text() for sign in self._signs])
 248
 249     def get_chars(self):
 250         return "".join([sign.get_char() for sign in self._signs])
 251
 252     def is_empty(self):
 253         return not self._signs
 254
 255     def at_word_start(self):
 256         return self._signs[0].word_start()
 257
 258     def at_word_end(self):
 259         return self._signs[-1].word_end()
 260
 261     def is_feminine(self):
 262         """
 263         A syllable is feminine iff:
 264         - it is placed at word end
 265         - it contains exactly one vowel, which is 'e' or 'ë', at the end
 266         (with possibly a final s)
 267         -
 268         """
 269         if self.at_word_end():
 270             chars = "".join([sign.get_char() for sign in self._signs])
 271             # special cases:
 272             # exact words: ces, mes, ses, tes, les, des, es
 273             # have no feminine e
 274             if (self.at_word_start()
 275                 and re.match("^[cmstld]?es$", chars)):
 276                 return False
 277             vowels = ""
 278             for char in chars:
 279                 if char in "aàâäeëéèêœiìïîoôòuùûüy&":
 280                     vowels = "".join((vowels, char))
 281             return not not (
 282                 # only one vowel: e or ë, and word ends with -e or -es
 283                 ((vowels == "e" or vowels == "ë")
 284                  and (vowels == chars[-1] or (vowels + "s") == chars[-2:]))
 285                 # two vowels: "que?" or "gues?"
 286                 or ((vowels == "ue" or vowels == "uë")
 287                     and re.search("[qg]u[eë]s?", chars)))
 288         return False
 289
 290
 291 class SyllableTokenizer():
 292     """
 293     Provides a method for build a list of syllables from a list of signs.
 294     Usage:
 295       sign_tokenizer = SignTokenizer()
 296       syllable_tokenizer = SyllableTokenizer()
 297       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
 298       syllables = syllable_tokenizer.tokenize(signs)
 299     syllables being a list of Syllable objects
 300     """
 301     def __init__(self,
 302                  e_vowels = "eë",
 303                  other_vowels = "aàâäéèêœiìïîoôòuùûüy&",
 304                  consonants_sonority_levels = { 'liquid' : "lrh",
 305                                                 'nasal' : "mn",
 306                                                 'constrictive' : "çfjsvxz",
 307                                                 'occlusive' : "bcdgkpqt" }
 308                  ):
 309         self.e_vowels = e_vowels
 310         self.other_vowels = other_vowels
 311         self.vowels = "".join((e_vowels, other_vowels))
 312         self.consonants_sonority_levels = consonants_sonority_levels
 313         self.consonants = "".join(consonants_sonority_levels.values())
 314         self._reset()
 315         self.re = {
 316             # [something][vowel (no feminine e)]<space>[vowel]
 317             'hiatus' : ".[{}][{}]".format(self.other_vowels, self.vowels),
 318             # <word start>s[cçpt][vowel]
 319             '^sca' : "s[cçpt][{}]".format(self.vowels),
 320             # <word start>s[cp][lr][vowel]
 321             '^scla' : "s[cp][lr][{}]".format(self.vowels),
 322             # <word start>ps[vowel]
 323             '^psa' : "ps[{}]".format(self.vowels),
 324             # gn[vowel]
 325             'gna' : "gn[{}]".format(self.vowels),
 326             # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
 327             'bla' : "[{}{}][{}][{}]".format(
 328                 self.consonants_sonority_levels['occlusive'],
 329                 self.consonants_sonority_levels['constrictive'].replace("s", ""),
 330                 self.consonants_sonority_levels['liquid'],
 331                 self.vowels),
 332             # [tpc]h[rl][vowel]
 333             'thra' : "[tpc]h[rl][{}]".format(self.vowels),
 334             # [consonant][vowel]
 335             'ba' : "[{}][{}]".format(self.consonants, self.vowels),
 336             }
 337         self.compiled_re = {}
 338         for (key, string) in self.re.items():
 339             self.compiled_re[key] = re.compile(string)
 340         self._match_data = None
 341
 342     def _match(self, re_key, text):
 343         self._match_data = self.compiled_re[re_key].match(text)
 344         return self._match_data
 345
 346     def _get_match_data(self):
 347         return self._match_data
 348
 349     def _reset(self):
 350         self._syllables = []
 351         self._current_syllable = None
 352         self._first_syllable = Syllable()
 353
 354     def _start_new_syllable(self):
 355         if (self._first_syllable and not self._first_syllable.is_empty()):
 356             self._syllables.append(self._first_syllable)
 357         if not (self._current_syllable
 358                 and self._current_syllable.is_empty()):
 359             self._current_syllable = Syllable()
 360             self._syllables.append(self._current_syllable)
 361         self._first_syllable = None
 362
 363     def _add_sign(self, text):
 364         if self._first_syllable:
 365             self._first_syllable.add_sign(text)
 366         else:
 367             self._current_syllable.add_sign(text)
 368
 369     def get_syllables(self):
 370         return self._syllables
 371
 372     def tokenize(self, signs):
 373         self._reset()
 374         verse_text = "".join([sign.get_char() for sign in signs])
 375         sign_count = len(signs)
 376         i = 0
 377         while (i < sign_count):
 378             word_start = signs[i].word_start()
 379             # forced syllable ends
 380             if (i > 0 and signs[i].forced_syllable_start()):
 381                 self._start_new_syllable()
 382
 383             # Hiatus
 384             # ^[vowel]<space>
 385             if (i == 0
 386                   and verse_text[i] in self.vowels
 387                   and signs[i].word_end()):
 388                 self._add_sign(signs[i])
 389                 i += 1
 390                 self._start_new_syllable()
 391             # [something][vowel (no feminine e)]<space>[vowel]
 392             elif (self._match('hiatus', verse_text[i:])
 393                   and signs[i+1].word_end()):
 394                 self._add_sign(signs[i])
 395                 self._add_sign(signs[i+1])
 396                 self._start_new_syllable()
 397                 self._add_sign(signs[i+2])
 398                 i += 3
 399             elif (
 400                 # <word start>s[cçpt][vowel]
 401                 (word_start and self._match('^sca', verse_text[i:])
 402                  and not signs[i].word_end())
 403                 # <word start>s[cp][lr][vowel]
 404                 or (word_start and self._match('^scla', verse_text[i:])
 405                     and not signs[i].word_end()
 406                     and not signs[i+1].word_end())
 407                 # <word start>ps[vowel]
 408                 or (word_start and self._match('^psa', verse_text[i:]))
 409                 # gn[vowel]
 410                 or (self._match('gna', verse_text[i:])
 411                     and not signs[i].word_end())
 412                 # [bcdgkpqtçfjvxz][lrh][vowel]
 413                 or (self._match('bla', verse_text[i:])
 414                     and not signs[i].word_end())
 415                 # [tpc]h[rl][vowel]
 416                 or (self._match('thra', verse_text[i:])
 417                     and not signs[i+1].word_end())
 418                 # [consonant][vowel]
 419                 or self._match('ba', verse_text[i:])
 420                 ):
 421                 match = self._get_match_data().group(0)
 422                 self._start_new_syllable()
 423                 for x in match:
 424                     self._add_sign(signs[i])
 425                     i += 1
 426             else:
 427                 self._add_sign(signs[i])
 428                 i += 1
 429         return self.get_syllables()
 430
 431
 432 class SyllableTokenizerWithWordSeparation(SyllableTokenizer):
 433     """
 434     A specialized SyllableTokenizer which preferes syllable
 435     breaking between words when possible.  For instance:
 436
 437       "tant attendu"
 438       gives:  tant / at / ten / du
 439       iso:    tan / t at / ten / du
 440
 441     This is useful when breaking verses for lyrics.
 442
 443     Usage:
 444       sign_tokenizer = SignTokenizer()
 445       syllable_tokenizer = SyllableTokenizerWithWordSeparation()
 446       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
 447       syllables = syllable_tokenizer.tokenize(signs)
 448     syllables being a list of Syllable objects
 449     """
 450     def force_word_separation(self, syllables = None):
 451         syllables = syllables or self._syllables
 452         syllable_count = len(syllables)
 453         prev_syllable = syllables[0]
 454         for this_syllable in syllables[1:]:
 455             signs = this_syllable.get_signs()
 456             if not signs[0].word_start() and signs[1:]:
 457                 tokens_count = len(signs)
 458                 i = 1
 459                 while (not signs[i].word_start()
 460                        or not signs[i].get_char() in self.vowels):
 461                     i += 1
 462                     if i == tokens_count:
 463                         break
 464                 else:
 465                     # we found a vowel at word start at index i
 466                     # signs from indices 0 to i-1 go to the previous syllable
 467                     prev_syllable.add_signs(signs[0:i])
 468                     this_syllable.set_signs(signs[i:])
 469             prev_syllable = this_syllable
 470         return syllables
 471
 472     def tokenize(self, signs):
 473         SyllableTokenizer.tokenize(self, signs)
 474         return self.force_word_separation()
 475
 476 class Verse():
 477     """
 478     A verse
 479
 480     Usage:
 481       verse = Verse("Un ver avec des décorations")
 482       # possible pass sign and syllable tokenizers to split:
 483       verse.split()
 484       verse.get_syllables()
 485       => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
 486
 487     """
 488     def __init__(self, text, lineno = None):
 489         self._text = text
 490         self._syllables = []
 491         self._lineno = lineno
 492
 493     def get_syllables(self):
 494         return [syll.get_text() for syll in self._syllables]
 495
 496     def get_text(self):
 497         return "".join([syll.get_text() for syll in self._syllables])
 498
 499     def syllabify(self,
 500               sign_tokenizer = SignTokenizer(),
 501               syllable_tokenizer = SyllableTokenizer()
 502               ):
 503         self._syllables = syllable_tokenizer.tokenize(
 504             sign_tokenizer.tokenize(self._text))
 505
 506     def get_metric(self):
 507         return len(self._syllables) - (1 if self._syllables[-1].is_feminine() else 0)
 508
 509     def hyphenate(self, hyphen = "-", add_space = False):
 510         syllables = []
 511         i = 0
 512         count = len(self._syllables)
 513         for syllable in self._syllables:
 514             if (i > 0) and not syllable.at_word_start():
 515                 syllables.append(hyphen)
 516             text = syllable.get_text()
 517             syllables.append(text)
 518             if add_space:
 519                 verse_end = (i == count - 1)
 520                 # if syllable is word end and do not end with a space,
 521                 # add it (unless at verse end)
 522                 if (not verse_end
 523                     and syllable.at_word_end()
 524                     and text[-1] != " "):
 525                     syllables.append(" ")
 526             i += 1
 527         return "".join(syllables)
 528
 529 class Corpus():
 530     """
 531     A corpus, consisting of verses.
 532
 533     Example:
 534     To generate LilyPond lyrics (where syllables in a word are separated
 535     with " -- ")
 536
 537       corpus = Corpus()
 538       corpus.add_verse(["premier ver", "second ver..."])
 539       corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 540       corpus.get_hyphenated_verses(hyphen = " -- ")
 541       => ["pre -- mier ver", "se -- cond ver..."]
 542     """
 543     def __init__(self, filename = None):
 544         self._verses = []
 545         self._filename = filename
 546
 547     def add_verse(self, verse, lineno = None):
 548         """
 549         Add verse (a string) to the corpus.
 550         """
 551         self._verses.append(Verse(verse, lineno))
 552
 553     def get_verses(self):
 554         return self._verses
 555
 556     def syllabify(self,
 557                   sign_tokenizer = SignTokenizer(),
 558                   syllable_tokenizer = SyllableTokenizer()):
 559         """
 560         Syllabify all the corpus verses.
 561         """
 562         for verse in self._verses:
 563             verse.syllabify(sign_tokenizer, syllable_tokenizer)
 564
 565     def get_hyphenated_verses(self, hyphen = "-", add_space = False):
 566         """
 567         Return the hyphenated verses (list of strings) contained in the
 568         corpus.
 569         Corpus.syllabify() is supposed to have been called before.
 570         """
 571         return [verse.hyphenate(hyphen, add_space)
 572                 for verse in self._verses]
 573
 574 class CorpusReader():
 575
 576     def read(self, filename = "-"):
 577         """
 578         Read a corpus file (or stdin if filename is "-")
 579         and produce a Corpus object.
 580         """
 581         file = open(filename, 'r') if (filename != "-") else sys.stdin
 582         corpus = Corpus(filename)
 583         lineno = 0
 584         for line in file:
 585             line = line.strip()
 586             lineno += 1
 587             # skip empty lines
 588             if line == "":
 589                 pass
 590             # skip comments
 591             elif re.match(r"^//", line):
 592                 # TODO: do something
 593                 pass
 594             # TODO: titling directives
 595             elif re.match(r"^#", line):
 596                 pass
 597             # a verse
 598             else:
 599                 # verse format:
 600                 # verse text TAB+ [properties]
 601                 # where properties can be:
 602                 #   [LB]+  breve/long syllables indicators
 603                 #   [AT]+  schema (?)
 604                 #   R      "refrain"
 605                 #   D      "double"
 606                 #   other  lilypond code
 607                 # for now, we only keep the verse text itself
 608                 text = re.sub(r"([^\t]+)\t.*$", r"\1", line)
 609                 corpus.add_verse(text, lineno)
 610         file.close()
 611         return corpus
 612
 613 def main():
 614     """
 615     Syllabify and print verses.
 616     """
 617     parser = argparse.ArgumentParser(
 618         description='Verse syllabication.',
 619         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 620     parser.add_argument(
 621         '--verse',
 622         metavar="words",
 623         nargs='+',
 624         help='verse words to syllabify (if no corpus is provided)')
 625     parser.add_argument(
 626         '--corpus',
 627         help="Corpus file to syllabify.  Use - for reading from stdin")
 628     parser.add_argument(
 629         '--hyphen',
 630         default=" -- ",
 631         help="String to be used when hyphenating a verse.")
 632     parser.add_argument(
 633         '--format',
 634         default="{hyphenated_verse}",
 635         help="""Python format string for outputing the verse.
 636 Possible keywords, to be used between curly braces in the format string,
 637 are
 638 *) hyphenated_verse: the verse after applying hyphenation
 639 *) verse: the verse without hyphenation
 640 *) metric: the verse metric (a number).""")
 641     args = vars(parser.parse_args())
 642
 643     if args['corpus']:
 644         # Syllabify a corpus
 645         reader = CorpusReader()
 646         corpus = reader.read(args['corpus'])
 647         corpus.syllabify(
 648             syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 649         for verse in corpus.get_verses():
 650             hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'],
 651                                                add_space = True)
 652             print(args['format'].format(verse = verse.get_text(),
 653                                         hyphenated_verse = hyphenated_verse,
 654                                         metric = verse.get_metric()))
 655     elif args['verse']:
 656         # read verse on command line arguments
 657         verse = Verse(" ".join(args['verse']))
 658         verse.syllabify(
 659             syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 660         hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'], add_space = True)
 661         print(args['format'].format(verse = verse.get_text(),
 662                                     hyphenated_verse = hyphenated_verse,
 663                                     metric = verse.get_metric()))
 664     else:
 665         parser.print_help()
 666
 667 if __name__ == '__main__':
 668     main()