tools/voice.pl

   1 #!/usr/bin/perl -s
   2 #             __________               __   ___.
   3 #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4 #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7 #                     \/            \/     \/    \/            \/
   8 # $Id:
   9 #
  10 # Copyright (C) 2007 Jonas Häggqvist
  11 #
  12 # All files in this archive are subject to the GNU General Public License.
  13 # See the file COPYING in the source tree root for full license agreement.
  14 #
  15 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16 # KIND, either express or implied.
  17
  18 use strict;
  19 use warnings;
  20 use File::Basename;
  21 use File::Copy;
  22 use Switch;
  23 use vars qw($V $C $t $l $e $E $s $S $i $v);
  24 use IPC::Open2;
  25 use IPC::Open3;
  26 use Digest::MD5 qw(md5_hex);
  27 use DirHandle;
  28
  29 sub printusage {
  30     print <<USAGE
  31
  32 Usage: voice.pl [options] [path to dir]
  33  -V
  34     Create voice file. You must also specify -t and -l.
  35
  36  -C
  37     Create .talk clips.
  38
  39  -t=<target>
  40     Specify which target you want to build voicefile for. Must include
  41     any features that target supports.
  42
  43  -i=<target_id>
  44     Numeric target id. Needed for voice building.
  45
  46  -l=<language>
  47     Specify which language you want to build. Without .lang extension.
  48
  49  -e=<encoder>
  50     Which encoder to use for voice strings
  51
  52  -E=<encoder options>
  53     Which encoder options to use when compressing voice strings. Enclose
  54     in double quotes if the options include spaces.
  55
  56  -s=<TTS engine>
  57     Which TTS engine to use.
  58
  59  -S=<TTS engine options>
  60     Options to pass to the TTS engine. Enclose in double quotes if the
  61     options include spaces.
  62
  63  -v
  64     Be verbose
  65 USAGE
  66 ;
  67 }
  68
  69 # Initialize TTS engine. May return an object or value which will be passed
  70 # to voicestring and shutdown_tts
  71 sub init_tts {
  72     our $verbose;
  73     my ($tts_engine, $tts_engine_opts, $language) = @_;
  74     my %ret = ("name" => $tts_engine);
  75     switch($tts_engine) {
  76         case "festival" {
  77             print("> festival $tts_engine_opts --server\n") if $verbose;
  78             my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
  79             my $dummy = *FESTIVAL_SERVER; #suppress warning
  80             $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
  81             $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
  82             $ret{"pid"} = $pid;
  83         }
  84         case "sapi" {
  85             my $toolsdir = dirname($0);
  86             my $path = `cygpath $toolsdir -a -w`;
  87             chomp($path);
  88             $path = $path . '\\';
  89             my $cmd = $path . "sapi_voice.vbs /language:$language $tts_engine_opts";
  90             $cmd =~ s/\\/\\\\/g;
  91             print("> cscript //nologo $cmd\n") if $verbose;
  92             my $pid = open2(*CMD_OUT, *CMD_IN, "cscript //nologo $cmd");
  93             $SIG{INT} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  94             $SIG{KILL} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  95             print(CMD_IN "QUERY\tVENDOR\r\n");
  96             my $vendor = readline(CMD_OUT);
  97             $vendor =~ s/\r\n//;
  98             %ret = (%ret,
  99                     "stdin" => *CMD_IN,
 100                     "stdout" => *CMD_OUT,
 101                     "vendor" => $vendor);
 102         }
 103     }
 104     return \%ret;
 105 }
 106
 107 # Shutdown TTS engine if necessary.
 108 sub shutdown_tts {
 109     my ($tts_object) = @_;
 110     switch($$tts_object{"name"}) {
 111         case "festival" {
 112             # Send SIGTERM to festival server
 113             kill TERM => $$tts_object{"pid"};
 114         }
 115         case "sapi" {
 116             print({$$tts_object{"stdin"}} "QUIT\r\n");
 117             close($$tts_object{"stdin"});
 118         }
 119     }
 120 }
 121
 122 # Apply corrections to a voice-string to make it sound better
 123 sub correct_string {
 124     our $verbose;
 125     my ($string, $language, $tts_object) = @_;
 126     my $orig = $string;
 127     switch($language) {
 128         # General for all engines and languages
 129         $string =~ s/USB/U S B/g;
 130         $string =~ s/ID3/I D 3/g;
 131
 132         case "english" {
 133             switch($$tts_object{"name"}) {
 134                 case ["sapi","festival"] {
 135                     $string =~ s/plugin(s?)/plug-in$1/ig;
 136                 }
 137             }
 138         }
 139         case "deutsch" {
 140             # for all german engines (e.g. for english words)
 141             $string =~ s/alkaline/alkalein/ig;
 142             $string =~ s/byte(s?)/beit$1/ig;
 143             $string =~ s/clip(s?)/klipp$1/ig;
 144             $string =~ s/cuesheet/kjuschiet/ig;
 145             $string =~ s/dither/didder/ig;
 146             $string =~ s/equalizer/iquileiser/ig;
 147             $string =~ s/\bflash\b/fläsh/ig;
 148             $string =~ s/\bfirmware(s?)\b/firmwer$1/ig;
 149             $string =~ s/\bI D 3 tag\b/I D 3 täg/ig; # can't just use "tag" here
 150             $string =~ s/\bloudness\b/laudness/ig;
 151             $string =~ s/\bunicode\b/unikod/ig;
 152             switch($$tts_object{"name"}) {
 153                  case "sapi" {   # just for SAPI
 154                     switch($$tts_object{"vendor"}) {
 155                         case "AT&T Labs" {
 156                             $string =~ s/alphabet/alfabet/ig;
 157                             $string =~ s/ampere/amper/ig;
 158                             $string =~ s/\bdezibel\b/de-zibell/ig;
 159                             $string =~ s/diddering/didde-ring/ig;
 160                             $string =~ s/energie\b/ener-gie/ig;
 161                             $string =~ s/\Blauf\b/-lauf/ig;
 162                             $string =~ s/\bnumerisch\b/numehrisch/ig;
 163                         }
 164                     }
 165                 }
 166             }
 167         }
 168         case "svenska" {
 169             # for all swedish engines (e.g. for english words)
 170             $string =~ s/kilobyte/kilobajt/ig;
 171             $string =~ s/megabyte/megabajt/ig;
 172             $string =~ s/gigabyte/gigabajt/ig;
 173             $string =~ s/\bloudness\b/laudness/ig;
 174
 175             switch($$tts_object{"name"}) {
 176                  case "espeak" {   # just for eSpeak
 177                      $string =~ s/ampere/ampär/ig;
 178                      $string =~ s/bokmärken/bok-märken/ig;
 179                      $string =~ s/generella/schenerella/ig;
 180                      $string =~ s/dithering/diddering/ig;
 181                      $string =~ s/\bunicode\b/jynikod/ig;
 182                      $string =~ s/uttoning/utoning/ig;
 183                      $string =~ s/procent/pro-cent/ig;
 184                      $string =~ s/spellistor/spelistor/ig;
 185                      $string =~ s/cuesheet/qjyschiit/ig;
 186                  }
 187             }
 188         }
 189         case "italiano" {
 190             # for all italian engines (e.g. for english words)
 191             $string =~ s/Replaygain/Ripleyghein/ig;
 192             $string =~ s/Crossfade/Crossfeid/ig;
 193             $string =~ s/beep/Bip/ig;
 194             $string =~ s/cuesheet/chiushit/ig;
 195             $string =~ s/fade/feid/ig;
 196             $string =~ s/Crossfeed/crossfid/ig;
 197             $string =~ s/Cache/chash/ig;
 198             $string =~ s/\bfirmware(s?)\b/firmuer$1/ig;
 199             $string =~ s/\bFile(s?)\b/fail$1/ig;
 200             $string =~ s/\bloudness\b/laudness/ig;
 201             $string =~ s/\bunicode\b/unikod/ig;
 202             $string =~ s/Playlist/pleylist/ig;
 203             $string =~ s/WavPack/wave pak/ig;
 204             $string =~ s/BITRATE/bit reit/ig;
 205             $string =~ s/Codepage/cod page/ig;
 206             $string =~ s/PCM Wave/pcm ueiv/ig;
 207             $string =~ s/Ã¨/è/ig;
 208             $string =~ s/\b(s*)Ã¹\b/$1ù/ig;
 209             $string =~ s/\b(s*)Ã\b/$1à/ig;
 210             switch($$tts_object{"name"}) {
 211                  case "sapi" {   # just for SAPI
 212                     switch($$tts_object{"vendor"}) {
 213                         case "Loquendo" {
 214                             $string =~ s/Inizializza/inizializa/ig;
 215                         }
 216                         case "ScanSoft, Inc" {
 217                             $string =~ s/V/v/ig;
 218                             $string =~ s/X/x/ig;
 219                             $string =~ s/stop/stohp/ig;
 220                         }
 221                     }
 222                 }
 223             }
 224         }
 225     }
 226     if ($orig ne $string) {
 227         printf("%s -> %s\n", $orig, $string) if $verbose;
 228     }
 229     return $string;
 230 }
 231
 232 # Produce a wav file of the text given
 233 sub voicestring {
 234     our $verbose;
 235     my ($string, $output, $tts_engine_opts, $tts_object) = @_;
 236     my $cmd;
 237     printf("Generate \"%s\" with %s in file %s\n", $string, $$tts_object{"name"}, $output) if $verbose;
 238     switch($$tts_object{"name"}) {
 239         case "festival" {
 240             # festival_client lies to us, so we have to do awful soul-eating
 241             # work with IPC::open3()
 242             $cmd = "festival_client --server localhost --otype riff --ttw --output \"$output\"";
 243             print("> $cmd\n") if $verbose;
 244             # Open command, and filehandles for STDIN, STDOUT, STDERR
 245             my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
 246             # Put the string to speak into STDIN and close it
 247             print(CMD_IN $string);
 248             close(CMD_IN);
 249             # Read all output from festival_client (because it LIES TO US)
 250             while (<CMD_ERR>) {
 251             }
 252             close(CMD_OUT);
 253             close(CMD_ERR);
 254         }
 255         case "flite" {
 256             $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
 257             print("> $cmd\n") if $verbose;
 258             `$cmd`;
 259         }
 260         case "espeak" {
 261             # xxx: $tts_engine_opts isn't used
 262             $cmd = "espeak $tts_engine_opts -w \"$output\"";
 263             print("> $cmd\n") if $verbose;
 264             open(ESPEAK, "| $cmd");
 265             print ESPEAK $string . "\n";
 266             close(ESPEAK);
 267         }
 268         case "sapi" {
 269             print({$$tts_object{"stdin"}} "SPEAK\t$output\t$string\r\n");
 270         }
 271         case "swift" {
 272             $cmd = "swift $tts_engine_opts -o \"$output\" \"$string\"";
 273             print("> $cmd\n") if $verbose;
 274             system($cmd);
 275         }
 276     }
 277 }
 278
 279 # trim leading / trailing silence from the clip
 280 sub wavtrim {
 281     our $verbose;
 282     my ($file, $threshold, $tts_object) = @_;
 283     printf("Trim \"%s\"\n", $file) if $verbose;
 284     my $cmd = "wavtrim \"$file\" $threshold";
 285     if ($$tts_object{"name"} eq "sapi") {
 286         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 287     }
 288     else {
 289         print("> $cmd\n") if $verbose;
 290         `$cmd`;
 291     }
 292 }
 293
 294 # Encode a wav file into the given destination file
 295 sub encodewav {
 296     our $verbose;
 297     my ($input, $output, $encoder, $encoder_opts, $tts_object) = @_;
 298     printf("Encode \"%s\" with %s in file %s\n", $input, $encoder, $output) if $verbose;
 299     my $cmd = "$encoder $encoder_opts \"$input\" \"$output\"";
 300     if ($$tts_object{"name"} eq "sapi") {
 301         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 302     }
 303     else {
 304         print("> $cmd\n") if $verbose;
 305         `$cmd`;
 306     }
 307 }
 308
 309 # synchronize the clip generation / processing if it's running in another process
 310 sub synchronize {
 311     my ($tts_object) = @_;
 312     if ($$tts_object{"name"} eq "sapi") {
 313         print({$$tts_object{"stdin"}} "SYNC\t42\r\n");
 314         my $wait = readline($$tts_object{"stdout"});
 315         #ignore what's actually returned
 316     }
 317 }
 318
 319 # Run genlang and create voice clips for each string
 320 sub generateclips {
 321     our $verbose;
 322     my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
 323     my $english = dirname($0) . '/../apps/lang/english.lang';
 324     my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
 325     my $id = '';
 326     my $voice = '';
 327     my $cmd = "genlang -o -t=$target -e=$english $langfile 2>/dev/null";
 328     my $pool_file;
 329     open(VOICEFONTIDS, "> voicefontids");
 330     my $i = 0;
 331     local $| = 1; # make progress indicator work reliably
 332
 333     my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
 334     print("Generating voice clips");
 335     print("\n") if $verbose;
 336     for (`$cmd`) {
 337         my $line = $_;
 338         print(VOICEFONTIDS $line);
 339         if ($line =~ /^id: (.*)$/) {
 340             $id = $1;
 341         }
 342         elsif ($line =~ /^voice: "(.*)"$/) {
 343             $voice = $1;
 344             if ($id !~ /^NOT_USED_.*$/ && $voice ne "") {
 345                 my $wav = $id . '.wav';
 346                 my $mp3 = $id . '.mp3';
 347
 348                 # Print some progress information
 349                 if (++$i % 10 == 0 and !$verbose) {
 350                     print(".");
 351                 }
 352
 353                 # Apply corrections to the string
 354                 $voice = correct_string($voice, $language, $tts_object);
 355
 356                 # If we have a pool of snippets, see if the string exists there first
 357                 if (defined($ENV{'POOL'})) {
 358                     $pool_file = sprintf("%s/%s-%s.mp3", $ENV{'POOL'},
 359                                          md5_hex("$voice $tts_engine $tts_engine_opts $encoder_opts"),
 360                                          $language);
 361                     if (-f $pool_file) {
 362                         printf("Re-using %s (%s) from pool\n", $id, $voice) if $verbose;
 363                         copy($pool_file, $mp3);
 364                     }
 365                 }
 366
 367                 # Don't generate MP3 if it already exists (probably from the POOL)
 368                 if (! -f $mp3) {
 369                     if ($id eq "VOICE_PAUSE") {
 370                         print("Use distributed $wav\n") if $verbose;
 371                         copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
 372                     }
 373                     else {
 374                         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 375                         wavtrim($wav, 500, $tts_object);
 376                         # 500 seems to be a reasonable default for now
 377                     }
 378
 379                     encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 380                     synchronize($tts_object);
 381                     if (defined($ENV{'POOL'})) {
 382                         copy($mp3, $pool_file);
 383                     }
 384                     unlink($wav);
 385                 }
 386                 $voice = "";
 387                 $id = "";
 388             }
 389         }
 390     }
 391     print("\n");
 392     close(VOICEFONTIDS);
 393     shutdown_tts($tts_object);
 394 }
 395
 396 # Assemble the voicefile
 397 sub createvoice {
 398     our $verbose;
 399     my ($language, $target_id) = @_;
 400     my $outfile = "";
 401     $outfile = sprintf("%s.voice", $language);
 402     printf("Saving voice file to %s\n", $outfile) if $verbose;
 403     my $cmd = "voicefont 'voicefontids' $target_id ./ $outfile";
 404     print("> $cmd\n") if $verbose;
 405     my $output = `$cmd`;
 406     print($output) if $verbose;
 407 }
 408
 409 sub deletemp3s() {
 410     for (glob('*.mp3')) {
 411         unlink($_);
 412     }
 413     for (glob('*.wav')) {
 414         unlink($_);
 415     }
 416 }
 417
 418 sub panic_cleanup {
 419     deletemp3s();
 420     die "moo";
 421 }
 422
 423 # Generate .talk clips
 424 sub gentalkclips {
 425     our $verbose;
 426     my ($dir, $tts_object, $encoder, $encoder_opts, $tts_engine_opts, $i) = @_;
 427     my $d = new DirHandle $dir;
 428     while (my $file = $d->read) {
 429         my ($voice, $wav, $mp3);
 430         # Print some progress information
 431         if (++$i % 10 == 0 and !$verbose) {
 432             print(".");
 433         }
 434
 435         # Convert to a complete path
 436         my $path = sprintf("%s/%s", $dir, $file);
 437
 438         $voice = $file;
 439         $wav = sprintf("%s.talk.wav", $path);
 440
 441         # Ignore dot-dirs and talk files
 442         if ($file eq '.' || $file eq '..' || $file =~ /\.talk$/) {
 443             next;
 444         }
 445         # Element is a dir
 446         if ( -d $path) {
 447             gentalkclips($path, $tts_object, $encoder, $encoder_opts, $tts_engine_opts, $i);
 448             $mp3 = sprintf("%s/_dirname.talk", $path);
 449         }
 450         # Element is a file
 451         else {
 452             $mp3 = sprintf("%s.talk", $path);
 453             $voice =~ s/\.[^\.]*$//; # Trim extension
 454         }
 455
 456         printf("Talkclip %s: %s", $mp3, $voice) if $verbose;
 457
 458         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 459         wavtrim($wav, 500, $tts_object);
 460         # 500 seems to be a reasonable default for now
 461         encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 462         synchronize($tts_object);
 463         unlink($wav);
 464     }
 465 }
 466
 467
 468 # Check parameters
 469 my $printusage = 0;
 470 unless (defined($V) or defined($C)) { print("Missing either -V or -C\n"); $printusage = 1; }
 471 if (defined($V)) {
 472     unless (defined($t)) { print("Missing -t argument\n"); $printusage = 1; }
 473     unless (defined($l)) { print("Missing -l argument\n"); $printusage = 1; }
 474     unless (defined($i)) { print("Missing -i argument\n"); $printusage = 1; }
 475 }
 476 elsif (defined($C)) {
 477     unless (defined($ARGV[0])) { print "Missing path argument\n"; $printusage = 1; }
 478 }
 479 unless (defined($e)) { print("Missing -e argument\n"); $printusage = 1; }
 480 unless (defined($E)) { print("Missing -E argument\n"); $printusage = 1; }
 481 unless (defined($s)) { print("Missing -s argument\n"); $printusage = 1; }
 482 unless (defined($S)) { print("Missing -S argument\n"); $printusage = 1; }
 483 if ($printusage == 1) { printusage(); exit 1; }
 484
 485 if (defined($v) or defined($ENV{'V'})) {
 486     our $verbose = 1;
 487 }
 488
 489 # add the tools dir to the path temporarily, for calling various tools
 490 $ENV{'PATH'} = dirname($0) . ':' . $ENV{'PATH'};
 491
 492
 493 # Do what we're told
 494 if ($V == 1) {
 495     # Only do the panic cleanup for voicefiles
 496     $SIG{INT} = \&panic_cleanup;
 497     $SIG{KILL} = \&panic_cleanup;
 498
 499     printf("Generating voice\n  Target: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n",
 500         $t, $l, $e, $E, $s, $S);
 501     generateclips($l, $t, $e, $E, $s, $S);
 502     createvoice($l, $i);
 503     deletemp3s();
 504 }
 505 elsif ($C) {
 506     printf("Generating .talk clips\n  Path: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n", $ARGV[0], $l, $e, $E, $s, $S);
 507     my $tts_object = init_tts($s, $S, $l);
 508     gentalkclips($ARGV[0], $tts_object, $e, $E, $S, 0);
 509     shutdown_tts($tts_object);
 510 }
 511 else {
 512     printusage();
 513     exit 1;
 514 }