tools/voice.pl

   1 #!/usr/bin/perl -s
   2 #             __________               __   ___.
   3 #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4 #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7 #                     \/            \/     \/    \/            \/
   8 # $Id:
   9 #
  10 # Copyright (C) 2007 Jonas Häggqvist
  11 #
  12 # All files in this archive are subject to the GNU General Public License.
  13 # See the file COPYING in the source tree root for full license agreement.
  14 #
  15 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16 # KIND, either express or implied.
  17
  18 use strict;
  19 use warnings;
  20 use File::Basename;
  21 use File::Copy;
  22 use Switch;
  23 use vars qw($V $C $t $l $e $E $s $S $i $v);
  24 use IPC::Open2;
  25 use IPC::Open3;
  26 use Digest::MD5 qw(md5_hex);
  27
  28 sub printusage {
  29     print <<USAGE
  30
  31 Usage: voice.pl [options] [path to dir]
  32  -V
  33     Create voice file. You must also specify -t and -l.
  34
  35  -C
  36     Create .talk clips.
  37
  38  -t=<target>
  39     Specify which target you want to build voicefile for. Must include
  40     any features that target supports.
  41
  42  -i=<target_id>
  43     Numeric target id. Needed for voice building.
  44
  45  -l=<language>
  46     Specify which language you want to build. Without .lang extension.
  47
  48  -e=<encoder>
  49     Which encoder to use for voice strings
  50
  51  -E=<encoder options>
  52     Which encoder options to use when compressing voice strings. Enclose
  53     in double quotes if the options include spaces.
  54
  55  -s=<TTS engine>
  56     Which TTS engine to use.
  57
  58  -S=<TTS engine options>
  59     Options to pass to the TTS engine. Enclose in double quotes if the
  60     options include spaces.
  61
  62  -v
  63     Be verbose
  64 USAGE
  65 ;
  66 }
  67
  68 # Initialize TTS engine. May return an object or value which will be passed
  69 # to voicestring and shutdown_tts
  70 sub init_tts {
  71     our $verbose;
  72     my ($tts_engine, $tts_engine_opts, $language) = @_;
  73     my %ret = ("name" => $tts_engine);
  74     switch($tts_engine) {
  75         case "festival" {
  76             print("> festival $tts_engine_opts --server\n") if $verbose;
  77             my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
  78             my $dummy = *FESTIVAL_SERVER; #suppress warning
  79             $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
  80             $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
  81             $ret{"pid"} = $pid;
  82         }
  83         case "sapi" {
  84             my $toolsdir = dirname($0);
  85             my $path = `cygpath $toolsdir -a -w`;
  86             chomp($path);
  87             $path = $path . '\\';
  88             my $cmd = $path . "sapi_voice.vbs /language:$language $tts_engine_opts";
  89             $cmd =~ s/\\/\\\\/g;
  90             print("> cscript //nologo $cmd\n") if $verbose;
  91             my $pid = open2(*CMD_OUT, *CMD_IN, "cscript //nologo $cmd");
  92             $SIG{INT} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  93             $SIG{KILL} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  94             print(CMD_IN "QUERY\tVENDOR\r\n");
  95             my $vendor = readline(CMD_OUT);
  96             $vendor =~ s/\r\n//;
  97             %ret = (%ret,
  98                     "stdin" => *CMD_IN,
  99                     "stdout" => *CMD_OUT,
 100                     "toolspath" => $path,
 101                     "vendor" => $vendor);
 102         }
 103     }
 104     return \%ret;
 105 }
 106
 107 # Shutdown TTS engine if necessary.
 108 sub shutdown_tts {
 109     my ($tts_object) = @_;
 110     switch($$tts_object{"name"}) {
 111         case "festival" {
 112             # Send SIGTERM to festival server
 113             kill TERM => $$tts_object{"pid"};
 114         }
 115         case "sapi" {
 116             print({$$tts_object{"stdin"}} "QUIT\r\n");
 117             close($$tts_object{"stdin"});
 118         }
 119     }
 120 }
 121
 122 # Apply corrections to a voice-string to make it sound better
 123 sub correct_string {
 124     our $verbose;
 125     my ($string, $language, $tts_object) = @_;
 126     my $orig = $string;
 127     switch($language) {
 128         # General for all engines and languages
 129         $string =~ s/USB/U S B/g;
 130         $string =~ s/ID3/I D 3/g;
 131
 132         case "english" {
 133             switch($$tts_object{"name"}) {
 134                 case ["sapi","festival"] {
 135                     $string =~ s/plugin(s?)/plug-in$1/ig;
 136                 }
 137             }
 138         }
 139         case "deutsch" {
 140             # for all german engines (e.g. for english words)
 141             $string =~ s/alkaline/alkalein/ig;
 142             $string =~ s/byte(s?)/beit$1/ig;
 143             $string =~ s/clip(s?)/klipp$1/ig;
 144             $string =~ s/cuesheet/kjuschiet/ig;
 145             $string =~ s/dither/didder/ig;
 146             $string =~ s/equalizer/iquileiser/ig;
 147             $string =~ s/\bflash\b/fläsh/ig;
 148             $string =~ s/\bfirmware(s?)\b/firmwer$1/ig;
 149             $string =~ s/\bI D 3 tag\b/I D 3 täg/ig; # can't just use "tag" here
 150             $string =~ s/\bloudness\b/laudness/ig;
 151             $string =~ s/\bunicode\b/unikod/ig;
 152             switch($$tts_object{"name"}) {
 153                  case "sapi" {   # just for SAPI
 154                     switch($$tts_object{"vendor"}) {
 155                         case "AT&T Labs" {
 156                             $string =~ s/alphabet/alfabet/ig;
 157                             $string =~ s/ampere/amper/ig;
 158                             $string =~ s/\bdezibel\b/de-zibell/ig;
 159                             $string =~ s/diddering/didde-ring/ig;
 160                             $string =~ s/energie\b/ener-gie/ig;
 161                             $string =~ s/\bnumerisch\b/numehrisch/ig;
 162                             $string =~ s/\brücklauf\b/rück-lauf/ig;
 163                             $string =~ s/\bsuchlauf\b/such-lauf/ig;
 164                         }
 165                     }
 166                 }
 167             }
 168         }
 169     }
 170     if ($orig ne $string) {
 171         printf("%s -> %s\n", $orig, $string) if $verbose;
 172     }
 173     return $string;
 174 }
 175
 176 # Produce a wav file of the text given
 177 sub voicestring {
 178     our $verbose;
 179     my ($string, $output, $tts_engine_opts, $tts_object) = @_;
 180     my $cmd;
 181     printf("Generate \"%s\" with %s in file %s\n", $string, $$tts_object{"name"}, $output) if $verbose;
 182     switch($$tts_object{"name"}) {
 183         case "festival" {
 184             # festival_client lies to us, so we have to do awful soul-eating
 185             # work with IPC::open3()
 186             $cmd = "festival_client --server localhost --otype riff --ttw --output \"$output\"";
 187             print("> $cmd\n") if $verbose;
 188             # Open command, and filehandles for STDIN, STDOUT, STDERR
 189             my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
 190             # Put the string to speak into STDIN and close it
 191             print(CMD_IN $string);
 192             close(CMD_IN);
 193             # Read all output from festival_client (because it LIES TO US)
 194             while (<CMD_ERR>) {
 195             }
 196             close(CMD_OUT);
 197             close(CMD_ERR);
 198         }
 199         case "flite" {
 200             $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
 201             print("> $cmd\n") if $verbose;
 202             `$cmd`;
 203         }
 204         case "espeak" {
 205             # xxx: $tts_engine_opts isn't used
 206             $cmd = "espeak $tts_engine_opts -w $output";
 207             print("> $cmd\n") if $verbose;
 208             open(ESPEAK, "| $cmd");
 209             print ESPEAK $string . "\n";
 210             close(ESPEAK);
 211         }
 212         case "sapi" {
 213             print({$$tts_object{"stdin"}} "SPEAK\t$output\t$string\r\n");
 214         }
 215         case "swift" {
 216             $cmd = "swift $tts_engine_opts -o $output \"$string\"";
 217             print("> $cmd\n") if $verbose;
 218             system($cmd);
 219         }
 220     }
 221 }
 222
 223 # trim leading / trailing silence from the clip
 224 sub wavtrim {
 225     our $verbose;
 226     my ($file, $threshold, $tts_object) = @_;
 227     printf("Trim \"%s\"\n", $file) if $verbose;
 228     if ($$tts_object{"name"} eq "sapi") {
 229         my $cmd = $$tts_object{"toolspath"}."wavtrim $file $threshold";
 230         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 231     }
 232     else {
 233         my $cmd = dirname($0) . "/wavtrim $file $threshold";
 234         print("> $cmd\n") if $verbose;
 235         `$cmd`;
 236     }
 237 }
 238
 239 # Encode a wav file into the given destination file
 240 sub encodewav {
 241     our $verbose;
 242     my ($input, $output, $encoder, $encoder_opts, $tts_object) = @_;
 243     my $cmd = '';
 244     printf("Encode \"%s\" with %s in file %s\n", $input, $encoder, $output) if $verbose;
 245     switch ($encoder) {
 246         case 'lame' {
 247             $cmd = "lame $encoder_opts \"$input\" \"$output\"";
 248         }
 249         case 'vorbis' {
 250             $cmd = "oggenc $encoder_opts \"$input\" -o \"$output\"";
 251         }
 252         case 'speexenc' {
 253             $cmd = "speexenc $encoder_opts \"$input\" \"$output\"";
 254         }
 255     }
 256     if ($$tts_object{"name"} eq "sapi") {
 257         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 258     }
 259     else {
 260         print("> $cmd\n") if $verbose;
 261         `$cmd`;
 262     }
 263 }
 264
 265 # synchronize the clip generation / processing if it's running in another process
 266 sub synchronize {
 267     my ($tts_object) = @_;
 268     if ($$tts_object{"name"} eq "sapi") {
 269         print({$$tts_object{"stdin"}} "SYNC\t42\r\n");
 270         my $wait = readline($$tts_object{"stdout"});
 271         #ignore what's actually returned
 272     }
 273 }
 274
 275 # Run genlang and create voice clips for each string
 276 sub generateclips {
 277     our $verbose;
 278     my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
 279     my $genlang = dirname($0) . '/genlang';
 280     my $english = dirname($0) . '/../apps/lang/english.lang';
 281     my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
 282     my $id = '';
 283     my $voice = '';
 284     my $cmd = "$genlang -o -t=$target -e=$english $langfile 2>/dev/null";
 285     my $pool_file;
 286     open(VOICEFONTIDS, "> voicefontids");
 287     my $i = 0;
 288     local $| = 1; # make progress indicator work reliably
 289
 290     my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
 291     print("Generating voice clips");
 292     print("\n") if $verbose;
 293     for (`$cmd`) {
 294         my $line = $_;
 295         print(VOICEFONTIDS $line);
 296         if ($line =~ /^id: (.*)$/) {
 297             $id = $1;
 298         }
 299         elsif ($line =~ /^voice: "(.*)"$/) {
 300             $voice = $1;
 301             if ($id !~ /^NOT_USED_.*$/ && $voice ne "") {
 302                 my $wav = $id . '.wav';
 303                 my $mp3 = $id . '.mp3';
 304
 305                 # Print some progress information
 306                 if (++$i % 10 == 0 and !$verbose) {
 307                     print(".");
 308                 }
 309
 310                 # Apply corrections to the string
 311                 $voice = correct_string($voice, $language, $tts_object);
 312
 313                 # If we have a pool of snippets, see if the string exists there first
 314                 if (defined($ENV{'POOL'})) {
 315                     $pool_file = sprintf("%s/%s-%s.mp3", $ENV{'POOL'},
 316                                          md5_hex("$voice $tts_engine $tts_engine_opts $encoder_opts"),
 317                                          $language);
 318                     if (-f $pool_file) {
 319                         printf("Re-using %s (%s) from pool\n", $id, $voice) if $verbose;
 320                         copy($pool_file, $mp3);
 321                     }
 322                 }
 323
 324                 # Don't generate MP3 if it already exists (probably from the POOL)
 325                 if (! -f $mp3) {
 326                     if ($id eq "VOICE_PAUSE") {
 327                         print("Use distributed $wav\n") if $verbose;
 328                         copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
 329                     }
 330                     else {
 331                         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 332                         wavtrim($wav, 500, $tts_object);
 333                         # 500 seems to be a reasonable default for now
 334                     }
 335
 336                     encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 337                     synchronize($tts_object);
 338                     if (defined($ENV{'POOL'})) {
 339                         copy($mp3, $pool_file);
 340                     }
 341                     unlink($wav);
 342                 }
 343                 $voice = "";
 344                 $id = "";
 345             }
 346         }
 347     }
 348     print("\n");
 349     close(VOICEFONTIDS);
 350     shutdown_tts($tts_object);
 351 }
 352
 353 # Assemble the voicefile
 354 sub createvoice {
 355     our $verbose;
 356     my ($language, $target_id) = @_;
 357     my $voicefont = dirname($0) . '/voicefont';
 358     my $outfile = "";
 359     my $i = 0;
 360     do {
 361         $outfile = sprintf("%s%s.voice", $language, ($i++ == 0 ? '' : '-'.$i));
 362     } while (-f $outfile);
 363     printf("Saving voice file to %s\n", $outfile) if $verbose;
 364     my $cmd = "$voicefont 'voicefontids' $target_id ./ $outfile";
 365     print("> $cmd\n") if $verbose;
 366     my $output = `$cmd`;
 367     print($output) if $verbose;
 368 }
 369
 370 sub deletemp3s() {
 371     for (glob('*.mp3')) {
 372         unlink($_);
 373     }
 374     for (glob('*.wav')) {
 375         unlink($_);
 376     }
 377 }
 378
 379 sub panic_cleanup {
 380     deletemp3s();
 381     die "moo";
 382 }
 383
 384 # Check parameters
 385 my $printusage = 0;
 386 unless (defined($V) or defined($C)) { print("Missing either -V or -C\n"); $printusage = 1; }
 387 if (defined($V)) {
 388     unless (defined($t)) { print("Missing -t argument\n"); $printusage = 1; }
 389     unless (defined($l)) { print("Missing -l argument\n"); $printusage = 1; }
 390     unless (defined($i)) { print("Missing -i argument\n"); $printusage = 1; }
 391 }
 392 elsif (defined($C)) {
 393     unless (defined($ARGV[0])) { print "Missing path argument\n"; $printusage = 1; }
 394 }
 395 unless (defined($e)) { print("Missing -e argument\n"); $printusage = 1; }
 396 unless (defined($E)) { print("Missing -E argument\n"); $printusage = 1; }
 397 unless (defined($s)) { print("Missing -s argument\n"); $printusage = 1; }
 398 unless (defined($S)) { print("Missing -S argument\n"); $printusage = 1; }
 399 if ($printusage == 1) { printusage(); exit 1; }
 400
 401 $SIG{INT} = \&panic_cleanup;
 402 $SIG{KILL} = \&panic_cleanup;
 403
 404 if (defined($v) or defined($ENV{'V'})) {
 405     our $verbose = 1;
 406 }
 407
 408
 409 # Do what we're told
 410 if ($V == 1) {
 411     printf("Generating voice\n  Target: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n",
 412         $t, $l, $e, $E, $s, $S);
 413     generateclips($l, $t, $e, $E, $s, $S);
 414     createvoice($l, $i);
 415     deletemp3s();
 416 }
 417 elsif ($C) {
 418     # xxx: Implement .talk clip generation
 419 }
 420 else {
 421     printusage();
 422     exit 1;
 423 }