tools/voice.pl

   1 #!/usr/bin/perl -s
   2 #             __________               __   ___.
   3 #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4 #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7 #                     \/            \/     \/    \/            \/
   8 # $Id:
   9 #
  10 # Copyright (C) 2007 Jonas Häggqvist
  11 #
  12 # All files in this archive are subject to the GNU General Public License.
  13 # See the file COPYING in the source tree root for full license agreement.
  14 #
  15 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16 # KIND, either express or implied.
  17
  18 use strict;
  19 use warnings;
  20 use File::Basename;
  21 use File::Copy;
  22 use Switch;
  23 use vars qw($V $C $t $l $e $E $s $S $i $v);
  24 use IPC::Open2;
  25 use IPC::Open3;
  26 use Digest::MD5 qw(md5_hex);
  27
  28 sub printusage {
  29     print <<USAGE
  30
  31 Usage: voice.pl [options] [path to dir]
  32  -V
  33     Create voice file. You must also specify -t and -l.
  34
  35  -C
  36     Create .talk clips.
  37
  38  -t=<target>
  39     Specify which target you want to build voicefile for. Must include
  40     any features that target supports.
  41
  42  -i=<target_id>
  43     Numeric target id. Needed for voice building.
  44
  45  -l=<language>
  46     Specify which language you want to build. Without .lang extension.
  47
  48  -e=<encoder>
  49     Which encoder to use for voice strings
  50
  51  -E=<encoder options>
  52     Which encoder options to use when compressing voice strings. Enclose
  53     in double quotes if the options include spaces.
  54
  55  -s=<TTS engine>
  56     Which TTS engine to use.
  57
  58  -S=<TTS engine options>
  59     Options to pass to the TTS engine. Enclose in double quotes if the
  60     options include spaces.
  61
  62  -v
  63     Be verbose
  64 USAGE
  65 ;
  66 }
  67
  68 # Initialize TTS engine. May return an object or value which will be passed
  69 # to voicestring and shutdown_tts
  70 sub init_tts {
  71     our $verbose;
  72     my ($tts_engine, $tts_engine_opts, $language) = @_;
  73     my %ret = ("name" => $tts_engine);
  74     switch($tts_engine) {
  75         case "festival" {
  76             print("> festival $tts_engine_opts --server\n") if $verbose;
  77             my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
  78             my $dummy = *FESTIVAL_SERVER; #suppress warning
  79             $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
  80             $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
  81             $ret{"pid"} = $pid;
  82         }
  83         case "sapi" {
  84             my $toolsdir = dirname($0);
  85             my $path = `cygpath $toolsdir -a -w`;
  86             chomp($path);
  87             $path = $path . '\\';
  88             my $cmd = $path . "sapi_voice.vbs /language:$language $tts_engine_opts";
  89             $cmd =~ s/\\/\\\\/g;
  90             print("> cscript //nologo $cmd\n") if $verbose;
  91             my $pid = open2(*CMD_OUT, *CMD_IN, "cscript //nologo $cmd");
  92             $SIG{INT} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  93             $SIG{KILL} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  94             print(CMD_IN "QUERY\tVENDOR\r\n");
  95             my $vendor = readline(CMD_OUT);
  96             $vendor =~ s/\r\n//;
  97             %ret = (%ret,
  98                     "stdin" => *CMD_IN,
  99                     "stdout" => *CMD_OUT,
 100                     "toolspath" => $path,
 101                     "vendor" => $vendor);
 102         }
 103     }
 104     return \%ret;
 105 }
 106
 107 # Shutdown TTS engine if necessary.
 108 sub shutdown_tts {
 109     my ($tts_object) = @_;
 110     switch($$tts_object{"name"}) {
 111         case "festival" {
 112             # Send SIGTERM to festival server
 113             kill TERM => $$tts_object{"pid"};
 114         }
 115         case "sapi" {
 116             print({$$tts_object{"stdin"}} "QUIT\r\n");
 117             close($$tts_object{"stdin"});
 118         }
 119     }
 120 }
 121
 122 # Apply corrections to a voice-string to make it sound better
 123 sub correct_string {
 124     our $verbose;
 125     my ($string, $language, $tts_object) = @_;
 126     my $orig = $string;
 127     switch($language) {
 128         # General for all engines and languages
 129         $string =~ s/USB/U S B/g;
 130         $string =~ s/ID3/I D 3/g;
 131
 132         case "english" {
 133             $string =~ s/plugin(s?)/plug-in$1/ig;
 134         }
 135         case "deutsch" {
 136             # for all german engines (e.g. for english words)
 137             $string =~ s/alkaline/alkalein/ig;
 138             $string =~ s/byte(s?)/beit$1/ig;
 139             $string =~ s/clip(s?)/klipp$1/ig;
 140             $string =~ s/cuesheet/kjuschiet/ig;
 141             $string =~ s/dither/didder/ig;
 142             $string =~ s/equalizer/iquileiser/ig;
 143             $string =~ s/\bflash\b/fläsh/ig;
 144             $string =~ s/\bfirmware(s?)\b/firmwer$1/ig;
 145             $string =~ s/\bI D 3 tag\b/I D 3 täg/ig; # can't just use "tag" here
 146             $string =~ s/\bloudness\b/laudness/ig;
 147             $string =~ s/\bunicode\b/unikod/ig;
 148             switch($$tts_object{"name"}) {
 149                  case "sapi" {   # just for SAPI
 150                     switch($$tts_object{"vendor"}) {
 151                         case "AT&T Labs" {
 152                             $string =~ s/alphabet/alfabet/ig;
 153                             $string =~ s/ampere/amper/ig;
 154                             $string =~ s/\bdezibel\b/de-zibell/ig;
 155                             $string =~ s/diddering/didde-ring/ig;
 156                             $string =~ s/energie\b/ener-gie/ig;
 157                             $string =~ s/\bnumerisch\b/numehrisch/ig;
 158                             $string =~ s/\brücklauf\b/rück-lauf/ig;
 159                             $string =~ s/\bsuchlauf\b/such-lauf/ig;
 160                         }
 161                     }
 162                 }
 163             }
 164         }
 165     }
 166     if ($orig ne $string) {
 167         printf("%s -> %s\n", $orig, $string) if $verbose;
 168     }
 169     return $string;
 170 }
 171
 172 # Produce a wav file of the text given
 173 sub voicestring {
 174     our $verbose;
 175     my ($string, $output, $tts_engine_opts, $tts_object) = @_;
 176     my $cmd;
 177     printf("Generate \"%s\" with %s in file %s\n", $string, $$tts_object{"name"}, $output) if $verbose;
 178     switch($$tts_object{"name"}) {
 179         case "festival" {
 180             # festival_client lies to us, so we have to do awful soul-eating
 181             # work with IPC::open3()
 182             $cmd = "festival_client --server localhost --otype riff --ttw --output \"$output\"";
 183             print("> $cmd\n") if $verbose;
 184             # Open command, and filehandles for STDIN, STDOUT, STDERR
 185             my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
 186             # Put the string to speak into STDIN and close it
 187             print(CMD_IN $string);
 188             close(CMD_IN);
 189             # Read all output from festival_client (because it LIES TO US)
 190             while (<CMD_ERR>) {
 191             }
 192             close(CMD_OUT);
 193             close(CMD_ERR);
 194         }
 195         case "flite" {
 196             $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
 197             print("> $cmd\n") if $verbose;
 198             `$cmd`;
 199         }
 200         case "espeak" {
 201             # xxx: $tts_engine_opts isn't used
 202             $cmd = "espeak $tts_engine_opts -w $output";
 203             print("> $cmd\n") if $verbose;
 204             open(ESPEAK, "| $cmd");
 205             print ESPEAK $string . "\n";
 206             close(ESPEAK);
 207         }
 208         case "sapi" {
 209             print({$$tts_object{"stdin"}} "SPEAK\t$output\t$string\r\n");
 210         }
 211         case "swift" {
 212             $cmd = "swift $tts_engine_opts -o $output \"$string\"";
 213             print("> $cmd\n") if $verbose;
 214             system($cmd);
 215         }
 216     }
 217 }
 218
 219 # trim leading / trailing silence from the clip
 220 sub wavtrim {
 221     our $verbose;
 222     my ($file, $threshold, $tts_object) = @_;
 223     printf("Trim \"%s\"\n", $file) if $verbose;
 224     if ($$tts_object{"name"} eq "sapi") {
 225         my $cmd = $$tts_object{"toolspath"}."wavtrim $file $threshold";
 226         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 227     }
 228     else {
 229         my $cmd = dirname($0) . "/wavtrim $file $threshold";
 230         print("> $cmd\n") if $verbose;
 231         `$cmd`;
 232     }
 233 }
 234
 235 # Encode a wav file into the given destination file
 236 sub encodewav {
 237     our $verbose;
 238     my ($input, $output, $encoder, $encoder_opts, $tts_object) = @_;
 239     my $cmd = '';
 240     printf("Encode \"%s\" with %s in file %s\n", $input, $encoder, $output) if $verbose;
 241     switch ($encoder) {
 242         case 'lame' {
 243             $cmd = "lame $encoder_opts \"$input\" \"$output\"";
 244         }
 245         case 'vorbis' {
 246             $cmd = "oggenc $encoder_opts \"$input\" -o \"$output\"";
 247         }
 248         case 'speexenc' {
 249             $cmd = "speexenc $encoder_opts \"$input\" \"$output\"";
 250         }
 251     }
 252     if ($$tts_object{"name"} eq "sapi") {
 253         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 254     }
 255     else {
 256         print("> $cmd\n") if $verbose;
 257         `$cmd`;
 258     }
 259 }
 260
 261 # synchronize the clip generation / processing if it's running in another process
 262 sub synchronize {
 263     my ($tts_object) = @_;
 264     if ($$tts_object{"name"} eq "sapi") {
 265         print({$$tts_object{"stdin"}} "SYNC\t42\r\n");
 266         my $wait = readline($$tts_object{"stdout"});
 267         #ignore what's actually returned
 268     }
 269 }
 270
 271 # Run genlang and create voice clips for each string
 272 sub generateclips {
 273     our $verbose;
 274     my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
 275     my $genlang = dirname($0) . '/genlang';
 276     my $english = dirname($0) . '/../apps/lang/english.lang';
 277     my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
 278     my $id = '';
 279     my $voice = '';
 280     my $cmd = "$genlang -o -t=$target -e=$english $langfile 2>/dev/null";
 281     my $pool_file;
 282     open(VOICEFONTIDS, "> voicefontids");
 283     my $i = 0;
 284     local $| = 1; # make progress indicator work reliably
 285
 286     my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
 287     print("Generating voice clips");
 288     print("\n") if $verbose;
 289     for (`$cmd`) {
 290         my $line = $_;
 291         print(VOICEFONTIDS $line);
 292         if ($line =~ /^id: (.*)$/) {
 293             $id = $1;
 294         }
 295         elsif ($line =~ /^voice: "(.*)"$/) {
 296             $voice = $1;
 297             if ($id !~ /^NOT_USED_.*$/ && $voice ne "") {
 298                 my $wav = $id . '.wav';
 299                 my $mp3 = $id . '.mp3';
 300
 301                 # Print some progress information
 302                 if (++$i % 10 == 0 and !$verbose) {
 303                     print(".");
 304                 }
 305
 306                 # Apply corrections to the string
 307                 $voice = correct_string($voice, $language, $tts_object);
 308
 309                 # If we have a pool of snippets, see if the string exists there first
 310                 if (defined($ENV{'POOL'})) {
 311                     $pool_file = sprintf("%s/%s-%s.mp3", $ENV{'POOL'},
 312                                          md5_hex("$voice $tts_engine $tts_engine_opts $encoder_opts"),
 313                                          $language);
 314                     if (-f $pool_file) {
 315                         printf("Re-using %s (%s) from pool\n", $id, $voice) if $verbose;
 316                         copy($pool_file, $mp3);
 317                     }
 318                 }
 319
 320                 # Don't generate MP3 if it already exists (probably from the POOL)
 321                 if (! -f $mp3) {
 322                     if ($id eq "VOICE_PAUSE") {
 323                         print("Use distributed $wav\n") if $verbose;
 324                         copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
 325                     }
 326                     else {
 327                         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 328                         wavtrim($wav, 500, $tts_object);
 329                         # 500 seems to be a reasonable default for now
 330                     }
 331
 332                     encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 333                     synchronize($tts_object);
 334                     if (defined($ENV{'POOL'})) {
 335                         copy($mp3, $pool_file);
 336                     }
 337                     unlink($wav);
 338                 }
 339                 $voice = "";
 340                 $id = "";
 341             }
 342         }
 343     }
 344     print("\n");
 345     close(VOICEFONTIDS);
 346     shutdown_tts($tts_object);
 347 }
 348
 349 # Assemble the voicefile
 350 sub createvoice {
 351     our $verbose;
 352     my ($language, $target_id) = @_;
 353     my $voicefont = dirname($0) . '/voicefont';
 354     my $outfile = "";
 355     my $i = 0;
 356     do {
 357         $outfile = sprintf("%s%s.voice", $language, ($i++ == 0 ? '' : '-'.$i));
 358     } while (-f $outfile);
 359     printf("Saving voice file to %s\n", $outfile) if $verbose;
 360     my $cmd = "$voicefont 'voicefontids' $target_id ./ $outfile";
 361     print("> $cmd\n") if $verbose;
 362     my $output = `$cmd`;
 363     print($output) if $verbose;
 364 }
 365
 366 sub deletemp3s() {
 367     for (glob('*.mp3')) {
 368         unlink($_);
 369     }
 370     for (glob('*.wav')) {
 371         unlink($_);
 372     }
 373 }
 374
 375 sub panic_cleanup {
 376     deletemp3s();
 377     die "moo";
 378 }
 379
 380 # Check parameters
 381 my $printusage = 0;
 382 unless (defined($V) or defined($C)) { print("Missing either -V or -C\n"); $printusage = 1; }
 383 if (defined($V)) {
 384     unless (defined($t)) { print("Missing -t argument\n"); $printusage = 1; }
 385     unless (defined($l)) { print("Missing -l argument\n"); $printusage = 1; }
 386     unless (defined($i)) { print("Missing -i argument\n"); $printusage = 1; }
 387 }
 388 elsif (defined($C)) {
 389     unless (defined($ARGV[0])) { print "Missing path argument\n"; $printusage = 1; }
 390 }
 391 unless (defined($e)) { print("Missing -e argument\n"); $printusage = 1; }
 392 unless (defined($E)) { print("Missing -E argument\n"); $printusage = 1; }
 393 unless (defined($s)) { print("Missing -s argument\n"); $printusage = 1; }
 394 unless (defined($S)) { print("Missing -S argument\n"); $printusage = 1; }
 395 if ($printusage == 1) { printusage(); exit 1; }
 396
 397 $SIG{INT} = \&panic_cleanup;
 398 $SIG{KILL} = \&panic_cleanup;
 399
 400 if (defined($v) or defined($ENV{'V'})) {
 401     our $verbose = 1;
 402 }
 403
 404
 405 # Do what we're told
 406 if ($V == 1) {
 407     printf("Generating voice\n  Target: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n",
 408         $t, $l, $e, $E, $s, $S);
 409     generateclips($l, $t, $e, $E, $s, $S);
 410     createvoice($l, $i);
 411     deletemp3s();
 412 }
 413 elsif ($C) {
 414     # xxx: Implement .talk clip generation
 415 }
 416 else {
 417     printusage();
 418     exit 1;
 419 }