tools/voice.pl

   1 #!/usr/bin/perl -s
   2 #             __________               __   ___.
   3 #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4 #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7 #                     \/            \/     \/    \/            \/
   8 # $Id$
   9 #
  10 # Copyright (C) 2007 Jonas Häggqvist
  11 #
  12 # All files in this archive are subject to the GNU General Public License.
  13 # See the file COPYING in the source tree root for full license agreement.
  14 #
  15 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16 # KIND, either express or implied.
  17
  18 use strict;
  19 use warnings;
  20 use feature 'switch';
  21 use File::Basename;
  22 use File::Copy;
  23 use vars qw($V $C $t $l $e $E $s $S $i $v);
  24 use IPC::Open2;
  25 use IPC::Open3;
  26 use Digest::MD5 qw(md5_hex);
  27 use DirHandle;
  28 use open ':encoding(utf8)';
  29 use open ':std';
  30
  31 sub printusage {
  32     print <<USAGE
  33
  34 Usage: voice.pl [options] [path to dir]
  35  -V
  36     Create voice file. You must also specify -t and -l.
  37
  38  -C
  39     Create .talk clips.
  40
  41  -t=<target>
  42     Specify which target you want to build voicefile for. Must include
  43     any features that target supports.
  44
  45  -i=<target_id>
  46     Numeric target id. Needed for voice building.
  47
  48  -l=<language>
  49     Specify which language you want to build. Without .lang extension.
  50
  51  -e=<encoder>
  52     Which encoder to use for voice strings
  53
  54  -E=<encoder options>
  55     Which encoder options to use when compressing voice strings. Enclose
  56     in double quotes if the options include spaces.
  57
  58  -s=<TTS engine>
  59     Which TTS engine to use.
  60
  61  -S=<TTS engine options>
  62     Options to pass to the TTS engine. Enclose in double quotes if the
  63     options include spaces.
  64
  65  -v
  66     Be verbose
  67 USAGE
  68 ;
  69 }
  70
  71 # Initialize TTS engine. May return an object or value which will be passed
  72 # to voicestring and shutdown_tts
  73 sub init_tts {
  74     our $verbose;
  75     my ($tts_engine, $tts_engine_opts, $language) = @_;
  76     my %ret = ("name" => $tts_engine);
  77     given ($tts_engine) {
  78         when ("festival") {
  79             print("> festival $tts_engine_opts --server\n") if $verbose;
  80             my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
  81             my $dummy = *FESTIVAL_SERVER; #suppress warning
  82             $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
  83             $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
  84             $ret{"pid"} = $pid;
  85         }
  86         when ("sapi") {
  87             my $toolsdir = dirname($0);
  88             my $path = `cygpath $toolsdir -a -w`;
  89             chomp($path);
  90             $path = $path . '\\';
  91             my $cmd = $path . "sapi_voice.vbs /language:$language $tts_engine_opts";
  92             $cmd =~ s/\\/\\\\/g;
  93             print("> cscript //nologo $cmd\n") if $verbose;
  94             my $pid = open2(*CMD_OUT, *CMD_IN, "cscript //nologo $cmd");
  95             binmode(*CMD_IN, ':encoding(utf16le)');
  96             binmode(*CMD_OUT, ':encoding(utf16le)');
  97             $SIG{INT} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  98             $SIG{KILL} = sub { print(CMD_IN "QUIT\r\n"); panic_cleanup(); };
  99             print(CMD_IN "QUERY\tVENDOR\r\n");
 100             my $vendor = readline(*CMD_OUT);
 101             $vendor =~ s/\r\n//;
 102             %ret = (%ret,
 103                     "stdin" => *CMD_IN,
 104                     "stdout" => *CMD_OUT,
 105                     "vendor" => $vendor);
 106         }
 107     }
 108     return \%ret;
 109 }
 110
 111 # Shutdown TTS engine if necessary.
 112 sub shutdown_tts {
 113     my ($tts_object) = @_;
 114     given ($$tts_object{"name"}) {
 115         when ("festival") {
 116             # Send SIGTERM to festival server
 117             kill TERM => $$tts_object{"pid"};
 118         }
 119         when ("sapi") {
 120             print({$$tts_object{"stdin"}} "QUIT\r\n");
 121             close($$tts_object{"stdin"});
 122         }
 123     }
 124 }
 125
 126 # Apply corrections to a voice-string to make it sound better
 127 sub correct_string {
 128     our $verbose;
 129     my ($string, $language, $tts_object) = @_;
 130     my $orig = $string;
 131     my $corrections = $tts_object->{"corrections"};
 132
 133     foreach (@$corrections) {
 134         my $r = "s" . $_->{separator} . $_->{search} . $_->{separator}
 135                 . $_->{replace} . $_->{separator} . $_->{modifier};
 136         eval ('$string =~' . "$r;");
 137     }
 138     if ($orig ne $string) {
 139         printf("%s -> %s\n", $orig, $string) if $verbose;
 140     }
 141     return $string;
 142 }
 143
 144 # Produce a wav file of the text given
 145 sub voicestring {
 146     our $verbose;
 147     my ($string, $output, $tts_engine_opts, $tts_object) = @_;
 148     my $cmd;
 149     printf("Generate \"%s\" with %s in file %s\n", $string, $$tts_object{"name"}, $output) if $verbose;
 150     given ($$tts_object{"name"}) {
 151         when ("festival") {
 152             # festival_client lies to us, so we have to do awful soul-eating
 153             # work with IPC::open3()
 154             $cmd = "festival_client --server localhost --otype riff --ttw --output \"$output\"";
 155             # Use festival-prolog.scm if it's there (created by user of tools/configure)
 156             if (-f "festival-prolog.scm") {
 157                 $cmd .= " --prolog festival-prolog.scm";
 158             }
 159             print("> $cmd\n") if $verbose;
 160             # Open command, and filehandles for STDIN, STDOUT, STDERR
 161             my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
 162             # Put the string to speak into STDIN and close it
 163             print(CMD_IN $string);
 164             close(CMD_IN);
 165             # Read all output from festival_client (because it LIES TO US)
 166             while (<CMD_ERR>) {
 167             }
 168             close(CMD_OUT);
 169             close(CMD_ERR);
 170         }
 171         when ("flite") {
 172             $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
 173             print("> $cmd\n") if $verbose;
 174             `$cmd`;
 175         }
 176         when ("espeak") {
 177             $cmd = "espeak $tts_engine_opts -w \"$output\"";
 178             print("> $cmd\n") if $verbose;
 179             open(ESPEAK, "| $cmd");
 180             print ESPEAK $string . "\n";
 181             close(ESPEAK);
 182         }
 183         when ("sapi") {
 184             print({$$tts_object{"stdin"}} "SPEAK\t$output\t$string\r\n");
 185         }
 186         when ("swift") {
 187             $cmd = "swift $tts_engine_opts -o \"$output\" \"$string\"";
 188             print("> $cmd\n") if $verbose;
 189             system($cmd);
 190         }
 191     }
 192 }
 193
 194 # trim leading / trailing silence from the clip
 195 sub wavtrim {
 196     our $verbose;
 197     my ($file, $threshold, $tts_object) = @_;
 198     printf("Trim \"%s\"\n", $file) if $verbose;
 199     my $cmd = "wavtrim \"$file\" $threshold";
 200     if ($$tts_object{"name"} eq "sapi") {
 201         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 202     }
 203     else {
 204         print("> $cmd\n") if $verbose;
 205         `$cmd`;
 206     }
 207 }
 208
 209 # Encode a wav file into the given destination file
 210 sub encodewav {
 211     our $verbose;
 212     my ($input, $output, $encoder, $encoder_opts, $tts_object) = @_;
 213     printf("Encode \"%s\" with %s in file %s\n", $input, $encoder, $output) if $verbose;
 214     my $cmd = "$encoder $encoder_opts \"$input\" \"$output\"";
 215     if ($$tts_object{"name"} eq "sapi") {
 216         print({$$tts_object{"stdin"}} "EXEC\t$cmd\r\n");
 217     }
 218     else {
 219         print("> $cmd\n") if $verbose;
 220         `$cmd`;
 221     }
 222 }
 223
 224 # synchronize the clip generation / processing if it's running in another process
 225 sub synchronize {
 226     my ($tts_object) = @_;
 227     if ($$tts_object{"name"} eq "sapi") {
 228         print({$$tts_object{"stdin"}} "SYNC\t42\r\n");
 229         my $wait = readline($$tts_object{"stdout"});
 230         #ignore what's actually returned
 231     }
 232 }
 233
 234 # Run genlang and create voice clips for each string
 235 sub generateclips {
 236     our $verbose;
 237     my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
 238     my $english = dirname($0) . '/../apps/lang/english.lang';
 239     my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
 240     my $correctionsfile = dirname($0) . '/voice-corrections.txt';
 241     my $id = '';
 242     my $voice = '';
 243     my $cmd = "genlang -o -t=$target -e=$english $langfile 2>/dev/null";
 244     my $pool_file;
 245     open(VOICEFONTIDS, "> voicefontids");
 246     my $i = 0;
 247     local $| = 1; # make progress indicator work reliably
 248
 249     my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
 250     # add string corrections to tts_object.
 251     my @corrects = ();
 252     open(VOICEREGEXP, "<$correctionsfile") or die "Can't open corrections file!\n";
 253     while(<VOICEREGEXP>) {
 254         # get first character of line
 255         my $line = $_;
 256         my $separator = substr($_, 0, 1);
 257         if($separator =~ m/\s+/) {
 258             next;
 259         }
 260         chomp($line);
 261         $line =~ s/^.//g; # remove separator at beginning
 262         my ($lang, $engine, $vendor, $search, $replace, $modifier) = split(/$separator/, $line);
 263
 264         # does language match?
 265         if($language !~ m/$lang/) {
 266             next;
 267         }
 268         if($$tts_object{"name"} !~ m/$engine/) {
 269             next;
 270         }
 271         my $v = $$tts_object{"vendor"} || ""; # vendor might be empty in $tts_object
 272         if($v !~ m/$vendor/) {
 273             next;
 274         }
 275         push @corrects, {separator => $separator, search => $search, replace => $replace, modifier => $modifier};
 276
 277     }
 278     close(VOICEREGEXP);
 279     $tts_object->{corrections} = [@corrects];
 280
 281     print("Generating voice clips");
 282     print("\n") if $verbose;
 283     for (`$cmd`) {
 284         my $line = $_;
 285         print(VOICEFONTIDS $line);
 286         if ($line =~ /^id: (.*)$/) {
 287             $id = $1;
 288         }
 289         elsif ($line =~ /^voice: "(.*)"$/) {
 290             $voice = $1;
 291             if ($id !~ /^NOT_USED_.*$/ && $voice ne "") {
 292                 my $wav = $id . '.wav';
 293                 my $mp3 = $id . '.mp3';
 294
 295                 # Print some progress information
 296                 if (++$i % 10 == 0 and !$verbose) {
 297                     print(".");
 298                 }
 299
 300                 # Apply corrections to the string
 301                 $voice = correct_string($voice, $language, $tts_object);
 302
 303                 # If we have a pool of snippets, see if the string exists there first
 304                 if (defined($ENV{'POOL'})) {
 305                     $pool_file = sprintf("%s/%s-%s.mp3", $ENV{'POOL'},
 306                                          md5_hex("$voice $tts_engine $tts_engine_opts $encoder_opts"),
 307                                          $language);
 308                     if (-f $pool_file) {
 309                         printf("Re-using %s (%s) from pool\n", $id, $voice) if $verbose;
 310                         copy($pool_file, $mp3);
 311                     }
 312                 }
 313
 314                 # Don't generate MP3 if it already exists (probably from the POOL)
 315                 if (! -f $mp3) {
 316                     if ($id eq "VOICE_PAUSE") {
 317                         print("Use distributed $wav\n") if $verbose;
 318                         copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
 319                     }
 320                     else {
 321                         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 322                         wavtrim($wav, 500, $tts_object);
 323                         # 500 seems to be a reasonable default for now
 324                     }
 325
 326                     encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 327                     synchronize($tts_object);
 328                     if (defined($ENV{'POOL'})) {
 329                         copy($mp3, $pool_file);
 330                     }
 331                     unlink($wav);
 332                 }
 333                 $voice = "";
 334                 $id = "";
 335             }
 336         }
 337     }
 338     print("\n");
 339     close(VOICEFONTIDS);
 340     shutdown_tts($tts_object);
 341 }
 342
 343 # Assemble the voicefile
 344 sub createvoice {
 345     our $verbose;
 346     my ($language, $target_id) = @_;
 347     my $outfile = "";
 348     $outfile = sprintf("%s.voice", $language);
 349     printf("Saving voice file to %s\n", $outfile) if $verbose;
 350     my $cmd = "voicefont 'voicefontids' $target_id ./ $outfile";
 351     print("> $cmd\n") if $verbose;
 352     my $output = `$cmd`;
 353     print($output) if $verbose;
 354 }
 355
 356 sub deletemp3s() {
 357     for (glob('*.mp3')) {
 358         unlink($_);
 359     }
 360     for (glob('*.wav')) {
 361         unlink($_);
 362     }
 363 }
 364
 365 sub panic_cleanup {
 366     deletemp3s();
 367     die "moo";
 368 }
 369
 370 # Generate .talk clips
 371 sub gentalkclips {
 372     our $verbose;
 373     my ($dir, $tts_object, $encoder, $encoder_opts, $tts_engine_opts, $i) = @_;
 374     my $d = new DirHandle $dir;
 375     while (my $file = $d->read) {
 376         my ($voice, $wav, $mp3);
 377         # Print some progress information
 378         if (++$i % 10 == 0 and !$verbose) {
 379             print(".");
 380         }
 381
 382         # Convert to a complete path
 383         my $path = sprintf("%s/%s", $dir, $file);
 384
 385         $voice = $file;
 386         $wav = sprintf("%s.talk.wav", $path);
 387
 388         # Ignore dot-dirs and talk files
 389         if ($file eq '.' || $file eq '..' || $file =~ /\.talk$/) {
 390             next;
 391         }
 392         # Element is a dir
 393         if ( -d $path) {
 394             gentalkclips($path, $tts_object, $encoder, $encoder_opts, $tts_engine_opts, $i);
 395             $mp3 = sprintf("%s/_dirname.talk", $path);
 396         }
 397         # Element is a file
 398         else {
 399             $mp3 = sprintf("%s.talk", $path);
 400             $voice =~ s/\.[^\.]*$//; # Trim extension
 401         }
 402
 403         printf("Talkclip %s: %s", $mp3, $voice) if $verbose;
 404
 405         voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 406         wavtrim($wav, 500, $tts_object);
 407         # 500 seems to be a reasonable default for now
 408         encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
 409         synchronize($tts_object);
 410         unlink($wav);
 411     }
 412 }
 413
 414
 415 # Check parameters
 416 my $printusage = 0;
 417 unless (defined($V) or defined($C)) { print("Missing either -V or -C\n"); $printusage = 1; }
 418 if (defined($V)) {
 419     unless (defined($t)) { print("Missing -t argument\n"); $printusage = 1; }
 420     unless (defined($l)) { print("Missing -l argument\n"); $printusage = 1; }
 421     unless (defined($i)) { print("Missing -i argument\n"); $printusage = 1; }
 422 }
 423 elsif (defined($C)) {
 424     unless (defined($ARGV[0])) { print "Missing path argument\n"; $printusage = 1; }
 425 }
 426 unless (defined($e)) { print("Missing -e argument\n"); $printusage = 1; }
 427 unless (defined($E)) { print("Missing -E argument\n"); $printusage = 1; }
 428 unless (defined($s)) { print("Missing -s argument\n"); $printusage = 1; }
 429 unless (defined($S)) { print("Missing -S argument\n"); $printusage = 1; }
 430 if ($printusage == 1) { printusage(); exit 1; }
 431
 432 if (defined($v) or defined($ENV{'V'})) {
 433     our $verbose = 1;
 434 }
 435
 436 # add the tools dir to the path temporarily, for calling various tools
 437 $ENV{'PATH'} = dirname($0) . ':' . $ENV{'PATH'};
 438
 439
 440 # Do what we're told
 441 if ($V == 1) {
 442     # Only do the panic cleanup for voicefiles
 443     $SIG{INT} = \&panic_cleanup;
 444     $SIG{KILL} = \&panic_cleanup;
 445
 446     printf("Generating voice\n  Target: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n",
 447         $t, $l, $e, $E, $s, $S);
 448     generateclips($l, $t, $e, $E, $s, $S);
 449     createvoice($l, $i);
 450     deletemp3s();
 451 }
 452 elsif ($C) {
 453     printf("Generating .talk clips\n  Path: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n", $ARGV[0], $l, $e, $E, $s, $S);
 454     my $tts_object = init_tts($s, $S, $l);
 455     gentalkclips($ARGV[0], $tts_object, $e, $E, $S, 0);
 456     shutdown_tts($tts_object);
 457 }
 458 else {
 459     printusage();
 460     exit 1;
 461 }