genlang: Remove unused subs. Use precompiled regex in parsetarget(). Minor tweek...
[maemo-rb.git] / tools / genlang
blob6a63f8004efee9f72e42aca2db88ce76a5ad2baa
1 #!/usr/bin/perl -s
2 # __________ __ ___.
3 # Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 # \/ \/ \/ \/ \/
8 # $Id$
10 # Copyright (C) 2006 - 2008 by Daniel Stenberg
13 # See apps/language.c (TODO: Use common include for both)
14 # Cookie and binary version for the binary lang file
15 my $LANGUAGE_COOKIE = 0x1a;
16 my $LANGUAGE_VERSION = 0x06;
17 my $LANGUAGE_FLAG_RTL = 0x01;
19 my $HEADER_SIZE = 4;
20 my $SUBHEADER_SIZE = 6;
22 # A note for future users and readers: The original v1 language system allowed
23 # the build to create and use a different language than english built-in. We
24 # removed that feature from our build-system, but the build scripts still had
25 # the ability. But, starting now, this ability is no longer provided since I
26 # figured it was boring and unnecessary to write support for now since we
27 # don't use it anymore.
30 if(!$ARGV[0]) {
31 print <<MOO
32 Usage: genlang [options] <langv2 file>
34 -p=<prefix>
35 Make the tool create a [prefix].c and [prefix].h file.
37 -b=<outfile>
38 Make the tool create a binary language (.lng) file named [outfile].
39 The use of this option requires that you also use -e, -t and -i.
42 Update language file. Given the translated file and the most recent english
43 file, you\'ll get an updated version sent to stdout. Suitable action to do
44 when you intend to update a translation.
46 -e=<english lang file>
47 Point out the english (original source) file, to use that as master
48 language template. Used in combination with -b, -u or -s.
51 Sort the Update language file in the same order as the strings in the
52 English file.
54 -t=<target>
55 Specify which target you want the translations/phrases for. Required when
56 -b or -p is used.
58 The target can in fact be specified as numerous different strings,
59 separated with colons. This will make genlang to use all the specified
60 strings when searching for a matching phrase.
62 -i=<target id>
63 The target id number, needed for -b.
66 Voice mode output. Outputs all id: and voice: lines for the given target!
69 Enables verbose (debug) output.
70 MOO
72 exit;
75 # How update works:
77 # 1) scan the english file, keep the whole <phrase> for each phrase.
78 # 2) read the translated file, for each end of phrase, compare:
79 # A) all source strings, if there's any change there should be a comment about
80 # it output
81 # B) the desc fields
83 # 3) output the phrase with the comments from above
84 # 4) check which phrases that the translated version didn't have, and spit out
85 # the english version of those
88 my $prefix = $p;
89 my $binary = $b;
90 my $update = $u;
91 my $sortfile = $s;
93 my $english = $e;
94 my $voiceout = $o;
96 my $check = ($binary?1:0) + ($prefix?1:0) + ($update?1:0) + ($voiceout?1:0) + ($sortfile?1:0);
98 if($check > 1) {
99 print STDERR "Please use only one of -p, -u, -o, -b and -s\n";
100 exit;
102 if(!$check) {
103 print STDERR "Please use at least one of -p, -u, -o, -b and -s\n";
104 exit;
108 if(($binary || $update || $voiceout || $sortfile) && !$english) {
109 print STDERR "Please use -e too when you use -b, -o, -u or -s\n";
110 exit;
113 my $target_id = $i;
114 if($binary && !$target_id) {
115 print STDERR "Please specify a target id number (with -i)!\n";
116 exit;
119 my $target = $t;
120 if(!$target && !$update && !$sortfile) {
121 print STDERR "Please specify a target (with -t)!\n";
122 exit;
124 my @target_parts = split ':', $target;
126 my $verbose=$v;
128 my %id; # string to num hash
129 my @idnum; # num to string array
131 my %allphrases; # For sorting - an array of the <phrase> elements
132 my %source; # id string to source phrase hash
133 my %dest; # id string to dest phrase hash
134 my %voice; # id string to voice phrase hash
136 my %users =
137 ('core' => 0);
139 my $input = $ARGV[0];
141 my @m;
142 my $m="blank";
144 sub trim {
145 my ($string) = @_;
146 $string =~ s/^\s+//;
147 $string =~ s/\s+$//;
148 return $string;
151 my %head;
152 sub header {
153 my ($full, $n, $v)=@_;
154 $head{$n}=$v;
157 my %phrase;
158 sub phrase {
159 my ($full, $n, $v)=@_;
160 $phrase{$n}=$v;
163 my %options;
164 sub options {
165 my ($full, $n, $v)=@_;
166 $options{$n}=$v;
169 sub parsetarget {
170 my ($debug, $strref, $full, $n, $v)=@_;
171 my $string;
172 my @all= split(" *, *", $n);
173 my $test;
174 my $regexp;
175 for $test (@all) {
176 $test =~ s/\*/.*/g;
177 $test =~ s/\?/./g;
178 $regexp = qr/^$test\z/;
179 # print "TEST ($debug) $target for $test\n";
180 for my $part (@target_parts) {
181 if($part =~ $regexp) {
182 $string = $v;
183 # print "MATCH: $test => $v\n";
184 $$strref = $string;
185 return $string;
191 my $src;
192 sub source {
193 parsetarget("src", \$src, @_);
196 my $dest;
197 sub dest {
198 parsetarget("dest", \$dest, @_);
201 my $voice;
202 sub voice {
203 parsetarget("voice", \$voice, @_);
206 my %idmap;
207 my %english;
208 if($english) {
209 # For the cases where the english file needs to be scanned/read, we do
210 # it before we read the translated file. For -b it isn't necessary, but for
211 # -u it is convenient.
213 my @idnum = ((0)); # start with a true number
214 my @vidnum = ((0x8000)); # first voice id
215 open(ENG, "<$english") || die "Error: can't open $english";
216 my @phrase;
217 my $id;
218 my $maybeid;
219 my $user;
220 my $withindest;
221 my $numphrases = 0;
222 my $numusers = 1; # core is already in the users map
224 while(<ENG>) {
226 # get rid of DOS newlines
227 $_ =~ tr/\r//d;
229 if($_ =~ /^ *\<phrase\>/) {
230 # this is the start of a phrase
232 elsif($_ =~ /^ *\<\/phrase\>/) {
234 # if id is something, when we count and store this phrase
235 if($id) {
236 # voice-only entries get a difference range
237 if($id =~ /^VOICE_/) {
238 # Assign an ID number to this entry
239 $idmap[$user]{$id}=$vidnum[$user];
240 $vidnum[$user]++;
242 else {
243 # Assign an ID number to this entry
244 $idmap[$user]{$id}=$idnum[$user];
245 $idnum[$user]++;
246 # print STDERR "DEST: bumped idnum to $idnum[$user]\n";
249 # this is the end of a phrase, add it to the english hash
250 $english{$id}=join("", @phrase);
252 undef @phrase;
253 $id="";
255 elsif($_ ne "\n") {
256 # gather everything related to this phrase
257 push @phrase, $_;
258 if($_ =~ /^ *\<dest\>/i) {
259 $withindest=1;
260 $deststr="";
262 elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
263 $withindest=0;
265 if($update || ($deststr && ($deststr !~ /^none\z/i))) {
266 # we unconditionally always use all IDs when the "update"
267 # feature is used
268 $id = $maybeid;
269 # print "DEST: use this id $id\n";
271 else {
272 # print "skip $maybeid for $name\n";
275 elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
276 my ($name, $val)=($1, $2);
277 $dest=""; # in case it is left untouched for when the
278 # model name isn't "our"
279 dest($_, $name, $val);
281 if($dest) {
282 # Store the current dest string. If this target matches
283 # multiple strings, it will get updated several times.
284 $deststr = $dest;
289 if($_ =~ /^ *id: ([^ \t\n]+)/i) {
290 $maybeid=$1;
291 $sortorder{$maybeid}=$numphrases++;
293 if($_ =~ /^ *user: ([^ \t\n]+)/i) {
294 $user = $users{$1};
295 if(!(defined $user)) {
296 $user = ++$numusers;
297 $users{$1} = $user;
301 close(ENG);
304 # a function that compares the english phrase with the translated one.
305 # compare source strings and desc
307 # Then output the updated version!
308 sub compare {
309 my ($idstr, $engref, $locref)=@_;
310 my ($edesc, $ldesc);
311 my ($esource, $lsource);
312 my $mode=0;
314 for my $l (@$engref) {
315 if($l =~ /^ *#/) {
316 # comment
317 next;
319 if($l =~ /^ *desc: (.*)/) {
320 $edesc=$1;
322 elsif($l =~ / *\<source\>/i) {
323 $mode=1;
325 elsif($mode) {
326 if($l =~ / *\<\/source\>/i) {
327 last;
329 $esource .= "$l\n";
333 my @show;
334 my @source;
336 $mode = 0;
337 for my $l (@$locref) {
338 if($l =~ /^ *desc: (.*)/) {
339 $ldesc=$1;
340 if(trim($edesc) ne trim($ldesc)) {
341 $l = "### The 'desc' field differs from the english!\n### the previously used desc is commented below:\n### desc: $ldesc\n desc: $edesc\n";
343 push @show, $l;
345 elsif($l =~ / *\<source\>/i) {
346 $mode=1;
347 push @show, $l;
349 elsif($mode) {
350 if($l =~ / *\<\/source\>/i) {
351 $mode = 0;
352 print @show;
353 if(trim($esource) ne trim($lsource)) {
354 print "### The <source> section differs from the english!\n",
355 "### the previously used one is commented below:\n";
356 for(split("\n", $lsource)) {
357 print "### $_\n";
359 print $esource;
361 else {
362 print $lsource;
364 undef @show; # start over
366 push @show, $l;
368 else {
369 $lsource .= "$l";
372 else {
373 push @show, $l;
378 print @show;
381 my @idcount; # counter for lang ID numbers
382 my @voiceid; # counter for voice-only ID numbers
384 for (keys %users) {
385 push @idcount, 0;
386 push @voiceid, 0x8001;
390 # Now start the scanning of the selected language string
393 open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
394 my @phrase;
395 my $header = 1;
396 my $langoptions = 0;
398 while(<LANG>) {
400 $line++;
402 # get rid of DOS newlines
403 $_ =~ tr/\r//d;
405 if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
406 # comment or empty line - output it if it's part of the header
407 if ($header and ($update || $sortfile)) {
408 print($_);
410 next;
412 $header = 0;
414 my $ll = $_;
416 # print "M: $m\n";
418 push @phrase, $ll;
420 # this is an XML-lookalike tag
421 if (/^(<|[^\"<]+<)([^>]*)>/) {
422 my $part = $2;
423 # print "P: $part\n";
425 if($part =~ /^\//) {
426 # this was a closing tag
428 if($part eq "/phrase") {
429 # closing the phrase
431 my $idstr = $phrase{'id'};
432 my $idnum;
434 if($binary && !$english{$idstr}) {
435 # $idstr doesn't exist for english, skip it\n";
437 elsif($dest =~ /^none\z/i) {
438 # "none" as dest (without quotes) means that this entire
439 # phrase is to be ignored
441 elsif($sortfile) {
442 $allphrases{$idstr}=join('',@phrase);
444 elsif(!$update) {
445 # we don't do the fully detailed analysis when we "update"
446 # since we don't do it for a particular target etc
448 # allow the keyword 'deprecated' to be used on dest and
449 # voice strings to mark that as deprecated. It will then
450 # be replaced with "".
452 $dest =~ s/^deprecate(|d)\z/\"\"/i;
453 $voice =~ s/^deprecate(|d)\z/\"\"/i;
455 # basic syntax error alerts, if there are no quotes we
456 # will assume an empty string was intended
457 if($dest !~ /^\"/) {
458 print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
459 $dest='""';
461 if($src !~ /^\"/) {
462 print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
463 $src='""';
465 if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
466 print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
467 $voice='""';
469 if($dest eq '""' && $phrase{'desc'} !~ /deprecated/i && $idstr !~ /^VOICE/) {
470 print STDERR "$input:$line:1: warning: empty dest before line in non-deprecated phrase!\n";
473 my $userstr = trim($phrase{'user'});
474 my $user = $users{$userstr};
475 if ($userstr eq "") {
476 print STDERR "$input:$line:1: warning: missing user!\n";
477 $user = $users{"core"};
479 elsif(!(defined $user)) {
480 if($english) {
481 print STDERR "$input:$line:1: warning: user was not found in $english!\n";
482 $user = keys %users; # set to an invalid user so it won't be added
484 else {
485 # we found a new user, add it to the usermap
486 $user = ++$numusers;
487 $users{$userstr} = $user;
491 # Use the ID name to figure out which id number range we
492 # should use for this phrase. Voice-only strings are
493 # separated.
495 if($idstr =~ /^VOICE/) {
496 $idnum = $voiceid[$user]++;
498 else {
499 $idnum = $idcount[$user]++;
502 $id{$idstr} = $idnum;
503 $idnum[$user][$idnum]=$idstr;
505 $source{$idstr}=$src;
506 $dest{$idstr}=$dest;
507 $voice{$idstr}=$voice;
509 if($verbose) {
510 print "id: $phrase{id} ($idnum)\n";
511 print "source: $src\n";
512 print "dest: $dest\n";
513 print "voice: $voice\n";
514 print "user: $user\n";
517 undef $src;
518 undef $dest;
519 undef $voice;
520 undef $user;
521 undef %phrase;
524 if($update) {
525 my $e = $english{$idstr};
527 if($e) {
528 # compare original english with this!
529 my @eng = split("\n", $english{$idstr});
531 compare($idstr, \@eng, \@phrase);
533 $english{$idstr}=""; # clear it
535 else {
536 print "### $idstr: The phrase is not used. Skipped\n";
539 undef @phrase;
540 } # end of </phrase>
541 elsif($part eq "/options") {
542 # closing the options
543 if ($options{'rtl'}) {
544 $langoptions |= $LANGUAGE_FLAG_RTL;
546 } # end of </options>
548 # starts with a slash, this _ends_ this section
549 $m = pop @m; # get back old value, the previous level's tag
550 next;
551 } # end of tag close
553 # This is an opening (sub) tag
555 push @m, $m; # store old value
556 $m = $part;
557 next;
560 if(/^ *([^:]+): *(.*)/) {
561 my ($name, $val)=($1, $2);
562 &$m($_, $name, $val);
565 close(LANG);
567 if($update) {
568 my $any=0;
569 for(keys %english) {
570 if($english{$_}) {
571 print "###\n",
572 "### This phrase below was not present in the translated file\n",
573 "<phrase>\n";
574 print $english{$_};
575 print "</phrase>\n";
580 if ($sortfile) {
581 for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
582 print $allphrases{$_};
586 if($prefix) {
587 # We create a .c and .h file
589 open(HFILE_CORE, ">$prefix/lang.h") ||
590 die "Error: couldn't create file $prefix/lang.h\n";
591 open(CFILE_CORE, ">$prefix/lang_core.c") ||
592 die "Error: couldn't create file $prefix/lang_core.c\n";
594 # get header file name
595 $headername = "$prefix/lang.h";
596 $headername =~ s/(.*\/)*//;
598 print HFILE_CORE <<MOO
599 /* This file was automatically generated using genlang */
601 * The str() macro/functions is how to access strings that might be
602 * translated. Use it like str(MACRO) and expect a string to be
603 * returned!
605 #define str(x) language_strings[x]
607 /* this is the array for holding the string pointers.
608 It will be initialized at runtime. */
609 extern unsigned char *language_strings[];
610 /* this contains the concatenation of all strings, separated by \\0 chars */
611 extern const unsigned char core_language_builtin[];
613 /* The enum below contains all available strings */
614 enum \{
618 print CFILE_CORE <<MOO
619 /* This file was automatically generated using genlang, the strings come
620 from "$input" */
622 #include "$headername"
624 unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
625 const unsigned char core_language_builtin[] =
629 # Output the ID names for the enum in the header file
630 my $i;
631 for $i (0 .. $idcount[$users{"core"}]-1) {
632 my $name=$idnum[$users{"core"}][$i]; # get the ID name
634 $name =~ s/\"//g; # cut off the quotes
636 printf HFILE_CORE (" %s, /* %d */\n", $name, $i);
639 # Output separation marker for last string ID and the upcoming voice IDs
641 print HFILE_CORE <<MOO
642 LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
643 /* --- below this follows voice-only strings --- */
644 VOICEONLY_DELIMITER = 0x8000,
648 # Output the ID names for the enum in the header file
649 for $i (0x8001 .. ($voiceid[$users{"core"}]-1)) {
650 my $name=$idnum[$users{"core"}][$i]; # get the ID name
652 $name =~ s/\"//g; # cut off the quotes
654 printf HFILE_CORE (" %s, /* 0x%x */\n", $name, $i);
657 # Output end of enum
658 print HFILE_CORE "\n};\n/* end of generated enum list */\n";
660 # Output the target phrases for the source file
661 for $i (0 .. $idcount[$users{"core"}]-1) {
662 my $name=$idnum[$users{"core"}][$i]; # get the ID
663 my $dest = $dest{$name}; # get the destination phrase
665 $dest =~ s:\"$:\\0\":; # insert a \0 before the second quote
667 if(!$dest) {
668 # this is just to be on the safe side
669 $dest = '"\0"';
672 printf CFILE_CORE (" %s\n", $dest);
675 # Output end of string chunk
676 print CFILE_CORE <<MOO
678 /* end of generated string list */
682 close(HFILE_CORE);
683 close(CFILE_CORE);
684 } # end of the c/h file generation
685 elsif($binary) {
686 # Creation of a binary lang file was requested
688 # We must first scan the english file to get the correct order of the id
689 # numbers used there, as that is what sets the id order for all language
690 # files. The english file is scanned before the translated file was
691 # scanned.
693 open(OUTF, ">$binary") or die "Error: Can't create $binary";
694 binmode OUTF;
695 printf OUTF ("%c%c%c%c", $LANGUAGE_COOKIE, $LANGUAGE_VERSION, $target_id,
696 $langoptions); # magic lang file header
698 # output the number of strings for each user
699 my $foffset = $HEADER_SIZE + $SUBHEADER_SIZE * keys(%users);
700 for (keys %users) {
701 my $size;
702 for $n (0 .. $idcount[$_]-1) {
703 $size += length(trim($dest{$idnum[$_][$n]})) + 1;
705 printf OUTF ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
706 ($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
707 $foffset += $size;
710 for (keys %users) {
711 # loop over the target phrases
712 for $n (0 .. $idcount[$_]-1) {
713 my $name=$idnum[$_][$n]; # get the ID
714 my $dest = $dest{$name}; # get the destination phrase
716 if($dest) {
717 $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
719 # Now, make sure we get the number from the english sort order:
720 $idnum = $idmap[$_]{$name};
722 printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
727 elsif($voiceout) {
728 # voice output requested, display id: and voice: strings in a v1-like
729 # fashion
731 my @engl;
733 # This loops over the strings in the translated language file order
734 my @ids = ((0 .. ($idcount[$users{"core"}]-1)));
735 push @ids, (0x8000 .. ($voiceid[$users{"core"}]-1));
737 #for my $id (@ids) {
738 # print "$id\n";
741 for $i (@ids) {
742 my $name=$idnum[$users{"core"}][$i]; # get the ID
743 my $dest = $voice{$name}; # get the destination voice string
745 if($dest) {
746 $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
748 # Now, make sure we get the number from the english sort order:
749 $idnum = $idmap[$users{"core"}]{$name};
751 if(length($idnum)) {
752 $engl[$idnum] = $i;
754 #print "Input index $i output index $idnum\n";
756 else {
757 # not used, mark it so
758 $engl[$i] = -1
763 for my $i (@ids) {
765 my $o = $engl[$i];
767 if(($o < 0) || !length($o)) {
768 print "#$i\nid: NOT_USED_$i\nvoice: \"\"\n";
769 next;
772 my $name=$idnum[$users{"core"}][$o]; # get the ID
773 my $dest = $voice{$name}; # get the destination voice string
775 print "#$i ($o)\nid: $name\nvoice: $dest\n";
781 if($verbose) {
782 my $num_str = 0;
784 for (keys %users) {
785 $num_str += $idcount[$_];
788 printf("%d ID strings scanned\n", $num_str);
790 print "* head *\n";
791 for(keys %head) {
792 printf "$_: %s\n", $head{$_};