3 # langen2kvtml - Converts langenscheidt *.voc to kvoctrain's *.kvtml
5 # analyse general header
6 # remove langenscheidt specific information
7 # continously rewrite the first of three lines giving:
11 # while guessing word type information for entries in:
16 # rewrite lesson names
17 # append tables for conjugation and articles in:
28 # Version 0.1 22/08/01 - Andreas Neuper
31 # Version 0.2 20/09/01 - Andreas Neuper
33 # language recognition added (derived from input filename)
34 # added (guessed) type information for English and Spanish
35 # modifying kde-config file to add language
37 # resolved difficulties (CR left) with Lesson information
39 # Version 0.3 22/09/01 - Andreas Neuper
41 # added (guessed) type information for French
42 # Article and conjugation tables added for:
43 # French, English, Italian, and Czechian
44 # added Verb recognition by local ending (95%)
45 # some more language definition lines
47 # Spanish words ending with -mente are adverbs
48 # Spanish pronomina in conjugation table completed
49 # Adding a space between concatenated lines
50 # empty articel and conj. definitions for other languages
51 # avoid rc-file update while kvoctrain is running
52 # some minor or less obvious corrections
54 # Version 0.4 22/09/01 - Andreas Neuper
56 # first version of automated internet download
58 # instead of "en" was "it" used for English articles
60 # Updated 22/11/02 - Ben Burton
62 # default proxy is now no proxy, not http://proxy:8080/
63 # proxy option is now --proxy=http://..., not --proxy="-p http://..."
64 # changed location of rcfile from $HOME/.kde2/... to $HOME/.kde/...
65 # updated usage string
67 # Updated 05/05/03 - Ewald Arnold
68 # corrections: XML header in uppercase where needed
70 ####################################################################
73 # User configurable variables
75 $myopt = ""; # options (not used yet)
77 # command-line option exist for:
79 $author = "http://www.Vokabeln.de"; # oder: "Langenscheidt";
80 $outdir = $cwd; # Where the output should go
81 $langdefault = "en"; # My favorite is English
82 $trans = "de"; # My favorite is Deutsch
83 $longlesson = 0; # as you like
86 #######################################################################
87 # No User configurable parameters below here
88 #######################################################################
92 $/="\r\n"; # we work with dos files
95 # Initialise internal global variables
97 $rcfile = "$ENV{HOME}/.kde/share/config/kvoctrainrc" ;
98 $vocfile = ""; # Passed from command line
99 $kvtfile = ""; # adopt to command line
100 $lang = ""; # initially unset
101 $filestage = 0; # file stage
102 %VOCLANG = ( "GB", "en",
105 # "PL", "pl", # Polish
106 # "TR", "tr", # Turkey
107 # "SK", "sk", # Slovakian
108 # "RO", "ru", # Rumanian
109 # "GR", "el", # Greek
110 # "IL", "he", # Hebraic
111 # "J", "ja", # Japanese
128 "Esp", "us_eu", # Esperanto
129 "D-Fr", "de", # Fränkisch
130 "D-Platt", "pd", # Plattdeutsch
131 "D-Hes", "hs", # Hessisch
132 "D-Schw", "sw", # Schwäbisch
133 "D-Bay", "by", # Schwäbisch
134 "GB-Alt", "en", # Shakespear
135 "GB-Sco", "sc", # Scottish
138 ####################################################
139 ####################################################
141 # Get the parameters from the command line
144 "trans=s" => \
$trans,
145 "proxy=s" => \
$proxy,
146 "author=s" => \
$author,
147 "outdir=s" => \
$outdir,
148 "country=s" => \
$country,
149 "longlesson" => \
$longlesson) ) {
150 print "Usage:\tlangen2kvtml [--lang=xx] [--trans=xx] [--author=<name>]\n";
151 print "\t\t [--longlesson] [--outdir=<dir>] [--proxy=<proxy>]\n";
152 print "\t\t { --country=x | vocfile }\n";
156 # Fix the proxy option if it has been given
158 $proxy = "-p $proxy";
161 # Get input file name if it has been given
162 if ($ARGV[0] eq '' && $country eq "") {
163 print STDERR
"Error: no input filename.\n";
164 print STDERR
"Note: Trying to take --country=GB Samples from the internet.\n";
168 # All logging information goes into this file
169 $logfile = "/tmp/langen2kvtml.log";
170 open(LOG
, ">$logfile") || die "Cannot create $logfile: $!";
172 &printflush
(STDOUT
,"Waiting for generating files ...\n");
174 &printflush
(STDOUT
,"... $tmp1 files given via command line ...\n");
176 if ( $country ne "" ) {
177 &printflush
(STDOUT
,"... fetching http://www.vokabeln.de/files/Voc-$country.zip ...\n");
179 &printflush
(STDOUT
,"... using proxy service $proxy ...\n");
181 `lwp-request $proxy http://www.vokabeln.de/files/Voc-$country.zip >/tmp/Voc-$country.zip`;
182 # unzip -u update only!
183 # unzip -o overwrite!
184 `unzip -u /tmp/Voc-$country.zip >/tmp/unzip.log`;
185 &printflush
(STDOUT
,"... updating Voc-$country.zip ...\n");
186 $/="\n"; # we work with a unix file
187 open(ZIP
,"</tmp/unzip.log");
197 unlink("/tmp/unzip.log");
198 $/="\r\n"; # we work with a dos file
201 for my $file (@res, @ARGV) {
203 &printflush
(LOG
,"... generating \"$kvtfile\"...\n");
204 $lang = ""; # initially unset
205 $filestage = 0; # file stage
206 &process_vocfile
($vocfile);
208 print STDERR
"...\tAll Complete.\n";
209 &printflush
(LOG
,"\nAll Complete.\n");
217 sub process_vocfile
() {
218 &printflush
(STDERR
,"Converting $vocfile\t ...");
219 $kvtfile = $outdir."/".substr($vocfile, 0, rindex($vocfile, ".")).".kvtml";
220 $voclang = substr($vocfile, 0, index($vocfile, "_"));
221 if("$lang" eq "") { # overwrite automatic by commandline
222 if("$voclang" ne "") { # a valid lanugage selection was found
223 $lang = $VOCLANG{$voclang};
224 } else { # use default language
225 print STDERR
"WARNING: language guessed to be \"$langdefault\".\n";
226 $lang = "$langdefault";
229 $title = substr($vocfile, 0, rindex($vocfile, "."));
230 &get_info
(); # Extract Information
231 if("$lang" eq "") { # overwrite automatic by commandline
232 print STDERR
"WARNING: language completely unsupported... but trying...\n";
234 &add2rcfile
(); # add language info to rcfile
236 print STDERR
"\"$kvtfile\" generated.\n";
239 #_____________________________________________________________________________
240 #_____________________________________________________________________________>
241 #_____________________________________________________________________________>
245 # This subroutine works as a state machine
246 # which increments state after each section:
247 # - 1 - general header containing:
250 # - 2 - langenscheidt specific information (removed)
251 # - 3 - Vocabulary in the first of 3 line blocks containing:
256 # - 5 - sub for articles
257 # - 6 - sub for conjugation
258 # All the named information is retrieved.
263 local $privateline = 0;
264 $/="\r\n"; # we work with dos files
267 open(VOC
, "$vocfile") || die "Cannot open $vocfile: $!";
269 # Generate output file
270 open(KVT
, ">$kvtfile") || die "Cannot create $kvtfile: $!";
271 print KVT
'<?xml version="1.0"?>'."\n".'<!DOCTYPE kvtml SYSTEM "kvoctrain.dtd">'."\n\n";
276 # chop twice since it is a dos file
279 if (/^([^"]*"[^"]*"[^"]*)*"[^"]*$/) {
285 if ($privateline ne 0) {
289 if ($entrynum eq 1) {
290 ($tmp1,$title,$tmp2) = split("\"");
292 if ($entrynum eq 4) {
293 ($tmp1,$tlang,$tmp2,$ttrans) = split("\"");
295 if (/^.[sS]tandard.,. ./) {
297 print KVT
'<kvtml generator="langen2kvtml v0.4.1"'."\n";
298 print KVT
'title="'."$title".'" author="'."$author".'">'."\n\n";
299 # print "$tlang $ttrans\n";
301 if ($filestage eq 0) {
304 if ($filestage eq 1) {
305 if (/^[^,]*,[^,]*$/) {
306 $privateline=15; # lines to follow (16 in total)
311 if ($filestage eq 2) {
312 ($foreign,$local,$lesson) = split(/\",\"?/) ;
313 $foreign=substr($foreign,1);
314 if (length($foreign) eq 0) {
316 print KVT
'<lesson width="200">';
320 print KVT
"<e m=\"$lesson\"".&type_specific
().">\t<o l=\"$lang\">$foreign</o>\t<t l=\"$trans\">$local</t>\t</e>\n";
322 $privateline = 2; # lines to follow
324 if ($filestage eq 3) {
325 ($tmp1,$fullname,$tmp2) = split("\"");
326 ($name,$lesson) = split(/\\/,$fullname);
327 ($tmp1,$lesson) = split(/[ "]/,$lesson);
328 if (length($lesson) ne 0) {
329 print KVT
"\n\t".'<desc no="'."$lesson".'">';
330 if ($longlesson eq 0) {
331 print KVT
sprintf("%s %03d</desc>", $name, $lesson);
333 print KVT
"$fullname</desc>";
336 $privateline = 1; # lines to follow
339 print KVT
"\n\t</lesson>\n\n";
342 print KVT
"</kvtml>\n\n";
349 print KVT
"<options>\n\t<sort on=".'"1"'."/>\n\t</options>\n\n<article>\n";
350 if ( $lang eq "es" ) {
351 print KVT
"\t<e l=\"es\">\t<fd>la</fd> <md>el</md> <nd>el</nd>\n";
352 print KVT
"\t\t\t<fi>una</fi> <mi>uno</mi> <ni>un</ni> </e>\n";
353 } elsif ($lang eq "fr" ) {
354 print KVT
"\t<e l=\"fr\">\t<fd>la</fd> <md>le</md> \n";
355 print KVT
"\t\t\t<fi>un</fi> <mi>une</mi> </e>\n";
356 } elsif ($lang eq "en" ) {
357 print KVT
"\t<e l=\"en\">\t<fd>the</fd> <md>the</md> <nd>the</nd>\n";
358 print KVT
"\t\t\t<fi>a</fi> <mi>a</mi> <ni>a</ni> </e>\n";
359 } elsif ($lang eq "it" ) {
360 print KVT
"\t<e l=\"it\">\t<fd>la</fd> <md>il</md> <nd>il</nd>\n";
361 print KVT
"\t\t\t<fi>una</fi> <mi>un</mi> <ni>un</ni> </e>\n";
362 } elsif ($lang eq "cz" ) {
363 print KVT
"\t<e l=\"cz\"> </e>\n"; # only the noun is modified
365 print KVT
"\t<e l=\"$lang\"> </e>\n";
366 } # and alays add the German definitions
367 print KVT
"\t<e l=\"de\">\t<fd>die</fd> <md>der</md> <nd>das</nd>\n";
368 print KVT
"\t\t\t<fi>eine</fi> <mi>ein</mi> <ni>ein</ni> </e>\n";
369 print KVT
"</article>\n\n";
372 sub add_conjugation
() {
373 print KVT
"<conjugation>\n";
374 if ( $lang eq "es" ) {
375 print KVT
"\t<e l=\"es\"> <s1>yo</s1> <s2>tu</s2>\n";
376 print KVT
"\t\t<s3m>el</s3m> <s3f>ella</s3f> <s3n>usted</s3n>\n";
377 print KVT
"\t\t<p1>nosotros</p1> <p2>vosotros</p2> <p3n>ustedes</p3n>\n";
378 print KVT
"\t\t<p3m>ellos</p3m> <p3f>ellas</p3f> </e>\n";
379 } elsif ($lang eq "fr" ) {
380 print KVT
"\t<e l=\"fr\"> <s1>je</s1> <s2>tu</s2>\n";
381 print KVT
"\t\t<s3m>il</s3m> <s3f>elle</s3f>\n";
382 print KVT
"\t\t<p1>nous</p1> <p2>vous</p2>\n";
383 print KVT
"\t\t<p3m>ils</p3m> <p3f>elles</p3f> </e>\n";
384 } elsif ($lang eq "en" ) {
385 print KVT
"\t<e l=\"en\"> <s1>I</s1> <s2>you</s2>\n";
386 print KVT
"\t\t<s3m>he</s3m> <s3f>she</s3f>\n";
387 print KVT
"\t\t<p1>we</p1> <p2>you</p2>\n";
388 print KVT
"\t\t<p3f common=\"1\">them</p3f> </e>\n";
389 } elsif ($lang eq "it" ) {
390 print KVT
"\t<e l=\"it\"> <s1>io</s1> <s2>tu</s2>\n";
391 print KVT
"\t\t<s3m>egli</s3m> <s3f>ella</s3f> <s3n>esso</s3n>\n";
392 print KVT
"\t\t<p1>noi</p1> <p2>voi</p2>\n";
393 print KVT
"\t\t<p3f common=\"1\">essi</p3f> </e>\n";
394 } elsif ($lang eq "cz" ) {
395 print KVT
"\t<e l=\"cz\"> <s1>já</s1> <s2>ty</s2>\n";
396 print KVT
"\t\t<s3m>on</s3m> <s3f>ona</s3f> <s3n>ono</s3n>\n";
397 print KVT
"\t\t<p1>my</p1> <p2>vy</p2>\n";
398 print KVT
"\t\t<p3m>oni</p3m> <p3f>ony</p3f> <p3f>ona</p3f> </e>\n";
400 print KVT
"\t<e l=\"$lang\"> </e>\n";
402 print KVT
"\t<e l=\"de\"> <s1>ich</s1> <s2>du</s2>\n";
403 print KVT
"\t\t<s3m>er</s3m> <s3f>sie</s3f> <s3n>es</s3n>\n";
404 print KVT
"\t\t<p1>wir</p1> <p2>ihr</p2> <p3f common=\"1\">sie</p3f> </e>\n";
405 print KVT
"</conjugation>\n";
409 # This subroutine compares all entries one by one
410 # - There is a initial section comparing the local
411 # translation (assuming it to be German).
412 # - There are foreign sections for the languages
416 # None of the rules here claims to be perfect, but
417 # there should be much less "false indications" than
418 # correct type selections.
421 sub type_specific
() {
422 if( $local =~ /^eins$/
423 || $local =~ /^zwei$/
424 || $local =~ /^drei$/
425 || $local =~ /^vier$/
426 || $local =~ /^fünf$/
427 || $local =~ /^sechs$/
428 || $local =~ /^sieben$/
429 || $local =~ /^acht$/
430 || $local =~ /^neun$/
431 || $local =~ /^zehn$/
433 || $local =~ /^zwölf$/
434 || $local =~ /^dreizehn$/
435 || $local =~ /^vierzehn$/
436 || $local =~ /^fünfzehn$/
437 || $local =~ /^sechzehn$/
438 || $local =~ /^siebzehn$/
439 || $local =~ /^achtzehn$/
440 || $local =~ /^neunzehn$/
441 || $local =~ /^zwanzig$/
442 || $local =~ /^dreißig$/
443 || $local =~ /^vierzig$/
444 || $local =~ /^fünfzig$/
445 || $local =~ /^sechzig$/
446 || $local =~ /^siebzig$/
447 || $local =~ /^achtzig$/
448 || $local =~ /^neunzig$/
449 || $local =~ /^hundert$/
450 || $local =~ /^einhundert$/
451 || $local =~ /^zweihundert$/
452 || $local =~ /^dreihundert$/
453 || $local =~ /^vierhundert$/
454 || $local =~ /^fünfhundert$/
455 || $local =~ /^sechshundert$/
456 || $local =~ /^siebenhundert$/
457 || $local =~ /^achthundert$/
458 || $local =~ /^neunhundert$/
459 || $local =~ /^tausend$/){
460 $type=" t=\"num:crd\"";
461 } elsif( $local =~ /weil([,;].*)?$/
462 || $local =~ /^und([,;].*)?$/
463 || $local =~ /\sund([,;].*)?$/
464 || $local =~ /\soder([,;].*)?$/
465 || $local =~ /\saber([,;].*)?$/
466 || $local =~ /^oder([,;].*)?$/
467 || $local =~ /^aber([,;].*)?$/
468 || $local =~ /daß([,;].*)?$/
469 || $local =~ /^als([,;].*)?$/
470 || $local =~ /\sals([,;].*)?$/
471 || $local =~ /als ob([,;].*)?$/
472 || $local =~ /obwohl([,;].*)?$/
473 || $local =~ /trotzdem([,;].*)?$/
474 || $local =~ /dennoch([,;].*)?$/
475 || $local =~ /indem([,;].*)?$/
476 || $local =~ /darum([,;].*)?$/
477 || $local =~ /wobei([,;].*)?$/
478 || $local =~ /^doch([,;].*)?$/
479 || $local =~ /\sdoch([,;].*)?$/
480 || $local =~ /damit([,;].*)?$/
481 || $local =~ /während([,;].*)?$/
482 || $local =~ /^falls([,;].*)?$/
483 || $local =~ /\sfalls([,;].*)?$/){
485 } elsif( $local =~ /^zweite[rs]?$/
486 || $local =~ /dritte[rs]?$/
487 || $local =~ /vierte[rs]?$/
488 || $local =~ /fünfte[rs]?$/
489 || $local =~ /sechste[rs]?$/
490 || $local =~ /siebente[rs]?$/
491 || $local =~ /achte[rs]?$/
492 || $local =~ /neunte[rs]?$/
493 || $local =~ /zehnte[rs]?$/
494 || $local =~ /elfte[rs]?$/
495 || $local =~ /zwölfte[rs]?$/
496 || $local =~ /zwanzigste[rs]?$/
497 || $local =~ /hundertste[rs]?$/
498 || $local =~ /tausendste[rs]?$/){
499 $type=" t=\"num:ord\"";
500 } elsif( $local =~ /\?$/){
502 } elsif( $local =~ /^[A-ZÄÜÖ].*[\.\?!]$/){
504 } elsif( $local =~ /^[A-ZÄÜÖ]/){
506 } elsif( $local =~ /en$/){
509 # maybe some lines are left - that's OK
513 # Now for the language specific terms
517 # some VERY simple strategy to recognize the
519 # (known things first - least likely guesses last)
521 if( $foreign =~ s/^to\s+°?//){
523 } elsif( $foreign =~ /ous$/){
525 } elsif( $foreign =~ /ble$/){
527 } elsif( $foreign =~ /less$/){
529 } elsif( $foreign =~ /ic$/){
531 } elsif( $foreign =~ /ive$/){
533 } elsif( $foreign =~ /ly$/){
535 } elsif( $foreign =~ /nt$/){
538 } elsif( $lang eq "fr") {
540 # Not perfect but a starting point
542 if( $foreign =~ s/^la\s+°?([^(]*)(\s\(m\))?/\1/){
544 } elsif( $foreign =~ s/^le\s+°?([^(]*)(\s\(f\))?/\1/){
546 } elsif( $foreign =~ s/^les\s+°?([^(]*)(\s\(m\))?/\1/){
548 } elsif( $foreign =~ s/^les\s+°?([^(]*)(\s\(f\))?/\1/){
550 } elsif( $foreign =~ s/^l[´']°?([^(]*)\s\(f\)/\1/){
552 } elsif( $foreign =~ s/^l[´']°?([^(]*)\s\(m\)/\1/){
554 } elsif( $foreign =~ s/^l[´']°?//){
556 } elsif( $foreign =~ s/^une°?([^(]*)(\s\(f\))?/\1/){
558 } elsif( $foreign =~ s/^un°?([^(]*)(\s\(m\))?/\1/){
561 } elsif( $lang eq "es") {
563 # some VERY simple strategy to recognize the
565 # (known things first - least likely guesses last)
567 if( $foreign =~ /[iae]rse$/){
569 } elsif( $foreign =~ /[iae]r(\s.*)?$/){
571 } elsif( $foreign =~ s/^las?\s+//){
573 } elsif( $foreign =~ s/^los\s+//){
575 } elsif( $foreign =~ s/^el\s+//){
577 } elsif( $foreign =~ /mente$/){
579 } elsif( $foreign =~ /ión$/){
581 } elsif( $type =~ " t=\"n\"" && $foreign =~ /as?$/){
583 } elsif( $type =~ " t=\"n\"" && $foreign =~ /os?$/){
591 # This subroutine rewrites the rc-file for kvoctrain
592 # (only for the executing user) to add language information
593 # to it. [not perfect but for convenience]
597 $/="\n"; # we work with a unix file
598 $rnfile = "$rcfile.new" ;
599 chomp($running = `ps ax | grep kvoctrain | grep -v grep`);
600 open(RC
, "<$rcfile" ) || die "kvoctrainrc preferences file not open: $!";
601 open(RCNEW
, ">$rnfile" ) || die "kvoctrainrc replacement preferences file not open: $!";
602 $replace=1; # i.e. language is not known yet
605 ($vockey,$vocval) = split("=");
609 print RCNEW
"$vockey=$vocval\n";
610 } elsif(/ShortId\d*=$lang/) {
611 print STDERR
"\nLanguage \"$lang\" already configured.\n";
613 } elsif(/PasteOrder/) {
614 print RCNEW
"$vockey=$vocval$lang,\n";
615 } elsif(/RecentFiles/) {
616 print RCNEW
"$vockey=$kvtfile,$vocval\n";
622 print RCNEW
"LongId$counter=$tlang\nShortId$counter=$lang\nShort2Id$counter=$voclang\n";
623 print RCNEW
"Pixmap$counter=/opt/kde/share/apps/kcmlocale/pics/flag_$lang.gif\n";
626 if( "$running" ne "" ) {
627 print STDERR
"\nWARNING: Please close kvoctrain and rerun this program.\n";
631 rename $rnfile, $rcfile;