Language translation update
[openemr.git] / contrib / util / language_translations / buildLanguageDatabase.pl
blob2e92e318750174a27575991d122c4399d33a09d4
1 #!/usr/bin/perl
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # author Brady Miller
9 # email brady@sparmy.com
10 # date 04/03/09
12 # This is a perl script that will build the language translation sql
13 # dumpfiles from the tab delimited language translation spreadsheet.
14 # It will create two output dumpfiles:
15 # languageTranslations_utf8.sql
16 # languageTranslations_latin1.sql (minus utf languages)
17 # It will also validate the spreadsheet and create a new spreadsheet
18 # that can be used for further downstream editing and re-importing
19 # back into Google Docs. It also outputs a logfile log.txt with
20 # errors in validation and database statistics.
22 # Example command:
24 # -Below command will build the sql dumpfile from given tsv
25 # language spreadsheet and compare with a constants list to
26 # ensure constants didn't get edited (output will go to
27 # the log file), and will also fix limited issues. In this case
28 # a new spreadsheet file will also be created with the corrected
29 # constants to allow downstream modification and re-importing of
30 # file into Google Docs:
31 # ./buildLanguageDatabase.pl openemr_language_table.tsv constants.txt
35 use strict;
37 # Array to hold languages to skip for the latin1 translation file
38 # (pending)
40 # Put current known constant mismatches here, which are basically
41 # constants that get modified during the pipeline and don't look
42 # like originals in the end. If this number increases, a english constant
43 # was likely modified in the spreadsheet, and can then use log output
44 # to localize and fix the problem. As of list of 3.0.1 constants
45 # the known number of mismatched constants is 57 .
46 my $mismatchesKnown = 97;
48 # Hold variables to calculate language database statistics
49 my $totalConstants;
50 my $totalDefinitions;
51 my @languages;
52 my @numberConstantsLanguages;
54 # Main variables
55 my $de = "\t";
56 my $filenameOut;
57 my $inputFilename;
58 my $logFile = "log.txt";
59 my $stats = "stats.txt";
60 my $constantIdColumn = 0; # 0 is lowest
61 my $constantColumn = 1; # 0 is lowest
62 my $constantRow = 5; # 0 is lowest
63 my $languageNumRow = 0; # 0 is lowest
64 my $languageIdRow = 1; # 0 is lowest
65 my $languageNameRow = 2; # 0 is lowest
67 # variables for checking/fixing constants application
68 my $checkFilename; # holds list of constants if checking
69 my $filenameOut_revised = "revisedSpreadsheet.tsv";
70 my $flagCheck = 0;
71 my @previousConstants;
72 my @inputFile;
73 my @revisedFile;
74 my @inputFileProcessed;
76 # to hold utf8 flag
77 my $utf8;
79 # open output file
80 open(LOGFILE, ">$logFile") or die "unable to open log file";
82 # open output file
83 open(STATFILE, ">$stats") or die "unable to open stats file";
85 # collect parameters
86 if (@ARGV > 2) {
87 die "\nERROR: Too many parameters. Follow instructions found in buildLanguageDatabase.pl file.\n\n";
89 elsif (@ARGV < 2) {
90 die "\nERROR: Need more parameter(s). Follow instructions found in buildLanguageDatabase.pl file.\n\n";
92 elsif (@ARGV == 2) {
93 $flagCheck = 1;
94 $checkFilename = $ARGV[1];
95 $inputFilename = $ARGV[0];
97 else {
98 print LOGFILE "ERROR: with parameters\n\n";
101 # if checking, then open check file and store in array
102 if ($flagCheck) {
103 open(MYINPUTFILE, "<$checkFilename") or die "unable to open file";
104 @previousConstants = <MYINPUTFILE>;
105 close(MYINPUTFILE);
108 # place spreadsheet into array
109 open(MYINPUTFILE2, "<$inputFilename") or die "unable to open file";
110 @inputFile = <MYINPUTFILE2>;
111 close(MYINPUTFILE2);
113 # Clean up spreadsheet
114 # FIRST, remove newlines, blank lines, escape characters, and windows returns
115 # SECOND, place the escape characters in all required sql characters
116 foreach my $tempLine (@inputFile) {
117 chomp($tempLine);
118 if ($tempLine !~ /^\s*$/) {
119 # remove ^M characters (windows line feeds)
120 $tempLine =~ s/\r//g;
122 # remove all escape characters
123 $tempLine =~ s/\\//g;
125 # place all required escape characters
126 $tempLine =~ s/\'/\\\'/g;
127 $tempLine =~ s/\"/\\\"/g;
129 # push into new array
130 push (@inputFileProcessed,$tempLine);
134 # check spreadsheet for rogue tabs and newlines
135 # (the last column needs to be full for this to work
136 # correctly, such as the dummy language)
137 quickCheckStructure(@inputFileProcessed);
139 # check and fix modified constants (and constant id's)
140 if ($flagCheck) {
141 # first create data for replacement spreadsheet if needed
142 @revisedFile = checkConstants("special",@inputFileProcessed);
143 # then clean data to create mysql dumpfiles
144 @inputFileProcessed = checkConstants("normal",@inputFileProcessed);
147 # run through twice to make a utf8 table and a latin1 table
148 # revised spreadsheet. Build statistics and revised
149 # spreadsheet during utf8 run.
150 for (my $i=0;$i<2;$i++) {
152 # set utf flag
153 if ($i == 0) {
154 # build utf8 table
155 $filenameOut = "languageTranslations_utf8.sql";
156 $utf8 = 1;
158 else {
159 # build latin1 table
160 $filenameOut = "languageTranslations_latin1.sql";
161 $utf8 = 0
164 # open output file
165 open(OUTPUTFILE, ">$filenameOut") or die "unable to open output file";
167 my $outputString = "";
169 # add UTF8 set names for both utf8 and latin1 encoding, since
170 # the dumpfile is encoded in UTF8
171 $outputString .= "\
173 -- Ensure correct encoding
176 $outputString .= "SET NAMES utf8;\n\n";
178 # parse lang_languages
179 $outputString .= createLanguages($utf8, @inputFileProcessed);
181 # parse lang_constants
182 $outputString .= createConstants($utf8, @inputFileProcessed);
184 # parse lang_definitions
185 $outputString .= createDefinitions($utf8, @inputFileProcessed);
187 print OUTPUTFILE $outputString;
189 # calculate statistics
190 if ($utf8) {
191 my $count = 0;
192 my $countLanguages = 0;
193 my $subtractDefinitions = 0;
194 my @tempArray;
195 my @statArray;
196 foreach my $var (@languages) {
197 # push all info into the log file
198 push (@tempArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
199 if ($var eq "dummy") {
200 # do not count dummy language or dummy language constants
201 $subtractDefinitions += $numberConstantsLanguages[$count];
203 else {
204 if ($numberConstantsLanguages[$count] > 0) {
205 # only count non-empty languages in total count
206 $countLanguages += 1;
207 # only include non-empty and non-dummy languages in stats
208 push (@statArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
211 $count += 1;
213 print LOGFILE "\nLanguage Statistics:\n";
214 print STATFILE "\nLanguage Statistics:\n";
216 # Report total number of real non empty languages
217 print LOGFILE "Total number of languages with translations: ".$countLanguages."\n";
218 print STATFILE "Total number of languages with translations: ".$countLanguages."\n";
220 # Report total number of constants
221 print LOGFILE "Total number of constants: ".$totalConstants."\n";
222 print STATFILE "Total number of constants: ".$totalConstants."\n";
224 # Report total number of real definitions
225 print LOGFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
226 print STATFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
228 # Send log stat info
229 my @sorted_tempArray = sort { lc($a) cmp lc($b) } @tempArray;
230 foreach my $var (@sorted_tempArray) {
231 print LOGFILE $var;
234 # Send official stat info
235 my @sorted_statArray = sort { lc($a) cmp lc($b) } @statArray;
236 foreach my $var (@sorted_statArray) {
237 print STATFILE $var;
241 # send the processed spreadsheet to file to allow downstream modifications
242 # if checking and fixing modified constants
243 if ($flagCheck && $utf8) {
244 open(MYOUTPUTFILE2, ">$filenameOut_revised") or die "unable to open file";
245 foreach my $var (@revisedFile) {
246 print MYOUTPUTFILE2 $var."\n";
248 close(MYOUTPUTFILE2)
251 # close files
252 close(OUTPUTFILE);
255 close(LOGFILE);
256 close(STATFILE);
260 # FUNCTIONS
265 # function to check spreadsheet for rogue tabs
266 # to work, the last column needs to be filled (such as a dummy language)
267 # will output errors to LOGFILE
268 # param - @arr array of spreadsheet
269 # globals - @inputFile, LOGFILE, $de, $languageNumRow
271 sub quickCheckStructure() {
272 my (@arr) = @_;
274 # use the languagNumRow as the standard for number of tabs
275 # on each row
276 my $numberColumns = split($de,$arr[$languageNumRow]);
277 my $numberTabs = $numberColumns - 1;
279 # ensure every row on spreadsheet has equal number of tabs
280 my $counter = 1;
281 foreach my $var (@arr) {
282 my $tempNumber = split($de,$var);
283 my $tempTabs = $tempNumber - 1;
284 if ($numberTabs != $tempTabs) {
285 print LOGFILE "\nERROR: $counter row with incorrect number of tabs. There are $tempTabs in this row and should be $numberTabs.\n";
286 if ($tempTabs > $numberTabs) {
287 # too many tabs
288 print LOGFILE "\t(This is likely secondary to a rogue tab character(s) on row $counter.)\n";
290 else {
291 # not enough tabs
292 print LOGFILE "\t(This is likely secondary to a rogue newline character(s) on row $counter or one row above.)\n";
295 $counter += 1;
298 return;
302 # function to compare to original constants
303 # normal flag will fix constants escape characters to prepare for mysql dumpfile
304 # special flag will not fix escape characters to prepare for spreadsheet revision file
305 # param - flag (normal or special), array of processed file
306 # globals - @previousConstants, $constantRow, $de, LOGFILE,
307 # $constantIdColumn, $constantColumn
308 # return - none
310 sub checkConstants () {
311 my ($flag, @page) = @_;
313 print LOGFILE "Checking constants:\n\n";
314 my $counter = $constantRow;
315 my $badCount = 0;
316 my $idErrorFlag = 0;
317 foreach my $var (@previousConstants) {
318 chomp($var);
319 my @tempRow = split($de,$page[$counter]);
320 my $tempId = $tempRow[$constantIdColumn];
321 my $tempConstant = $tempRow[$constantColumn];
323 # ensure constant has not been altered
324 if ($var ne $tempConstant) {
325 print LOGFILE "Following constant not same:\n";
326 print LOGFILE "\toriginal- val:$var\n";
327 print LOGFILE "\tspreadsheet- ID:$tempId val:$tempConstant\n";
328 $badCount += 1;
330 # apply fix
331 my $fixedVar = $var;
332 if ($flag eq "normal") {
333 $fixedVar =~ s/\\//g;
334 $fixedVar =~ s/\'/\\\'/g;
335 $fixedVar =~ s/\"/\\\"/g;
337 $tempRow[$constantColumn] = $fixedVar;
338 $page[$counter] = join($de,@tempRow);
341 # ensure constant id number has not been altered
342 my $realID = ($counter - $constantRow + 1);
343 if ($realID != $tempId) {
344 $idErrorFlag = 1;
345 print LOGFILE "\nERROR: Constant ID number ".$realID." has been modified to ".$tempId."!!!\n\n";
347 # apply fix (replace with original after reset escape characters)
348 $tempRow[$constantIdColumn] = $realID;
349 $page[$counter] = join($de,@tempRow);
352 # increment counter
353 $counter += 1;
356 print LOGFILE "\nDone checking constants:\n";
357 print LOGFILE "\t".$badCount." mismatches found (known is ".$mismatchesKnown.")\n";
358 if ($badCount == $mismatchesKnown) {
359 print LOGFILE "Good, constants weren't modified by translators\n\n";
361 else {
362 print LOGFILE "ERROR: Constant(s) have been modified by translators\n\n";
364 if ($idErrorFlag) {
365 print LOGFILE "ERROR: Constant ID number(s) have been modified by translators\n\n";
368 return @page;
372 # function to build lang_languages dumpfile
373 # param - integer flag for utf8, array of processed file
374 # globals - $constantColumn, $constantRow,
375 # $languageNumRow, $languageIdRow, $languageNameRow,
376 # @numberConstantsLanguages, @languages
377 # return - output string
379 sub createLanguages() {
380 my ($flag, @page) = @_;
381 my $charset;
382 if ($flag) {
383 $charset = "utf8";
385 else {
386 $charset = "latin1";
389 # create table input
390 my $tempReturn;
391 my $tempCounter;
392 my @numberRow = split($de,$page[$languageNumRow]);
393 my @idRow = split($de,$page[$languageIdRow]);
394 my @nameRow = split($de,$page[$languageNameRow]);
395 for (my $i = $constantColumn; $i < @numberRow; $i++) {
396 $tempReturn .= "INSERT INTO `lang_languages` VALUES (".$numberRow[$i].", '".$idRow[$i]."', '".$nameRow[$i]."');\n";
397 $tempCounter = $numberRow[$i];
399 # set up for statistics later
400 push (@languages, $nameRow[$i]);
401 $numberConstantsLanguages[$numberRow[$i]-1] = 0;
403 $tempCounter += 1;
405 # create header
406 my $return = "\
408 -- Table structure for table `lang_languages`
411 DROP TABLE IF EXISTS `lang_languages`;
412 CREATE TABLE `lang_languages` (
413 `lang_id` int(11) NOT NULL auto_increment,
414 `lang_code` char(2) NOT NULL default '',
415 `lang_description` varchar(100) default NULL,
416 UNIQUE KEY `lang_id` (`lang_id`)
417 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
420 -- Dumping data for table `lang_languages`
421 --\n\n";
423 # insert table input
424 $return .= $tempReturn;
426 # create footer
427 $return .= "
428 --\n\n";
430 return $return;
434 # function to build lang_constants dumpfile
435 # param - integer flag for utf, array of processed file
436 # globals - $constantColumn, $constantRow, $constantIdColumn, $totalConstants
437 # return - nothing
439 sub createConstants() {
440 my (@page) = @_;
441 my ($flag, @page) = @_;
442 my $charset;
443 if ($flag) {
444 $charset = "utf8";
446 else {
447 $charset = "latin1";
450 # create table input
451 my $tempReturn;
452 my $tempCounter;
453 for (my $i = $constantRow; $i < @page; $i++) {
454 my @tempRow = split($de,$page[$i]);
455 my $tempId = $tempRow[$constantIdColumn];
456 my $tempConstant = $tempRow[$constantColumn];
457 $tempReturn .= "INSERT INTO `lang_constants` VALUES (".$tempId.", '".$tempConstant."');\n";
458 $tempCounter = $tempId;
460 $tempCounter += 1;
462 # create header
463 my $return = "\
465 -- Table structure for table `lang_constants`
468 DROP TABLE IF EXISTS `lang_constants`;
469 CREATE TABLE `lang_constants` (
470 `cons_id` int(11) NOT NULL auto_increment,
471 `constant_name` varchar(255) BINARY default NULL,
472 UNIQUE KEY `cons_id` (`cons_id`),
473 KEY `constant_name` (`constant_name`)
474 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
477 -- Dumping data for table `lang_constants`
478 --\n\n";
480 # insert table input
481 $return .= $tempReturn;
483 # create footer
484 $return .= "
485 --\n\n";
487 # fill total constants for statistics later
488 $totalConstants = $tempCounter - 1;
490 return $return;
494 # function to build lang_definitions dumpfile
495 # param - integer flag for utf8, array of processed file
496 # globals - $constantColumn, $constantRow,
497 # $languageNumRow, $constantIdColumn, @numberConstantsLanguages,
498 # $totalDefinitions
499 # return - nothing
501 sub createDefinitions() {
502 my (@page) = @_;
503 my ($flag, @page) = @_;
504 my $charset;
505 if ($flag) {
506 $charset = "utf8";
508 else {
509 $charset = "latin1";
512 # create table input
513 my $tempReturn;
514 my $tempCounter;
515 my @numberRow = split($de,$page[$languageNumRow]);
516 my $counter = 1;
517 for (my $i = $constantColumn + 1; $i < @numberRow; $i++) {
518 for (my $j = $constantRow; $j < @page; $j++) {
519 my @tempRow = split($de,$page[$j]);
520 my $tempId = $tempRow[$constantIdColumn];
521 my $tempDefinition = $tempRow[$i];
522 my $tempLangNumber = $numberRow[$i];
523 if ($tempDefinition !~ /^\s*$/) {
524 $tempReturn .= "INSERT INTO `lang_definitions` VALUES (".$counter.", ".$tempId.", ".$tempLangNumber.", '".$tempDefinition."');\n";
525 $tempCounter = $counter;
526 $counter += 1;
528 # set up for statistics
529 $numberConstantsLanguages[($tempLangNumber - 1)] += 1;
533 $tempCounter += 1;
535 # create header
536 my $return = "\
538 -- Table structure for table `lang_definitions`
541 DROP TABLE IF EXISTS `lang_definitions`;
542 CREATE TABLE `lang_definitions` (
543 `def_id` int(11) NOT NULL auto_increment,
544 `cons_id` int(11) NOT NULL default '0',
545 `lang_id` int(11) NOT NULL default '0',
546 `definition` mediumtext,
547 UNIQUE KEY `def_id` (`def_id`),
548 KEY `cons_id` (`cons_id`)
549 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
552 -- Dumping data for table `lang_definitions`
553 --\n\n";
555 # insert table input
556 $return .= $tempReturn;
558 # create footer
559 $return .= "
560 --\n\n";
562 # fill total definitions for statistics later
563 $totalDefinitions = $tempCounter - 1;
565 return $return;
568 # Function to drop decimals
569 # param: 1st is number, 2nd is nubmer of desired decimals
570 sub fstr () {
571 my ($value,$precision) = @_;
572 if ($value == 0) {
573 return "0";
575 my $s = sprintf("%.${precision}f", $value);
576 $s =~ s/\.?0*$//;
577 return $s;