bb51ce87ec5b4f1979880180555fa4940aa7dcf2
[openemr.git] / contrib / util / language_translations / buildLanguageDatabase.pl
blobbb51ce87ec5b4f1979880180555fa4940aa7dcf2
1 #!/usr/bin/perl
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # author Brady Miller
9 # email brady@sparmy.com
10 # date 04/03/09
12 # This is a perl script that will build the language translation sql
13 # dumpfiles from the tab delimited language translation spreadsheet.
14 # It will create two output dumpfiles:
15 # languageTranslations_utf8.sql
16 # languageTranslations_latin1.sql (minus utf languages)
17 # It will also validate the spreadsheet and create a new spreadsheet
18 # that can be used for further downstream editing and re-importing
19 # back into Google Docs. It also outputs a logfile log.txt with
20 # errors in validation and database statistics.
22 # Example command:
24 # -Below command will build the sql dumpfile from given tsv
25 # language spreadsheet and compare with a constants list to
26 # ensure constants didn't get edited (output will go to
27 # the log file), and will also fix limited issues. In this case
28 # a new spreadsheet file will also be created with the corrected
29 # constants to allow downstream modification and re-importing of
30 # file into Google Docs:
31 # ./buildLanguageDatabase.pl openemr_language_table.tsv constants.txt
35 use strict;
37 # Array to hold languages to skip for the latin1 translation file
38 # (pending)
40 # Put current known constant mismatches here, which are basically
41 # constants that get modified during the pipeline and don't look
42 # like originals in the end. If this number increases, a english constant
43 # was likely modified in the spreadsheet, and can then use log output
44 # to localize and fix the problem. As of list of 3.0.1 constants
45 # the known number of mismatched constants is 57 .
46 my $mismatchesKnown = 95;
48 # Hold variables to calculate language database statistics
49 my $totalConstants;
50 my $totalDefinitions;
51 my @languages;
52 my @numberConstantsLanguages;
54 # Main variables
55 my $de = "\t";
56 my $filenameOut;
57 my $inputFilename;
58 my $logFile = "log.txt";
59 my $stats = "stats.txt";
60 my $constantIdColumn = 0; # 0 is lowest
61 my $constantColumn = 1; # 0 is lowest
62 my $constantRow = 5; # 0 is lowest
63 my $languageNumRow = 0; # 0 is lowest
64 my $languageIdRow = 1; # 0 is lowest
65 my $languageNameRow = 2; # 0 is lowest
67 # variables for checking/fixing constants application
68 my $checkFilename; # holds list of constants if checking
69 my $filenameOut_revised = "revisedSpreadsheet.tsv";
70 my $flagCheck = 0;
71 my @previousConstants;
72 my @inputFile;
73 my @revisedFile;
74 my @inputFileProcessed;
76 # to hold utf8 flag
77 my $utf8;
79 # open output file
80 open(LOGFILE, ">$logFile") or die "unable to open log file";
82 # open output file
83 open(STATFILE, ">$stats") or die "unable to open stats file";
85 # collect parameters
86 if (@ARGV > 2) {
87 die "\nERROR: Too many parameters. Follow instructions found in buildLanguageDatabase.pl file.\n\n";
89 elsif (@ARGV < 2) {
90 die "\nERROR: Need more parameter(s). Follow instructions found in buildLanguageDatabase.pl file.\n\n";
92 elsif (@ARGV == 2) {
93 $flagCheck = 1;
94 $checkFilename = $ARGV[1];
95 $inputFilename = $ARGV[0];
97 else {
98 print LOGFILE "ERROR: with parameters\n\n";
101 # if checking, then open check file and store in array
102 if ($flagCheck) {
103 open(MYINPUTFILE, "<$checkFilename") or die "unable to open file";
104 @previousConstants = <MYINPUTFILE>;
105 close(MYINPUTFILE);
108 # place spreadsheet into array
109 open(MYINPUTFILE2, "<$inputFilename") or die "unable to open file";
110 @inputFile = <MYINPUTFILE2>;
111 close(MYINPUTFILE2);
113 # Clean up spreadsheet
114 # FIRST, remove newlines, blank lines, escape characters, and windows returns
115 # SECOND, place the escape characters in all required sql characters
116 foreach my $tempLine (@inputFile) {
117 chomp($tempLine);
118 if ($tempLine !~ /^\s*$/) {
119 # remove ^M characters (windows line feeds)
120 $tempLine =~ s/\r//g;
122 # remove all escape characters
123 $tempLine =~ s/\\//g;
125 # place all required escape characters
126 $tempLine =~ s/\'/\\\'/g;
127 $tempLine =~ s/\"/\\\"/g;
129 # push into new array
130 push (@inputFileProcessed,$tempLine);
134 # check spreadsheet for rogue tabs and newlines
135 # (the last column needs to be full for this to work
136 # correctly, such as the dummy language)
137 quickCheckStructure(@inputFileProcessed);
139 # check and fix modified constants (and constant id's)
140 if ($flagCheck) {
141 # first create data for replacement spreadsheet if needed
142 @revisedFile = checkConstants("special",@inputFileProcessed);
143 # then clean data to create mysql dumpfiles
144 @inputFileProcessed = checkConstants("normal",@inputFileProcessed);
147 # run through twice to make a utf8 table and a latin1 table
148 # revised spreadsheet. Build statistics and revised
149 # spreadsheet during utf8 run.
150 for (my $i=0;$i<2;$i++) {
152 # set utf flag
153 if ($i == 0) {
154 # build utf8 table
155 $filenameOut = "languageTranslations_utf8.sql";
156 $utf8 = 1;
158 else {
159 # build latin1 table
160 $filenameOut = "languageTranslations_latin1.sql";
161 $utf8 = 0
164 # open output file
165 open(OUTPUTFILE, ">$filenameOut") or die "unable to open output file";
167 my $outputString = "";
169 # add UTF8 set names for both utf8 and latin1 encoding, since
170 # the dumpfile is encoded in UTF8
171 $outputString .= "\
173 -- Ensure correct encoding
176 $outputString .= "SET NAMES utf8;\n\n";
178 # parse lang_languages
179 $outputString .= createLanguages($utf8, @inputFileProcessed);
181 # parse lang_constants
182 $outputString .= createConstants($utf8, @inputFileProcessed);
184 # parse lang_definitions
185 $outputString .= createDefinitions($utf8, @inputFileProcessed);
187 print OUTPUTFILE $outputString;
189 # calculate statistics
190 if ($utf8) {
191 my $count = 0;
192 my $countLanguages = 0;
193 my $subtractDefinitions = 0;
194 my @tempArray;
195 my @statArray;
196 foreach my $var (@languages) {
197 # push all info into the log file
198 push (@tempArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
199 if ($var eq "dummy") {
200 # do not count dummy language or dummy language constants
201 $subtractDefinitions += $numberConstantsLanguages[$count];
203 else {
204 if ($numberConstantsLanguages[$count] > 0) {
205 # only count non-empty languages in total count
206 $countLanguages += 1;
207 # only include non-empty and non-dummy languages in stats
208 push (@statArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
211 $count += 1;
213 print LOGFILE "\nLanguage Statistics:\n";
214 print STATFILE "\nLanguage Statistics:\n";
216 # Report total number of real non empty languages
217 print LOGFILE "Total number of languages with translations: ".$countLanguages."\n";
218 print STATFILE "Total number of languages with translations: ".$countLanguages."\n";
220 # Report total number of constants
221 print LOGFILE "Total number of constants: ".$totalConstants."\n";
222 print STATFILE "Total number of constants: ".$totalConstants."\n";
224 # Report total number of real definitions
225 print LOGFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
226 print STATFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
228 # Send log stat info
229 my @sorted_tempArray = sort { lc($a) cmp lc($b) } @tempArray;
230 foreach my $var (@sorted_tempArray) {
231 print LOGFILE $var;
234 # Send official stat info
235 my @sorted_statArray = sort { lc($a) cmp lc($b) } @statArray;
236 foreach my $var (@sorted_statArray) {
237 print STATFILE $var;
241 # send the processed spreadsheet to file to allow downstream modifications
242 # if checking and fixing modified constants
243 if ($flagCheck && $utf8) {
244 open(MYOUTPUTFILE2, ">$filenameOut_revised") or die "unable to open file";
245 foreach my $var (@revisedFile) {
246 print MYOUTPUTFILE2 $var."\n";
248 close(MYOUTPUTFILE2)
251 # close files
252 close(OUTPUTFILE);
255 close(LOGFILE);
256 close(STATFILE);
260 # FUNCTIONS
265 # function to check spreadsheet for rogue tabs
266 # to work, the last column needs to be filled (such as a dummy language)
267 # will output errors to LOGFILE
268 # param - @arr array of spreadsheet
269 # globals - @inputFile, LOGFILE, $de, $languageNumRow
271 sub quickCheckStructure() {
272 my (@arr) = @_;
274 # use the languagNumRow as the standard for number of tabs
275 # on each row
276 my $numberColumns = split($de,$arr[$languageNumRow]);
277 my $numberTabs = $numberColumns - 1;
279 # ensure every row on spreadsheet has equal number of tabs
280 my $counter = 1;
281 foreach my $var (@arr) {
282 my $tempNumber = split($de,$var);
283 my $tempTabs = $tempNumber - 1;
284 if ($numberTabs != $tempTabs) {
285 print LOGFILE "\nERROR: $counter row with incorrect number of tabs. There are $tempTabs in this row and should be $numberTabs.\n";
286 if ($tempTabs > $numberTabs) {
287 # too many tabs
288 print LOGFILE "\t(This is likely secondary to a rogue tab character(s) on row $counter.)\n";
290 else {
291 # not enough tabs
292 print LOGFILE "\t(This is likely secondary to a rogue newline character(s) on row $counter or one row above.)\n";
295 $counter += 1;
298 return;
302 # function to compare to original constants
303 # normal flag will fix constants escape characters to prepare for mysql dumpfile
304 # special flag will not fix escape characters to prepare for spreadsheet revision file
305 # param - flag (normal or special), array of processed file
306 # globals - @previousConstants, $constantRow, $de, LOGFILE,
307 # $constantIdColumn, $constantColumn
308 # return - none
310 sub checkConstants () {
311 my ($flag, @page) = @_;
313 print LOGFILE "Checking constants:\n\n";
314 my $counter = $constantRow;
315 my $badCount = 0;
316 my $idErrorFlag = 0;
317 foreach my $var (@previousConstants) {
318 chomp($var);
319 my @tempRow = split($de,$page[$counter]);
320 my $tempId = $tempRow[$constantIdColumn];
321 my $tempConstant = $tempRow[$constantColumn];
323 # ensure constant has not been altered
324 if ($var ne $tempConstant) {
325 print LOGFILE "Following constant not same:\n";
326 print LOGFILE "\toriginal- val:$var\n";
327 print LOGFILE "\tspreadsheet- ID:$tempId val:$tempConstant\n";
328 $badCount += 1;
330 # apply fix
331 my $fixedVar = $var;
332 if ($flag eq "normal") {
333 $fixedVar =~ s/\\//g;
334 $fixedVar =~ s/\'/\\\'/g;
335 $fixedVar =~ s/\"/\\\"/g;
337 $tempRow[$constantColumn] = $fixedVar;
338 $page[$counter] = join($de,@tempRow);
341 # ensure constant id number has not been altered
342 my $realID = ($counter - $constantRow + 1);
343 if ($realID != $tempId) {
344 $idErrorFlag = 1;
345 print LOGFILE "\nERROR: Constant ID number ".$realID." has been modified to ".$tempId."!!!\n\n";
347 # apply fix (replace with original after reset escape characters)
348 $tempRow[$constantIdColumn] = $realID;
349 $page[$counter] = join($de,@tempRow);
352 # increment counter
353 $counter += 1;
356 print LOGFILE "\nDone checking constants:\n";
357 print LOGFILE "\t".$badCount." mismatches found (known is ".$mismatchesKnown.")\n";
358 if ($badCount == $mismatchesKnown) {
359 print LOGFILE "Good, constants weren't modified by translators\n\n";
361 else {
362 print LOGFILE "ERROR: Constant(s) have been modified by translators\n\n";
364 if ($idErrorFlag) {
365 print LOGFILE "ERROR: Constant ID number(s) have been modified by translators\n\n";
368 return @page;
372 # function to build lang_languages dumpfile
373 # param - integer flag for utf8, array of processed file
374 # globals - $constantColumn, $constantRow,
375 # $languageNumRow, $languageIdRow, $languageNameRow,
376 # @numberConstantsLanguages, @languages
377 # return - output string
379 sub createLanguages() {
380 my ($flag, @page) = @_;
381 my $charset;
382 if ($flag) {
383 $charset = "utf8";
385 else {
386 $charset = "latin1";
389 # create table input
390 my $tempReturn;
391 my $tempCounter;
392 my @numberRow = split($de,$page[$languageNumRow]);
393 my @idRow = split($de,$page[$languageIdRow]);
394 my @nameRow = split($de,$page[$languageNameRow]);
395 $tempReturn .= "INSERT INTO `lang_languages` (`lang_id`, `lang_code`, `lang_description`) VALUES\n";
396 for (my $i = $constantColumn; $i < @numberRow; $i++) {
397 $tempReturn .= "(".$numberRow[$i].", '".$idRow[$i]."', '".$nameRow[$i]."'),\n";
398 $tempCounter = $numberRow[$i];
400 # set up for statistics later
401 push (@languages, $nameRow[$i]);
402 $numberConstantsLanguages[$numberRow[$i]-1] = 0;
404 $tempReturn =~ s/,\n$/;\n/;
405 $tempCounter += 1;
407 # create header
408 my $return = "\
410 -- Table structure for table `lang_languages`
413 DROP TABLE IF EXISTS `lang_languages`;
414 CREATE TABLE `lang_languages` (
415 `lang_id` int(11) NOT NULL auto_increment,
416 `lang_code` char(2) NOT NULL default '',
417 `lang_description` varchar(100) default NULL,
418 UNIQUE KEY `lang_id` (`lang_id`)
419 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
422 -- Dumping data for table `lang_languages`
423 --\n\n";
425 # insert table input
426 $return .= $tempReturn;
428 # create footer
429 $return .= "
430 --\n\n";
432 return $return;
436 # function to build lang_constants dumpfile
437 # param - integer flag for utf, array of processed file
438 # globals - $constantColumn, $constantRow, $constantIdColumn, $totalConstants
439 # return - nothing
441 sub createConstants() {
442 my (@page) = @_;
443 my ($flag, @page) = @_;
444 my $charset;
445 if ($flag) {
446 $charset = "utf8";
448 else {
449 $charset = "latin1";
452 # create table input
453 my $tempReturn;
454 my $tempCounter;
455 $tempReturn .= "INSERT INTO `lang_constants` (`cons_id`, `constant_name`) VALUES\n";
456 for (my $i = $constantRow; $i < @page; $i++) {
457 my @tempRow = split($de,$page[$i]);
458 my $tempId = $tempRow[$constantIdColumn];
459 my $tempConstant = $tempRow[$constantColumn];
460 $tempReturn .= "(".$tempId.", '".$tempConstant."'),\n";
461 $tempCounter = $tempId;
463 $tempReturn =~ s/,\n$/;\n/;
464 $tempCounter += 1;
466 # create header
467 my $return = "\
469 -- Table structure for table `lang_constants`
472 DROP TABLE IF EXISTS `lang_constants`;
473 CREATE TABLE `lang_constants` (
474 `cons_id` int(11) NOT NULL auto_increment,
475 `constant_name` varchar(255) BINARY default NULL,
476 UNIQUE KEY `cons_id` (`cons_id`),
477 KEY `constant_name` (`constant_name`)
478 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
481 -- Dumping data for table `lang_constants`
482 --\n\n";
484 # insert table input
485 $return .= $tempReturn;
487 # create footer
488 $return .= "
489 --\n\n";
491 # fill total constants for statistics later
492 $totalConstants = $tempCounter - 1;
494 return $return;
498 # function to build lang_definitions dumpfile
499 # param - integer flag for utf8, array of processed file
500 # globals - $constantColumn, $constantRow,
501 # $languageNumRow, $constantIdColumn, @numberConstantsLanguages,
502 # $totalDefinitions
503 # return - nothing
505 sub createDefinitions() {
506 my (@page) = @_;
507 my ($flag, @page) = @_;
508 my $charset;
509 if ($flag) {
510 $charset = "utf8";
512 else {
513 $charset = "latin1";
516 # create table input
517 my $tempReturn;
518 my $tempCounter;
519 my @numberRow = split($de,$page[$languageNumRow]);
520 my $counter = 1;
521 $tempReturn .= "INSERT INTO `lang_definitions` (`def_id`, `cons_id`, `lang_id`, `definition`) VALUES\n";
522 for (my $i = $constantColumn + 1; $i < @numberRow; $i++) {
523 for (my $j = $constantRow; $j < @page; $j++) {
524 my @tempRow = split($de,$page[$j]);
525 my $tempId = $tempRow[$constantIdColumn];
526 my $tempDefinition = $tempRow[$i];
527 my $tempLangNumber = $numberRow[$i];
528 if ($tempDefinition !~ /^\s*$/) {
529 $tempReturn .= "(".$counter.", ".$tempId.", ".$tempLangNumber.", '".$tempDefinition."'),\n";
530 $tempCounter = $counter;
531 $counter += 1;
533 # set up for statistics
534 $numberConstantsLanguages[($tempLangNumber - 1)] += 1;
538 $tempReturn =~ s/,\n$/;\n/;
539 $tempCounter += 1;
541 # create header
542 my $return = "\
544 -- Table structure for table `lang_definitions`
547 DROP TABLE IF EXISTS `lang_definitions`;
548 CREATE TABLE `lang_definitions` (
549 `def_id` int(11) NOT NULL auto_increment,
550 `cons_id` int(11) NOT NULL default '0',
551 `lang_id` int(11) NOT NULL default '0',
552 `definition` mediumtext CHARACTER SET utf8,
553 UNIQUE KEY `def_id` (`def_id`),
554 KEY `cons_id` (`cons_id`)
555 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
558 -- Dumping data for table `lang_definitions`
559 --\n\n";
561 # insert table input
562 $return .= $tempReturn;
564 # create footer
565 $return .= "
566 --\n\n";
568 # fill total definitions for statistics later
569 $totalDefinitions = $tempCounter - 1;
571 return $return;
574 # Function to drop decimals
575 # param: 1st is number, 2nd is nubmer of desired decimals
576 sub fstr () {
577 my ($value,$precision) = @_;
578 if ($value == 0) {
579 return "0";
581 my $s = sprintf("%.${precision}f", $value);
582 $s =~ s/\.?0*$//;
583 return $s;