3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
9 # email brady@sparmy.com
12 # This is a perl script that will build the language translation sql
13 # dumpfiles from the tab delimited language translation spreadsheet.
14 # It will create two output dumpfiles:
15 # languageTranslations_utf8.sql
16 # languageTranslations_latin1.sql (minus utf languages)
17 # It will also validate the spreadsheet and create a new spreadsheet
18 # that can be used for further downstream editing and re-importing
19 # back into Google Docs. It also outputs a logfile log.txt with
20 # errors in validation and database statistics.
24 # -Below command will build the sql dumpfile from given tsv
25 # language spreadsheet and compare with a constants list to
26 # ensure constants didn't get edited (output will go to
27 # the log file), and will also fix limited issues. In this case
28 # a new spreadsheet file will also be created with the corrected
29 # constants to allow downstream modification and re-importing of
30 # file into Google Docs:
31 # ./buildLanguageDatabase.pl openemr_language_table.tsv constants.txt
37 # Array to hold languages to skip for the latin1 translation file
40 # Put current known constant mismatches here, which are basically
41 # constants that get modified during the pipeline and don't look
42 # like originals in the end. If this number increases, a english constant
43 # was likely modified in the spreadsheet, and can then use log output
44 # to localize and fix the problem. As of list of 3.0.1 constants
45 # the known number of mismatched constants is 57 .
46 my $mismatchesKnown = 95;
48 # Hold variables to calculate language database statistics
52 my @numberConstantsLanguages;
58 my $logFile = "log.txt";
59 my $stats = "stats.txt";
60 my $constantIdColumn = 0; # 0 is lowest
61 my $constantColumn = 1; # 0 is lowest
62 my $constantRow = 5; # 0 is lowest
63 my $languageNumRow = 0; # 0 is lowest
64 my $languageIdRow = 1; # 0 is lowest
65 my $languageNameRow = 2; # 0 is lowest
67 # variables for checking/fixing constants application
68 my $checkFilename; # holds list of constants if checking
69 my $filenameOut_revised = "revisedSpreadsheet.tsv";
71 my @previousConstants;
74 my @inputFileProcessed;
80 open(LOGFILE
, ">$logFile") or die "unable to open log file";
83 open(STATFILE
, ">$stats") or die "unable to open stats file";
87 die "\nERROR: Too many parameters. Follow instructions found in buildLanguageDatabase.pl file.\n\n";
90 die "\nERROR: Need more parameter(s). Follow instructions found in buildLanguageDatabase.pl file.\n\n";
94 $checkFilename = $ARGV[1];
95 $inputFilename = $ARGV[0];
98 print LOGFILE
"ERROR: with parameters\n\n";
101 # if checking, then open check file and store in array
103 open(MYINPUTFILE
, "<$checkFilename") or die "unable to open file";
104 @previousConstants = <MYINPUTFILE
>;
108 # place spreadsheet into array
109 open(MYINPUTFILE2
, "<$inputFilename") or die "unable to open file";
110 @inputFile = <MYINPUTFILE2
>;
113 # Clean up spreadsheet
114 # FIRST, remove newlines, blank lines, escape characters, and windows returns
115 # SECOND, place the escape characters in all required sql characters
116 foreach my $tempLine (@inputFile) {
118 if ($tempLine !~ /^\s*$/) {
119 # remove ^M characters (windows line feeds)
120 $tempLine =~ s/\r//g;
122 # remove all escape characters
123 $tempLine =~ s/\\//g;
125 # place all required escape characters
126 $tempLine =~ s/\'/\\\'/g;
127 $tempLine =~ s/\"/\\\"/g;
129 # push into new array
130 push (@inputFileProcessed,$tempLine);
134 # check spreadsheet for rogue tabs and newlines
135 # (the last column needs to be full for this to work
136 # correctly, such as the dummy language)
137 quickCheckStructure
(@inputFileProcessed);
139 # check and fix modified constants (and constant id's)
141 # first create data for replacement spreadsheet if needed
142 @revisedFile = checkConstants
("special",@inputFileProcessed);
143 # then clean data to create mysql dumpfiles
144 @inputFileProcessed = checkConstants
("normal",@inputFileProcessed);
147 # run through twice to make a utf8 table and a latin1 table
148 # revised spreadsheet. Build statistics and revised
149 # spreadsheet during utf8 run.
150 for (my $i=0;$i<2;$i++) {
155 $filenameOut = "languageTranslations_utf8.sql";
160 $filenameOut = "languageTranslations_latin1.sql";
165 open(OUTPUTFILE
, ">$filenameOut") or die "unable to open output file";
167 my $outputString = "";
169 # add UTF8 set names for both utf8 and latin1 encoding, since
170 # the dumpfile is encoded in UTF8
173 -- Ensure correct encoding
176 $outputString .= "SET NAMES utf8;\n\n";
178 # parse lang_languages
179 $outputString .= createLanguages
($utf8, @inputFileProcessed);
181 # parse lang_constants
182 $outputString .= createConstants
($utf8, @inputFileProcessed);
184 # parse lang_definitions
185 $outputString .= createDefinitions
($utf8, @inputFileProcessed);
187 print OUTPUTFILE
$outputString;
189 # calculate statistics
192 my $countLanguages = 0;
193 my $subtractDefinitions = 0;
196 foreach my $var (@languages) {
197 # push all info into the log file
198 push (@tempArray, $var.": ".fstr
((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
199 if ($var eq "dummy") {
200 # do not count dummy language or dummy language constants
201 $subtractDefinitions += $numberConstantsLanguages[$count];
204 if ($numberConstantsLanguages[$count] > 0) {
205 # only count non-empty languages in total count
206 $countLanguages += 1;
207 # only include non-empty and non-dummy languages in stats
208 push (@statArray, $var.": ".fstr
((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
213 print LOGFILE
"\nLanguage Statistics:\n";
214 print STATFILE
"\nLanguage Statistics:\n";
216 # Report total number of real non empty languages
217 print LOGFILE
"Total number of languages with translations: ".$countLanguages."\n";
218 print STATFILE
"Total number of languages with translations: ".$countLanguages."\n";
220 # Report total number of constants
221 print LOGFILE
"Total number of constants: ".$totalConstants."\n";
222 print STATFILE
"Total number of constants: ".$totalConstants."\n";
224 # Report total number of real definitions
225 print LOGFILE
"Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
226 print STATFILE
"Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
229 my @sorted_tempArray = sort { lc($a) cmp lc($b) } @tempArray;
230 foreach my $var (@sorted_tempArray) {
234 # Send official stat info
235 my @sorted_statArray = sort { lc($a) cmp lc($b) } @statArray;
236 foreach my $var (@sorted_statArray) {
241 # send the processed spreadsheet to file to allow downstream modifications
242 # if checking and fixing modified constants
243 if ($flagCheck && $utf8) {
244 open(MYOUTPUTFILE2
, ">$filenameOut_revised") or die "unable to open file";
245 foreach my $var (@revisedFile) {
246 print MYOUTPUTFILE2
$var."\n";
265 # function to check spreadsheet for rogue tabs
266 # to work, the last column needs to be filled (such as a dummy language)
267 # will output errors to LOGFILE
268 # param - @arr array of spreadsheet
269 # globals - @inputFile, LOGFILE, $de, $languageNumRow
271 sub quickCheckStructure
() {
274 # use the languagNumRow as the standard for number of tabs
276 my $numberColumns = split($de,$arr[$languageNumRow]);
277 my $numberTabs = $numberColumns - 1;
279 # ensure every row on spreadsheet has equal number of tabs
281 foreach my $var (@arr) {
282 my $tempNumber = split($de,$var);
283 my $tempTabs = $tempNumber - 1;
284 if ($numberTabs != $tempTabs) {
285 print LOGFILE
"\nERROR: $counter row with incorrect number of tabs. There are $tempTabs in this row and should be $numberTabs.\n";
286 if ($tempTabs > $numberTabs) {
288 print LOGFILE
"\t(This is likely secondary to a rogue tab character(s) on row $counter.)\n";
292 print LOGFILE
"\t(This is likely secondary to a rogue newline character(s) on row $counter or one row above.)\n";
302 # function to compare to original constants
303 # normal flag will fix constants escape characters to prepare for mysql dumpfile
304 # special flag will not fix escape characters to prepare for spreadsheet revision file
305 # param - flag (normal or special), array of processed file
306 # globals - @previousConstants, $constantRow, $de, LOGFILE,
307 # $constantIdColumn, $constantColumn
310 sub checkConstants
() {
311 my ($flag, @page) = @_;
313 print LOGFILE
"Checking constants:\n\n";
314 my $counter = $constantRow;
317 foreach my $var (@previousConstants) {
319 my @tempRow = split($de,$page[$counter]);
320 my $tempId = $tempRow[$constantIdColumn];
321 my $tempConstant = $tempRow[$constantColumn];
323 # ensure constant has not been altered
324 if ($var ne $tempConstant) {
325 print LOGFILE
"Following constant not same:\n";
326 print LOGFILE
"\toriginal- val:$var\n";
327 print LOGFILE
"\tspreadsheet- ID:$tempId val:$tempConstant\n";
332 if ($flag eq "normal") {
333 $fixedVar =~ s/\\//g;
334 $fixedVar =~ s/\'/\\\'/g;
335 $fixedVar =~ s/\"/\\\"/g;
337 $tempRow[$constantColumn] = $fixedVar;
338 $page[$counter] = join($de,@tempRow);
341 # ensure constant id number has not been altered
342 my $realID = ($counter - $constantRow + 1);
343 if ($realID != $tempId) {
345 print LOGFILE
"\nERROR: Constant ID number ".$realID." has been modified to ".$tempId."!!!\n\n";
347 # apply fix (replace with original after reset escape characters)
348 $tempRow[$constantIdColumn] = $realID;
349 $page[$counter] = join($de,@tempRow);
356 print LOGFILE
"\nDone checking constants:\n";
357 print LOGFILE
"\t".$badCount." mismatches found (known is ".$mismatchesKnown.")\n";
358 if ($badCount == $mismatchesKnown) {
359 print LOGFILE
"Good, constants weren't modified by translators\n\n";
362 print LOGFILE
"ERROR: Constant(s) have been modified by translators\n\n";
365 print LOGFILE
"ERROR: Constant ID number(s) have been modified by translators\n\n";
372 # function to build lang_languages dumpfile
373 # param - integer flag for utf8, array of processed file
374 # globals - $constantColumn, $constantRow,
375 # $languageNumRow, $languageIdRow, $languageNameRow,
376 # @numberConstantsLanguages, @languages
377 # return - output string
379 sub createLanguages
() {
380 my ($flag, @page) = @_;
392 my @numberRow = split($de,$page[$languageNumRow]);
393 my @idRow = split($de,$page[$languageIdRow]);
394 my @nameRow = split($de,$page[$languageNameRow]);
395 $tempReturn .= "INSERT INTO `lang_languages` (`lang_id`, `lang_code`, `lang_description`) VALUES\n";
396 for (my $i = $constantColumn; $i < @numberRow; $i++) {
397 $tempReturn .= "(".$numberRow[$i].", '".$idRow[$i]."', '".$nameRow[$i]."'),\n";
398 $tempCounter = $numberRow[$i];
400 # set up for statistics later
401 push (@languages, $nameRow[$i]);
402 $numberConstantsLanguages[$numberRow[$i]-1] = 0;
404 $tempReturn =~ s/,\n$/;\n/;
410 -- Table structure for table `lang_languages`
413 DROP TABLE IF EXISTS `lang_languages`;
414 CREATE TABLE `lang_languages` (
415 `lang_id` int(11) NOT NULL auto_increment,
416 `lang_code` char(2) NOT NULL default '',
417 `lang_description` varchar(100) default NULL,
418 UNIQUE KEY `lang_id` (`lang_id`)
419 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
422 -- Dumping data for table `lang_languages`
426 $return .= $tempReturn;
436 # function to build lang_constants dumpfile
437 # param - integer flag for utf, array of processed file
438 # globals - $constantColumn, $constantRow, $constantIdColumn, $totalConstants
441 sub createConstants
() {
443 my ($flag, @page) = @_;
455 $tempReturn .= "INSERT INTO `lang_constants` (`cons_id`, `constant_name`) VALUES\n";
456 for (my $i = $constantRow; $i < @page; $i++) {
457 my @tempRow = split($de,$page[$i]);
458 my $tempId = $tempRow[$constantIdColumn];
459 my $tempConstant = $tempRow[$constantColumn];
460 $tempReturn .= "(".$tempId.", '".$tempConstant."'),\n";
461 $tempCounter = $tempId;
463 $tempReturn =~ s/,\n$/;\n/;
469 -- Table structure for table `lang_constants`
472 DROP TABLE IF EXISTS `lang_constants`;
473 CREATE TABLE `lang_constants` (
474 `cons_id` int(11) NOT NULL auto_increment,
475 `constant_name` varchar(255) BINARY default NULL,
476 UNIQUE KEY `cons_id` (`cons_id`),
477 KEY `constant_name` (`constant_name`)
478 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
481 -- Dumping data for table `lang_constants`
485 $return .= $tempReturn;
491 # fill total constants for statistics later
492 $totalConstants = $tempCounter - 1;
498 # function to build lang_definitions dumpfile
499 # param - integer flag for utf8, array of processed file
500 # globals - $constantColumn, $constantRow,
501 # $languageNumRow, $constantIdColumn, @numberConstantsLanguages,
505 sub createDefinitions
() {
507 my ($flag, @page) = @_;
519 my @numberRow = split($de,$page[$languageNumRow]);
521 $tempReturn .= "INSERT INTO `lang_definitions` (`def_id`, `cons_id`, `lang_id`, `definition`) VALUES\n";
522 for (my $i = $constantColumn + 1; $i < @numberRow; $i++) {
523 for (my $j = $constantRow; $j < @page; $j++) {
524 my @tempRow = split($de,$page[$j]);
525 my $tempId = $tempRow[$constantIdColumn];
526 my $tempDefinition = $tempRow[$i];
527 my $tempLangNumber = $numberRow[$i];
528 if ($tempDefinition !~ /^\s*$/) {
529 $tempReturn .= "(".$counter.", ".$tempId.", ".$tempLangNumber.", '".$tempDefinition."'),\n";
530 $tempCounter = $counter;
533 # set up for statistics
534 $numberConstantsLanguages[($tempLangNumber - 1)] += 1;
538 $tempReturn =~ s/,\n$/;\n/;
544 -- Table structure for table `lang_definitions`
547 DROP TABLE IF EXISTS `lang_definitions`;
548 CREATE TABLE `lang_definitions` (
549 `def_id` int(11) NOT NULL auto_increment,
550 `cons_id` int(11) NOT NULL default '0',
551 `lang_id` int(11) NOT NULL default '0',
552 `definition` mediumtext CHARACTER SET utf8,
553 UNIQUE KEY `def_id` (`def_id`),
554 KEY `cons_id` (`cons_id`)
555 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
558 -- Dumping data for table `lang_definitions`
562 $return .= $tempReturn;
568 # fill total definitions for statistics later
569 $totalDefinitions = $tempCounter - 1;
574 # Function to drop decimals
575 # param: 1st is number, 2nd is nubmer of desired decimals
577 my ($value,$precision) = @_;
581 my $s = sprintf("%.${precision}f", $value);