Additional mods to previous commit (still discussing UTF8 enforcing)
[openemr.git] / contrib / util / language_translations / buildLanguageDatabase.pl
blob7589b58d7e8dada4281785ca237c17f1bfc67dda
1 #!/usr/bin/perl
3 # Copyright (C) 2009-2013 Brady Miller <brady@sparmy.com>
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # Author Brady Miller <brady@sparmy.com>
11 # Author Ramin Moshiri <raminmoshiri@gmail.com>
13 # This is a perl script that will build the language translation sql
14 # dumpfiles from the tab delimited language translation spreadsheet.
15 # It will create two output dumpfiles:
16 # languageTranslations_utf8.sql
17 # languageTranslations_latin1.sql (minus utf languages)
18 # It will also validate the spreadsheet and create a new spreadsheet
19 # that can be used for further downstream editing and re-importing
20 # back into Google Docs. It also outputs a logfile log.txt with
21 # errors in validation and database statistics.
23 # Example command:
25 # -Below command will build the sql dumpfile from given tsv
26 # language spreadsheet and compare with a constants list to
27 # ensure constants didn't get edited (output will go to
28 # the log file), and will also fix limited issues. In this case
29 # a new spreadsheet file will also be created with the corrected
30 # constants to allow downstream modification and re-importing of
31 # file into Google Docs:
32 # ./buildLanguageDatabase.pl openemr_language_table.tsv constants.txt
36 use strict;
38 # Array to hold languages to skip for the latin1 translation file
39 # (pending)
41 # Put current known constant mismatches here, which are basically
42 # constants that get modified during the pipeline and don't look
43 # like originals in the end. If this number increases, a english constant
44 # was likely modified in the spreadsheet, and can then use log output
45 # to localize and fix the problem. As of list of 3.0.1 constants
46 # the known number of mismatched constants is 57 .
47 my $mismatchesKnown = 95;
49 # Hold variables to calculate language database statistics
50 my $totalConstants;
51 my $totalDefinitions;
52 my @languages;
53 my @numberConstantsLanguages;
55 # Main variables
56 my $de = "\t";
57 my $filenameOut;
58 my $inputFilename;
59 my $logFile = "log.txt";
60 my $stats = "stats.txt";
61 my $constantIdColumn = 0; # 0 is lowest
62 my $constantColumn = 1; # 0 is lowest
63 my $constantRow = 5; # 0 is lowest
64 my $languageNumRow = 0; # 0 is lowest
65 my $languageIdRow = 1; # 0 is lowest
66 my $languageNameRow = 2; # 0 is lowest
68 # variables for checking/fixing constants application
69 my $checkFilename; # holds list of constants if checking
70 my $filenameOut_revised = "revisedSpreadsheet.tsv";
71 my $flagCheck = 0;
72 my @previousConstants;
73 my @inputFile;
74 my @revisedFile;
75 my @inputFileProcessed;
77 # to hold utf8 flag
78 my $utf8;
80 # open output file
81 open(LOGFILE, ">$logFile") or die "unable to open log file";
83 # open output file
84 open(STATFILE, ">$stats") or die "unable to open stats file";
86 # collect parameters
87 if (@ARGV > 2) {
88 die "\nERROR: Too many parameters. Follow instructions found in buildLanguageDatabase.pl file.\n\n";
90 elsif (@ARGV < 2) {
91 die "\nERROR: Need more parameter(s). Follow instructions found in buildLanguageDatabase.pl file.\n\n";
93 elsif (@ARGV == 2) {
94 $flagCheck = 1;
95 $checkFilename = $ARGV[1];
96 $inputFilename = $ARGV[0];
98 else {
99 print LOGFILE "ERROR: with parameters\n\n";
102 # if checking, then open check file and store in array
103 if ($flagCheck) {
104 open(MYINPUTFILE, "<$checkFilename") or die "unable to open file";
105 @previousConstants = <MYINPUTFILE>;
106 close(MYINPUTFILE);
109 # place spreadsheet into array
110 open(MYINPUTFILE2, "<$inputFilename") or die "unable to open file";
111 @inputFile = <MYINPUTFILE2>;
112 close(MYINPUTFILE2);
114 # Clean up spreadsheet
115 # FIRST, remove newlines, blank lines, escape characters, and windows returns
116 # SECOND, place the escape characters in all required sql characters
117 foreach my $tempLine (@inputFile) {
118 chomp($tempLine);
119 if ($tempLine !~ /^\s*$/) {
120 # remove ^M characters (windows line feeds)
121 $tempLine =~ s/\r//g;
123 # remove all escape characters
124 $tempLine =~ s/\\//g;
126 # place all required escape characters
127 $tempLine =~ s/\'/\\\'/g;
128 $tempLine =~ s/\"/\\\"/g;
130 # push into new array
131 push (@inputFileProcessed,$tempLine);
135 # check spreadsheet for rogue tabs and newlines
136 # (the last column needs to be full for this to work
137 # correctly, such as the dummy language)
138 quickCheckStructure(@inputFileProcessed);
140 # check and fix modified constants (and constant id's)
141 if ($flagCheck) {
142 # first create data for replacement spreadsheet if needed
143 @revisedFile = checkConstants("special",@inputFileProcessed);
144 # then clean data to create mysql dumpfiles
145 @inputFileProcessed = checkConstants("normal",@inputFileProcessed);
148 # run through twice to make a utf8 table and a latin1 table
149 # revised spreadsheet. Build statistics and revised
150 # spreadsheet during utf8 run.
151 for (my $i=0;$i<2;$i++) {
153 # set utf flag
154 if ($i == 0) {
155 # build utf8 table
156 $filenameOut = "languageTranslations_utf8.sql";
157 $utf8 = 1;
159 else {
160 # build latin1 table
161 $filenameOut = "languageTranslations_latin1.sql";
162 $utf8 = 0
165 # open output file
166 open(OUTPUTFILE, ">$filenameOut") or die "unable to open output file";
168 my $outputString = "";
170 # add UTF8 set names for both utf8 and latin1 encoding, since
171 # the dumpfile is encoded in UTF8
172 $outputString .= "\
174 -- Ensure correct encoding
177 $outputString .= "SET NAMES utf8;\n\n";
179 # parse lang_languages
180 $outputString .= createLanguages($utf8, @inputFileProcessed);
182 # parse lang_constants
183 $outputString .= createConstants($utf8, @inputFileProcessed);
185 # parse lang_definitions
186 $outputString .= createDefinitions($utf8, @inputFileProcessed);
188 print OUTPUTFILE $outputString;
190 # calculate statistics
191 if ($utf8) {
192 my $count = 0;
193 my $countLanguages = 0;
194 my $subtractDefinitions = 0;
195 my @tempArray;
196 my @statArray;
197 foreach my $var (@languages) {
198 # push all info into the log file
199 push (@tempArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
200 if ($var eq "dummy") {
201 # do not count dummy language or dummy language constants
202 $subtractDefinitions += $numberConstantsLanguages[$count];
204 else {
205 if ($numberConstantsLanguages[$count] > 0) {
206 # only count non-empty languages in total count
207 $countLanguages += 1;
208 # only include non-empty and non-dummy languages in stats
209 push (@statArray, $var.": ".fstr((($numberConstantsLanguages[$count]/$totalConstants)*100),2)."% (".$numberConstantsLanguages[$count]." definitions)\n");
212 $count += 1;
214 print LOGFILE "\nLanguage Statistics:\n";
215 print STATFILE "\nLanguage Statistics:\n";
217 # Report total number of real non empty languages
218 print LOGFILE "Total number of languages with translations: ".$countLanguages."\n";
219 print STATFILE "Total number of languages with translations: ".$countLanguages."\n";
221 # Report total number of constants
222 print LOGFILE "Total number of constants: ".$totalConstants."\n";
223 print STATFILE "Total number of constants: ".$totalConstants."\n";
225 # Report total number of real definitions
226 print LOGFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
227 print STATFILE "Total number of real definitions: ".($totalDefinitions-$subtractDefinitions)."\n";
229 # Send log stat info
230 my @sorted_tempArray = sort { lc($a) cmp lc($b) } @tempArray;
231 foreach my $var (@sorted_tempArray) {
232 print LOGFILE $var;
235 # Send official stat info
236 my @sorted_statArray = sort { lc($a) cmp lc($b) } @statArray;
237 foreach my $var (@sorted_statArray) {
238 print STATFILE $var;
242 # send the processed spreadsheet to file to allow downstream modifications
243 # if checking and fixing modified constants
244 if ($flagCheck && $utf8) {
245 open(MYOUTPUTFILE2, ">$filenameOut_revised") or die "unable to open file";
246 foreach my $var (@revisedFile) {
247 print MYOUTPUTFILE2 $var."\n";
249 close(MYOUTPUTFILE2)
252 # close files
253 close(OUTPUTFILE);
256 close(LOGFILE);
257 close(STATFILE);
261 # FUNCTIONS
266 # function to check spreadsheet for rogue tabs
267 # to work, the last column needs to be filled (such as a dummy language)
268 # will output errors to LOGFILE
269 # param - @arr array of spreadsheet
270 # globals - @inputFile, LOGFILE, $de, $languageNumRow
272 sub quickCheckStructure() {
273 my (@arr) = @_;
275 # use the languagNumRow as the standard for number of tabs
276 # on each row
277 my $numberColumns = split($de,$arr[$languageNumRow]);
278 my $numberTabs = $numberColumns - 1;
280 # ensure every row on spreadsheet has equal number of tabs
281 my $counter = 1;
282 foreach my $var (@arr) {
283 my $tempNumber = split($de,$var);
284 my $tempTabs = $tempNumber - 1;
285 if ($numberTabs != $tempTabs) {
286 print LOGFILE "\nERROR: $counter row with incorrect number of tabs. There are $tempTabs in this row and should be $numberTabs.\n";
287 if ($tempTabs > $numberTabs) {
288 # too many tabs
289 print LOGFILE "\t(This is likely secondary to a rogue tab character(s) on row $counter.)\n";
291 else {
292 # not enough tabs
293 print LOGFILE "\t(This is likely secondary to a rogue newline character(s) on row $counter or one row above.)\n";
296 $counter += 1;
299 return;
303 # function to compare to original constants
304 # normal flag will fix constants escape characters to prepare for mysql dumpfile
305 # special flag will not fix escape characters to prepare for spreadsheet revision file
306 # param - flag (normal or special), array of processed file
307 # globals - @previousConstants, $constantRow, $de, LOGFILE,
308 # $constantIdColumn, $constantColumn
309 # return - none
311 sub checkConstants () {
312 my ($flag, @page) = @_;
314 print LOGFILE "Checking constants:\n\n";
315 my $counter = $constantRow;
316 my $badCount = 0;
317 my $idErrorFlag = 0;
318 foreach my $var (@previousConstants) {
319 chomp($var);
320 my @tempRow = split($de,$page[$counter]);
321 my $tempId = $tempRow[$constantIdColumn];
322 my $tempConstant = $tempRow[$constantColumn];
324 # ensure constant has not been altered
325 if ($var ne $tempConstant) {
326 print LOGFILE "Following constant not same:\n";
327 print LOGFILE "\toriginal- val:$var\n";
328 print LOGFILE "\tspreadsheet- ID:$tempId val:$tempConstant\n";
329 $badCount += 1;
331 # apply fix
332 my $fixedVar = $var;
333 if ($flag eq "normal") {
334 $fixedVar =~ s/\\//g;
335 $fixedVar =~ s/\'/\\\'/g;
336 $fixedVar =~ s/\"/\\\"/g;
338 $tempRow[$constantColumn] = $fixedVar;
339 $page[$counter] = join($de,@tempRow);
342 # ensure constant id number has not been altered
343 my $realID = ($counter - $constantRow + 1);
344 if ($realID != $tempId) {
345 $idErrorFlag = 1;
346 print LOGFILE "\nERROR: Constant ID number ".$realID." has been modified to ".$tempId."!!!\n\n";
348 # apply fix (replace with original after reset escape characters)
349 $tempRow[$constantIdColumn] = $realID;
350 $page[$counter] = join($de,@tempRow);
353 # increment counter
354 $counter += 1;
357 print LOGFILE "\nDone checking constants:\n";
358 print LOGFILE "\t".$badCount." mismatches found (known is ".$mismatchesKnown.")\n";
359 if ($badCount == $mismatchesKnown) {
360 print LOGFILE "Good, constants weren't modified by translators\n\n";
362 else {
363 print LOGFILE "ERROR: Constant(s) have been modified by translators\n\n";
365 if ($idErrorFlag) {
366 print LOGFILE "ERROR: Constant ID number(s) have been modified by translators\n\n";
369 return @page;
373 # function to build lang_languages dumpfile
374 # param - integer flag for utf8, array of processed file
375 # globals - $constantColumn, $constantRow,
376 # $languageNumRow, $languageIdRow, $languageNameRow,
377 # @numberConstantsLanguages, @languages
378 # return - output string
380 sub createLanguages() {
381 my ($flag, @page) = @_;
382 my $charset;
383 if ($flag) {
384 $charset = "utf8";
386 else {
387 $charset = "latin1";
390 # create table input
391 my $tempReturn;
392 my $tempCounter;
393 my @numberRow = split($de,$page[$languageNumRow]);
394 my @idRow = split($de,$page[$languageIdRow]);
395 my @nameRow = split($de,$page[$languageNameRow]);
396 $tempReturn .= "INSERT INTO `lang_languages` (`lang_id`, `lang_code`, `lang_description`) VALUES\n";
397 for (my $i = $constantColumn; $i < @numberRow; $i++) {
398 $tempReturn .= "(".$numberRow[$i].", '".$idRow[$i]."', '".$nameRow[$i]."'),\n";
399 $tempCounter = $numberRow[$i];
401 # set up for statistics later
402 push (@languages, $nameRow[$i]);
403 $numberConstantsLanguages[$numberRow[$i]-1] = 0;
405 $tempReturn =~ s/,\n$/;\n/;
406 $tempCounter += 1;
408 # create header
409 my $return = "\
411 -- Table structure for table `lang_languages`
414 DROP TABLE IF EXISTS `lang_languages`;
415 CREATE TABLE `lang_languages` (
416 `lang_id` int(11) NOT NULL auto_increment,
417 `lang_code` char(2) NOT NULL default '',
418 `lang_description` varchar(100) default NULL,
419 UNIQUE KEY `lang_id` (`lang_id`)
420 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
423 -- Dumping data for table `lang_languages`
424 --\n\n";
426 # insert table input
427 $return .= $tempReturn;
429 # create footer
430 $return .= "
431 --\n\n";
433 return $return;
437 # function to build lang_constants dumpfile
438 # param - integer flag for utf, array of processed file
439 # globals - $constantColumn, $constantRow, $constantIdColumn, $totalConstants
440 # return - nothing
442 sub createConstants() {
443 my (@page) = @_;
444 my ($flag, @page) = @_;
445 my $charset;
446 if ($flag) {
447 $charset = "utf8";
449 else {
450 $charset = "latin1";
453 # create table input
454 my $tempReturn;
455 my $tempCounter;
456 $tempReturn .= "INSERT INTO `lang_constants` (`cons_id`, `constant_name`) VALUES\n";
457 for (my $i = $constantRow; $i < @page; $i++) {
458 my @tempRow = split($de,$page[$i]);
459 my $tempId = $tempRow[$constantIdColumn];
460 my $tempConstant = $tempRow[$constantColumn];
461 $tempReturn .= "(".$tempId.", '".$tempConstant."'),\n";
462 $tempCounter = $tempId;
464 $tempReturn =~ s/,\n$/;\n/;
465 $tempCounter += 1;
467 # create header
468 my $return = "\
470 -- Table structure for table `lang_constants`
473 DROP TABLE IF EXISTS `lang_constants`;
474 CREATE TABLE `lang_constants` (
475 `cons_id` int(11) NOT NULL auto_increment,
476 `constant_name` varchar(255) BINARY default NULL,
477 UNIQUE KEY `cons_id` (`cons_id`),
478 KEY `constant_name` (`constant_name`)
479 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
482 -- Dumping data for table `lang_constants`
483 --\n\n";
485 # insert table input
486 $return .= $tempReturn;
488 # create footer
489 $return .= "
490 --\n\n";
492 # fill total constants for statistics later
493 $totalConstants = $tempCounter - 1;
495 return $return;
499 # function to build lang_definitions dumpfile
500 # param - integer flag for utf8, array of processed file
501 # globals - $constantColumn, $constantRow,
502 # $languageNumRow, $constantIdColumn, @numberConstantsLanguages,
503 # $totalDefinitions
504 # return - nothing
506 sub createDefinitions() {
507 my (@page) = @_;
508 my ($flag, @page) = @_;
509 my $charset;
510 if ($flag) {
511 $charset = "utf8";
513 else {
514 $charset = "latin1";
517 # create table input
518 my $tempReturn;
519 my $tempCounter;
520 my @numberRow = split($de,$page[$languageNumRow]);
521 my $counter = 1;
522 $tempReturn .= "INSERT INTO `lang_definitions` (`def_id`, `cons_id`, `lang_id`, `definition`) VALUES\n";
523 for (my $i = $constantColumn + 1; $i < @numberRow; $i++) {
524 for (my $j = $constantRow; $j < @page; $j++) {
525 my @tempRow = split($de,$page[$j]);
526 my $tempId = $tempRow[$constantIdColumn];
527 my $tempDefinition = $tempRow[$i];
528 my $tempLangNumber = $numberRow[$i];
529 if ($tempDefinition !~ /^\s*$/) {
530 $tempReturn .= "(".$counter.", ".$tempId.", ".$tempLangNumber.", '".$tempDefinition."'),\n";
531 $tempCounter = $counter;
532 $counter += 1;
534 # set up for statistics
535 $numberConstantsLanguages[($tempLangNumber - 1)] += 1;
539 $tempReturn =~ s/,\n$/;\n/;
540 $tempCounter += 1;
542 # create header
543 my $return = "\
545 -- Table structure for table `lang_definitions`
548 DROP TABLE IF EXISTS `lang_definitions`;
549 CREATE TABLE `lang_definitions` (
550 `def_id` int(11) NOT NULL auto_increment,
551 `cons_id` int(11) NOT NULL default '0',
552 `lang_id` int(11) NOT NULL default '0',
553 `definition` mediumtext,
554 UNIQUE KEY `def_id` (`def_id`),
555 KEY `cons_id` (`cons_id`)
556 ) ENGINE=MyISAM AUTO_INCREMENT=".$tempCounter." ;
559 -- Dumping data for table `lang_definitions`
560 --\n\n";
562 # insert table input
563 $return .= $tempReturn;
565 # create footer
566 $return .= "
567 --\n\n";
569 # fill total definitions for statistics later
570 $totalDefinitions = $tempCounter - 1;
572 return $return;
575 # Function to drop decimals
576 # param: 1st is number, 2nd is nubmer of desired decimals
577 sub fstr () {
578 my ($value,$precision) = @_;
579 if ($value == 0) {
580 return "0";
582 my $s = sprintf("%.${precision}f", $value);
583 $s =~ s/\.?0*$//;
584 return $s;