Language trans.
[apertium.git] / apertium-transfer-tools / src / giza++A3-to-apertium.awk
blob86254dcab62b05980267f2f6cb70de51b3f89741
2 # This script reads GIZA++ alignment and proccess them giving
3 # as an output a more human (and machine) readable format with the
4 # same information. From this new format it easier to construct an
5 # aligment matrix between suorce an target sentences.
8 # Ouput format:
9 # alignment_score | source_sentence | target_sentence | alignment
11 # Example:
12 # 5.07501e-05 | Reanudación del período de sesiones | Resumption of the session | 0:0 1:2 3:1 4:3
14 # Open question: What do we do with NULL words?
15 # In this version they're ignored
17 # author: Felipe Sánchez-Martínez
20 function process_alignment(str) {
21 source_sentence="";
22 alignment="";
24 ntokens=split(str, tokens, " }) ");
26 sl_pos=0;
28 for(i=1; i<=ntokens; i++) {
29 if (length(tokens[i])==0)
30 continue;
32 nwa=split(tokens[i], wa, " ");
33 if (nwa<2) {
34 print "Error while processing the alignment information at input line " line > "/dev/stderr";
35 exit 1;
38 if (wa[1] == "NULL") # NULL is ignored
39 continue;
41 if(length(source_sentence)>0) {
42 source_sentence = source_sentence " ";
44 source_sentence = source_sentence wa[1];
46 for (j=3; j<=nwa; j++) {
47 if (length(alignment)>0) {
48 alignment = alignment " ";
50 alignment = alignment sl_pos ":" wa[j]-1;
53 sl_pos++;
57 function trim(w) {
58 for(i=1;i<=length(w);i++){
59 if(substr(w,i,1)~/[ \t\r\n]/);
60 else break;
62 liminf=i;
64 for(i=length(w);i>=1;i--){
65 if(substr(w,i,1)~/[ \t\r\n]/);
66 else break;
69 limsup=i;
71 return substr(w,liminf,limsup-liminf+1);
74 BEGIN {
75 line=0;
76 alignment_score=0;
77 target_sentence="";
78 source_sentence="";
79 alignment="";
80 reading_al=0;
83 line++;
84 if (reading_al==0)
85 alignment_score=$NF;
86 else if (reading_al==1)
87 target_sentence=$0;
88 else {
89 process_alignment($0);
90 print alignment_score " | " trim(source_sentence) " | " trim(target_sentence) " | " alignment;
93 reading_al++;
94 if (reading_al>2)
95 reading_al=0;