2 # This script reads GIZA++ alignment and proccess them giving
3 # as an output a more human (and machine) readable format with the
4 # same information. From this new format it easier to construct an
5 # aligment matrix between suorce an target sentences.
9 # alignment_score | source_sentence | target_sentence | alignment
12 # 5.07501e-05 | Reanudación del período de sesiones | Resumption of the session | 0:0 1:2 3:1 4:3
14 # Open question: What do we do with NULL words?
15 # In this version they're ignored
17 # author: Felipe Sánchez-Martínez
20 function process_alignment
(str
) {
24 ntokens=
split(str
, tokens
, " }) ");
28 for(i=
1; i
<=ntokens
; i
++) {
29 if (length(tokens
[i
])==
0)
32 nwa=
split(tokens
[i
], wa
, " ");
34 print "Error while processing the alignment information at input line " line
> "/dev/stderr";
38 if (wa
[1] ==
"NULL") # NULL is ignored
41 if(length(source_sentence
)>0) {
42 source_sentence = source_sentence
" ";
44 source_sentence = source_sentence wa
[1];
46 for (j=
3; j
<=nwa
; j
++) {
47 if (length(alignment
)>0) {
48 alignment = alignment
" ";
50 alignment = alignment sl_pos
":" wa
[j
]-1;
58 for(i=
1;i
<=
length(w
);i
++){
59 if(substr(w
,i
,1)~
/[ \t\r\n]/);
64 for(i=
length(w
);i
>=
1;i
--){
65 if(substr(w
,i
,1)~
/[ \t\r\n]/);
71 return substr(w
,liminf
,limsup
-liminf
+1);
86 else if (reading_al==
1)
89 process_alignment
($
0);
90 print alignment_score
" | " trim
(source_sentence
) " | " trim
(target_sentence
) " | " alignment
;