2 # This script reads a corpus that has been analysed. It preprocess the corpus
3 # before training the word alignments using GIZA++.
5 # It remove all character not belonging to apertium words and replace each
6 # space within apertium words by the character '_'.
12 # author: Felipe Sánchez-Martínez
14 function trim
(palabra
) {
15 for(i=
1;i
<=
length(palabra
);i
++){
16 if(substr(palabra
,i
,1)~
/[ \t\r\n]/);
21 for(i=
length(palabra
);i
>=
1;i
--){
22 if(substr(palabra
,i
,1)~
/[ \t\r\n]/);
28 return substr(palabra
,liminf
,limsup
-liminf
+1);
34 for (j=
1; j
<=
NF; j
++) {
36 w=
substr(w
,index(w
,"^"));
38 if ((length(w
)>0) && (index(w
,"^")>0)) {