Language trans.
[apertium.git] / apertium-transfer-tools / src / preprocess-corpus.awk
blob88dee8b443894be18222ce0946abe0b0fb88a9c0
2 # This script reads a corpus that has been analysed. It preprocess the corpus
3 # before training the word alignments using GIZA++.
4 #
5 # It remove all character not belonging to apertium words and replace each
6 # space within apertium words by the character '_'.
7 #
8 # Example of input:
9 #
10 # Example of output:
12 # author: Felipe Sánchez-Martínez
14 function trim(palabra) {
15 for(i=1;i<=length(palabra);i++){
16 if(substr(palabra,i,1)~/[ \t\r\n]/);
17 else break;
19 liminf=i;
21 for(i=length(palabra);i>=1;i--){
22 if(substr(palabra,i,1)~/[ \t\r\n]/);
23 else break;
26 limsup=i;
28 return substr(palabra,liminf,limsup-liminf+1);
31 BEGIN{FS="\\$"}
33 c="";
34 for (j=1; j<=NF; j++) {
35 w=trim($j);
36 w=substr(w,index(w,"^"));
38 if ((length(w)>0) && (index(w,"^")>0)) {
39 gsub(" ", "_", w);
41 if (length(c)>0)
42 c = c " ";
44 c = c w "$";
47 #if (length(c)>0)
48 print c;