apertium-transfer-tools/src/giza++A3-to-apertium.awk

   1
   2 # This script reads GIZA++ alignment and proccess them giving
   3 # as an output a more human (and machine) readable format with the
   4 # same information. From this new format it easier to construct an
   5 # aligment matrix between suorce an target sentences.
   6 #
   7 #
   8 # Ouput format:
   9 #     alignment_score | source_sentence | target_sentence | alignment
  10 #
  11 # Example:
  12 #     5.07501e-05 | Reanudación del período de sesiones | Resumption of the session | 0:0 1:2 3:1 4:3
  13 #
  14 # Open question: What do we do with NULL words?
  15 #                In this version they're ignored
  16 #
  17 # author: Felipe Sánchez-Martínez
  18
  19
  20 function process_alignment(str) {
  21   source_sentence="";
  22   alignment="";
  23
  24   ntokens=split(str, tokens, " }) ");
  25
  26   sl_pos=0;
  27
  28   for(i=1; i<=ntokens; i++) {
  29     if (length(tokens[i])==0)
  30       continue;
  31
  32     nwa=split(tokens[i], wa, " ");
  33     if (nwa<2) {
  34       print "Error while processing the alignment information at input line " line > "/dev/stderr";
  35       exit 1;
  36     }
  37
  38     if (wa[1] == "NULL") # NULL is ignored
  39       continue;
  40
  41     if(length(source_sentence)>0) {
  42       source_sentence = source_sentence " ";
  43     }
  44     source_sentence = source_sentence wa[1];
  45
  46     for (j=3; j<=nwa; j++) {
  47       if (length(alignment)>0) {
  48         alignment = alignment " ";
  49       }
  50       alignment = alignment sl_pos ":" wa[j]-1;
  51     }
  52
  53     sl_pos++;
  54   }
  55 }
  56
  57 function trim(w) {
  58    for(i=1;i<=length(w);i++){
  59      if(substr(w,i,1)~/[ \t\r\n]/);
  60      else break;
  61    }
  62    liminf=i;
  63
  64    for(i=length(w);i>=1;i--){
  65      if(substr(w,i,1)~/[ \t\r\n]/);
  66      else break;
  67    }
  68
  69    limsup=i;
  70
  71    return substr(w,liminf,limsup-liminf+1);
  72 }
  73
  74 BEGIN {
  75   line=0;
  76   alignment_score=0;
  77   target_sentence="";
  78   source_sentence="";
  79   alignment="";
  80   reading_al=0;
  81 }
  82 {
  83   line++;
  84   if (reading_al==0)
  85     alignment_score=$NF;
  86   else if (reading_al==1)
  87     target_sentence=$0;
  88   else {
  89     process_alignment($0);
  90     print alignment_score " | " trim(source_sentence) " | " trim(target_sentence) " | " alignment;
  91   }
  92
  93   reading_al++;
  94   if (reading_al>2)
  95     reading_al=0;
  96 }