Add another
[apertium.git] / apertium-eval-translator-alternate / apertium-eval
blobe93b7bb0bbaf4454a90bd4083568a43d4c3411ae
1 #!/bin/sh
2 # (c) 2007 Mikel L. Forcada
4 # This script approximates the word error rate (WER) between
5 # a translation
6 # performed by the apertium MT system and a reference translation
7 # obtained by post-editing the system output; it also uses the original
8 # text to compute the coverage of the system's dictionaries.
9 #
10 # It "approximates" because it uses "diff -d", which is an approximation
11 # to the real edit distance.
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License (http://www.gnu.org/licenses/gpl.txt)
17 # for more details.
21 # Perhaps I wouldn't need to write this if I knew more bash scripting...
22 # Alternatives welcome
23 function percentage {
24 local x=$[100*$1/$2]
25 local y=$[10000*$1/$2-100*x]
26 if [[ y -le 10 ]]
27 then y="0"$y
29 echo $x"."$y"%"
32 function diffprocessor {
33 # Also, I know this is not beautiful
34 # but lets me have all in one file
35 # I parse the default output of diff -d
36 # To compute insertions, deletions and substitutions
39 awk 'BEGIN { FS="[acd,]"}
41 # nd number of deletions
42 # ni number of insertions
43 # ns number of substitutions
44 # n number of words affected in the original text
46 # Deletions
48 /^[0-9]+d[0-9]+$/ {nd++; n++;}
49 /^[0-9]+,[0-9]+d[0-9]+$/ {l=($2-$1+1); nd+=l; n+=l}
51 # Insertions
53 /^[0-9]+a[0-9]+$/ {ni++;}
54 /^[0-9]+a[0-9]+,[0-9]+$/ {ni+=($3-$2+1);}
57 # Substitutions
58 /^[0-9]+,[0-9]+c[0-9]+,[0-9]+$/ {
59 l=($2-$1+1); r=($4-$3+1); n+=l;
60 if (l==r) ns+=l;
61 else if (l>r) {ns+=r; nd+=(l-r);}
62 else {ns+=l; ni+=(r-l);}
64 /^[0-9]+,[0-9]+c[0-9]+$/ { l=($2-$1+1); n+=l;
65 if (l==1) ns+=1;
66 else if (l>1) {ns++; nd+=(l-1);}
68 /^[0-9]+c[0-9]+,[0-9]+$/ { r=($3-$2+1); n++;
69 if (r==1) ns+=1;
70 else if (r>1) {ns++; ni+=(r-1);}
72 /^[0-9]+c[0-9]+$/ {n++; ns++;}
74 END {print ns+ni+nd}'
77 case $# in
79 FILENAME=$1
82 echo "USAGE: $(basename $0) <basefilename>"
83 echo "basefilename Base filename for evaluation (no extensions)"
84 echo ".orig=original text "
85 echo ".raw=raw translation "
86 echo ".corr=corrected translation "
87 exit 1;
88 esac
91 # Generate original file with one word per line and no blank lines
92 cat $FILENAME.orig | tr ' ' '\012' | grep -v ^$ >$FILENAME.o
93 norigwords=$(cat $FILENAME.o | wc -l)
95 # Generate raw file with one word per line and no blank lines
96 cat $FILENAME.raw | tr ' ' '\012' | grep -v ^$ >$FILENAME.rs
97 nrawwords=$(cat $FILENAME.rs | wc -l)
99 # Count stars
100 norigunknown=$(cat $FILENAME.rs | grep "[*]" | wc -l)
101 echo $norigunknown >>$FILENAME.aux
102 echo "Number of words in raw translation to be corrected "$FILENAME.raw" : "$nrawwords;
103 echo "Number of unknown words in "$FILENAME".orig : "$norigunknown
105 # Compute percentage of unknown words, echo and store
106 forigunknown=$(percentage $norigunknown $norigwords)
107 echo $forigunknown >>$FILENAME.aux
108 echo "Percentage of unknown words in "$FILENAME".raw : "$forigunknown
110 # Remove stars (avoid second translation) and
111 # generate raw file with one word per line and no blank lines
112 cat $FILENAME.raw | sed 's/[*]//g' | tr ' ' '\012' | \
113 grep -v ^$ >$FILENAME.r
115 # Convert corrected translation to 1 word per line, no blank lines
116 cat $FILENAME.corr | tr ' ' '\012' | grep -v ^$ >$FILENAME.c
118 # Count words in raw translation
119 nrawwords=$(cat $FILENAME.r | wc -l)
121 # Make difference of two files and compute number of 1-word edit operations (with stars)
122 nedits_stars=$(diff -d $FILENAME.rs $FILENAME.c | diffprocessor )
124 # Make difference of two files and compute number of 1-word edit operations (without stars)
125 nedits_nostars=$(diff -d $FILENAME.r $FILENAME.c | diffprocessor )
127 # Start reporting
128 echo "Report for corrected file "$FILENAME.corr
129 echo "Number of words in raw translation : "$nrawwords
130 echo "" >>$FILENAME.aux
131 echo $nrawwords >>$FILENAME.aux
132 echo "Number of 1-word edit operations needed : "$nedits_nostars
133 echo $nedits_nostars >>$FILENAME.aux
135 # Compute "error rate"
136 error_rate=$(percentage $nedits_nostars $nrawwords)
137 echo "Percent error rate : "$error_rate
138 echo $error_rate >>$FILENAME.aux
140 # Number of "free rides" (unknown words which came out right)
141 nfree=$[nedits_stars-nedits_nostars]
142 error_rate2=$(percentage $nedits_stars $nrawwords)
143 echo "Number of unknown words which were free rides : "$nfree
144 echo "Number of 1-word edit operations needed (incl. unknown) :"$nedits_stars
145 echo "Percent error rate taking unknown words into account : "$error_rate2
146 echo $nfree >>$FILENAME.aux
148 #Clean up
149 rm $FILENAME.o $FILENAME.r $FILENAME.rs $FILENAME.c