Moving more modules
[apertium.git] / trunk / apertium-tools / apertium-utils / cognate-indux / cognate.cpp
blob884340a4af5ca6882a21e45bebae3bb1db9143cf
1 /*
2 * Copyright (C) 2007 Francis Tyers
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * TODO:
22 * change format of arguments to:
24 * cognate [ -m | -v | -V ] [input_file1 input_file2 [transliteration1 transliteration2]]
27 #include <fstream>
28 #include <iostream>
29 #include <string>
30 #include <vector>
31 #include <cstdlib>
32 #include <getopt.h>
34 #include "similarity.h"
35 #include "transliterate.h"
36 #include "XMLParseUtil.h"
38 using namespace std;
40 void usage(void);
41 void find_cognates(string file1, string file2, string trant1, string trant2, string measure);
43 int main(int argc, char **argv)
45 wstring wtext;
46 string first_file, second_file, trf_first, trf_second, measure;
47 string line, text;
49 if (NULL == setlocale(LC_ALL,"")) {
50 wcerr << L"Couldn't set locale." << endl;
51 return 0;
54 if(argc < 4) {
55 usage();
58 static struct option long_options[] = {
59 { "first-file", 0, 0, 'f' },
60 { "second-file", 0, 0, 's' },
61 { "transliterate-first", 0, 0, 't' },
62 { "transliterate-second", 0, 0, 'r' },
63 { "measure", 0, 0, 'm' },
64 { "verbose", 0, 0, 'v' },
65 { "version", 0, 0, 'V' },
66 { 0, 0, 0, 0 }
69 while(true) {
70 int option_index;
71 int c = getopt_long(argc, argv, "f:s:t:r:m:vV", long_options, &option_index);
73 if(c < 0) {
74 break;
77 switch(c) {
78 case 'v':
79 wcout << L"Verbose" << endl;
80 break;
81 case 'V':
82 wcout << L"Version" << endl;
83 usage();
84 break;
85 case 'f':
86 wcout << L"First file: " << optarg << endl;
87 first_file = optarg;
88 break;
89 case 's':
90 wcout << L"Second file: " << optarg << endl;
91 second_file = optarg;
92 break;
93 case 't':
94 wcout << L"First transliteration: " << optarg << endl;
95 trf_first = optarg;
96 break;
97 case 'r':
98 wcout << L"Second transliteration: " << optarg << endl;
99 trf_second = optarg;
100 break;
101 case 'm':
102 wcout << L"Measure: " << optarg << endl;
103 measure = optarg;
104 break;
106 default:
107 break;
111 find_cognates(first_file, second_file, trf_first, trf_second, measure);
113 return 0;
117 * file1 =
118 * file2 =
119 * trant1 = transliteration table file for first file
120 * trant2 = transliteration table file for second file
121 * measure = string similarity measur
124 void find_cognates(string file1, string file2, string trant1, string trant2, string measure)
126 ifstream file1stream(file1.c_str());
127 ifstream file2stream(file2.c_str());
129 vector<wstring> file1words;
130 vector<wstring> file2words;
131 vector<wstring>::iterator iter;
133 string buf1, buf2, line;
134 wstring wbuf1, wbuf2;
136 if(!file1stream.is_open()) {
137 cerr << "Could not open " << file1 << endl;
138 return;
140 if(!file2stream.is_open()) {
141 cerr << "Could not open " << file2 << endl;
142 return;
145 iter = file1words.begin();
146 while(getline(file1stream, line)) {
147 iter = file1words.insert(iter, XMLParseUtil::towstring((const xmlChar *)line.c_str()));
150 iter = file2words.begin();
151 while(getline(file2stream, line)) {
152 iter = file2words.insert(iter, XMLParseUtil::towstring((const xmlChar *)line.c_str()));
155 Transliterator ttable_first = Transliterator(trant1); // read the file :after: setting the locale
156 Transliterator ttable_second = Transliterator(trant2);
158 wcout << file1words.size() << endl;
159 wcout << file2words.size() << endl;
161 for(unsigned int i = 0; i < file1words.size(); i++) {
162 for(unsigned int j = 0; j < file2words.size(); j++) {
163 float d = 0.0;
164 wstring wword1 = ttable_first.transliterate(file1words[i]);
165 wstring wword2 = ttable_second.transliterate(file2words[j]);
166 wcout << file1words[i] << " : " << file2words[j] << " : ";
167 wcout << wword1 << " : " << wword2 << " : ";
169 if(measure == "dice") { // choose similarity measure
170 d = dice_coefficient(wword1, wword2);
171 } else if(measure == "levenshtein") {
172 d = levenshtein_distance(wword1, wword2);
173 } else if(measure == "xdice") {
174 d = xdice_coefficient(wword1, wword2);
175 } else if(measure == "xxdice") {
176 d = xxdice_coefficient(wword1, wword2);
177 } else if(measure == "lcs") {
178 d = longest_common_subsequence(wword1, wword2);
179 } else if(measure == "lcsc") {
180 d = longest_common_subsequence_coefficient(wword1, wword2);
181 } else {
182 wcout << L"Default (m: " << XMLParseUtil::towstring((const xmlChar *)measure.c_str()) << ")" << endl;
183 return;
185 wcout << d << endl;
189 file1stream.close();
190 file2stream.close();
194 void usage(void)
196 cout << "Usage: ./cognate [ -v | -V ] -f <first file> -s <second file> -t <first transliteration table> -r <second transliteration table>" << endl;
197 cout << "Options:" << endl;
198 cout << " -m, --measure: String similarity measure (levenshtein, dice, xdice, xxdice, lcs, lcsc)." << endl;
199 cout << " -v, --verbose: Display verbose output." << endl;
200 cout << " -V, --version: Display version information." << endl;
201 exit(0);