2 * Copyright (C) 2007 Francis Tyers
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * change format of arguments to:
24 * cognate [ -m | -v | -V ] [input_file1 input_file2 [transliteration1 transliteration2]]
34 #include "similarity.h"
35 #include "transliterate.h"
36 #include "XMLParseUtil.h"
41 void find_cognates(string file1
, string file2
, string trant1
, string trant2
, string measure
);
43 int main(int argc
, char **argv
)
46 string first_file
, second_file
, trf_first
, trf_second
, measure
;
49 if (NULL
== setlocale(LC_ALL
,"")) {
50 wcerr
<< L
"Couldn't set locale." << endl
;
58 static struct option long_options
[] = {
59 { "first-file", 0, 0, 'f' },
60 { "second-file", 0, 0, 's' },
61 { "transliterate-first", 0, 0, 't' },
62 { "transliterate-second", 0, 0, 'r' },
63 { "measure", 0, 0, 'm' },
64 { "verbose", 0, 0, 'v' },
65 { "version", 0, 0, 'V' },
71 int c
= getopt_long(argc
, argv
, "f:s:t:r:m:vV", long_options
, &option_index
);
79 wcout
<< L
"Verbose" << endl
;
82 wcout
<< L
"Version" << endl
;
86 wcout
<< L
"First file: " << optarg
<< endl
;
90 wcout
<< L
"Second file: " << optarg
<< endl
;
94 wcout
<< L
"First transliteration: " << optarg
<< endl
;
98 wcout
<< L
"Second transliteration: " << optarg
<< endl
;
102 wcout
<< L
"Measure: " << optarg
<< endl
;
111 find_cognates(first_file
, second_file
, trf_first
, trf_second
, measure
);
119 * trant1 = transliteration table file for first file
120 * trant2 = transliteration table file for second file
121 * measure = string similarity measur
124 void find_cognates(string file1
, string file2
, string trant1
, string trant2
, string measure
)
126 ifstream
file1stream(file1
.c_str());
127 ifstream
file2stream(file2
.c_str());
129 vector
<wstring
> file1words
;
130 vector
<wstring
> file2words
;
131 vector
<wstring
>::iterator iter
;
133 string buf1
, buf2
, line
;
134 wstring wbuf1
, wbuf2
;
136 if(!file1stream
.is_open()) {
137 cerr
<< "Could not open " << file1
<< endl
;
140 if(!file2stream
.is_open()) {
141 cerr
<< "Could not open " << file2
<< endl
;
145 iter
= file1words
.begin();
146 while(getline(file1stream
, line
)) {
147 iter
= file1words
.insert(iter
, XMLParseUtil::towstring((const xmlChar
*)line
.c_str()));
150 iter
= file2words
.begin();
151 while(getline(file2stream
, line
)) {
152 iter
= file2words
.insert(iter
, XMLParseUtil::towstring((const xmlChar
*)line
.c_str()));
155 Transliterator ttable_first
= Transliterator(trant1
); // read the file :after: setting the locale
156 Transliterator ttable_second
= Transliterator(trant2
);
158 wcout
<< file1words
.size() << endl
;
159 wcout
<< file2words
.size() << endl
;
161 for(unsigned int i
= 0; i
< file1words
.size(); i
++) {
162 for(unsigned int j
= 0; j
< file2words
.size(); j
++) {
164 wstring wword1
= ttable_first
.transliterate(file1words
[i
]);
165 wstring wword2
= ttable_second
.transliterate(file2words
[j
]);
166 wcout
<< file1words
[i
] << " : " << file2words
[j
] << " : ";
167 wcout
<< wword1
<< " : " << wword2
<< " : ";
169 if(measure
== "dice") { // choose similarity measure
170 d
= dice_coefficient(wword1
, wword2
);
171 } else if(measure
== "levenshtein") {
172 d
= levenshtein_distance(wword1
, wword2
);
173 } else if(measure
== "xdice") {
174 d
= xdice_coefficient(wword1
, wword2
);
175 } else if(measure
== "xxdice") {
176 d
= xxdice_coefficient(wword1
, wword2
);
177 } else if(measure
== "lcs") {
178 d
= longest_common_subsequence(wword1
, wword2
);
179 } else if(measure
== "lcsc") {
180 d
= longest_common_subsequence_coefficient(wword1
, wword2
);
182 wcout
<< L
"Default (m: " << XMLParseUtil::towstring((const xmlChar
*)measure
.c_str()) << ")" << endl
;
196 cout
<< "Usage: ./cognate [ -v | -V ] -f <first file> -s <second file> -t <first transliteration table> -r <second transliteration table>" << endl
;
197 cout
<< "Options:" << endl
;
198 cout
<< " -m, --measure: String similarity measure (levenshtein, dice, xdice, xxdice, lcs, lcsc)." << endl
;
199 cout
<< " -v, --verbose: Display verbose output." << endl
;
200 cout
<< " -V, --version: Display version information." << endl
;