2 * Copyright (C) 2007 Francis Tyers
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include "similarity.h"
26 * Levenshtein Distance Algorithm: C++ Implementation
27 * by Anders Sewerin Johansen
31 int levenshtein_distance(wstring string1
, wstring string2
)
33 int n
= string1
.length();
34 int m
= string2
.length();
36 if(n
== 0 || m
== 0) {
40 vector
< vector
<int> > matrix(n
+ 1);
42 for(int i
= 0; i
<= n
; i
++) {
43 matrix
[i
].resize(m
+ 1);
46 for(int i
= 0; i
<= n
; i
++) {
50 for(int j
= 0; j
<= m
; j
++) {
54 for (int i
= 1; i
<= n
; i
++) {
55 const wchar_t s1i
= string1
[i
- 1];
56 for (int j
= 1; j
<= m
; j
++) {
57 const wchar_t s2j
= string2
[j
- 1];
64 const int above
= matrix
[i
- 1][j
];
65 const int left
= matrix
[i
][j
- 1];
66 const int diag
= matrix
[i
- 1][j
- 1];
67 const int cell
= min(above
+ 1, min(left
+ 1, diag
+ cost
));
77 * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
82 float dice_coefficient(wstring string1
, wstring string2
)
86 vector
<wstring
> string1_bigrams
;
87 vector
<wstring
> string2_bigrams
;
89 vector
<wstring
>::iterator iter
;
91 iter
= string1_bigrams
.begin();
92 for(unsigned int i
= 0; i
< (string1
.length() - 1); i
++) { // extract character bigrams from string1
93 iter
= string1_bigrams
.insert(iter
, string1
.substr(i
, 2));
96 iter
= string2_bigrams
.begin();
97 for(unsigned int j
= 0; j
< (string2
.length() - 1); j
++) { // extract character bigrams from string2
98 iter
= string2_bigrams
.insert(iter
, string2
.substr(j
, 2));
101 // calculate character bigram overlap between two strings
102 for(unsigned int z
= 0; z
< string1_bigrams
.size(); z
++) {
103 for(unsigned int x
= 0; x
< string2_bigrams
.size(); x
++) {
104 if(string1_bigrams
.at(z
) == string2_bigrams
.at(x
)) {
110 // calculate dice coefficient
111 int total
= string1_bigrams
.size() + string2_bigrams
.size();
112 float dice
= (float)(overlap
* 2) / (float)total
;
118 * Same as dice_coefficient only with gappy bigrams.
121 float xdice_coefficient(wstring string1
, wstring string2
)
125 vector
<wstring
> string1_bigrams
;
126 vector
<wstring
> string2_bigrams
;
128 vector
<wstring
>::iterator iter
;
130 iter
= string1_bigrams
.begin();
131 for(unsigned int i
= 0; i
< (string1
.length() - 1); i
++) { // extract character bigrams from string1
132 iter
= string1_bigrams
.insert(iter
, string1
.substr(i
, 2));
135 for(unsigned int h
= 0; h
< (string1
.length() - 1); h
++) { // extract trigrams -- for gappy matching.
136 iter
= string1_bigrams
.insert(iter
, string1
.substr(h
, 3));
139 iter
= string2_bigrams
.begin();
140 for(unsigned int j
= 0; j
< (string2
.length() - 1); j
++) { // extract character bigrams from string2
141 iter
= string2_bigrams
.insert(iter
, string2
.substr(j
, 2));
144 for(unsigned int l
= 0; l
< (string2
.length() - 1); l
++) { // extract trigrams
145 iter
= string2_bigrams
.insert(iter
, string2
.substr(l
, 3));
148 // calculate character bigram overlap between two strings
149 for(unsigned int z
= 0; z
< string1_bigrams
.size(); z
++) {
150 for(unsigned int x
= 0; x
< string2_bigrams
.size(); x
++) {
151 if(string1_bigrams
.at(z
) == string2_bigrams
.at(x
)) {
152 overlap
++; // bar:bar = 1
153 } else if(string1_bigrams
.at(z
)[0] == string2_bigrams
.at(x
)[0] && string1_bigrams
.at(z
)[2] == string2_bigrams
.at(x
)[2]) {
154 overlap
++; // bar:bur = 1
155 } else if(string1_bigrams
.at(z
)[0] == string2_bigrams
.at(x
)[0] && string1_bigrams
.at(z
)[2] == string2_bigrams
.at(x
)[1]) {
156 overlap
++; // bar:br = 1
161 // calculate dice coefficient
162 int total
= string1_bigrams
.size() + string2_bigrams
.size();
163 float dice
= (float)(overlap
* 2) / (float)total
;
169 * xxdice, as xdice but includes adjustment based on
170 * bigram position offset.
173 float xxdice_coefficient(wstring string1
, wstring string2
)
177 vector
<wstring
> string1_bigrams
;
178 vector
<wstring
> string2_bigrams
;
180 vector
<wstring
>::iterator iter
;
182 iter
= string1_bigrams
.begin();
183 for(unsigned int i
= 0; i
< (string1
.length() - 1); i
++) { // extract character bigrams from string1
184 iter
= string1_bigrams
.insert(iter
, string1
.substr(i
, 2));
187 for(unsigned int h
= 0; h
< (string1
.length() - 1); h
++) { // extract trigrams -- for gappy matching.
188 iter
= string1_bigrams
.insert(iter
, string1
.substr(h
, 3));
191 iter
= string2_bigrams
.begin();
192 for(unsigned int j
= 0; j
< (string2
.length() - 1); j
++) { // extract character bigrams from string2
193 iter
= string2_bigrams
.insert(iter
, string2
.substr(j
, 2));
196 for(unsigned int l
= 0; l
< (string2
.length() - 1); l
++) { // extract trigrams
197 iter
= string2_bigrams
.insert(iter
, string2
.substr(l
, 3));
200 // calculate character bigram overlap between two strings
201 for(unsigned int z
= 0; z
< string1_bigrams
.size(); z
++) {
202 for(unsigned int x
= 0; x
< string2_bigrams
.size(); x
++) {
203 int idx1
= string1
.find_first_of(string1_bigrams
.at(z
));
204 int idx2
= string2
.find_first_of(string2_bigrams
.at(x
));
205 int offset
= idx1
- idx2
; // offset is the difference between the positions of the bigrams
206 float fiddle
= (float)(2 / (1 + (offset
* offset
))); // calculate fiddle factor
207 if(string1_bigrams
.at(z
) == string2_bigrams
.at(x
)) {
208 overlap
+= fiddle
; // bar:bar = 1
209 } else if(string1_bigrams
.at(z
)[0] == string2_bigrams
.at(x
)[0] && string1_bigrams
.at(z
)[2] == string2_bigrams
.at(x
)[2]) {
210 overlap
+= fiddle
; // bar:bur = 1
211 } else if(string1_bigrams
.at(z
)[0] == string2_bigrams
.at(x
)[0] && string1_bigrams
.at(z
)[2] == string2_bigrams
.at(x
)[1]) {
212 overlap
+= fiddle
; // bar:br = 1
217 // calculate dice coefficient
218 int total
= string1_bigrams
.size() + string2_bigrams
.size();
219 float dice
= (float)(overlap
* 2) / (float)total
;
225 * This is a hack of a hack of a hack.
226 * Kind of works though.
229 int longest_common_subsequence(wstring string1
, wstring string2
)
233 unsigned int n
= string1
.length();
234 unsigned int m
= string2
.length();
236 if(n
== 0 || m
== 0) {
240 vector
< vector
<int> > matrix(n
+ 2);
242 for(i
= 0; i
<= n
+1; i
++) {
243 matrix
[i
].resize(m
+ 2);
246 for(i
= 0; i
<= n
+1; i
++) {
250 for(j
= 0; j
<= m
+1; j
++) {
254 for(i
= n
+1; i
>= 1; i
--) {
255 for(j
= m
+1; j
>= 1; j
--) {
256 // wcout << L"n: " << n+1 << L" m: " << m+1 << L" i: " << i << L" j: " << j << endl;
257 if(i
== n
+1 || j
== m
+1) {
259 } else if(string1
.at(i
-1) == string2
.at(j
-1)) {
260 matrix
[i
][j
] = 1 + matrix
[i
+ 1][j
+ 1];
262 matrix
[i
][j
] = max(matrix
[i
+ 1][j
], matrix
[i
][j
+ 1]);
267 for(i = 0; i <= n+1; i++) { // print matrix
269 for(j = 0; j <= m+1; j++){
270 wcout << matrix[i][j] << " ";
278 float longest_common_subsequence_coefficient(wstring string1
, wstring string2
)
280 unsigned int n
= string1
.length();
281 unsigned int m
= string2
.length();
283 int lcs
= longest_common_subsequence(string1
, string2
);
285 return (float)lcs
/ (float)((n
+ m
) / 2);