1 // Implementation of “slice-and-count” algorithm
2 // Copyright © 2009 The University of Chicago
6 #include "WordCollection.h"
9 #include "CompareFunc.h"
12 snippet::snippet(QString s
)
18 StringInventory::StringInventory() { }
20 snippet
* StringInventory::getsnippetpointer (QString a
)
30 snippet
* StringInventory::operator<< (QString a
)
32 return getsnippetpointer(a
);
35 StringPtrList::StringPtrList(QString s
, StringInventory
* inventory
)
38 m_MyStringInventory
= inventory
;
41 Slice
* StringInventory::addword(QString s
)
45 Slice
* pslice
= new Slice(s
, this);
47 for (int start
= 0; start
< s
.length(); start
++)
49 for (int length
= 1; start
+ length
<= s
.length(); length
++)
51 pstring
= getsnippetpointer(s
.mid(start
,length
));
54 pslice
->append(pstring
);
57 pstring
= new snippet(s
.mid(start
,length
));
58 pslice
->append(pstring
);
66 // add word collection.
68 void StringInventory::addwordcollection(
69 CWordCollection
* Words
, SliceCollection
& Slices
,
70 linguistica::ui::status_user_agent
& status
)
72 status
.major_operation
= "Computing word slices.";
73 status
.progress
.clear();
74 status
.progress
.set_denominator(Words
->GetCount());
75 for (int i
= 0; i
< Words
->GetCount(); i
++) {
76 QString word
= Words
->GetAt(i
)->Display();
77 Slices
.append(addword(word
));
79 // XXX. not an operation.
80 status
.major_operation
= word
;
83 status
.progress
.clear();
85 // XXX. not an operation.
86 status
.major_operation
= "End of computing word slices.";
89 bool snippetSort(const snippet
* s1
, const snippet
* s2
)
91 if (s1
->getlength() < s2
->getlength())
93 if (s1
->getlength() > s2
->getlength())
95 if (s1
->getkey() < s2
->getkey())
102 qSort(begin(), end(), snippetSort
);
105 // Write to log file.
107 void SliceCollection::writetolog( QTextStream
& stream
)
110 stream
<< StartTable
<<
112 MakeTableHeader ("Word " ) <<
113 MakeTableHeader ("Word " ) <<
114 MakeTableHeader ("xxx") <<
118 for (int i
= 0; i
< size(); i
++)
120 stream
<< StartTableRow
<<
121 TableData( at(i
)->getkey() );
123 for (int j
= 0; j
< at(i
)->size(); j
++)
125 stream
<< TableData( at(i
)->at(j
)->getkey() ) ;
127 stream
<< EndTableRow
;
133 snippet
* largestcommonsubstring( Slice
* slice1
, Slice
* slice2
, uint minimum
)
136 int index1 (0), index2(0);
138 while (index1
< slice1
->size() && index2
< slice2
->size() )
140 s1
= slice1
->at(index1
);
141 s2
= slice2
->at(index2
);
142 if (s1
->getlength() < minimum
|| s2
->getlength() < minimum
) {return NULL
;}
143 if (s1
->getlength() > s2
->getlength() )
144 { index1
++; continue;}
146 if (s2
->getlength() > s1
->getlength() )
147 { index2
++; continue; }
149 if (s1
== s2
) {return s1
;}
151 if ( s2
->getkey() > s1
->getkey() )
163 int largestcommonsubstringlength( Slice
* slice1
, Slice
* slice2
, uint minimum
)
166 s
= largestcommonsubstring(slice1
, slice2
, minimum
);
168 else return s
->getlength();
171 // Slice pair collection.
173 void SliceCollection::FindWordPairs(
174 linguistica::ui::status_user_agent
& status
,
176 unsigned int minimumlength
)
178 const int THRESHOLD
= 4;
180 status
.details
= "Finding related word pairs.";
181 status
.progress
.clear();
183 if (lexicon
->LogFileOn())
184 *lexicon
->GetLogFileStream() <<
187 MakeTableHeader("Word 1") <<
188 MakeTableHeader("Word 2") <<
191 status
.progress
.set_denominator(size());
192 for (int i
= 0; i
< size(); i
++) {
193 Slice
* const s1
= at(i
);
194 // XXX. not an operation.
195 status
.major_operation
= s1
->getkey();
197 for (int j
= i
+ 1; j
< size(); j
++) {
198 Slice
* const s2
= at(j
);
199 const int length
= largestcommonsubstringlength(s1
, s2
, minimumlength
);
200 if (length
> THRESHOLD
)
201 if (lexicon
->LogFileOn())
202 *lexicon
->GetLogFileStream() <<
204 TableData(s1
->getkey()) <<
205 TableData(s2
->getkey()) <<
207 TableData(largestcommonsubstring(s1
,s2
)->getkey()) <<
211 status
.progress
.clear();
212 // XXX. not an operation.
213 status
.major_operation
= "End of computing word slices.";
214 status
.details
.clear();