CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Slice.cpp
blobab630fe66237fdbf7bf2dd194e6cb2efc9fdbe58
1 // Implementation of “slice-and-count” algorithm
2 // Copyright © 2009 The University of Chicago
3 #include "Slice.h"
4 #include <QString>
5 #include "ui/Status.h"
6 #include "WordCollection.h"
7 #include "Stem.h"
8 #include "Lexicon.h"
9 #include "CompareFunc.h"
10 #include "HTML.h"
12 snippet::snippet(QString s)
14 m_string = s;
15 m_count = 1;
18 StringInventory::StringInventory() { }
20 snippet* StringInventory::getsnippetpointer (QString a)
22 snippet* s;
23 s = find(a);
24 if (s) return s;
25 s = new snippet(a);
26 insert (a,s);
27 return s;
30 snippet* StringInventory::operator<< (QString a)
32 return getsnippetpointer(a);
35 StringPtrList::StringPtrList(QString s, StringInventory* inventory)
37 m_key = s;
38 m_MyStringInventory = inventory;
41 Slice* StringInventory::addword(QString s)
43 snippet* pstring;
44 s = "#"+ s + "#";
45 Slice* pslice = new Slice(s, this);
47 for (int start = 0; start < s.length(); start++)
49 for (int length = 1; start + length <= s.length(); length++)
51 pstring = getsnippetpointer(s.mid(start,length));
52 if (pstring)
54 pslice->append(pstring);
56 else {
57 pstring = new snippet(s.mid(start,length));
58 pslice->append(pstring);
62 pslice->sort();
63 return pslice;
66 // add word collection.
68 void StringInventory::addwordcollection(
69 CWordCollection* Words, SliceCollection& Slices,
70 linguistica::ui::status_user_agent& status)
72 status.major_operation = "Computing word slices.";
73 status.progress.clear();
74 status.progress.set_denominator(Words->GetCount());
75 for (int i = 0; i < Words->GetCount(); i++) {
76 QString word = Words->GetAt(i)->Display();
77 Slices.append(addword(word));
79 // XXX. not an operation.
80 status.major_operation = word;
81 status.progress = i;
83 status.progress.clear();
85 // XXX. not an operation.
86 status.major_operation = "End of computing word slices.";
89 bool snippetSort(const snippet* s1, const snippet* s2)
91 if (s1->getlength() < s2->getlength())
92 return FALSE;
93 if (s1->getlength() > s2->getlength())
94 return TRIE;
95 if (s1->getkey() < s2->getkey())
96 return TRUE;
97 return FALSE;
100 void Slice::sort()
102 qSort(begin(), end(), snippetSort);
105 // Write to log file.
107 void SliceCollection::writetolog( QTextStream& stream )
110 stream << StartTable <<
111 StartTableRow <<
112 MakeTableHeader ("Word " ) <<
113 MakeTableHeader ("Word " ) <<
114 MakeTableHeader ("xxx") <<
115 EndTableRow ;
118 for (int i = 0; i < size(); i++)
120 stream << StartTableRow <<
121 TableData( at(i)->getkey() );
123 for (int j = 0; j < at(i)->size(); j++)
125 stream << TableData( at(i)->at(j)->getkey() ) ;
127 stream << EndTableRow;
128 stream << endl;
130 stream << EndTable;
133 snippet* largestcommonsubstring( Slice* slice1, Slice* slice2, uint minimum)
135 snippet* s1, *s2;
136 int index1 (0), index2(0);
138 while (index1 < slice1->size() && index2 < slice2->size() )
140 s1 = slice1->at(index1);
141 s2 = slice2->at(index2);
142 if (s1->getlength() < minimum || s2->getlength() < minimum ) {return NULL;}
143 if (s1->getlength() > s2->getlength() )
144 { index1++; continue;}
146 if (s2->getlength() > s1->getlength() )
147 { index2++; continue; }
149 if (s1 == s2) {return s1;}
151 if ( s2->getkey() > s1->getkey() )
153 index1++;
155 else
157 index2++;
160 return NULL;
163 int largestcommonsubstringlength( Slice* slice1, Slice* slice2, uint minimum)
165 snippet* s;
166 s = largestcommonsubstring(slice1, slice2, minimum);
167 if (!s) return 0;
168 else return s->getlength();
171 // Slice pair collection.
173 void SliceCollection::FindWordPairs(
174 linguistica::ui::status_user_agent& status,
175 CLexicon* lexicon,
176 unsigned int minimumlength)
178 const int THRESHOLD = 4;
180 status.details = "Finding related word pairs.";
181 status.progress.clear();
183 if (lexicon->LogFileOn())
184 *lexicon->GetLogFileStream() <<
185 StartTable <<
186 StartTableRow <<
187 MakeTableHeader("Word 1") <<
188 MakeTableHeader("Word 2") <<
189 EndTableRow;
191 status.progress.set_denominator(size());
192 for (int i = 0; i < size(); i++) {
193 Slice* const s1 = at(i);
194 // XXX. not an operation.
195 status.major_operation = s1->getkey();
196 status.progress = i;
197 for (int j = i + 1; j < size(); j++) {
198 Slice* const s2 = at(j);
199 const int length = largestcommonsubstringlength(s1, s2, minimumlength);
200 if (length > THRESHOLD)
201 if (lexicon->LogFileOn())
202 *lexicon->GetLogFileStream() <<
203 StartTableRow <<
204 TableData(s1->getkey()) <<
205 TableData(s2->getkey()) <<
206 TableData(length) <<
207 TableData(largestcommonsubstring(s1,s2)->getkey()) <<
208 EndTableRow;
211 status.progress.clear();
212 // XXX. not an operation.
213 status.major_operation = "End of computing word slices.";
214 status.details.clear();