CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Stats.cpp
blob4bb644cffc4d9602a14bd3ba5cd508cb3f23b1c7
1 // Implementation of CStats methods
2 // Copyright © 2009 The University of Chicago
3 #include "Stats.h"
5 /* CSTATS (by Jeremy O'Brien)
6 ---------------------------------------------------------------------------------
7 CStats is an object for Text Statistics. Simply put, it reads in a corpus,
8 and outputs the frequently occurring words along with the number of times
9 they occur and the standard deviation for them. Words with low standard
10 deviation (relative to others) will be evenly distributed; words with high
11 standard deviation will be unevenly distributed. Likewise, if a word is
12 evenly distributed, then it is likely that it is a LOW-CONTENT word, while
13 a word that is unevenly distributed will be a HIGH-CONTENT word.
17 #include <cmath>
18 #include <QTextStream>
19 #include <Q3FileDialog>
20 #include <QIODevice>
21 #include <QFile>
22 #include "linguisticamainwindow.h"
23 #include "ui/Status.h"
25 typedef Q3ValueList<int> IntList;
26 typedef QMap<QString, IntList> StringToIntList;
28 /* THINGS TO DO
29 -------------------------------------------------------------------------------
31 (1) better output (dialog box?)
32 (2) getting better accuracy (not using unsigned long ints) while avoiding
33 -1.INF errors
34 (3) have the program read in the same corpus as the "reread corpus" corpus
35 (this might be too confusing for the user)
39 //////////////////////////////////////////////////////////////////////
40 // Construction/Destruction
41 //////////////////////////////////////////////////////////////////////
43 CStats::CStats() {}
45 CStats::CStats(LinguisticaMainWindow* parent)
47 m_parent = parent;
50 CStats::~CStats() {}
52 /// Reads the corpus. It makes m_StringMap into a map of words to
53 /// a list of indices of where these words occur in the corpus. The length
54 /// of this list is therefore the # of occurrences.
55 /// Note that words are make lowercase, commas and periods are removed,
56 /// as are the 's suffixes (temporary fix)
57 void CStats::readCorpus()
59 LinguisticaMainWindow& ui = *m_parent;
60 linguistica::ui::status_user_agent& status = ui.status_display();
62 m_StringMap.clear();
63 QString corpusFileName;
64 QString word;
66 corpusFileName = Q3FileDialog::getOpenFileName( corpusFileName,
67 "TXT Files (*.txt)",
68 NULL,
69 "open file dialog",
70 "Choose a corpus file to open" );
71 if ( corpusFileName.isEmpty() )
72 return;
74 QFile corpusFile(corpusFileName);
76 if ( corpusFile.open( QIODevice::ReadOnly ) )
79 Q3TextStream corpusStream( &corpusFile );
80 int wordNum = 0;
82 // we do stuff for each word
83 status.major_operation.clear();
84 while (!corpusStream.atEnd()) {
85 corpusStream >> word;
87 word = word.lower();
88 //QRegExp punctuation = QRegExp ( "\,|\.|\?|\'|\"");
89 word.remove( "." ); word.remove( "," );
90 word.remove( "'s" ); //temporary
92 // the map will map strings to the indices of where they appear in the corpus
93 m_StringMap[word].append(wordNum);
95 // the corpus will be a QStringList representation of the file
96 //m_Corpus.append(word);
98 wordNum++;
99 if (wordNum % 512 == 0)
100 // XXX. not an operation.
101 status.major_operation =
102 QString("reading word %1...")
103 .arg(wordNum);
105 status.major_operation.clear();
107 // big distinction: m_CorpusSize is the # of words (starting at 1)
108 // wordNum, or the index for an occurrence of a word, starts at 0
109 m_CorpusSize = wordNum;
111 // XXX. not an operation.
112 status.major_operation = "Reading complete";
113 corpusFile.close();
114 output();
117 /////////////////////////////////////////////////////////////////////
118 // Although not a member function, this is the heart of the standard deviation
119 // system. It is very general, taking an array of numbers and the size of the array.
120 /////////////////////////////////////////////////////////////////////
122 long int standev( int* array, int N )
124 /* formula is
125 sigma = sqrt ( 1/N * [ sum (0 to N) xi^2 - N * mu^2 ] )
126 http://en.wikipedia.org/math/46d26dc566d3b4ed00e4c6158bdd253e.png
128 using std::sqrt;
130 long int xi2 = 0;
131 long int sum = 0;
132 long int mu = 0;
133 long int sigma = 0;
135 for (int i = 0; i < N; i++)
137 xi2 += array[i] * array[i];
138 sum += array[i];
141 mu = (long int) sum / (long int) N;
142 sigma = sqrt( ( (long int) xi2 - (long int) N * mu * mu ) / (long int) N );
143 return sigma;
147 /////////////////////////////////////////////////////////////////////
148 // CorpusDistribution takes a word from the corpus, and outputs the standard deviation of
149 // that word. Basically, it finds the distances between each occurrence of the word,
150 // and puts that into standev
151 /////////////////////////////////////////////////////////////////////
153 long int CStats::CorpusDistribution(QString &word) const
156 IntList indices = m_StringMap[word];
157 int N = indices.size();
158 int* array = new int[N];
160 IntList::const_iterator it = indices.begin();
161 int last = indices.last();
162 // IntList::const_iterator end = indices.end();
164 int prevValue = 0;
165 for (int i = 0; i < N; i++)
167 array[i] = *it - prevValue - 1;
168 prevValue = *it;
170 it++;
172 array[0] += m_CorpusSize - last;
174 long int dist = standev(array,N);
175 delete[] array;
176 return dist;
180 /// output is a the glue that holds this class together.
181 /// This means it will probably be the first to go
182 /// when this class is modified.
183 /// It saves the statistics to a file "stats.txt",
184 /// in the form of word, # of occurrences, standard deviation.
185 /// It goes down the map, looking at all the unique words,
186 /// calculating corpus distribution for each one.
187 void CStats::output() const
189 LinguisticaMainWindow& ui = *m_parent;
190 linguistica::ui::status_user_agent& status = ui.status_display();
192 QFile out("stats.txt");
193 out.open(QIODevice::WriteOnly);
194 QTextStream outStream(&out);
196 StringToIntList::const_iterator it = m_StringMap.begin();
197 StringToIntList::const_iterator end = m_StringMap.end();
199 int validWordNum = 0;
200 int everyWordNum = 0;
201 // int cutoff = (double) m_CorpusSize / (double) 6000;
202 int cutoff = 6;
205 outStream << "Word\t\t\t\t# of Occurrences\tStandard Deviation\n"
206 << "__________________________________________________________________________\n";
207 status.major_operation.clear();
208 status.progress.clear();
209 status.progress.set_denominator(m_StringMap.keys().size());
210 while (it != end) {
211 QString word = it.key();
212 int wordLength = word.length();
213 int corpusCount = it.data().size();
215 if (corpusCount > cutoff)
217 double dist = CorpusDistribution(word);
219 outStream << word << "\t\t\t";
220 if (wordLength < 6) outStream << "\t";
221 outStream << corpusCount << "\t\t\t" << dist << "\n";
223 validWordNum++;
224 if (validWordNum % 16 == 0)
225 // XXX. not an operation
226 status.major_operation =
227 QString("writing word %1...")
228 .arg(validWordNum);
231 // else outStream << word << "\t\t\t\t" << m_StringMap[word] << "\n";
232 status.progress = ++everyWordNum;
233 it++;
235 status.progress.clear();
236 // XXX. not an operation
237 status.major_operation = "Output successful!";
238 out.close();