1 // Implementation of CStats methods
2 // Copyright © 2009 The University of Chicago
5 /* CSTATS (by Jeremy O'Brien)
6 ---------------------------------------------------------------------------------
7 CStats is an object for Text Statistics. Simply put, it reads in a corpus,
8 and outputs the frequently occurring words along with the number of times
9 they occur and the standard deviation for them. Words with low standard
10 deviation (relative to others) will be evenly distributed; words with high
11 standard deviation will be unevenly distributed. Likewise, if a word is
12 evenly distributed, then it is likely that it is a LOW-CONTENT word, while
13 a word that is unevenly distributed will be a HIGH-CONTENT word.
18 #include <QTextStream>
19 #include <Q3FileDialog>
22 #include "linguisticamainwindow.h"
23 #include "ui/Status.h"
25 typedef Q3ValueList
<int> IntList
;
26 typedef QMap
<QString
, IntList
> StringToIntList
;
29 -------------------------------------------------------------------------------
31 (1) better output (dialog box?)
32 (2) getting better accuracy (not using unsigned long ints) while avoiding
34 (3) have the program read in the same corpus as the "reread corpus" corpus
35 (this might be too confusing for the user)
39 //////////////////////////////////////////////////////////////////////
40 // Construction/Destruction
41 //////////////////////////////////////////////////////////////////////
45 CStats::CStats(LinguisticaMainWindow
* parent
)
52 /// Reads the corpus. It makes m_StringMap into a map of words to
53 /// a list of indices of where these words occur in the corpus. The length
54 /// of this list is therefore the # of occurrences.
55 /// Note that words are make lowercase, commas and periods are removed,
56 /// as are the 's suffixes (temporary fix)
57 void CStats::readCorpus()
59 LinguisticaMainWindow
& ui
= *m_parent
;
60 linguistica::ui::status_user_agent
& status
= ui
.status_display();
63 QString corpusFileName
;
66 corpusFileName
= Q3FileDialog::getOpenFileName( corpusFileName
,
70 "Choose a corpus file to open" );
71 if ( corpusFileName
.isEmpty() )
74 QFile
corpusFile(corpusFileName
);
76 if ( corpusFile
.open( QIODevice::ReadOnly
) )
79 Q3TextStream
corpusStream( &corpusFile
);
82 // we do stuff for each word
83 status
.major_operation
.clear();
84 while (!corpusStream
.atEnd()) {
88 //QRegExp punctuation = QRegExp ( "\,|\.|\?|\'|\"");
89 word
.remove( "." ); word
.remove( "," );
90 word
.remove( "'s" ); //temporary
92 // the map will map strings to the indices of where they appear in the corpus
93 m_StringMap
[word
].append(wordNum
);
95 // the corpus will be a QStringList representation of the file
96 //m_Corpus.append(word);
99 if (wordNum
% 512 == 0)
100 // XXX. not an operation.
101 status
.major_operation
=
102 QString("reading word %1...")
105 status
.major_operation
.clear();
107 // big distinction: m_CorpusSize is the # of words (starting at 1)
108 // wordNum, or the index for an occurrence of a word, starts at 0
109 m_CorpusSize
= wordNum
;
111 // XXX. not an operation.
112 status
.major_operation
= "Reading complete";
117 /////////////////////////////////////////////////////////////////////
118 // Although not a member function, this is the heart of the standard deviation
119 // system. It is very general, taking an array of numbers and the size of the array.
120 /////////////////////////////////////////////////////////////////////
122 long int standev( int* array
, int N
)
125 sigma = sqrt ( 1/N * [ sum (0 to N) xi^2 - N * mu^2 ] )
126 http://en.wikipedia.org/math/46d26dc566d3b4ed00e4c6158bdd253e.png
135 for (int i
= 0; i
< N
; i
++)
137 xi2
+= array
[i
] * array
[i
];
141 mu
= (long int) sum
/ (long int) N
;
142 sigma
= sqrt( ( (long int) xi2
- (long int) N
* mu
* mu
) / (long int) N
);
147 /////////////////////////////////////////////////////////////////////
148 // CorpusDistribution takes a word from the corpus, and outputs the standard deviation of
149 // that word. Basically, it finds the distances between each occurrence of the word,
150 // and puts that into standev
151 /////////////////////////////////////////////////////////////////////
153 long int CStats::CorpusDistribution(QString
&word
) const
156 IntList indices
= m_StringMap
[word
];
157 int N
= indices
.size();
158 int* array
= new int[N
];
160 IntList::const_iterator it
= indices
.begin();
161 int last
= indices
.last();
162 // IntList::const_iterator end = indices.end();
165 for (int i
= 0; i
< N
; i
++)
167 array
[i
] = *it
- prevValue
- 1;
172 array
[0] += m_CorpusSize
- last
;
174 long int dist
= standev(array
,N
);
180 /// output is a the glue that holds this class together.
181 /// This means it will probably be the first to go
182 /// when this class is modified.
183 /// It saves the statistics to a file "stats.txt",
184 /// in the form of word, # of occurrences, standard deviation.
185 /// It goes down the map, looking at all the unique words,
186 /// calculating corpus distribution for each one.
187 void CStats::output() const
189 LinguisticaMainWindow
& ui
= *m_parent
;
190 linguistica::ui::status_user_agent
& status
= ui
.status_display();
192 QFile
out("stats.txt");
193 out
.open(QIODevice::WriteOnly
);
194 QTextStream
outStream(&out
);
196 StringToIntList::const_iterator it
= m_StringMap
.begin();
197 StringToIntList::const_iterator end
= m_StringMap
.end();
199 int validWordNum
= 0;
200 int everyWordNum
= 0;
201 // int cutoff = (double) m_CorpusSize / (double) 6000;
205 outStream
<< "Word\t\t\t\t# of Occurrences\tStandard Deviation\n"
206 << "__________________________________________________________________________\n";
207 status
.major_operation
.clear();
208 status
.progress
.clear();
209 status
.progress
.set_denominator(m_StringMap
.keys().size());
211 QString word
= it
.key();
212 int wordLength
= word
.length();
213 int corpusCount
= it
.data().size();
215 if (corpusCount
> cutoff
)
217 double dist
= CorpusDistribution(word
);
219 outStream
<< word
<< "\t\t\t";
220 if (wordLength
< 6) outStream
<< "\t";
221 outStream
<< corpusCount
<< "\t\t\t" << dist
<< "\n";
224 if (validWordNum
% 16 == 0)
225 // XXX. not an operation
226 status
.major_operation
=
227 QString("writing word %1...")
231 // else outStream << word << "\t\t\t\t" << m_StringMap[word] << "\n";
232 status
.progress
= ++everyWordNum
;
235 status
.progress
.clear();
236 // XXX. not an operation
237 status
.major_operation
= "Output successful!";