Stats.cpp

   1 // Implementation of CStats methods
   2 // Copyright © 2009 The University of Chicago
   3 #include "Stats.h"
   4
   5 /*                                      CSTATS (by Jeremy O'Brien)
   6 ---------------------------------------------------------------------------------
   7         CStats is an object for Text Statistics. Simply put, it reads in a corpus,
   8         and outputs the frequently occurring words along with the number of times
   9         they occur and the standard deviation for them. Words with low standard
  10         deviation (relative to others) will be evenly distributed; words with high
  11         standard deviation will be unevenly distributed. Likewise, if a word is
  12         evenly distributed, then it is likely that it is a LOW-CONTENT word, while
  13         a word that is unevenly distributed will be a HIGH-CONTENT word.
  14
  15 */
  16
  17 #include <cmath>
  18 #include <QTextStream>
  19 #include <Q3FileDialog>
  20 #include <QIODevice>
  21 #include <QFile>
  22 #include "linguisticamainwindow.h"
  23 #include "ui/Status.h"
  24
  25 typedef Q3ValueList<int> IntList;
  26 typedef QMap<QString, IntList> StringToIntList;
  27
  28 /*                                      THINGS TO DO
  29 -------------------------------------------------------------------------------
  30
  31         (1)     better output (dialog box?)
  32         (2) getting better accuracy (not using unsigned long ints) while avoiding
  33                 -1.INF errors
  34         (3)     have the program read in the same corpus as the "reread corpus" corpus
  35                 (this might be too confusing for the user)
  36
  37 */
  38
  39 //////////////////////////////////////////////////////////////////////
  40 // Construction/Destruction
  41 //////////////////////////////////////////////////////////////////////
  42
  43 CStats::CStats() {}
  44
  45 CStats::CStats(LinguisticaMainWindow* parent)
  46 {
  47         m_parent = parent;
  48 }
  49
  50 CStats::~CStats() {}
  51
  52 /// Reads the corpus. It makes m_StringMap into a map of words to
  53 /// a list of indices of where these words occur in the corpus. The length
  54 /// of this list is therefore the # of occurrences.
  55 /// Note that words are make lowercase, commas and periods are removed,
  56 /// as are the 's suffixes (temporary fix)
  57 void CStats::readCorpus()
  58 {
  59         LinguisticaMainWindow& ui = *m_parent;
  60         linguistica::ui::status_user_agent& status = ui.status_display();
  61
  62         m_StringMap.clear();
  63         QString                                 corpusFileName;
  64         QString                                 word;
  65
  66         corpusFileName = Q3FileDialog::getOpenFileName( corpusFileName,
  67                                                                                                  "TXT Files (*.txt)",
  68                                                                                                  NULL,
  69                                                                                                  "open file dialog",
  70                                                                                                  "Choose a corpus file to open" );
  71         if ( corpusFileName.isEmpty() )
  72                 return;
  73
  74         QFile corpusFile(corpusFileName);
  75
  76         if ( corpusFile.open( QIODevice::ReadOnly ) )
  77         {
  78
  79                 Q3TextStream corpusStream( &corpusFile );
  80                 int wordNum = 0;
  81
  82                 // we do stuff for each word
  83                 status.major_operation.clear();
  84                 while (!corpusStream.atEnd()) {
  85                         corpusStream >> word;
  86
  87                         word = word.lower();
  88                         //QRegExp punctuation = QRegExp ( "\,|\.|\?|\'|\"");
  89                         word.remove( "." ); word.remove( "," );
  90                         word.remove( "'s" );    //temporary
  91
  92                         // the map will map strings to the indices of where they appear in the corpus
  93                         m_StringMap[word].append(wordNum);
  94
  95                         // the corpus will be a QStringList representation of the file
  96                         //m_Corpus.append(word);
  97
  98                         wordNum++;
  99                         if (wordNum % 512 == 0)
 100                                 // XXX. not an operation.
 101                                 status.major_operation =
 102                                         QString("reading word %1...")
 103                                         .arg(wordNum);
 104                 }
 105                 status.major_operation.clear();
 106
 107                 // big distinction: m_CorpusSize is the # of words (starting at 1)
 108                 // wordNum, or the index for an occurrence of a word, starts at 0
 109                 m_CorpusSize = wordNum;
 110         }
 111         // XXX. not an operation.
 112         status.major_operation = "Reading complete";
 113         corpusFile.close();
 114         output();
 115 }
 116
 117 /////////////////////////////////////////////////////////////////////
 118 // Although not a member function, this is the heart of the standard deviation
 119 // system. It is very general, taking an array of numbers and the size of the array.
 120 /////////////////////////////////////////////////////////////////////
 121
 122 long int standev( int* array, int N )
 123 {
 124         /* formula is
 125         sigma = sqrt ( 1/N * [ sum (0 to N) xi^2 - N * mu^2 ] )
 126   http://en.wikipedia.org/math/46d26dc566d3b4ed00e4c6158bdd253e.png
 127     */
 128         using std::sqrt;
 129
 130         long int                                xi2             = 0;
 131         long int                                sum             = 0;
 132         long int                                mu              = 0;
 133         long int                                sigma   = 0;
 134
 135         for (int i = 0; i < N; i++)
 136         {
 137                 xi2 += array[i] * array[i];
 138                 sum += array[i];
 139         }
 140
 141         mu = (long int) sum / (long int) N;
 142         sigma = sqrt( ( (long int) xi2 - (long int) N * mu * mu ) / (long int) N );
 143         return sigma;
 144 }
 145
 146
 147 /////////////////////////////////////////////////////////////////////
 148 // CorpusDistribution takes a word from the corpus, and outputs the standard deviation of
 149 // that word. Basically, it finds the distances between each occurrence of the word,
 150 // and puts that into standev
 151 /////////////////////////////////////////////////////////////////////
 152
 153 long int CStats::CorpusDistribution(QString &word) const
 154 {
 155
 156         IntList         indices         = m_StringMap[word];
 157         int                     N                       = indices.size();
 158         int*            array           = new int[N];
 159
 160         IntList::const_iterator         it                      = indices.begin();
 161         int                                             last            = indices.last();
 162 //      IntList::const_iterator         end                     = indices.end();
 163
 164         int prevValue = 0;
 165         for (int i = 0; i < N; i++)
 166         {
 167                 array[i] = *it - prevValue - 1;
 168                 prevValue = *it;
 169
 170                 it++;
 171         }
 172         array[0] += m_CorpusSize - last;
 173
 174         long int dist = standev(array,N);
 175         delete[] array;
 176         return dist;
 177
 178 }
 179
 180 /// output is a the glue that holds this class together.
 181 /// This means it will probably be the first to go
 182 /// when this class is modified.
 183 /// It saves the statistics to a file "stats.txt",
 184 /// in the form of word, # of occurrences, standard deviation.
 185 /// It goes down the map, looking at all the unique words,
 186 /// calculating corpus distribution for each one.
 187 void CStats::output() const
 188 {
 189         LinguisticaMainWindow& ui = *m_parent;
 190         linguistica::ui::status_user_agent& status = ui.status_display();
 191
 192         QFile out("stats.txt");
 193         out.open(QIODevice::WriteOnly);
 194         QTextStream outStream(&out);
 195
 196         StringToIntList::const_iterator                                 it                      = m_StringMap.begin();
 197         StringToIntList::const_iterator                                 end                     = m_StringMap.end();
 198
 199         int validWordNum = 0;
 200         int everyWordNum = 0;
 201 //      int cutoff = (double) m_CorpusSize / (double) 6000;
 202         int cutoff = 6;
 203
 204
 205         outStream << "Word\t\t\t\t# of Occurrences\tStandard Deviation\n"
 206                       << "__________________________________________________________________________\n";
 207         status.major_operation.clear();
 208         status.progress.clear();
 209         status.progress.set_denominator(m_StringMap.keys().size());
 210         while (it != end) {
 211                 QString         word                    = it.key();
 212                 int                     wordLength              = word.length();
 213                 int                     corpusCount             = it.data().size();
 214
 215                 if (corpusCount > cutoff)
 216                 {
 217                         double dist = CorpusDistribution(word);
 218
 219                         outStream << word << "\t\t\t";
 220                         if (wordLength < 6) outStream << "\t";
 221                         outStream << corpusCount << "\t\t\t" << dist << "\n";
 222
 223                         validWordNum++;
 224                         if (validWordNum % 16 == 0)
 225                                 // XXX. not an operation
 226                                 status.major_operation =
 227                                         QString("writing word %1...")
 228                                         .arg(validWordNum);
 229
 230                 }
 231 //              else outStream << word << "\t\t\t\t" << m_StringMap[word] << "\n";
 232                 status.progress = ++everyWordNum;
 233                 it++;
 234         }
 235         status.progress.clear();
 236         // XXX. not an operation
 237         status.major_operation = "Output successful!";
 238         out.close();
 239 }