HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / CorpusCount.h
blob17d97db1ba1f98f0b3ee8434d267ba48f453aae0
1 // Helper class with number-of-occurences-in-corpus for a linguistic construct
2 // Copyright © 2009 The University of Chicago
3 #ifndef CORPUSCOUNT_H
4 #define CORPUSCOUNT_H
6 namespace linguistica {
7 class corpus_count;
10 #include <iostream>
12 /// It is useful to remember how many times each morpheme, phoneme,
13 /// morphological signature, and part of speech occurs in the corpus
14 /// to be analyzed:
15 ///
16 /// * The information content of an instance of that construct is
17 /// -log2(corpus count of construct /
18 /// corpus count of genre of construct)
19 /// [So, for example, the encoding length of "hello" is
20 /// -log2(# of appearances of "hello" / total # of words in corpus)]
21 ///
22 /// * When displaying results, it is convenient to sort by corpus count.
23 ///
24 /// * If the corpus count is very low, we can skip some costly operations
25 /// on a construct and be reasonably sure we are not introducing too
26 /// much error.
27 ///
28 /// This class maintains a corpus count for a linguistic construct and
29 /// provides methods for accessing and modifying it.
30 ///
31 /// The count is a "token count" (total number of appearances), not
32 /// "use count" (number of distinct contexts in which this appears).
33 class linguistica::corpus_count {
34 /// Always nonnegative.
35 int m_corpus_count;
36 public:
37 /// thrown if corpus count drops below zero
38 struct underflow { };
39 private:
40 void check_underflow()
42 static bool check_enabled = false;
44 if (m_corpus_count < 0) {
45 using std::cerr;
46 using std::endl;
48 if (check_enabled) {
49 // XXX. This shouldn't happen.
50 cerr << "corpus count underflow for object ";
51 cerr << this << ": ";
52 cerr << m_corpus_count << endl;
53 check_enabled = false;
55 m_corpus_count = 0;
58 public:
59 corpus_count() : m_corpus_count(0) { }
60 corpus_count(int n) : m_corpus_count(n)
61 { check_underflow(); }
62 // copy constructor, assignment operator defined implicitly.
63 virtual ~corpus_count() { }
65 corpus_count& operator=(int n) { SetCorpusCount(n); return *this; }
67 inline int GetCorpusCount() const { return m_corpus_count; }
69 void IncrementCorpusCount(int incr)
70 { m_corpus_count += incr; check_underflow(); }
72 void SetCorpusCount(int n)
73 { m_corpus_count = n; check_underflow(); }
76 #endif // CORPUSCOUNT_H