Stem.h

   1 // Common type for stems to which to attach affixes and words to analyze
   2 // Copyright © 2009 The University of Chicago
   3 #ifndef STEM_H
   4 #define STEM_H
   5
   6 class CStem;
   7
   8 #include "LParse.h"
   9 #include <QString>
  10 #include <QMap>
  11 #include "SparseVector.h"
  12 #include "Parse.h"
  13 #include "StemListViewItem.h"
  14 template<class V> class QList;
  15
  16 /// CStem objects are the key to signature discovery.
  17 ///
  18 /// A CStem represents a morpheme to which an affix might be attached
  19 /// on the left or right.  The CStem itself may be further analyzed,
  20 /// so that in the representation as stem + affix each word has an
  21 /// implied parse tree.
  22 ///
  23 /// A single instance represents all uses of a stem, and its main
  24 /// purpose is to remember the list of affixes it appears with so
  25 /// they can be considered as a potential signature.  In the
  26 /// prefix/suffix-based morphology code, first the affix and stem
  27 /// collections are built together and then the signature collection is
  28 /// built using data stored with the stems.
  29 ///
  30 /// Once a signature is built, CStem instances maintain the information
  31 /// a signature needs to remember about each stem it appears with.  In
  32 /// particular, the corpus count of a stem is kept here.
  33 ///
  34 /// Compound discovery code uses CStem instances in a similar way:
  35 /// the various parses of a word, the likelihood that a parse is of the
  36 /// form stem + affix rather than stem + stem, and other relevant data
  37 /// are maintained in CStem instances before compound objects are built,
  38 /// and afterwards the compound objects defer to the underlying CStem
  39 /// for some relevant counts.
  40 ///
  41 /// Lastly, phonological information in Linguistica is stored per-word,
  42 /// in the hope that it can help improve the discovery of
  43 /// affix boundaries and compounds.  In particular, the representation
  44 /// length (information content) of a stem or word disregarding
  45 /// morphology can be used in calculating its description length
  46 /// for building more complex models.
  47 ///
  48 /// Sometimes a CStem is used in ways that emphasize its role as
  49 /// something to be analyzed (i.e., something to be split: an entire
  50 /// word or a stem from a MiniLexicon’s word collection), while at
  51 /// other times, a CStem is primarily a constituent for other stems or
  52 /// compounds.  We say in the former case that the CStem is “playing
  53 /// the role of a word”, and in the latter, “it plays the role of a
  54 /// stem”, even though in both cases the representation is the same.
  55 /// This distinction comes up when displaying a word collection or
  56 /// stem collection through the UI: only the properties relevant to
  57 /// the role considered are displayed.
  58 class CStem : public CLParse {
  59 protected:
  60         static class CLexicon* m_Lexicon;
  61 public:
  62         enum type {
  63                 NORMAL            = 1,
  64                 STEM_PLUS_SUFFIX  = 2,
  65                 STEM_NORMAL       = 4,
  66                 STEM_COMPOUND     = 8,
  67                 BIWORD_COMPOUND   = 16,
  68                 MULTIPLE_COMPOUND = 32,
  69                 POSSIBLE_COMPOUND = 64,
  70                 NUMBER            = 128,
  71                 UNKNOWN           = 256,
  72                 ENDS_IN_HYPHEN    = 512,
  73                 POLYWORD_PIECE    = 1024,
  74         };
  75         enum eAddBoundarySymbols {
  76                 NO_BOUNDARIES,
  77                 BOUNDARIES,
  78         };
  79         enum ePhonologySplitType {
  80                 Split_LeaveSlot,
  81                 Split_LeaveCopy,
  82         };
  83 protected:
  84         int             m_WordCount;
  85         CParse*         m_BrokenForm;
  86         CParse          m_SuffixList; //sister affixes, not daughter
  87         class CSignature* m_pSuffixSignature;
  88         class CSignature* m_pPrefixSignature;
  89         CParse          m_PrefixList; //sister affixes, not daughter
  90         int             m_Regular;
  91         bool            m_SimpleFlag; //if TRUE, then it's not further decomposable.
  92         enum type       m_StemType;
  93         int             m_StemLoc;
  94         int             m_Stem2Loc;
  95         int             m_NumberOfStems;
  96         int             m_PrefixLoc;
  97         int             m_SuffixLoc;
  98         QString         m_Confidence;
  99         CStem*          m_pStem;
 100         CParse          m_strStem;
 101         CParse          m_strSuffix;
 102         CParse          m_strPrefix;
 103         class CSuffix*  m_pSuffix;
 104         class CPrefix*  m_pPrefix;
 105         mutable double  m_LengthOfPointerToMe; ///< Based on corpus counts, and the Stem collection set.
 106
 107         QList<CStem*>* m_WordPtrList;
 108         CSparseVector   m_LeftNeighbors;
 109         CSparseVector   m_RightNeighbors;
 110
 111         // compounding.
 112         class           CEarleyParser* m_MyEarleyParser;
 113         double          m_CompoundCount;
 114         double          m_Affixness;
 115
 116         // phonology.
 117         CParse          m_Phonology_Tier1;
 118         CParse          m_Phonology_Tier2;
 119         CParse          m_Phonology_Tier1_Skeleton;
 120
 121         double          m_UnigramLogProb;
 122         double          m_BigramLogProb;
 123
 124         double          m_BigramComplexity; // average log prob
 125         double          m_UnigramComplexity; // average log prob
 126         mutable double  m_PhonologicalContent; // value depends on what we have computed so far.
 127         double          m_HMM_LogProbability;
 128
 129         // First Boltzmann model: only MI between tier 2 neighbors:
 130         double          m_Tier2_LocalMI_Score; //Only the MI on tier 2
 131         double          m_LocalMI_TotalBoltzmannScore; // Total score, including tier 1 bigram score
 132         double          m_LocalMI_Plog;
 133
 134         // Second Boltzmann model: MI between more distant tier 2 elements
 135         double          m_Tier2_DistantMI_Score;
 136         double          m_DistantMI_TotalBoltzmannScore;
 137         double          m_DistantMI_Plog;
 138
 139 public:
 140         // Some Tier one Phonology Info for Graphica display
 141         QMap<int, QString> m_phonologies;
 142         QMap<int, double> m_unigrams;
 143         QMap<int, double> m_mis;
 144         int             m_countofunigrams;
 145         int             m_countofmis;
 146         double          m_maxpositive;
 147         double          m_maxnegative;
 148         bool            m_donephonology;
 149
 150 public:
 151         // construction/destruction.
 152
 153         CStem(class CMiniLexicon* mini = 0);
 154         CStem(const CStem& x);
 155         CStem(const class CStringSurrogate&, class CMiniLexicon* mini = 0);
 156         CStem(const CLParse&);
 157         CStem(const CParse&, class CMiniLexicon* mini = 0);
 158         virtual ~CStem();
 159
 160         // copy assignment.
 161
 162         void    operator=(const CStem &);
 163         void    Copy(CStem&);
 164
 165         // description length.
 166         double GetLengthOfPointerToMe() const;
 167         double GetLengthOfPointerToMe_2();      ///< deprecated
 168         void SetLengthOfPointerToMe(double L) { m_LengthOfPointerToMe = L; }
 169         double CalculatePhonologicalInformationContent(class CLexicon*) const;
 170         double GetPhonologicalInformationContent(class CLexicon* = 0) const;
 171         float CalculateDL() const;
 172
 173         // phonology.
 174         CParse* GetPhonology_Tier1();
 175         CParse* GetPhonology_Tier2();
 176         CParse* GetPhonology_Tier1_Skeleton();
 177         double GetTier2_LocalMI_Score() { return m_Tier2_LocalMI_Score; }
 178         double GetLocalMI_TotalBoltzmannScore();
 179         double GetLocalMI_Plog() { return m_LocalMI_Plog; }
 180         double GetTier2_DistantMI_Score() { return m_Tier2_DistantMI_Score; }
 181         double GetDistantMI_TotalBoltzmannScore();
 182         double GetDistantMI_Plog() { return m_DistantMI_Plog; }
 183         double GetHMM_LogProbability() { return m_HMM_LogProbability; }
 184         void ComputeProbabilities(class CWordCollection* words);
 185         void ComputeBoltzmannProbabilities(double Z, double ZStar);
 186         void GetPhonogyTier1InfoForGraph(class CWordCollection* words);
 187         QString GetProbabilityInformation();
 188         void SplitPhonologyToTiers(enum ePhonologySplitType type,
 189                 CParse& PhonesToMove);
 190         void CreateCVTemplate(CParse* Vowels);
 191         void CreatePhonologyFromOrthography(enum eAddBoundarySymbols = BOUNDARIES);
 192
 193
 194         bool                    IsAnalyzed();
 195         bool                    ContainsPrefix(class CPrefix*) const;
 196         bool                    ContainsPrefix(const class CStringSurrogate&) const;
 197         bool                    ContainsSuffix(class CSuffix*) const;
 198
 199         CParse  DisplayBrokenForm();    // for MT, etc.
 200         QString DisplayStemType() const;
 201 //      TODO: get CRule  int FindRule (CStem*, CRule&) const;
 202
 203         // accessors
 204
 205         double                  GetAffixness()      const   { return m_Affixness; }
 206         double                  GetCompoundCount()  const   { return m_CompoundCount; }
 207         QString                 GetConfidence()     const   { return m_Confidence; }
 208         int                     GetCorpusCount()    const   { return linguistica::corpus_count::GetCorpusCount(); }
 209         class  CEarleyParser*   GetMyEarleyParser() const   { return m_MyEarleyParser;}
 210         int                     GetNumberOfPrefixes() const { return m_PrefixList.Size(); }
 211         int                     GetNumberOfStems() const;
 212         int                     GetNumberOfSuffixes() const { return m_SuffixList.Size(); }
 213         void                    GetPrefix(CParse&) const;
 214         class CStringSurrogate  GetPrefix()         const;
 215         CParse*                 GetPrefixList()             { return &m_PrefixList; }
 216         int                     GetPrefixLoc()      const   { return m_PrefixLoc; }
 217         class   CPrefix*        GetPrefixPtr()      const   { return m_pPrefix; }
 218         class   CSignature*     GetPrefixSignature() const  { return m_pPrefixSignature; }
 219         int                     GetRegular()        const   { return m_Regular; }
 220         bool                    GetSimpleFlag()     const   { return m_SimpleFlag; }
 221         float                   GetSortingQuantity() const; // TODO
 222         QString                 GetSortingString();
 223         void                    GetStem(CParse&)    const;
 224         class CStringSurrogate  GetStem();
 225         int                     GetStem2Loc()       const { return m_Stem2Loc; }
 226         int                     GetStemLoc()        const { return m_StemLoc; }
 227         CStem*                  GetStemPtr()        const { return m_pStem; }
 228         enum type               GetStemType()       const { return m_StemType; }
 229         void                    GetSuffix(CParse&)  const;
 230         class CStringSurrogate  GetSuffix()         const;
 231         CParse*                 GetSuffixList()             { return &m_SuffixList; }
 232         int                     GetSuffixLoc()      const { return m_SuffixLoc; }
 233         class CSuffix*          GetSuffixPtr()      const { return m_pSuffix; }
 234         class CSignature*       GetSuffixSignature() const  { return m_pSuffixSignature; }
 235         int                     GetWordCount() const { return m_WordCount; }
 236
 237
 238         //Phonology
 239         double                  GetUnigramLogProb()         { return m_UnigramLogProb; }
 240         double                  GetBigramLogProb()          { return m_BigramLogProb; }
 241         CParse*                 GetTier1()                  { return &m_Phonology_Tier1; }
 242         const CParse*           GetTier1()          const   { return &m_Phonology_Tier1; }
 243         CParse*                 GetTier2()                  { return &m_Phonology_Tier2; }
 244         const CParse*           GetTier2()          const   { return &m_Phonology_Tier2; }
 245         CParse*                 GetTier1_Skeleton()         { return &m_Phonology_Tier1_Skeleton; }
 246         const   CParse*         GetTier1_Skeleton() const   { return &m_Phonology_Tier1_Skeleton; }
 247
 248         double                  GetUnigramComplexity() { return m_UnigramComplexity; }
 249         double                  GetBigramComplexity() { return m_BigramComplexity; }
 250
 251         QList<CStem*>*          GetWordPtrList()                                { return m_WordPtrList; }
 252         CStem*                  GetWord(int wordno)                     const { return m_WordPtrList->at(wordno);}
 253         int                     GetNumberOfWords()                     const { return m_WordPtrList->count(); }
 254         enum type               GetWordType()                           const   { return m_StemType; }
 255
 256         bool                    HasAPrefix() const;
 257         bool                    HasASuffix() const;
 258
 259         bool                    IsValid() const;
 260         int                     SF(int) const; // SuccessorFrequency;
 261         void                    StemListDisplay(class Q3ListView* dest,  QMap<QString, QString>* filter = 0, int char_count = 27);
 262         void                    WordListDisplay(class Q3ListView* dest,
 263                 QMap<QString, QString>* filter = 0,
 264                 enum CWordListViewItem::display_mode =
 265                         CWordListViewItem::MiniLexicon_MorphologyStuffFirst,
 266                 int char_count = 27);
 267
 268         // typical filter: m_pMyMini->GetOutFilter()
 269         void OutputStem(class Q3TextStream& outf, int index,
 270                 QMap<QString, QString>* filter);
 271         void OutputWord(class Q3TextStream& outf, int index,
 272                 QMap<QString, QString>* filter);
 273
 274         // mutators.
 275
 276         void                    AddNULLPrefix();
 277         void                    AddNULLSuffix();
 278         void                    AddPrefix(class CPrefix*);
 279         void                    AddPrefix(const class CStringSurrogate&);
 280         void                    AddSuffix(class CSuffix*);
 281         void                    AddSuffix(const class CStringSurrogate&);
 282         bool                    AddWord(CStem*);
 283         void                    AppendToConfidence(const QString string)        { m_Confidence += string; }
 284         void                    AttachPrefixSignature(class CSignature*);
 285         void                    AttachSuffixSignature(class CSignature*);
 286         void                    AttachWordAndSuffixalStem(CStem*);
 287         void                    AttachWordAndPrefixalStem(CStem*);
 288         void                    AttachWordStemAndPrefix(CStem*, class CPrefix*);
 289         void                    AttachWordStemAndSuffix(CStem*, class CSuffix*);
 290
 291         class CSignature*       ChangeSuffixSignature(class CSignature* pNewSig);
 292         void                    ClearPointers(); // to Stem, Suffix, Signature;
 293         void                    ClearPrefixStemSplit();
 294         void                    ClearRootSuffixSplit();
 295         void                    CopyStemInformation(CStem*);
 296         void                    CopySuffixList(CParse*);
 297
 298         void                    DeleteFactorization();
 299         void                    DeletePrefix();  // Arabic morphology
 300         void                    DetachPrefix(class CPrefix*);
 301         void                    DetachSuffix(class CSuffix*);
 302
 303         void                    IncrementSuffixLocs();
 304         void                    IncrementCompoundCount(double d = 1.0)            { m_CompoundCount += d; }
 305         void                    IncrementWordCount(int n = 1);
 306
 307
 308         void                    RepairSuffixList(const class CMiniLexicon*);
 309         void                    RemoveWordFromWordPtrList(CStem*);
 310
 311         void                    SetAffixness(double d)                          { m_Affixness = d; }
 312         void                    SetCompoundCount(double d)                      { m_CompoundCount = d; }
 313         void                    SetConfidence(const QString conf)               { m_Confidence = conf; }
 314         static void             SetLexicon(CLexicon* Lex)                       { m_Lexicon = Lex; }
 315         void                    SetNumberOfStems(int n)                         { m_NumberOfStems = n; }
 316         void                    SetPrefixLoc(int n)                             { m_PrefixLoc = n; }
 317         void                    SetPrefixPtr(class CPrefix* pPre)               { m_pPrefix = pPre; }
 318         void                    SetPrefixSignature(class CSignature* pSig)      { m_pPrefixSignature = pSig; }
 319         void                    SetStem2Loc(int n)                              { m_Stem2Loc = n; }
 320         void                    SetStemLoc(int n)                               { m_StemLoc = n; }
 321         void                    SetStemPtr(CStem* pStm)                         { m_pStem = pStm; }
 322         void                    SetStemType(enum type e)                        { m_StemType = e; }
 323         void                    SetSuffixList(CParse* pParse)                   { m_SuffixList = pParse; }
 324         void                    SetSuffixLoc(int n)                             { m_SuffixLoc = n; }
 325         void                    SetSuffixPtr(class CSuffix* pSuf)               { m_pSuffix = pSuf; }
 326         void                    SetSuffixSignature(class CSignature* pSig)      { m_pSuffixSignature = pSig; }
 327         void                    SetWordCount(int n)                             { m_WordCount = n; }
 328         void                    SetWordType(enum type WT)                       { m_StemType = WT; }
 329         void                    SetMyEarleyParser(class CEarleyParser* parser)  { m_MyEarleyParser = parser; }
 330         void                    SetPhonology_Tier1(CParse*);
 331         void                    ShiftStemSuffixBoundary(int);
 332         void                    ShiftPrefixStemBoundary(int);
 333         void                    SetHMM_LogProbability(double logprob)           { m_HMM_LogProbability = logprob;}
 334 };
 335
 336 inline CParse* CStem::GetPhonology_Tier1() { return &m_Phonology_Tier1; }
 337 inline CParse* CStem::GetPhonology_Tier2() { return &m_Phonology_Tier2; }
 338 inline CParse* CStem::GetPhonology_Tier1_Skeleton()
 339 { return &m_Phonology_Tier1_Skeleton; }
 340 inline double CStem::GetLocalMI_TotalBoltzmannScore()
 341 { return m_LocalMI_TotalBoltzmannScore; }
 342 inline double CStem::GetDistantMI_TotalBoltzmannScore()
 343 { return m_DistantMI_TotalBoltzmannScore; }
 344
 345 #endif // STEM_H