CompoundCollection.cpp

   1 // Implementation of CCompoundCollection methods
   2 // Copyright © 2009 The University of Chicago
   3 #include "CompoundCollection.h"
   4
   5 #include <QList>
   6 #include "linguisticamainwindow.h"
   7 #include "ui/Status.h"
   8 #include "MiniLexicon.h"
   9 #include "Lexicon.h"
  10 #include "Linker.h"
  11 #include "Edge.h"
  12 #include "Stem.h"
  13 #include "LinkerCollection.h"
  14 #include "WordCollection.h"
  15 #include "log2.h"
  16
  17 class CPhoneCollection;
  18
  19 //////////////////////////////////////////////////////////////////////
  20 // Construction/Destruction
  21 //////////////////////////////////////////////////////////////////////
  22
  23 CCompoundCollection::CCompoundCollection( CMiniLexicon* Lex )
  24 {
  25         m_pMiniLex = Lex;
  26         if( m_pMiniLex ) m_pLexicon = m_pMiniLex->GetLexicon();
  27         m_ComponentCount = 0.0;
  28         m_LinkerCount = 0.0;
  29 }
  30
  31 CCompoundCollection::CCompoundCollection( CLexicon* Lex )
  32 {
  33         m_pMiniLex = NULL;
  34         m_pLexicon = Lex;
  35         m_ComponentCount = 0.0;
  36         m_LinkerCount = 0.0;
  37 }
  38
  39 CCompoundCollection::~CCompoundCollection()
  40 {
  41         m_pMiniLex = NULL;
  42         m_pLexicon = NULL;
  43         m_ComponentCount = 0.0;
  44         m_LinkerCount = 0.0;
  45 }
  46
  47 void CCompoundCollection::FindMostProbableParse()
  48 {
  49         QList<CStem*>*  stemSet;
  50         CMiniLexicon*           mini;
  51         mini = NULL;
  52
  53         CCompound*                      pCompound;
  54         CLinker*                        pLinker;
  55         CEdge*                          pEdge;
  56         CParse                          oneParse;
  57
  58         int j;
  59
  60         linguistica::ui::status_user_agent status = m_pLexicon->status_display();
  61
  62         status.major_operation = "Counting Components";
  63         status.progress.clear();
  64         status.progress.set_denominator(GetCount());
  65         Sort(KEY);
  66         for (int i = 0; i < GetCount(); i++) {
  67                 status.progress = i;
  68                 pCompound = GetAtSort(i);
  69
  70                 double component_count,
  71                            parse_log_prob,
  72                            smallest_plog = 0.0;
  73
  74                 uint most_probable_parse = 0;
  75
  76                 for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
  77                 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
  78                 { //      pEdge = pCompound->GetParses()->at(z);
  79                         pEdge->GetParse( &oneParse );
  80                         parse_log_prob = 0.0;
  81                         for( j = 1; j <= oneParse.Size(); j++ )
  82                         {
  83                                 stemSet = m_pLexicon->GetStemSet( oneParse.GetPiece(j) );
  84                                 if( stemSet )
  85                                 {
  86                                         // TODO: we might be adding the same stem more than once, check and fix
  87                                         component_count = 0.0;
  88                                         if (stemSet->size() > 0) {
  89                                                 for (int y = 0; y < stemSet->size(); y++) {
  90                                                         CStem* pStem = stemSet->at(y);
  91                                                         component_count += pStem->GetCompoundCount();
  92                                                 }
  93
  94                                                 CStem* pStem = stemSet->at(stemSet->size() - 1);
  95                                                 parse_log_prob -= base2log( pStem->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
  96                                         }
  97                                 }
  98                                 else
  99                                 {
 100                                         // Since this is not in the list of stems, it could be an
 101                                         // unanalyzed word. We should only look at the last mini-
 102                                         // lexicon
 103                                         for( i = m_pLexicon->GetMiniSize(); i >= 0; i++ )
 104                                         {
 105                                                 mini = m_pLexicon->GetMiniLexicon(i);
 106                                                 if( mini )
 107                                                 {
 108                                                         if (CStem* pStem = *mini->GetWords() ^= oneParse.GetPiece(j)) {
 109                                                                 parse_log_prob -= base2log( pStem->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
 110                                                         }
 111                                                         break;
 112                                                 }
 113                                         }
 114
 115                                         // Otherwise must be a linker
 116                                         if( !mini )
 117                                         {
 118                                                 pLinker = *m_pLexicon->GetLinkers() ^= oneParse.GetPiece(j);
 119                                                 Q_ASSERT( pLinker );
 120                                                 if( pLinker )
 121                                                 {
 122                                                         parse_log_prob -= base2log( pLinker->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
 123                                                 }
 124                                         }
 125                                 }
 126                         }
 127
 128                         if( parse_log_prob < smallest_plog || smallest_plog == 0.0 )
 129                         {
 130                                 smallest_plog = parse_log_prob;
 131                                 most_probable_parse = pCompound->GetParses()->find( pEdge );
 132                         }
 133                 }
 134                 pCompound->SetBestParse( most_probable_parse );
 135
 136                 m_pLexicon->UpdateCompound( pCompound->Display() );
 137         }
 138         status.progress.clear();
 139         status.major_operation.clear();
 140 }
 141
 142 void CCompoundCollection::CheckAndRecount()
 143 {
 144         CCompound* pCompound;
 145         CEdge* pParse;
 146
 147         CParse parse;
 148
 149         StemSet* pStemSet;
 150         CStem* pStem;
 151
 152         QList<CCompound*> invalidCmpds;
 153
 154         // Reset all stem's compound counts to 0.0
 155         Q3DictIterator<StemSet> it( *m_pLexicon->GetAllStems() );
 156         for( ; it.current(); ++it )
 157         {
 158                 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
 159                 for (int z= 0; z < it.current()->size(); z++)
 160                 {
 161                         pStem = it.current()->at(z);
 162                         pStem->SetCompoundCount( 0.0 );
 163                 }
 164         }
 165
 166         it = Q3DictIterator<StemSet>( *m_pLexicon->GetAllWords() );
 167         for( ; it.current(); ++it )
 168         {
 169                 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
 170                 for (int z = 0; z < it.current()->size(); z++)
 171                 {       pStem = it.current()->at(z);
 172                         pStem->SetCompoundCount( 0.0 );
 173                 }
 174         }
 175
 176         // Count components and remove compounds with missing components
 177         for (int i = 0; i < GetCount(); ++i) {
 178                 pCompound = GetAt(i);
 179
 180                 QList<CEdge*> invalidParses;
 181
 182                 for( pParse = pCompound->GetParses()->first(); pParse; pParse = pCompound->GetParses()->next() )
 183                 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
 184                 {   //    pParse = pCompound->GetParses()->at(z);
 185                         pParse->GetParse( &parse );
 186
 187                         for( int j = 1; j <= parse.Size(); j++ )
 188                         {
 189                                 pStemSet = (*m_pLexicon->GetAllStems())[ parse.GetPiece(j).Display() ];
 190                                 if( !pStemSet ) pStemSet = (*m_pLexicon->GetAllWords())[ parse.GetPiece(j).Display() ];
 191                                 if( !pStemSet )
 192                                 {
 193                                         invalidParses.append( pParse );
 194                                         break;
 195                                 }
 196
 197                                 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
 198                                 for (int y = 0; y < pStemSet->size(); y++)
 199                                 {       pStem = pStemSet->at(y);
 200                                         pStem->IncrementCompoundCount(
 201                                                 double(pCompound->GetCorpusCount()) /
 202                                                 pCompound->GetParses()->count());
 203                                 }
 204                         }
 205                 }
 206
 207                 // Delete invalid parses
 208                 //for( pParse = invalidParses.first(); pParse; pParse = invalidParses.next() )
 209                 for (int x=0; x < invalidParses.size(); x++)
 210                 {       pParse=invalidParses.at(x);
 211                         pCompound->RemoveParse( pParse );
 212                         m_pLexicon->UpdateCompound( pCompound->Display() );
 213                 }
 214
 215                 if( pCompound->GetParses()->count() == 0 ) invalidCmpds.append( pCompound );
 216         }
 217
 218         // Delete invalid compounds
 219         //for( pCompound = invalidCmpds.first(); pCompound; pCompound = invalidCmpds.next() )
 220         for (int w = 0; w < invalidCmpds.size(); w++)
 221         {       pCompound = invalidCmpds.at(w);
 222                 m_pLexicon->UpdateCompound( pCompound->Display() );
 223                 RemoveMember( pCompound );
 224         }
 225 }