LParse.cpp

   1 // Implementation of CLParse methods
   2 // Copyright © 2009 The University of Chicago
   3 #include "LParse.h"
   4
   5 #include "StringFunc.h"
   6 using linguistica::corpus_count;
   7
   8 //--------------------------------------------------------------------
   9 // Construction/Destruction
  10 //--------------------------------------------------------------------
  11
  12
  13 /**
  14         Constructs an empty CLParse. <kbd>mini</kbd> is the mini-lexicon
  15         that contains this parse.
  16  */
  17 CLParse::CLParse( CMiniLexicon* mini )
  18         : CParse(), corpus_count(),
  19         m_DoNotParse(false),
  20         m_DoomFlag(false),
  21         m_Index(0),
  22         m_SortIndex(0),
  23         m_Trigrams(),
  24         m_AlphabetizedForm(),
  25         m_pMyMini(mini) { }
  26
  27 /**
  28         Constructs a CLParse copy of a single QChar. <kbd>c</kbd> is the QChar
  29         to be copied. <kbd>mini</kbd> is the mini-lexicon that contains this
  30         parse.
  31  */
  32 CLParse::CLParse(const QChar& c, CMiniLexicon* mini)
  33         : CParse(c), corpus_count(),
  34         m_DoNotParse(false),
  35         m_DoomFlag(false),
  36         m_Index(0),
  37         m_SortIndex(0),
  38         m_Trigrams(),
  39         m_AlphabetizedForm(),
  40         m_pMyMini(mini) { }
  41
  42 /**
  43         Constructs a CLParse copy of a CParse. <kbd>Parse</kbd> is the parse to be
  44         copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
  45  */
  46 CLParse::CLParse(const CParse& Parse, CMiniLexicon* mini)
  47         : CParse(Parse), corpus_count(),
  48         m_DoNotParse(false),
  49         m_DoomFlag(false),
  50         m_Index(0),
  51         m_SortIndex(0),
  52         m_Trigrams(),
  53         m_AlphabetizedForm(),
  54         m_pMyMini(mini) { }
  55
  56 /**
  57         Constructs a CLParse copy from a pointer to a CParse. <kbd>pParse</kbd> is the parse to be
  58         copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
  59  */
  60 CLParse::CLParse(const CParse* pParse, CMiniLexicon* mini)
  61         : CParse(*pParse), corpus_count(),
  62         m_DoNotParse(false),
  63         m_DoomFlag(false),
  64         m_Index(0),
  65         m_SortIndex(0),
  66         m_Trigrams(),
  67         m_AlphabetizedForm(),
  68         m_pMyMini(mini) { }
  69
  70 /**
  71         Constructs a CLParse copy of a CStringSurrogate. <kbd>SS</kbd> is the surrogate to be
  72         copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
  73  */
  74 CLParse::CLParse(const CStringSurrogate& SS, CMiniLexicon* mini)
  75         : CParse(SS), corpus_count(),
  76         m_DoNotParse(false),
  77         m_DoomFlag(false),
  78         m_Index(0),
  79         m_SortIndex(0),
  80         m_Trigrams(),
  81         m_AlphabetizedForm(),
  82         m_pMyMini(mini) { }
  83
  84 /**
  85         Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
  86         copied.
  87  */
  88 CLParse::CLParse(const CLParse& x)
  89         : CParse(x), corpus_count(x),
  90         m_DoNotParse(x.m_DoNotParse),
  91         m_DoomFlag(x.m_DoomFlag),
  92         m_Index(0),
  93         m_SortIndex(0),
  94         m_Trigrams(x.m_Trigrams != 0 ? new CParse(*x.m_Trigrams) : 0),
  95         m_AlphabetizedForm(),
  96         m_pMyMini(x.m_pMyMini) { }
  97
  98 /**
  99         Destroys this CLParse.
 100  */
 101 CLParse::~CLParse()
 102 { delete m_Trigrams; }
 103
 104 /**
 105         Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
 106         copied.
 107  */
 108 CLParse& CLParse::operator=(const CLParse& LParse)
 109 {
 110         if (&LParse == this) return *this;
 111
 112         CParse::operator=(LParse);
 113         corpus_count::operator=(LParse);
 114
 115         m_DoNotParse = LParse.m_DoNotParse;
 116         m_DoomFlag = LParse.m_DoomFlag;
 117
 118         Q_ASSERT(m_Trigrams == 0 ||
 119                 m_Trigrams != LParse.m_Trigrams);
 120         delete m_Trigrams;
 121         m_Trigrams = new CParse(*LParse.m_Trigrams);
 122
 123         m_AlphabetizedForm.clear();
 124         m_pMyMini = LParse.m_pMyMini;
 125
 126         return *this;
 127 }
 128
 129
 130 //-----------------------------------------------------------------
 131 // Other methods
 132 //-----------------------------------------------------------------
 133
 134
 135 // Prepare this word for input
 136 //
 137 // Parameters:
 138 //              LowerCaseFlag - if true, all characters
 139 //              are set to lower case
 140
 141 /**
 142         Replaces and removes different punctuation marks and makes other
 143         changes to prepare the parse for input into a collection. <kbd>LowerCaseFlag</kbd> should
 144         be set to <i>TRUE</i> if the characters of the parse should be made lower case.
 145         <kbd>punctuation</kbd> is a list of punctuation marks to be handled.
 146  */
 147
 148 void    CLParse::PrepareWordForInput( bool LowerCaseFlag, QString punctuation )
 149 {
 150         int             Length = GetKeyLength();
 151 //      int             count = 0;
 152         int             i, z, loc;
 153         QString ellipsis = "...",
 154                         dash = "--";
 155         QChar   apostrophe = '\'',
 156                         period = '.',
 157                         space = ' ';
 158
 159         if( punctuation == "EMPTY" ) punctuation = "";
 160
 161         // Nov 4 1999: deal with 3+ periods at word edges, or between
 162         //      words without white spaces (Tom Sawyer has the former).
 163         //  Clearly we don't want to get rid of periods inside abbrevations
 164         //      like U.S.A., but 3+ periods will get treated as a single
 165         //      punctuation with no internal structure; let's call it
 166         //      '&'.
 167
 168         // if it's a pure number, just delete it.
 169         // This is a bad solution regarding syntax, of course,
 170         // so this is just temporary --
 171
 172
 173 /*      for( i=0; i < Length; i++ )
 174         {
 175
 176         }*/
 177
 178         //  dealing with "..."
 179         for (i = 0; i < Length-3; i++)
 180         {
 181                 // TODO: not sure if this is the correct
 182                 // change, are we looking for any occurrence
 183                 // of "..." or just at the beginning of m_Key?
 184                 loc = LxStrCmp( m_Key, ellipsis, 3, 3);
 185                 if ( loc == 0  )
 186                 {
 187                         CutRightBeforeHere( loc );
 188                         if ( loc < Length-3 )
 189                         {
 190                                 CutRightBeforeHere( i+3 );
 191                         }
 192                 }
 193         }
 194         // dealing with Dash '--'
 195
 196         for (i = 0; i < Length-2; i++)
 197         {
 198                 if ( m_Key[i] == '-' && m_Key[i+1] == '-' )
 199                 {
 200                         if ( i > 0  )
 201                         {
 202                                 CutRightBeforeHere( i );
 203                         }
 204                         if ( i < Length-2 )
 205                         {
 206                                 CutRightBeforeHere( i+2 );
 207                         }
 208                 }
 209
 210         }
 211
 212
 213
 214         if (LowerCaseFlag)
 215         {
 216                 // make lower case;
 217                 LxStrLwr ( m_Key, GetKeyLength() );
 218         }
 219
 220         // do nothing to a '--' (if that's all it is)
 221         if ( LxStrCmp( m_Key, dash, 2, 2 ) == 0 )
 222         {
 223                 return;
 224         }
 225
 226         // these wrongly break up numbers, don't forget, and currency amounts
 227
 228         if (Length > 2) {
 229                 if ( m_Key[Length-2] != apostrophe &&
 230                          ( punctuation.find( m_Key[Length-2] ) > -1 ||
 231                            ( punctuation.length() == 0 && m_Key[Length-2].isPunct() ) ) )
 232                 {
 233                         CutRightBeforeHere (Length-2);
 234                         if (Length > 3) {                                       // because the brown corpus has things like " jr.,"
 235                                 if ( ( punctuation.find( m_Key[Length-3] ) > -1 ||
 236                                    ( punctuation.length() == 0 && m_Key[Length-3].isPunct() ) ) )
 237                                 {
 238                                         CutRightBeforeHere (Length - 3);
 239                                 }
 240                         }
 241                 }
 242         }
 243
 244         for ( z = 1; z < 3; z++)
 245         {
 246                 if (Length > 1) {
 247                         if ( m_Key[Length-z] != apostrophe &&
 248                                  ( punctuation.find( m_Key[Length-z] ) > -1 ||
 249                                ( punctuation.length() == 0 && m_Key[Length-z].isPunct() ) ) )
 250                         {
 251                                 CutRightBeforeHere (Length-z);
 252                         }
 253
 254
 255                         if ( punctuation.find( m_Key[0] ) > -1 ||
 256                              ( punctuation.length() == 0 && m_Key[0].isPunct() ) )
 257                         {
 258                                 CutRightBeforeHere( 1 );
 259                         }
 260                 }
 261         }
 262         // The following could be done better in an earlier loop, but it needs
 263         // to be done: if a Piece of the word now begins with a hyphen, cut the hyphen off:
 264         for ( z = 1; z <= Size(); z++ ) //PieceCount; z++)
 265         {
 266                 if ( m_Key[m_Pieces[z-1]] == '-' && ThisPieceLength(z) > 1 )
 267                 {
 268                         CutRightBeforeHere (m_Pieces[z-1]+1);
 269                 }
 270         }
 271
 272         return ;
 273 }
 274
 275 // Get the trigrams for this LParse
 276 //
 277 // Returns:
 278 //              CParse* - a pointer to the trigrams
 279
 280 /**
 281         Returns the trigrams of this LParse.
 282  */
 283 CParse* CLParse::GetTrigrams()
 284 {
 285         return m_Trigrams;
 286 }
 287
 288 /**
 289         Calculates the trigrams of this LParse.
 290  */
 291 void CLParse::CalculateTrigrams()
 292 {
 293         if( m_Trigrams ) delete m_Trigrams;
 294         m_Trigrams = new CParse();
 295         m_Trigrams->FindAlphabetizedTrigrams( m_Key, m_KeyLength );
 296 }
 297
 298
 299 // Compare the number of matches between
 300 // the trigrams of this and those of another
 301 // LParse
 302 //
 303 // Parameters:
 304 //              Other - the other parse
 305 //
 306 // Returns:
 307 //              int - the number of matches
 308
 309 /**
 310         Returns the number of matches of trigrams in this
 311         CLParse and the trigrams of another. <kbd>Other</kbd> is
 312         the other CLParse whose trigrams will be compared.
 313  */
 314 int     CLParse::CompareTrigrams( CLParse& Other )
 315 {
 316         CalculateTrigrams();
 317         return  GetTrigrams()->CountOverlapAlphabetized( *Other.GetTrigrams() );
 318 }