HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Word.cpp
blobb0e25d55cbc8fd1b48ecfd085ed08048eff026ad
1 // Implementation of the CStem class’s word functions.
2 // Copyright © 2009 The University of Chicago
3 #include "Stem.h"
5 #include <Q3TextStream>
6 #include "MiniLexicon.h"
7 #include "Signature.h"
8 #include "Prefix.h"
9 #include "Suffix.h"
10 #include "StringFunc.h"
12 // Get the number of stems
14 // Returns:
15 // int - 0, 1, or 2
17 int CStem::GetNumberOfStems() const
19 int count = m_NumberOfStems;
20 if (count < 2 && m_Stem2Loc ) { count = 2; }
21 return count;
25 // Find out if this word has a prefix
27 // Returns:
28 // bool - true if a prefix has been
29 // marked
31 bool CStem::HasAPrefix() const
33 if (m_PrefixLoc > 0) return true;
34 else return false;
38 // Increment the marked location of the suffix
40 void CStem::IncrementSuffixLocs()
42 if (m_SuffixLoc) m_SuffixLoc++;
46 // Find out if this word has a marked
47 // suffix
49 // Returns:
50 // bool - true if there is a
51 // marked suffix
53 bool CStem::HasASuffix() const
55 if (m_SuffixLoc > 0)
57 return true;
59 else
61 return false;
66 // Shift the stem/suffix boundary n spaces
67 // to the right
69 // Parameters:
70 // n - number of spaces to shift, a
71 // negative value is legal and shifts
72 // to the left
74 void CStem::ShiftStemSuffixBoundary (int n) // how many positions to the right
76 // QString f = "fixador";
77 // Q_ASSERT (CStringSurrogate(m_Key,0,GetKeyLength()) != CStringSurrogate(f.unicode(),0,f.length()) );
78 CStringSurrogate ssSuffix;
80 if ( n >= 0 )
82 if ( m_SuffixLoc == 0)
84 Q_ASSERT (FALSE);
85 return;
88 ssSuffix = GetPiece( m_SuffixLoc );
90 if ( ssSuffix.GetLength() < (int) n )
92 Q_ASSERT (FALSE);
93 return;
95 if ( ssSuffix.GetLength() == (int) n ) // changed Oct 2002
97 ClearRootSuffixSplit();
98 m_SuffixLoc = 0;
100 else
102 m_Pieces[m_StemLoc] += n;
106 else // shift to the left...
108 if ( m_SuffixLoc == 0 )
110 m_StemLoc = 1; // this is too simple -- prefixes --
111 m_SuffixLoc = m_StemLoc + 1;
112 CutRightBeforeHere ( GetKeyLength() + n ); // sept 28 2002
114 else
116 m_Pieces[m_StemLoc] += n;
120 // Shift the prefix/stem boundary n spaces
121 // to the left
123 // Parameters:
124 // n - number of spaces to shift, a
125 // negative value is legal and shifts
126 // to the right
128 void CStem::ShiftPrefixStemBoundary (int n) // how many positions to the left
131 CStringSurrogate ssPrefix;
133 if ( n >= 0 )
135 if ( m_PrefixLoc == 0)
137 Q_ASSERT (FALSE);
138 return;
141 ssPrefix = GetPiece( m_PrefixLoc );
143 if ( ssPrefix.GetLength() < (int) n )
145 Q_ASSERT (FALSE);
146 return;
148 if ( ssPrefix.GetLength() == (int) n ) // changed Oct 2002
150 ClearPrefixStemSplit();
151 m_PrefixLoc = 0;
153 else
155 m_Pieces[m_StemLoc] -= n;
159 else // negative number means shift to the right
161 if ( m_PrefixLoc == 0 )
163 m_StemLoc = 2;
164 m_PrefixLoc = 1;
165 CutRightBeforeHere ( -1 *n ); // sept 28 2002
167 else
169 m_Pieces[m_PrefixLoc] += -1 * n;
174 // Get the marked stem of the word
176 // Returns:
177 // CStringSurrogate - a surrogate string
178 // of the stem
180 CStringSurrogate CStem::GetStem( )
182 if (m_strStem.GetKeyLength() > 0) {
183 return CStringSurrogate(m_strStem);
185 if (m_StemLoc) {
186 return GetPiece( m_StemLoc );
188 else
189 return CStringSurrogate();
193 // Get the marked suffix of the word
195 // Returns:
196 // CStringSurrogate - a surrogate string
197 // of the suffix
199 CStringSurrogate CStem::GetSuffix() const
202 if (m_strSuffix.GetKeyLength() > 0) {
203 return CStringSurrogate (m_strSuffix);
205 if (m_SuffixLoc) {
206 return GetPiece( m_SuffixLoc );
208 else
209 return CStringSurrogate();
213 // Get the marked prefix of the word
215 // Returns:
216 // CStringSurrogate - a surrogate string
217 // of the prefix
219 CStringSurrogate CStem::GetPrefix() const
221 if (m_PrefixLoc)
222 return GetPiece( m_PrefixLoc );
223 else
224 return CStringSurrogate();
228 // Attach a new prefix signature
230 // Parameters:
231 // pSig = pointer to new signature
233 void CStem::AttachPrefixSignature(CSignature* pSig)
235 if (m_pPrefixSignature != 0 && m_pPrefixSignature != pSig)
236 m_pPrefixSignature->DetachWord(this,
237 CSignature::eDo_Not_Call_Words);
238 m_pPrefixSignature = pSig;
241 // Attach a new suffix signature
243 // Parameters:
244 // pSig = pointer to new signature
246 void CStem::AttachSuffixSignature(CSignature* pSig)
248 if (m_pSuffixSignature != 0 && m_pSuffixSignature != pSig)
249 m_pSuffixSignature->DetachWord(this,
250 CSignature::eDo_Not_Call_Words);
251 m_pSuffixSignature = pSig;
254 // Attach a new stem and attach this word
255 // to the stem
257 // Parameters:
258 // pStem - point to new stem
260 void CStem::AttachWordAndSuffixalStem(CStem* pStem)
262 if (m_pStem && m_pStem != pStem)
264 m_pStem->RemoveWordFromWordPtrList( this );
265 m_pStem->DetachSuffix ( m_pSuffix );
267 m_pStem = pStem;
268 if (pStem)
270 pStem->AddWord (this); // it checks whether this is on pStem's list yet.
271 m_pStem->IncrementWordCount();
272 m_pStem->IncrementCorpusCount( GetCorpusCount() - 1 );
278 void CStem::AttachWordAndPrefixalStem(CStem* pStem)
280 if (m_pStem && m_pStem != pStem)
282 m_pStem->RemoveWordFromWordPtrList( this );
283 m_pStem->DetachPrefix( m_pPrefix ); //todo this is causing a problem which
284 //I can't identify -- but I think it should be there. jg
287 m_pStem = pStem;
288 if (pStem)
290 if( pStem->AddWord (this) )
292 m_pStem->IncrementWordCount();
293 m_pStem->IncrementCorpusCount( GetCorpusCount() - 1 );
299 // Attach a new stem and new prefix and
300 // attach this word to both
302 // Parameters:
303 // pNewStem - the new stem
304 // pNewPrefix - the new prefix
306 void CStem::AttachWordStemAndPrefix(CStem* pNewStem, CPrefix* pNewPrefix)
309 // Sending message to the old Prefix that it's being dropped.
310 if (m_pPrefix)
312 m_pPrefix->IncrementCorpusCount ( -GetCorpusCount() );
313 m_pPrefix->RemoveFromStemPtrList ( m_pStem );
314 m_pPrefix->RemoveStemString ( m_pStem->GetKey() );
317 AttachWordAndPrefixalStem(pNewStem); // Also increments stem counts
319 m_pPrefix = pNewPrefix;
320 m_pStem->AddPrefix( m_pPrefix );
321 m_pPrefix->AddStem( m_pStem );
323 m_pPrefix->IncrementUseCount(); // July 2003
324 m_pPrefix->IncrementCorpusCount( GetCorpusCount() - 1 );
325 //***
330 // Attach a new stem and new suffix and
331 // attach this word to both
333 // Parameters:
334 // pNewStem - the new stem
335 // pNewSuffix - the new suffix
337 void CStem::AttachWordStemAndSuffix(CStem* pNewStem, CSuffix* pNewSuffix)
340 // Sending message to the old Suffix that it's being dropped.
341 if (m_pSuffix)
343 m_pSuffix->IncrementCorpusCount ( -GetCorpusCount() );
344 m_pSuffix->RemoveFromStemPtrList ( m_pStem );
345 m_pSuffix->RemoveStemString ( m_pStem->GetKey() );
348 AttachWordAndSuffixalStem(pNewStem); // Also increments stem counts
350 m_pSuffix = pNewSuffix;
351 m_pStem->AddSuffix( m_pSuffix );
352 m_pSuffix->AddStem( m_pStem );
354 m_pSuffix->IncrementUseCount(); // July 2003
355 m_pSuffix->IncrementCorpusCount( GetCorpusCount() - 1 );
356 //***
362 // Find out if this is a valid word
364 // Returns:
365 // bool - true if the word is valid
367 bool CStem::IsValid() const
369 Q_ASSERT (m_StemType >= 0);
370 Q_ASSERT (m_PieceCount < 2 || m_Pieces[1] > 0 );
371 Q_ASSERT ( m_Pieces[m_PieceCount] == GetKeyLength() );
372 if ( m_StemType < 0 ) return false;
375 QString test, appnd;
376 for (int i = 1; i <= m_PieceCount; i++)
378 CStringSurrogate SS = GetPiece(i);
379 LxStrCpy( &SS, appnd, SS.GetLength() );
380 test += appnd;
383 return true; // TODO: ???? what's being tested
387 // Delete the affix/stem separations
389 void CStem::DeleteFactorization()
391 SimplifyParseStructure();
392 m_PrefixLoc = 0;
393 m_StemLoc = 0;
394 m_SuffixLoc = 0;
398 // Clear the pointers to stem, suffix, and
399 // signatures
401 void CStem::ClearPointers()
403 m_pStem = NULL;
404 m_pSuffix = NULL;
405 m_pPrefix = NULL;
406 m_pSuffixSignature = NULL;
407 m_pPrefixSignature = NULL;
411 // Get the successor frequency
413 // Parameters:
414 // n - the position before the break
416 // Returns:
417 // int - the successor frequency
419 int CStem::SF(int n ) const // TODO: more descriptive name
421 CStringSurrogate ssPrefix;
422 if( n <= GetKeyLength() )
424 ssPrefix = Left(n);
425 // TODO: uncomment lines when CNode and CLexicon are available
426 // CNode* pNode = m_Lexicon->GetWords()->GetTrie()->SearchForPrefix ( ssPrefix, Result );
427 // return pNode->GetWidth( );
428 return 0;
430 else return -1; //TODO: is this change OK??
434 // Delete the prefix
436 void CStem::DeletePrefix()
439 Q_ASSERT ( m_PrefixLoc == 1 );
441 int PrefixLength = ThisPieceLength (m_PrefixLoc);
442 int Length = GetKeyLength() - PrefixLength;
444 m_AllocatedLength = GetKeyLength() - PrefixLength +1 ;
445 QChar* NewKey = new QChar[ m_AllocatedLength];
446 LxStrCpy(m_Key, NewKey, Length-PrefixLength, PrefixLength );
447 m_PieceCount--;
450 int* NewPieces = new int [ m_PieceCount - 1 ];
451 for (int i = 1; i <= m_LengthOfPieceVector; i++)
453 NewPieces[i-1] = m_Pieces[i] - m_Pieces[1];
456 delete m_Pieces; m_Pieces = NewPieces;
457 delete [] m_Key; m_Key = NewKey;
459 m_LengthOfPieceVector--;
460 m_StemLoc--;
461 m_SuffixLoc--;
462 if (m_Stem2Loc) { m_Stem2Loc--; }
463 m_pPrefix = NULL;
467 // Get a CParse copy of this word in broken
468 // form
470 // Returns:
471 // CParse - <stem>.+.<suffix>
473 CParse CStem::DisplayBrokenForm()
475 QString plus = "+";
476 if ( m_BrokenForm == NULL)
478 m_BrokenForm = new CParse();
479 if ( m_pSuffix )
481 m_BrokenForm->Append ( GetStem() );
482 m_BrokenForm->Append ( plus );
483 m_BrokenForm->Append ( GetSuffix() );
485 else
487 m_BrokenForm->Append ( CStringSurrogate(m_Key,0,m_KeyLength) );
491 return *m_BrokenForm;
495 void CStem::OutputWord(Q3TextStream& outf, int index,
496 QMap<QString, QString>* filter)
498 QString pieces = "";
500 for (int j = 1; j <= Size(); ++j) {
501 pieces += GetPiece(j).Display(filter);
502 if (j < Size())
503 pieces += ' ';
506 outf << " ";
508 outf.setf(2);
509 outf.width(5);
510 outf << index + 1;
511 outf << " ";
513 outf.unsetf(2);
514 outf.width(4);
515 outf << Size();
516 outf << " ";
518 outf.setf(2);
519 outf.width(40);
520 outf << pieces;
521 outf << " ";
523 outf.unsetf(2);
524 outf.width(12);
525 outf << GetCorpusCount();
526 outf << " ";
528 outf.width(10);
529 outf << GetPrefixLoc();
530 outf << " ";
532 outf.width(8);
533 outf << GetStemLoc();
534 outf << " ";
536 outf.width(10);
537 outf << GetSuffixLoc();
538 outf << " ";
540 if (GetSuffixLoc() == 0) {
541 if (GetPrefixLoc() == 0)
542 outf << "NONE";
543 else if (CSignature* sig = m_pPrefixSignature)
544 outf << sig->Display('.', filter) << ' ';
545 else
546 outf << "NONE";
547 } else if (CSignature* sig = m_pSuffixSignature) {
548 outf << sig->Display('.', filter) << ' ';
549 } else {
550 outf << "NONE";
553 outf << endl;
556 bool CStem::IsAnalyzed()
558 if (m_pSuffixSignature) return TRUE;
559 if (m_pPrefixSignature) return TRUE;
560 return FALSE;