HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / LParse.cpp
blob0d4b3f124ee9bc1f359f2ee086c69e90a610bdb6
1 // Implementation of CLParse methods
2 // Copyright © 2009 The University of Chicago
3 #include "LParse.h"
5 #include "StringFunc.h"
6 using linguistica::corpus_count;
8 //--------------------------------------------------------------------
9 // Construction/Destruction
10 //--------------------------------------------------------------------
13 /**
14 Constructs an empty CLParse. <kbd>mini</kbd> is the mini-lexicon
15 that contains this parse.
17 CLParse::CLParse( CMiniLexicon* mini )
18 : CParse(), corpus_count(),
19 m_DoNotParse(false),
20 m_DoomFlag(false),
21 m_Index(0),
22 m_SortIndex(0),
23 m_Trigrams(),
24 m_AlphabetizedForm(),
25 m_pMyMini(mini) { }
27 /**
28 Constructs a CLParse copy of a single QChar. <kbd>c</kbd> is the QChar
29 to be copied. <kbd>mini</kbd> is the mini-lexicon that contains this
30 parse.
32 CLParse::CLParse(const QChar& c, CMiniLexicon* mini)
33 : CParse(c), corpus_count(),
34 m_DoNotParse(false),
35 m_DoomFlag(false),
36 m_Index(0),
37 m_SortIndex(0),
38 m_Trigrams(),
39 m_AlphabetizedForm(),
40 m_pMyMini(mini) { }
42 /**
43 Constructs a CLParse copy of a CParse. <kbd>Parse</kbd> is the parse to be
44 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
46 CLParse::CLParse(const CParse& Parse, CMiniLexicon* mini)
47 : CParse(Parse), corpus_count(),
48 m_DoNotParse(false),
49 m_DoomFlag(false),
50 m_Index(0),
51 m_SortIndex(0),
52 m_Trigrams(),
53 m_AlphabetizedForm(),
54 m_pMyMini(mini) { }
56 /**
57 Constructs a CLParse copy from a pointer to a CParse. <kbd>pParse</kbd> is the parse to be
58 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
60 CLParse::CLParse(const CParse* pParse, CMiniLexicon* mini)
61 : CParse(*pParse), corpus_count(),
62 m_DoNotParse(false),
63 m_DoomFlag(false),
64 m_Index(0),
65 m_SortIndex(0),
66 m_Trigrams(),
67 m_AlphabetizedForm(),
68 m_pMyMini(mini) { }
70 /**
71 Constructs a CLParse copy of a CStringSurrogate. <kbd>SS</kbd> is the surrogate to be
72 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
74 CLParse::CLParse(const CStringSurrogate& SS, CMiniLexicon* mini)
75 : CParse(SS), corpus_count(),
76 m_DoNotParse(false),
77 m_DoomFlag(false),
78 m_Index(0),
79 m_SortIndex(0),
80 m_Trigrams(),
81 m_AlphabetizedForm(),
82 m_pMyMini(mini) { }
84 /**
85 Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
86 copied.
88 CLParse::CLParse(const CLParse& x)
89 : CParse(x), corpus_count(x),
90 m_DoNotParse(x.m_DoNotParse),
91 m_DoomFlag(x.m_DoomFlag),
92 m_Index(0),
93 m_SortIndex(0),
94 m_Trigrams(x.m_Trigrams != 0 ? new CParse(*x.m_Trigrams) : 0),
95 m_AlphabetizedForm(),
96 m_pMyMini(x.m_pMyMini) { }
98 /**
99 Destroys this CLParse.
101 CLParse::~CLParse()
102 { delete m_Trigrams; }
105 Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
106 copied.
108 CLParse& CLParse::operator=(const CLParse& LParse)
110 if (&LParse == this) return *this;
112 CParse::operator=(LParse);
113 corpus_count::operator=(LParse);
115 m_DoNotParse = LParse.m_DoNotParse;
116 m_DoomFlag = LParse.m_DoomFlag;
118 Q_ASSERT(m_Trigrams == 0 ||
119 m_Trigrams != LParse.m_Trigrams);
120 delete m_Trigrams;
121 m_Trigrams = new CParse(*LParse.m_Trigrams);
123 m_AlphabetizedForm.clear();
124 m_pMyMini = LParse.m_pMyMini;
126 return *this;
130 //-----------------------------------------------------------------
131 // Other methods
132 //-----------------------------------------------------------------
135 // Prepare this word for input
137 // Parameters:
138 // LowerCaseFlag - if true, all characters
139 // are set to lower case
142 Replaces and removes different punctuation marks and makes other
143 changes to prepare the parse for input into a collection. <kbd>LowerCaseFlag</kbd> should
144 be set to <i>TRUE</i> if the characters of the parse should be made lower case.
145 <kbd>punctuation</kbd> is a list of punctuation marks to be handled.
148 void CLParse::PrepareWordForInput( bool LowerCaseFlag, QString punctuation )
150 int Length = GetKeyLength();
151 // int count = 0;
152 int i, z, loc;
153 QString ellipsis = "...",
154 dash = "--";
155 QChar apostrophe = '\'',
156 period = '.',
157 space = ' ';
159 if( punctuation == "EMPTY" ) punctuation = "";
161 // Nov 4 1999: deal with 3+ periods at word edges, or between
162 // words without white spaces (Tom Sawyer has the former).
163 // Clearly we don't want to get rid of periods inside abbrevations
164 // like U.S.A., but 3+ periods will get treated as a single
165 // punctuation with no internal structure; let's call it
166 // '&'.
168 // if it's a pure number, just delete it.
169 // This is a bad solution regarding syntax, of course,
170 // so this is just temporary --
173 /* for( i=0; i < Length; i++ )
178 // dealing with "..."
179 for (i = 0; i < Length-3; i++)
181 // TODO: not sure if this is the correct
182 // change, are we looking for any occurrence
183 // of "..." or just at the beginning of m_Key?
184 loc = LxStrCmp( m_Key, ellipsis, 3, 3);
185 if ( loc == 0 )
187 CutRightBeforeHere( loc );
188 if ( loc < Length-3 )
190 CutRightBeforeHere( i+3 );
194 // dealing with Dash '--'
196 for (i = 0; i < Length-2; i++)
198 if ( m_Key[i] == '-' && m_Key[i+1] == '-' )
200 if ( i > 0 )
202 CutRightBeforeHere( i );
204 if ( i < Length-2 )
206 CutRightBeforeHere( i+2 );
214 if (LowerCaseFlag)
216 // make lower case;
217 LxStrLwr ( m_Key, GetKeyLength() );
220 // do nothing to a '--' (if that's all it is)
221 if ( LxStrCmp( m_Key, dash, 2, 2 ) == 0 )
223 return;
226 // these wrongly break up numbers, don't forget, and currency amounts
228 if (Length > 2) {
229 if ( m_Key[Length-2] != apostrophe &&
230 ( punctuation.find( m_Key[Length-2] ) > -1 ||
231 ( punctuation.length() == 0 && m_Key[Length-2].isPunct() ) ) )
233 CutRightBeforeHere (Length-2);
234 if (Length > 3) { // because the brown corpus has things like " jr.,"
235 if ( ( punctuation.find( m_Key[Length-3] ) > -1 ||
236 ( punctuation.length() == 0 && m_Key[Length-3].isPunct() ) ) )
238 CutRightBeforeHere (Length - 3);
244 for ( z = 1; z < 3; z++)
246 if (Length > 1) {
247 if ( m_Key[Length-z] != apostrophe &&
248 ( punctuation.find( m_Key[Length-z] ) > -1 ||
249 ( punctuation.length() == 0 && m_Key[Length-z].isPunct() ) ) )
251 CutRightBeforeHere (Length-z);
255 if ( punctuation.find( m_Key[0] ) > -1 ||
256 ( punctuation.length() == 0 && m_Key[0].isPunct() ) )
258 CutRightBeforeHere( 1 );
262 // The following could be done better in an earlier loop, but it needs
263 // to be done: if a Piece of the word now begins with a hyphen, cut the hyphen off:
264 for ( z = 1; z <= Size(); z++ ) //PieceCount; z++)
266 if ( m_Key[m_Pieces[z-1]] == '-' && ThisPieceLength(z) > 1 )
268 CutRightBeforeHere (m_Pieces[z-1]+1);
272 return ;
275 // Get the trigrams for this LParse
277 // Returns:
278 // CParse* - a pointer to the trigrams
281 Returns the trigrams of this LParse.
283 CParse* CLParse::GetTrigrams()
285 return m_Trigrams;
289 Calculates the trigrams of this LParse.
291 void CLParse::CalculateTrigrams()
293 if( m_Trigrams ) delete m_Trigrams;
294 m_Trigrams = new CParse();
295 m_Trigrams->FindAlphabetizedTrigrams( m_Key, m_KeyLength );
299 // Compare the number of matches between
300 // the trigrams of this and those of another
301 // LParse
303 // Parameters:
304 // Other - the other parse
306 // Returns:
307 // int - the number of matches
310 Returns the number of matches of trigrams in this
311 CLParse and the trigrams of another. <kbd>Other</kbd> is
312 the other CLParse whose trigrams will be compared.
314 int CLParse::CompareTrigrams( CLParse& Other )
316 CalculateTrigrams();
317 return GetTrigrams()->CountOverlapAlphabetized( *Other.GetTrigrams() );