1 // Implementation of CLParse methods
2 // Copyright © 2009 The University of Chicago
5 #include "StringFunc.h"
6 using linguistica::corpus_count
;
8 //--------------------------------------------------------------------
9 // Construction/Destruction
10 //--------------------------------------------------------------------
14 Constructs an empty CLParse. <kbd>mini</kbd> is the mini-lexicon
15 that contains this parse.
17 CLParse::CLParse( CMiniLexicon
* mini
)
18 : CParse(), corpus_count(),
28 Constructs a CLParse copy of a single QChar. <kbd>c</kbd> is the QChar
29 to be copied. <kbd>mini</kbd> is the mini-lexicon that contains this
32 CLParse::CLParse(const QChar
& c
, CMiniLexicon
* mini
)
33 : CParse(c
), corpus_count(),
43 Constructs a CLParse copy of a CParse. <kbd>Parse</kbd> is the parse to be
44 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
46 CLParse::CLParse(const CParse
& Parse
, CMiniLexicon
* mini
)
47 : CParse(Parse
), corpus_count(),
57 Constructs a CLParse copy from a pointer to a CParse. <kbd>pParse</kbd> is the parse to be
58 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
60 CLParse::CLParse(const CParse
* pParse
, CMiniLexicon
* mini
)
61 : CParse(*pParse
), corpus_count(),
71 Constructs a CLParse copy of a CStringSurrogate. <kbd>SS</kbd> is the surrogate to be
72 copied. <kbd>mini</kbd> is the mini-lexicon that contains this parse.
74 CLParse::CLParse(const CStringSurrogate
& SS
, CMiniLexicon
* mini
)
75 : CParse(SS
), corpus_count(),
85 Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
88 CLParse::CLParse(const CLParse
& x
)
89 : CParse(x
), corpus_count(x
),
90 m_DoNotParse(x
.m_DoNotParse
),
91 m_DoomFlag(x
.m_DoomFlag
),
94 m_Trigrams(x
.m_Trigrams
!= 0 ? new CParse(*x
.m_Trigrams
) : 0),
96 m_pMyMini(x
.m_pMyMini
) { }
99 Destroys this CLParse.
102 { delete m_Trigrams
; }
105 Constructs a copy of another CLParse. <kbd>LParse</kbd> is the other CLParse to be
108 CLParse
& CLParse::operator=(const CLParse
& LParse
)
110 if (&LParse
== this) return *this;
112 CParse::operator=(LParse
);
113 corpus_count::operator=(LParse
);
115 m_DoNotParse
= LParse
.m_DoNotParse
;
116 m_DoomFlag
= LParse
.m_DoomFlag
;
118 Q_ASSERT(m_Trigrams
== 0 ||
119 m_Trigrams
!= LParse
.m_Trigrams
);
121 m_Trigrams
= new CParse(*LParse
.m_Trigrams
);
123 m_AlphabetizedForm
.clear();
124 m_pMyMini
= LParse
.m_pMyMini
;
130 //-----------------------------------------------------------------
132 //-----------------------------------------------------------------
135 // Prepare this word for input
138 // LowerCaseFlag - if true, all characters
139 // are set to lower case
142 Replaces and removes different punctuation marks and makes other
143 changes to prepare the parse for input into a collection. <kbd>LowerCaseFlag</kbd> should
144 be set to <i>TRUE</i> if the characters of the parse should be made lower case.
145 <kbd>punctuation</kbd> is a list of punctuation marks to be handled.
148 void CLParse::PrepareWordForInput( bool LowerCaseFlag
, QString punctuation
)
150 int Length
= GetKeyLength();
153 QString ellipsis
= "...",
155 QChar apostrophe
= '\'',
159 if( punctuation
== "EMPTY" ) punctuation
= "";
161 // Nov 4 1999: deal with 3+ periods at word edges, or between
162 // words without white spaces (Tom Sawyer has the former).
163 // Clearly we don't want to get rid of periods inside abbrevations
164 // like U.S.A., but 3+ periods will get treated as a single
165 // punctuation with no internal structure; let's call it
168 // if it's a pure number, just delete it.
169 // This is a bad solution regarding syntax, of course,
170 // so this is just temporary --
173 /* for( i=0; i < Length; i++ )
178 // dealing with "..."
179 for (i
= 0; i
< Length
-3; i
++)
181 // TODO: not sure if this is the correct
182 // change, are we looking for any occurrence
183 // of "..." or just at the beginning of m_Key?
184 loc
= LxStrCmp( m_Key
, ellipsis
, 3, 3);
187 CutRightBeforeHere( loc
);
188 if ( loc
< Length
-3 )
190 CutRightBeforeHere( i
+3 );
194 // dealing with Dash '--'
196 for (i
= 0; i
< Length
-2; i
++)
198 if ( m_Key
[i
] == '-' && m_Key
[i
+1] == '-' )
202 CutRightBeforeHere( i
);
206 CutRightBeforeHere( i
+2 );
217 LxStrLwr ( m_Key
, GetKeyLength() );
220 // do nothing to a '--' (if that's all it is)
221 if ( LxStrCmp( m_Key
, dash
, 2, 2 ) == 0 )
226 // these wrongly break up numbers, don't forget, and currency amounts
229 if ( m_Key
[Length
-2] != apostrophe
&&
230 ( punctuation
.find( m_Key
[Length
-2] ) > -1 ||
231 ( punctuation
.length() == 0 && m_Key
[Length
-2].isPunct() ) ) )
233 CutRightBeforeHere (Length
-2);
234 if (Length
> 3) { // because the brown corpus has things like " jr.,"
235 if ( ( punctuation
.find( m_Key
[Length
-3] ) > -1 ||
236 ( punctuation
.length() == 0 && m_Key
[Length
-3].isPunct() ) ) )
238 CutRightBeforeHere (Length
- 3);
244 for ( z
= 1; z
< 3; z
++)
247 if ( m_Key
[Length
-z
] != apostrophe
&&
248 ( punctuation
.find( m_Key
[Length
-z
] ) > -1 ||
249 ( punctuation
.length() == 0 && m_Key
[Length
-z
].isPunct() ) ) )
251 CutRightBeforeHere (Length
-z
);
255 if ( punctuation
.find( m_Key
[0] ) > -1 ||
256 ( punctuation
.length() == 0 && m_Key
[0].isPunct() ) )
258 CutRightBeforeHere( 1 );
262 // The following could be done better in an earlier loop, but it needs
263 // to be done: if a Piece of the word now begins with a hyphen, cut the hyphen off:
264 for ( z
= 1; z
<= Size(); z
++ ) //PieceCount; z++)
266 if ( m_Key
[m_Pieces
[z
-1]] == '-' && ThisPieceLength(z
) > 1 )
268 CutRightBeforeHere (m_Pieces
[z
-1]+1);
275 // Get the trigrams for this LParse
278 // CParse* - a pointer to the trigrams
281 Returns the trigrams of this LParse.
283 CParse
* CLParse::GetTrigrams()
289 Calculates the trigrams of this LParse.
291 void CLParse::CalculateTrigrams()
293 if( m_Trigrams
) delete m_Trigrams
;
294 m_Trigrams
= new CParse();
295 m_Trigrams
->FindAlphabetizedTrigrams( m_Key
, m_KeyLength
);
299 // Compare the number of matches between
300 // the trigrams of this and those of another
304 // Other - the other parse
307 // int - the number of matches
310 Returns the number of matches of trigrams in this
311 CLParse and the trigrams of another. <kbd>Other</kbd> is
312 the other CLParse whose trigrams will be compared.
314 int CLParse::CompareTrigrams( CLParse
& Other
)
317 return GetTrigrams()->CountOverlapAlphabetized( *Other
.GetTrigrams() );