1 // Part of speech discovery
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
10 #include "SignatureCollection.h"
11 #include "Signature.h"
13 #include "POSCollection.h"
17 QString
HTMLfileStart("<html><head> "
18 "<link rel=\"stylesheet\" type=\"text/css\" href=\"lxa.css\"> "
22 void CMiniLexicon::FindMajorSignatures()
24 // XXX. add more options, for prefixes, etc.
25 // XXX. Make this a user-defined parameter
26 const int NumberOfTopSignatures
= 50;
27 const int MinimumSignatureSize
= 2;
28 const QString filename
= "c:\\PartsOfSpeech.html";
32 if (!file
.open(QIODevice::WriteOnly
))
35 QTextStream
outf(&file
);
37 if (m_pPOS
== 0) m_pPOS
= new LxPoSCollection
;
39 outf
<< HTMLfileStart
;
40 outf
<< StartTableRow
<< TableData ("Signature") << EndTableRow
;
43 // For each signature: if it's good enough, make it a part of speech
44 int TooBigToIgnoreRankCutoff
= 10;
45 m_pSignatures
->FindDisplayOrdering();
46 for (int signo
= 0, count
= 0;
47 signo
< m_pSignatures
->GetCount() &&
48 count
< NumberOfTopSignatures
;
50 CSignature
* pSig
= m_pSignatures
->GetAtSort(signo
);
52 outf
<< "<tr><td>" << pSig
->Display('-') << "</td>";
54 // hack: simulate labelled continue statement
55 struct not_eligible
{ };
57 // a sig P is a "mentor" to sig Q if P is more robust
58 // than Q and P contains Q
59 if (pSig
->GetMentor() != 0)
60 // not a part of speech: subsumed under mentor
63 if (pSig
->Size() < MinimumSignatureSize
)
64 // not a part of speech: too simple
67 if (pSig
->GetMentorList()->count() < 2 && signo
> TooBigToIgnoreRankCutoff
)
68 // it has no mentees, and it is not among the very biggest signatures
72 for (LxPoS
* qPOS
= m_pPOS
->first();
73 qPOS
!= 0; qPOS
=m_pPOS
->next()) {
74 // If pSig has exactly one more affix than a
75 // signature already in PoS, then pSig can't
76 // be a PoS, and its extra affix is entered as a satellite.
77 CSignature
* qSig
= qPOS
->GetSignature();
79 if (qSig
->Size() + 1 == pSig
->Size() &&
80 qSig
->Contains (pSig
)) {
81 CParse Suffix
= qSig
->Intersection(*pSig
);
82 qSig
->AppendSatelliteAffix (Suffix
);
83 outf
<< "<td>" << Suffix
.Display();
87 } catch (not_eligible
) { continue; }
89 std::auto_ptr
<LxPoS
> new_pos(new LxPoS(pSig
, this));
90 LxPoS
* pPOS
= new_pos
.get();
92 // Record affix frequencies
93 for (int k
= 1; k
<= pSig
->Size(); ++k
) {
94 pPOS
->SetPieceValue( k
, pSig
->GetNumberOfStems() );
95 outf
<< "<td>"<<pSig
->GetPiece(k
).Display() << "</td>";
98 // Calculate robustness
99 pPOS
->AddRobustness(pSig
->GetRobustness());
101 // Save part-of-speech
102 m_pPOS
->append(new_pos
.release());
105 outf
<< "<td>"<< pSig
->GetNumberOfStems() <<"</td></tr>";
108 for (int signo
= 0; signo
< pSig
->GetMentorList()->size(); signo
++)
109 { qSig
= pSig
->GetMentorList()->at(signo
);
110 outf
<< StartTableRow
<< TableData (qSig
->Display('-')) ;
111 pPOS
->AppendSignature( qSig
);
112 // Adjust robustness for mentee.
113 pPOS
->AddRobustness( qSig
->GetRobustness() );
116 for (int stemno
= 0; stemno
< qSig
->GetNumberOfStems(); stemno
++)
118 pPOS
->AddStem (qSig
->GetStem(stemno
));
122 // Adjust affix frequencies for mentee.
123 for (int affixno
= 1; affixno
<= qSig
->Size(); ++affixno
) {
124 pPOS
->IncrementPieceValue( qSig
->GetPiece(affixno
), qSig
->GetNumberOfStems() );
125 outf
<< "<td>" << qSig
->GetPiece(affixno
).Display() <<"</td><td>" <<qSig
->GetNumberOfStems() <<"</td><td>";
128 outf
<< EndTableRow
;
129 }// end of signo loop
132 // Write affix frequencies to log.
133 for (int m
= 1; m
<= pPOS
->Size(); m
++) {
134 outf
<< "<tr><td>" << pPOS
->GetPiece(m
).Display() <<"</td><td>"
135 << pPOS
->GetPieceValue(m
)<<"</td></tr>";
138 outf
<<"</table></body></html>";