HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / SignatureCollection_PartsOfSpeech.cpp
blob7e936e5bb2554193484654366560e4f32bd5c589
1 // Part of speech discovery
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <memory>
6 #include <QTextStream>
7 #include <QIODevice>
8 #include <QFile>
9 #include <QString>
10 #include "SignatureCollection.h"
11 #include "Signature.h"
12 #include "POS.h"
13 #include "POSCollection.h"
14 #include "Parse.h"
15 #include "HTML.h"
17 QString HTMLfileStart("<html><head> "
18 "<link rel=\"stylesheet\" type=\"text/css\" href=\"lxa.css\"> "
19 "</head>"
20 "<body><table>");
22 void CMiniLexicon::FindMajorSignatures()
24 // XXX. add more options, for prefixes, etc.
25 // XXX. Make this a user-defined parameter
26 const int NumberOfTopSignatures = 50;
27 const int MinimumSignatureSize = 2;
28 const QString filename = "c:\\PartsOfSpeech.html";
30 // open log file
31 QFile file(filename);
32 if (!file.open(QIODevice::WriteOnly))
33 // open failed
34 return;
35 QTextStream outf(&file);
37 if (m_pPOS == 0) m_pPOS = new LxPoSCollection;
39 outf << HTMLfileStart;
40 outf << StartTableRow << TableData ("Signature") << EndTableRow;
43 // For each signature: if it's good enough, make it a part of speech
44 int TooBigToIgnoreRankCutoff = 10;
45 m_pSignatures->FindDisplayOrdering();
46 for (int signo = 0, count = 0;
47 signo < m_pSignatures->GetCount() &&
48 count < NumberOfTopSignatures;
49 ++signo) {
50 CSignature* pSig = m_pSignatures->GetAtSort(signo);
52 outf << "<tr><td>" << pSig->Display('-') << "</td>";
54 // hack: simulate labelled continue statement
55 struct not_eligible { };
56 try {
57 // a sig P is a "mentor" to sig Q if P is more robust
58 // than Q and P contains Q
59 if (pSig->GetMentor() != 0)
60 // not a part of speech: subsumed under mentor
61 throw not_eligible();
63 if (pSig->Size() < MinimumSignatureSize)
64 // not a part of speech: too simple
65 throw not_eligible();
67 if (pSig->GetMentorList()->count() < 2 && signo > TooBigToIgnoreRankCutoff )
68 // it has no mentees, and it is not among the very biggest signatures
69 throw not_eligible();
72 for (LxPoS* qPOS = m_pPOS->first();
73 qPOS != 0; qPOS=m_pPOS->next()) {
74 // If pSig has exactly one more affix than a
75 // signature already in PoS, then pSig can't
76 // be a PoS, and its extra affix is entered as a satellite.
77 CSignature* qSig = qPOS->GetSignature();
79 if (qSig->Size() + 1 == pSig->Size() &&
80 qSig->Contains (pSig)) {
81 CParse Suffix = qSig->Intersection(*pSig);
82 qSig->AppendSatelliteAffix (Suffix);
83 outf<< "<td>" << Suffix.Display();
84 throw not_eligible();
87 } catch (not_eligible) { continue; }
89 std::auto_ptr<LxPoS> new_pos(new LxPoS(pSig, this));
90 LxPoS* pPOS = new_pos.get();
92 // Record affix frequencies
93 for (int k = 1; k <= pSig->Size(); ++k) {
94 pPOS->SetPieceValue( k, pSig->GetNumberOfStems() );
95 outf << "<td>"<<pSig->GetPiece(k).Display() << "</td>";
98 // Calculate robustness
99 pPOS->AddRobustness(pSig->GetRobustness());
101 // Save part-of-speech
102 m_pPOS->append(new_pos.release());
103 ++count;
105 outf << "<td>"<< pSig->GetNumberOfStems() <<"</td></tr>";
106 CSignature *qSig;
107 // For each mentee:
108 for (int signo = 0; signo < pSig->GetMentorList()->size(); signo++)
109 { qSig = pSig->GetMentorList()->at(signo);
110 outf << StartTableRow << TableData (qSig->Display('-')) ;
111 pPOS->AppendSignature( qSig );
112 // Adjust robustness for mentee.
113 pPOS->AddRobustness( qSig->GetRobustness() );
115 //Adjust stem count
116 for (int stemno = 0; stemno < qSig->GetNumberOfStems(); stemno++)
118 pPOS->AddStem (qSig->GetStem(stemno));
122 // Adjust affix frequencies for mentee.
123 for (int affixno = 1; affixno <= qSig->Size(); ++affixno) {
124 pPOS->IncrementPieceValue( qSig->GetPiece(affixno), qSig->GetNumberOfStems() );
125 outf << "<td>" << qSig->GetPiece(affixno).Display() <<"</td><td>" <<qSig->GetNumberOfStems() <<"</td><td>";
128 outf << EndTableRow ;
129 }// end of signo loop
132 // Write affix frequencies to log.
133 for (int m = 1; m <= pPOS->Size(); m++) {
134 outf << "<tr><td>" << pPOS->GetPiece(m).Display() <<"</td><td>"
135 << pPOS->GetPieceValue(m)<<"</td></tr>";
138 outf <<"</table></body></html>";