Updated German translation
[dasher.git] / Src / DasherCore / AlphabetManager.cpp
bloba0f8ef339a12d3e7e9f16afd312bd16583970dd3
1 // AlphabetManager.cpp
2 //
3 // Copyright (c) 2007 The Dasher Team
4 //
5 // This file is part of Dasher.
6 //
7 // Dasher is free software; you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation; either version 2 of the License, or
10 // (at your option) any later version.
12 // Dasher is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
17 // You should have received a copy of the GNU General Public License
18 // along with Dasher; if not, write to the Free Software
19 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "../Common/Common.h"
23 #include "AlphabetManager.h"
24 #include "ConversionManager.h"
25 #include "DasherInterfaceBase.h"
26 #include "DasherNode.h"
27 #include "Event.h"
28 #include "Observable.h"
29 #include "NodeCreationManager.h"
30 #include "LanguageModelling/PPMLanguageModel.h"
31 #include "LanguageModelling/WordLanguageModel.h"
32 #include "LanguageModelling/DictLanguageModel.h"
33 #include "LanguageModelling/MixtureLanguageModel.h"
34 #include "LanguageModelling/PPMPYLanguageModel.h"
35 #include "LanguageModelling/CTWLanguageModel.h"
36 #include "FileWordGenerator.h"
38 #include <vector>
39 #include <sstream>
40 #include <iostream>
41 #include "string.h"
43 using namespace Dasher;
45 // Track memory leaks on Windows to the line that new'd the memory
46 #ifdef _WIN32
47 #ifdef _DEBUG_MEMLEAKS
48 #define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
49 #define new DEBUG_NEW
50 #undef THIS_FILE
51 static char THIS_FILE[] = __FILE__;
52 #endif
53 #endif
55 CAlphabetManager::CAlphabetManager(CSettingsUser *pCreateFrom, CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet)
56 : CSettingsUser(pCreateFrom), m_pBaseGroup(NULL), m_pInterface(pInterface), m_pNCManager(pNCManager), m_pAlphabet(pAlphabet), m_pLastOutput(NULL) {
59 const string &CAlphabetManager::GetLabelText(symbol i) const {
60 return m_pAlphabet->GetDisplayText(i);
63 void CAlphabetManager::Setup() {
64 InitMap();
66 for (char c=33; (c&0x80)==0; c++) {
67 string s(&c,1);
68 if (m_map.Get(s)==0) {
69 m_sDelim = s;
70 break;
73 //else, if all single-octet chars are in alphabet - leave m_sDelim==""
74 // (and we'll find a delimiter for each context)
76 CreateLanguageModel();
79 void CAlphabetManager::InitMap() {
80 int iPara = m_pAlphabet->GetParagraphSymbol();
81 if (iPara) m_map.AddParagraphSymbol(iPara);
82 int i;
83 for(i = 1; i < m_pAlphabet->iEnd; i++) // 1-indexed
84 if (i!=iPara) m_map.Add(m_pAlphabet->GetText(i), i);
86 /*ACL I'm really not sure where conversion characters should/shouldn't be included.
87 They seemed to be included in the Alphabet Map, i.e. for reading training text via GetSymbols;
88 but a TODO comment suggested they should _not_ be included in GetNumberSymbols(),
89 and I couldn't find any code which would have called e.g. GetText on them.
90 Moreover, if these characters are put into the AlphabetMap, they'll be fed into the
91 LanguageModel just as any other "symbol", but with an out-of-bounds symbol number!
92 (So maybe the range of allowed symbol numbers is wrong?). Hence, not including them atm.
93 If they were needed, we could do something like the following:
94 if (StartConvertCharacter)
95 map->Add(StartConvertCharacter->Text, ++i);
96 if (EndConvertCharacter)
97 map->Add(EndConvertCharacter->Text, ++i);
101 void CAlphabetManager::CreateLanguageModel() {
102 // FIXME - return to using enum here
103 switch (GetLongParameter(LP_LANGUAGE_MODEL_ID)) {
104 default:
105 // If there is a bogus value for the language model ID, we'll default
106 // to our trusty old PPM language model.
107 case 0:
108 m_pLanguageModel = new CPPMLanguageModel(this, m_pAlphabet->iEnd-1);
109 break;
110 case 2:
111 m_pLanguageModel = new CWordLanguageModel(this, m_pAlphabet, &m_map);
112 break;
113 case 3:
114 m_pLanguageModel = new CMixtureLanguageModel(this, m_pAlphabet, &m_map);
115 break;
116 case 4:
117 m_pLanguageModel = new CCTWLanguageModel(m_pAlphabet->iEnd-1);
118 break;
122 CTrainer *CAlphabetManager::GetTrainer() {
123 return new CTrainer(m_pInterface, m_pLanguageModel, m_pAlphabet, &m_map);
126 void CAlphabetManager::MakeLabels(CDasherScreen *pScreen) {
127 m_pBaseGroup->RecursiveDelete();
128 for (vector<CDasherScreen::Label *>::iterator it=m_vLabels.begin(); it!=m_vLabels.end(); it++)
129 delete (*it);
130 m_vLabels.clear();
131 for (map<const SGroupInfo *,CDasherScreen::Label *>::iterator it=m_mGroupLabels.begin(); it!=m_mGroupLabels.end(); it++)
132 delete it->second;
133 m_mGroupLabels.clear();
134 m_pBaseGroup = copyGroups(m_pAlphabet,pScreen);
137 SGroupInfo *CAlphabetManager::copyGroups(const SGroupInfo *pBase, CDasherScreen *pScreen) {
138 if (pBase==NULL) return NULL;
139 DASHER_ASSERT(pBase->iNumChildNodes); //zero-element groups elided by CAlphIO
140 if (m_vLabels.size()<pBase->iEnd) m_vLabels.resize(pBase->iEnd);
141 string strGroupPrefix; int iBkgCol(-1);
142 SGroupInfo * const next=copyGroups(pBase->pNext, pScreen);
143 while (pBase->iNumChildNodes==1) {
144 //were about to create a group node, which would have only one child
145 // (eventually, if the group node were PopulateChildren'd).
146 // Such a child would entirely fill it's parent (the group), and thus,
147 // creation/destruction of the child would cause the node's colour to flash
148 // between that for parent group and child.
149 // Hence, instead we elide the group node and create the child _here_...
151 //1. however we also have to take account of the appearance of the elided group. Hence:
152 strGroupPrefix += pBase->strLabel;
153 if (pBase->bVisible) iBkgCol=pBase->iColour;
154 //2. group might contain a single subgroup, or a single symbol...
155 if (!pBase->pChild) {
156 //single symbol. Create its label, taking account of enclosing groups...
157 // (symbols are never transparent)
158 DASHER_ASSERT(pBase->iEnd == pBase->iStart+1);
159 string symLabel = strGroupPrefix + GetLabelText(pBase->iStart);
160 m_vLabels[pBase->iStart]=(symLabel.empty() ? NULL : pScreen->MakeLabel(symLabel));
161 //then skip this group, return any siblings
162 return next;
164 //...a subgroup, so go into it
165 pBase = pBase->pChild;
166 DASHER_ASSERT(pBase->pNext==NULL); //can't have siblings as parent has only one child
167 //hence, original 'next' pointer is still valid
168 //3. loop round...
170 //in or reached nontrivial subgroup - so make node for entire group
171 //First, make (unpefixed) labels for all children in (original) group
172 // (children of subgroups that are later elided, will have labels made at elision time)
174 SGroupInfo *pChild=pBase->pChild;
175 for (int i=pBase->iStart; i<pBase->iEnd;)
176 if (!pChild || i<pChild->iStart) {
177 const string &symLabel(GetLabelText(i));
178 m_vLabels[i] = (symLabel.empty() ? NULL : pScreen->MakeLabel(symLabel));
179 i++;
180 } else {
181 i=pChild->iEnd;
182 pChild = pChild->pNext;
185 SGroupInfo *pRes = new SGroupInfo(*pBase);
186 //apply properties of enclosing group(s)...
187 pRes->strLabel = strGroupPrefix + pRes->strLabel;
188 if (!pRes->bVisible)
189 if ((pRes->iColour = iBkgCol)!=-1) pRes->bVisible=true;
190 if (pRes->strLabel.length())
191 m_mGroupLabels[pRes] = pScreen->MakeLabel(pRes->strLabel);
192 //siblings (of this group or elided parent) copied already, from original
193 // (passed-in) pBase: if pBase unchanged, then still valid, whereas if pBase
194 // was changed by the above loop to be a subgroup of the original, then the subgroup
195 // has no children, so should be spliced in place of the original pBase.
196 pRes->pNext = next;
198 //recurse on children
199 pRes->pChild = copyGroups(pRes->pChild, pScreen);
200 DASHER_ASSERT(pRes->iNumChildNodes>1);
201 return pRes;
204 CWordGeneratorBase *CAlphabetManager::GetGameWords() {
205 CFileWordGenerator *pGen = new CFileWordGenerator(m_pInterface, m_pAlphabet, &m_map);
206 pGen->setAcceptUser(true);
207 if (!GetStringParameter(SP_GAME_TEXT_FILE).empty()) {
208 const string &gtf(GetStringParameter(SP_GAME_TEXT_FILE));
209 if (pGen->ParseFile(gtf,true)) return pGen;
210 ///TRANSLATORS: the string "GameTextFile" is the name of a setting in gsettings
211 /// (or equivalent), and should not be translated. The %s is the value of that
212 /// setting (this message displayed only if the user has provided a value)
213 m_pInterface->FormatMessageWithString(_("Note: GameTextFile setting specifies game sentences file '%s' but this does not exist"),gtf.c_str());
215 if (!m_pAlphabet->GetGameModeFile().empty()) {
216 //TODO, try user dir first / give one or other priority?
217 // This will concatenate all - which doesn't seem too bad...?
218 m_pInterface->ScanFiles(pGen, m_pAlphabet->GetGameModeFile());
219 if (pGen->HasLines()) return pGen;
221 pGen->setAcceptUser(false);
222 m_pInterface->ScanFiles(pGen, m_pAlphabet->GetTrainingFile());
223 if (pGen->HasLines()) return pGen;
224 delete pGen;
225 return NULL;
228 const CAlphInfo *CAlphabetManager::GetAlphabet() const {
229 return m_pAlphabet;
232 CAlphabetManager::~CAlphabetManager() {
233 //the alphabet belongs to the AlphIO, and may be reused later
234 delete m_pLanguageModel;
237 void CAlphabetManager::WriteTrainFileFull(CDasherInterfaceBase *pInterface) {
238 if (strTrainfileBuffer == "") return;
239 if (strTrainfileContext != "") {
240 //If context begins with the default, skip that - it'll be entered by Trainer 1st anyway
241 string defCtx(m_pAlphabet->GetDefaultContext());
242 if (strTrainfileContext.substr(0,defCtx.length()) == defCtx)
243 strTrainfileContext = strTrainfileContext.substr(defCtx.length());
244 string sDelim(m_sDelim);
245 if (sDelim == "") {
246 //find a character not in the context we want to write out
247 char c=33;
248 while (strTrainfileContext.find(c)!=strTrainfileContext.length()) c++; //will terminate, context is ~~5 chars
249 sDelim = string(&c,1);
251 strTrainfileBuffer = m_pAlphabet->GetContextEscapeChar() + sDelim + strTrainfileContext + sDelim + strTrainfileBuffer;
252 strTrainfileContext="";
254 pInterface->WriteTrainFile(m_pAlphabet->GetTrainingFile(), strTrainfileBuffer);
255 strTrainfileBuffer="";
258 int CAlphabetManager::GetColour(symbol sym, int iOffset) const {
259 int iColour = m_pAlphabet->GetColour(sym);
261 // This is for backwards compatibility with old alphabet files -
262 // ideally make this log a warning (unrelated TODO: automate
263 // validation of alphabet files, plus maintenance of repository
264 // etc.)
265 if(iColour == -1)
266 iColour = (sym % 3) + 10;
268 // Loop on low colours for nodes (TODO: go back to colour namespaces?)
269 // letters 10-109, alternate colours 140-239
270 if((iOffset&1) == 0 && iColour < 110)
271 iColour += 130;
273 return iColour;
277 CAlphabetManager::CAlphBase::CAlphBase(int iOffset, int iColour, CDasherScreen::Label *pLabel, CAlphabetManager *pMgr)
278 : CDasherNode(iOffset, iColour, pLabel), m_pMgr(pMgr) {
281 void CAlphabetManager::CAlphBase::Output() {
282 if (m_pMgr->m_pLastOutput && m_pMgr->m_pLastOutput == Parent())
283 m_pMgr->m_pLastOutput=this;
284 //Case where lastOutput != Parent to subclasses, if they want to.
285 //Note if lastOutput==NULL, we leave it - so the first letter written after startup,
286 // will register as a context switch and write out an empty/default context.
289 void CAlphabetManager::CAlphBase::Undo() {
290 if (m_pMgr->m_pLastOutput==this) m_pMgr->m_pLastOutput = Parent();
292 CAlphabetManager::CAlphNode::CAlphNode(int iOffset, int iColour, CDasherScreen::Label *pLabel, CAlphabetManager *pMgr)
293 : CAlphBase(iOffset, iColour, pLabel, pMgr), m_pProbInfo(NULL) {
296 CAlphabetManager::CSymbolNode::CSymbolNode(int iOffset, CDasherScreen::Label *pLabel, CAlphabetManager *pMgr, symbol _iSymbol)
297 : CAlphNode(iOffset, pMgr->GetColour(_iSymbol, iOffset), pLabel, pMgr), iSymbol(_iSymbol) {
300 CAlphabetManager::CSymbolNode::CSymbolNode(int iOffset, int iColour, CDasherScreen::Label *pLabel, CAlphabetManager *pMgr, symbol _iSymbol)
301 : CAlphNode(iOffset, iColour, pLabel, pMgr), iSymbol(_iSymbol) {
304 CAlphabetManager::CGroupNode::CGroupNode(int iOffset, CDasherScreen::Label *pLabel, int iBkgCol, CAlphabetManager *pMgr, const SGroupInfo *pGroup)
305 : CAlphNode(iOffset,
306 pGroup==pMgr->m_pBaseGroup ? ((iOffset&1) ? 7 : 137) //special case for root nodes
307 : (pGroup->bVisible ? pGroup->iColour : iBkgCol),
308 pLabel, pMgr), m_pGroup(pGroup) {
309 if (!m_pGroup->bVisible) SetFlag(NF_VISIBLE, false);
312 CAlphabetManager::CAlphNode *CAlphabetManager::GetRoot(CDasherNode *pParent, bool bEnteredLast, int iOffset) {
313 //pParent is not a parent, just for document/context.
314 int iNewOffset(max(-1,iOffset-1));
316 pair<symbol, CLanguageModel::Context> p = GetContextSymbols(pParent, iNewOffset, &m_map);
318 CAlphNode *pNewNode;
319 if(p.first==0 || !bEnteredLast) {
320 //couldn't extract last symbol (so probably using default context), or shouldn't
321 pNewNode = new CGroupNode(iNewOffset, NULL, 0, this, m_pBaseGroup); //default background colour
322 } else {
323 //new node represents a symbol that's already happened - i.e. user has already steered through it;
324 // so either we're rebuilding, or else creating a new root from existing text (in edit box)
325 DASHER_ASSERT(!pParent);
326 pNewNode = CreateSymbolRoot(iNewOffset, p.second, p.first);
327 pNewNode->SetFlag(NF_SEEN, true);
328 pNewNode->CDasherNode::SetFlag(NF_COMMITTED, true); //do NOT commit!
330 pNewNode->iContext = p.second;
331 return pNewNode;
334 CAlphabetManager::CAlphNode *CAlphabetManager::CreateSymbolRoot(int iOffset, CLanguageModel::Context ctx, symbol sym) {
335 return new CSymbolNode(iOffset, m_vLabels[sym], this, sym);
338 pair<symbol, CLanguageModel::Context> CAlphabetManager::GetContextSymbols(CDasherNode *pParent, int iRootOffset, const CAlphabetMap *pAlphMap) {
339 vector<symbol> vContextSymbols; bool bHaveFinalSymbol = true;
340 //no context is ever available at offset -1 (=choice between symbols with offset 0)
341 if (iRootOffset!=-1) {
342 // TODO: make the LM get the context, rather than force it to fix max context length as an int
343 int iStart = max(0, iRootOffset - m_pLanguageModel->GetContextLength());
344 if(pParent) {
345 pParent->GetContext(m_pInterface, pAlphMap, vContextSymbols, iStart, iRootOffset+1 - iStart);
346 } else {
347 pAlphMap->GetSymbols(vContextSymbols, m_pInterface->GetContext(iStart, iRootOffset+1 - iStart));
350 for (std::vector<symbol>::iterator it = vContextSymbols.end(); it!=vContextSymbols.begin();) {
351 if (*(--it) == 0) {
352 //found an impossible symbol! erase from beginning up to it (inclusive)
353 vContextSymbols.erase(vContextSymbols.begin(), ++it);
354 break;
358 if (vContextSymbols.empty()) {
359 bHaveFinalSymbol = false;
360 pAlphMap->GetSymbols(vContextSymbols, m_pAlphabet->GetDefaultContext());
363 CLanguageModel::Context iContext = m_pLanguageModel->CreateEmptyContext();
365 //enter the symbols we could make sense of, into the LM context...
366 for (vector<symbol>::iterator it=vContextSymbols.begin(); it != vContextSymbols.end(); it++) {
367 m_pLanguageModel->EnterSymbol(iContext, *it);
369 return pair<symbol,CLanguageModel::Context>(bHaveFinalSymbol ? vContextSymbols[vContextSymbols.size()-1] : 0, iContext);
372 bool CAlphabetManager::CSymbolNode::GameSearchNode(symbol sym) {
373 if (sym == iSymbol) {
374 SetFlag(NF_GAME, true);
375 return true;
377 return false;
379 bool CAlphabetManager::CGroupNode::GameSearchNode(symbol sym) {
380 if (sym >= m_pGroup->iStart && sym < m_pGroup->iEnd) {
381 if (GetFlag(NF_ALLCHILDREN)) {
382 if (!GameSearchChildren(sym)) //recurse, to mark game child also
383 DASHER_ASSERT(false); //sym within this group, should definitely be found!
385 SetFlag(NF_GAME, true);
386 return true;
388 DASHER_ASSERT(!GameSearchChildren(sym));
389 return false;
392 void CAlphabetManager::CSymbolNode::GetContext(CDasherInterfaceBase *pInterface, const CAlphabetMap *pAlphabetMap, vector<symbol> &vContextSymbols, int iOffset, int iLength) {
393 if (!GetFlag(NF_SEEN) && iOffset+iLength-1 == offset()) {
394 if (iLength > 1) Parent()->GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength-numChars());
395 vContextSymbols.push_back(iSymbol);
396 } else {
397 CDasherNode::GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength);
401 symbol CAlphabetManager::CSymbolNode::GetAlphSymbol() {
402 return iSymbol;
405 void CAlphabetManager::CSymbolNode::PopulateChildren() {
406 m_pMgr->IterateChildGroups(this, m_pMgr->m_pBaseGroup, NULL);
408 int CAlphabetManager::CAlphNode::ExpectedNumChildren() {
409 int i=m_pMgr->m_pBaseGroup->iNumChildNodes;
410 return (m_pMgr->GetBoolParameter(BP_CONTROL_MODE)) ? i+1 : i;
413 void CAlphabetManager::GetProbs(vector<unsigned int> *pProbInfo, CLanguageModel::Context context) {
414 const unsigned int iSymbols = m_pBaseGroup->iEnd-1;
416 // TODO - sort out size of control node - for the timebeing I'll fix the control node at 5%
417 // TODO: New method (see commented code) has been removed as it wasn' working.
419 const unsigned long iNorm(m_pNCManager->GetAlphNodeNormalization());
420 //the case for control mode on, generalizes to handle control mode off also,
421 // as then iNorm - control_space == iNorm...
422 const unsigned int iUniformAdd = max(1ul, ((iNorm * GetLongParameter(LP_UNIFORM)) / 1000) / iSymbols);
423 const unsigned long iNonUniformNorm = iNorm - iSymbols * iUniformAdd;
424 // m_pLanguageModel->GetProbs(context, Probs, iNorm, ((iNorm * uniform) / 1000));
426 //ACL used to test explicitly for MandarinDasher and if so called GetPYProbs instead
427 // (by statically casting to PPMPYLanguageModel). However, have renamed PPMPYLanguageModel::GetPYProbs
428 // to GetProbs as per ordinary language model, so no need to test....
429 m_pLanguageModel->GetProbs(context, *pProbInfo, iNonUniformNorm, 0);
431 DASHER_ASSERT(pProbInfo->size() == iSymbols+1);//initial 0
433 for(unsigned int k(1); k < pProbInfo->size(); ++k)
434 (*pProbInfo)[k] += iUniformAdd;
436 #ifdef DEBUG
438 unsigned long iTotal = 0;
439 for(unsigned int k = 0; k < pProbInfo->size(); ++k)
440 iTotal += (*pProbInfo)[k];
441 DASHER_ASSERT(iTotal == iNorm);
443 #endif
446 std::vector<unsigned int> *CAlphabetManager::CAlphNode::GetProbInfo() {
447 if (!m_pProbInfo) {
448 m_pProbInfo = new std::vector<unsigned int>();
449 m_pMgr->GetProbs(m_pProbInfo, iContext);
451 // work out cumulative probs in place
452 for(unsigned int i = 1; i < m_pProbInfo->size(); i++) {
453 (*m_pProbInfo)[i] += (*m_pProbInfo)[i - 1];
456 return m_pProbInfo;
459 std::vector<unsigned int> *CAlphabetManager::CGroupNode::GetProbInfo() {
460 if (Parent() && Parent()->mgr() == mgr() && Parent()->offset()==offset()) {
461 return (static_cast<CAlphNode *>(Parent()))->GetProbInfo();
463 //nope, no usable parent. compute here...
464 return CAlphNode::GetProbInfo();
467 void CAlphabetManager::CGroupNode::PopulateChildren() {
468 m_pMgr->IterateChildGroups(this, m_pGroup, NULL);
471 int CAlphabetManager::CGroupNode::ExpectedNumChildren() {
472 return m_pGroup->iNumChildNodes;
475 CAlphabetManager::CGroupNode *CAlphabetManager::CreateGroupNode(CAlphNode *pParent, int iBkgCol, const SGroupInfo *pInfo) {
477 // When creating a group node...
478 // ...the offset is the same as the parent...
480 CGroupNode *pNewNode = new CGroupNode(pParent->offset(), m_mGroupLabels[pInfo], iBkgCol, this, pInfo);
482 //...as is the context!
483 pNewNode->iContext = m_pLanguageModel->CloneContext(pParent->iContext);
485 return pNewNode;
488 CDasherNode *CAlphabetManager::CAlphBase::RebuildGroup(CAlphNode *pParent, int iBkgCol, const SGroupInfo *pInfo) {
489 CGroupNode *pRet=m_pMgr->CreateGroupNode(pParent, iBkgCol, pInfo);
490 if (isInGroup(pInfo)) {
491 //created group node should contain this one
492 m_pMgr->IterateChildGroups(pRet,pInfo,this);
494 return pRet;
497 CDasherNode *CAlphabetManager::CGroupNode::RebuildGroup(CAlphNode *pParent, int iBkgCol, const SGroupInfo *pInfo) {
498 if (pInfo == m_pGroup) {
499 //offset doesn't increase for groups...
500 DASHER_ASSERT (offset() == pParent->offset());
501 return this;
503 return CAlphBase::RebuildGroup(pParent, iBkgCol, pInfo);
506 bool CAlphabetManager::CGroupNode::isInGroup(const SGroupInfo *pInfo) {
507 return pInfo->iStart <= m_pGroup->iStart && pInfo->iEnd >= m_pGroup->iEnd;
510 bool CAlphabetManager::CSymbolNode::isInGroup(const SGroupInfo *pInfo) {
511 return (pInfo->iStart <= iSymbol && pInfo->iEnd > iSymbol);
514 CDasherNode *CAlphabetManager::CreateSymbolNode(CAlphNode *pParent, symbol iSymbol) {
516 // TODO: Exceptions / error handling in general
518 // Uniquely, a paragraph symbol can be two characters
519 // (and we can't call numChars() on the symbol before we've constructed it!)
520 int iNewOffset = pParent->offset()+1;
521 if (m_pAlphabet->GetText(iSymbol)=="\r\n") iNewOffset++;
522 CSymbolNode *pAlphNode = new CSymbolNode(iNewOffset, m_vLabels[iSymbol], this, iSymbol);
523 // std::stringstream ssLabel;
525 // ssLabel << GetLabelText(iSymbol) << ": " << pNewNode;
527 // pDisplayInfo->strDisplayText = ssLabel.str();
529 pAlphNode->iContext = m_pLanguageModel->CloneContext(pParent->iContext);
530 m_pLanguageModel->EnterSymbol(pAlphNode->iContext, iSymbol); // TODO: Don't use symbols?
532 return pAlphNode;
535 CDasherNode *CAlphabetManager::CAlphBase::RebuildSymbol(CAlphNode *pParent, symbol iSymbol) {
536 return m_pMgr->CreateSymbolNode(pParent, iSymbol);
539 CDasherNode *CAlphabetManager::CSymbolNode::RebuildSymbol(CAlphNode *pParent, symbol iSymbol) {
540 if(iSymbol == this->iSymbol) {
541 DASHER_ASSERT(offset() == pParent->offset() + numChars());
542 return this;
544 return CAlphBase::RebuildSymbol(pParent, iSymbol);
547 void CAlphabetManager::IterateChildGroups(CAlphNode *pParent, const SGroupInfo *pParentGroup, CAlphBase *buildAround) {
548 std::vector<unsigned int> *pCProb(pParent->GetProbInfo());
549 DASHER_ASSERT((*pCProb)[0] == 0);
550 const int iMin(pParentGroup->iStart);
551 const int iMax(pParentGroup->iEnd);
552 unsigned int iRange(pParentGroup == m_pBaseGroup ? CDasherModel::NORMALIZATION : ((*pCProb)[iMax-1] - (*pCProb)[iMin-1]));
554 // TODO: Think through alphabet file formats etc. to make this class easier.
555 // TODO: Throw a warning if parent node already has children
557 // Create child nodes and add them
559 int i(iMin); //lowest index of child which we haven't yet added
560 const SGroupInfo *pCurrentNode(pParentGroup->pChild);
561 // The SGroupInfo structure has something like linked list behaviour
562 // Each SGroupInfo contains a pNext, a pointer to a sibling group info
563 while (i < iMax) {
564 CDasherNode *pNewChild;
565 bool bSymbol = !pCurrentNode //gone past last subgroup
566 || i < pCurrentNode->iStart; //not reached next subgroup
567 const int iStart=i, iEnd = (bSymbol) ? i+1 : pCurrentNode->iEnd;
568 //uint64 is platform-dependently #defined in DasherTypes.h as an (unsigned) 64-bit int ("__int64" or "long long int")
569 unsigned int iLbnd = (((*pCProb)[iStart-1] - (*pCProb)[iMin-1]) *
570 static_cast<uint64>(CDasherModel::NORMALIZATION)) /
571 iRange;
572 unsigned int iHbnd = (((*pCProb)[iEnd-1] - (*pCProb)[iMin-1]) *
573 static_cast<uint64>(CDasherModel::NORMALIZATION)) /
574 iRange;
575 if (bSymbol) {
576 pNewChild = (buildAround) ? buildAround->RebuildSymbol(pParent, i) : CreateSymbolNode(pParent, i);
577 i++; //make one symbol at a time - move onto next symbol in next iteration of (outer) loop
578 } else {
579 DASHER_ASSERT(pCurrentNode->iNumChildNodes > 1);
580 pNewChild= (buildAround) ? buildAround->RebuildGroup(pParent, pParent->getColour(), pCurrentNode) : CreateGroupNode(pParent, pParent->getColour(), pCurrentNode);
581 i = pCurrentNode->iEnd; //make one group at a time - so move past entire group...
582 pCurrentNode = pCurrentNode->pNext; //next sibling of _original_ pCurrentNode (above)
583 // (maybe not of pCurrentNode now, which might be a subgroup filling the original)
585 //created a new node - symbol or (group which will have >1 child).
586 pNewChild->Reparent(pParent, iLbnd, iHbnd);
589 if (pParentGroup == m_pBaseGroup) m_pNCManager->AddExtras(pParent);
590 pParent->SetFlag(NF_ALLCHILDREN, true);
593 CAlphabetManager::CAlphNode::~CAlphNode() {
594 delete m_pProbInfo;
595 m_pMgr->m_pLanguageModel->ReleaseContext(iContext);
598 const std::string &CAlphabetManager::CSymbolNode::outputText() const {
599 if (iSymbol == m_pMgr->m_pAlphabet->GetParagraphSymbol() && GetFlag(NF_SEEN)) {
600 //Regardless of this particular platform's definition of a newline,
601 // which is what we'd _output_, when reversing back over text
602 // which may have been produced elsewhere, we represent occurrences
603 // of _either_ \n or \r\n by a single paragraph symbol.
604 //If the alphabet has a paragraph symbol, \r is not a symbol on its own
605 // (and \n isn't a symbol other than paragraph). So look for a
606 // \r before the \n.
607 DASHER_ASSERT(m_pMgr->m_pInterface->GetContext(offset(),1)=="\n");
608 static std::string rn("\r\n"),n("\n"); //must store strings somewhere to return by reference!
609 return (m_pMgr->m_pInterface->GetContext(offset()-1,2)=="\r\n") ? rn : n;
611 return mgr()->m_pAlphabet->GetText(iSymbol);
614 string CAlphabetManager::CSymbolNode::trainText() {
615 return m_pMgr->m_pAlphabet->escape(outputText());
618 int CAlphabetManager::CSymbolNode::numChars() {
619 return (outputText()=="\r\n") ? 2 : 1;
622 void CAlphabetManager::CSymbolNode::Output() {
623 if (m_pMgr->GetBoolParameter(BP_LM_ADAPTIVE)) {
624 if (m_pMgr->m_pLastOutput != Parent()) {
625 //Context changed. Flush to disk the old context + text written in it...
626 m_pMgr->WriteTrainFileFull(m_pMgr->m_pInterface);
628 ///Now extract the context in which this node was written.
629 /// Since this node is being output now, its parent must already have been,
630 /// so the simplest thing is to read from the edit buffer!
631 int iStart = max(0, offset() - m_pMgr->m_pLanguageModel->GetContextLength());
632 m_pMgr->strTrainfileContext = m_pMgr->m_pInterface->GetContext(iStart, offset()-iStart);
633 if (m_pMgr->strTrainfileContext=="") //Even the empty context (as for a new document)
634 m_pMgr->strTrainfileContext = m_pMgr->m_pAlphabet->GetDefaultContext(); //is a new ctx!
636 //Now handle outputting of this node
637 m_pMgr->m_pLastOutput = this;
638 string tr(trainText());
639 m_pMgr->strTrainfileBuffer += tr;
640 //an actual occurrence of the escape character, must be doubled (like \\)
641 if (tr == m_pMgr->m_pAlphabet->GetContextEscapeChar()) m_pMgr->strTrainfileBuffer+=tr;
643 //std::cout << this << " " << Parent() << ": Output at offset " << m_iOffset << " *" << m_pMgr->m_pAlphabet->GetText(t) << "* " << std::endl;
645 m_pMgr->m_pInterface->editOutput(outputText(), this);
648 SymbolProb CAlphabetManager::CSymbolNode::GetSymbolProb() const {
649 //TODO probability here not right - Range() is relative to parent, not prev symbol
650 return Dasher::SymbolProb(iSymbol, outputText(), Range() / (double)CDasherModel::NORMALIZATION);
653 void CAlphabetManager::CSymbolNode::Undo() {
654 DASHER_ASSERT(GetFlag(NF_SEEN));
655 if (m_pMgr->GetBoolParameter(BP_LM_ADAPTIVE)) {
656 if (m_pMgr->m_pLastOutput == this) {
657 //Erase from training buffer, and move lastOutput backwards,
658 // iff this node was actually written (i.e. not rebuilt _from_ context!)
659 std::string &buf(m_pMgr->strTrainfileBuffer);
660 std::string tr(trainText());
661 if (tr.length()<=buf.length()
662 && buf.substr(buf.length()-tr.length(),tr.length())==tr) {
663 buf=buf.substr(0,buf.length()-tr.length());
664 m_pMgr->m_pLastOutput = Parent();
667 } else CAlphBase::Undo();
668 m_pMgr->m_pInterface->editDelete(outputText(), this);
671 CDasherNode *CAlphabetManager::CGroupNode::RebuildParent() {
673 if (Parent()) return Parent();
675 if (m_pGroup == m_pMgr->m_pBaseGroup) {
676 //top level root node.
677 //if (offset()>0), there was _something_ before us, like
678 // a control node; but we no longer know what!
679 return NULL;
682 //All other CGroupNode's have a container i.e. the parent group
683 return CAlphBase::RebuildParent();
686 CDasherNode *CAlphabetManager::CAlphBase::RebuildParent() {
687 if (!Parent()) {
688 //Parent's offset usually one less than this, but can be two for the paragraph symbol.
689 int iNewOffset = offset()-numChars();
691 CAlphNode *pNewNode = m_pMgr->GetRoot(NULL, iNewOffset!=-1, iNewOffset+1);
693 RebuildForwardsFromAncestor(pNewNode);
695 if (int flags=(GetFlag(NF_SEEN) ? NF_SEEN : 0) | (GetFlag(NF_COMMITTED) ? NF_COMMITTED : 0)) {
696 for (CDasherNode *pNode=this; (pNode=pNode->Parent()); pNode->SetFlag(flags, true));
699 return Parent();
702 void CAlphabetManager::CAlphBase::RebuildForwardsFromAncestor(CAlphNode *pNewNode) {
703 //now fill in the new node - recursively - until it reaches us
704 m_pMgr->IterateChildGroups(pNewNode, m_pMgr->m_pBaseGroup, this);
707 // TODO: Shouldn't there be an option whether or not to learn as we write?
708 // For want of a better solution, game mode exemption explicit in this function
709 void CAlphabetManager::CSymbolNode::SetFlag(int iFlag, bool bValue) {
710 if ((iFlag & NF_COMMITTED) && bValue && !GetFlag(NF_COMMITTED | NF_GAME)
711 && m_pMgr->GetBoolParameter(BP_LM_ADAPTIVE)) {
712 //try to commit...if we have parent (else rebuilding (backwards) => don't)
713 if (Parent()) {
714 if (Parent()->mgr() != mgr()) return; //do not set flag
715 CLanguageModel *pLM(m_pMgr->m_pLanguageModel);
716 // (Note: for first symbol after startup: parent is (root) group node, which'll have the alphabet default context)
717 CLanguageModel::Context ctx = pLM->CloneContext(static_cast<CAlphabetManager::CAlphNode *>(Parent())->iContext);
718 pLM->LearnSymbol(ctx, iSymbol);
719 //could: pLM->ReleaseContext(ctx);
720 //however, seems better to replace this node's context (i.e. which it uses to create its own children)
721 // with the new (learned) context: the former was obtained by EnterSymbol rather than LearnSymbol, so
722 // will be different iff this node was the first time its symbol was entered into its parent context.
723 // (Yes, this node's context is unlikely to be used again, but not impossible...)
724 pLM->ReleaseContext(iContext);
725 iContext = ctx;
728 CDasherNode::SetFlag(iFlag, bValue);