Move tests
[lttoolbox.git] / lttoolbox / compiler.h
blob05fedb610c7ff84f6e7dfc625b35883b308e245d
1 /*
2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 * 02111-1307, USA.
19 #ifndef _MYCOMPILER_
20 #define _MYCOMPILER_
22 #include <lttoolbox/alphabet.h>
23 #include <lttoolbox/regexp_compiler.h>
24 #include <lttoolbox/entry_token.h>
25 #include <lttoolbox/ltstr.h>
26 #include <lttoolbox/transducer.h>
28 #include <map>
29 #include <string>
30 #include <set>
31 #include <libxml/xmlreader.h>
33 using namespace std;
35 /**
36 * A compiler of dictionaries to letter transducers
38 class Compiler
40 private:
41 /**
42 * The libxml2's XML reader
44 xmlTextReaderPtr reader;
46 /**
47 * The paradigm being compiled
49 wstring current_paradigm;
51 /**
52 * The dictionary section being compiled
54 wstring current_section;
56 /**
57 * The direction of the compilation, 'lr' (left-to-right) or 'rl'
58 * (right-to-left)
60 wstring direction;
62 /**
63 * List of characters to be considered alphabetic
65 wstring letters;
67 /**
68 * Identifier of all the symbols during the compilation
70 Alphabet alphabet;
72 /**
73 * List of named transducers-paradigms
75 map<wstring, Transducer, Ltstr> paradigms;
77 /**
78 * List of named dictionary sections
80 map<wstring, Transducer, Ltstr> sections;
82 /**
83 * List of named prefix copy of a paradigm
85 map<wstring, map<wstring, int, Ltstr>, Ltstr> prefix_paradigms;
87 /**
88 * List of named suffix copy of a paradigm
90 map<wstring, map<wstring, int, Ltstr>, Ltstr> suffix_paradigms;
92 /**
93 * List of named endings of a suffix copy of a paradgim
95 map<wstring, map<wstring, int, Ltstr>, Ltstr> postsuffix_paradigms;
97 /**
98 * Mapping of aliases of characters specified in ACX files
100 map<int, set<int> > acx_map;
103 * Original char being mapped
105 int acx_current_char;
108 static string range(char const a, char const b);
109 string readAlphabet();
113 * Method to parse an XML Node
115 void procNode();
118 * Method to parse an XML Node in ACX files
120 void procNodeACX();
124 * Parse the &lt;alphabet&gt; element
126 void procAlphabet();
129 * Parse the &lt;sdef&lt; element
131 void procSDef();
134 * Parse the &lt;pardef&gt; element
136 void procParDef();
139 * Parse the &lt;e&gt; element
141 void procEntry();
144 * Parse the &lt;re&gt; element
145 * @return a list of tokens from the dictionary's entry
147 EntryToken procRegexp();
150 * Parse the &lt;section&gt; element
152 void procSection();
155 * Gets an attribute value with their name and the current context
156 * @param name the name of the attribute
157 * @return the value of the attribute
159 wstring attrib(wstring const &name);
162 * Construct symbol pairs by align left side of both parts and insert
163 * them into a transducer
164 * @param lp left part of the transduction
165 * @param rp right part of the transduction
166 * @param state the state from wich insert the new transduction
167 * @param t the transducer
168 * @return the last state of the inserted transduction
170 int matchTransduction(list<int> const &lp, list<int> const &rp,
171 int state, Transducer &t);
173 * Parse the &lt;p&lt; element
174 * @return a list of tokens from the dictionary's entry
176 EntryToken procTransduction();
179 * Parse the &lt;i&lt; element
180 * @return a list of tokens from the dictionary's entry
182 EntryToken procIdentity();
185 * Parse the &lt;par&gt; element
186 * @return a list of tokens from the dictionary's entry
188 EntryToken procPar();
191 * Insert a list of tokens into the paradigm / section being processed
192 * @param elements the list
194 void insertEntryTokens(vector<EntryToken> const &elements);
197 * Skip all document #text nodes before "elem"
198 * @param name the name of the node
199 * @param elem the name of the expected node
201 void skip(wstring &name, wstring const &elem);
204 * Skip all blank #text nodes before "name"
205 * @param name the name of the node
207 void skipBlanks(wstring &name);
210 void readString(list<int> &result, wstring const &name);
213 * Force an element to be empty, and check for it
214 * @param name the element
216 void requireEmptyError(wstring const &name);
219 * Force an attribute to be specified, amd check for it
220 * @param value the value of the attribute
221 * @param attrname the name of the attribute
222 * @param elemname the parent of the attribute
224 void requireAttribute(wstring const &value, wstring const &attrname,
225 wstring const &elemname);
228 * True if all the elements in the current node are blanks
229 * @return true if all are blanks
231 bool allBlanks();
233 public:
236 * Constants to represent the element and the attributes of
237 * dictionaries
239 static wstring const COMPILER_DICTIONARY_ELEM;
240 static wstring const COMPILER_ALPHABET_ELEM;
241 static wstring const COMPILER_SDEFS_ELEM;
242 static wstring const COMPILER_SDEF_ELEM;
243 static wstring const COMPILER_N_ATTR;
244 static wstring const COMPILER_PARDEFS_ELEM;
245 static wstring const COMPILER_PARDEF_ELEM;
246 static wstring const COMPILER_PAR_ELEM;
247 static wstring const COMPILER_ENTRY_ELEM;
248 static wstring const COMPILER_RESTRICTION_ATTR;
249 static wstring const COMPILER_RESTRICTION_LR_VAL;
250 static wstring const COMPILER_RESTRICTION_RL_VAL;
251 static wstring const COMPILER_PAIR_ELEM;
252 static wstring const COMPILER_LEFT_ELEM;
253 static wstring const COMPILER_RIGHT_ELEM;
254 static wstring const COMPILER_S_ELEM;
255 static wstring const COMPILER_REGEXP_ELEM;
256 static wstring const COMPILER_SECTION_ELEM;
257 static wstring const COMPILER_ID_ATTR;
258 static wstring const COMPILER_TYPE_ATTR;
259 static wstring const COMPILER_IDENTITY_ELEM;
260 static wstring const COMPILER_JOIN_ELEM;
261 static wstring const COMPILER_BLANK_ELEM;
262 static wstring const COMPILER_POSTGENERATOR_ELEM;
263 static wstring const COMPILER_GROUP_ELEM;
264 static wstring const COMPILER_LEMMA_ATTR;
265 static wstring const COMPILER_IGNORE_ATTR;
266 static wstring const COMPILER_IGNORE_YES_VAL;
270 * Copnstructor
272 Compiler();
275 * Destructor
277 ~Compiler();
280 * Compile dictionary to letter transducers
282 void parse(string const &fichero, wstring const &dir);
285 * Read ACX file
287 void parseACX(string const &fichero, wstring const &dir);
291 * Write the result of compilation
292 * @param fd the stream where write the result
294 void write(FILE *fd);
298 #endif