2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include <lttoolbox/alphabet.h>
23 #include <lttoolbox/regexp_compiler.h>
24 #include <lttoolbox/entry_token.h>
25 #include <lttoolbox/ltstr.h>
26 #include <lttoolbox/transducer.h>
31 #include <libxml/xmlreader.h>
36 * A compiler of dictionaries to letter transducers
42 * The libxml2's XML reader
44 xmlTextReaderPtr reader
;
47 * The paradigm being compiled
49 wstring current_paradigm
;
52 * The dictionary section being compiled
54 wstring current_section
;
57 * The direction of the compilation, 'lr' (left-to-right) or 'rl'
63 * List of characters to be considered alphabetic
68 * Identifier of all the symbols during the compilation
73 * List of named transducers-paradigms
75 map
<wstring
, Transducer
, Ltstr
> paradigms
;
78 * List of named dictionary sections
80 map
<wstring
, Transducer
, Ltstr
> sections
;
83 * List of named prefix copy of a paradigm
85 map
<wstring
, map
<wstring
, int, Ltstr
>, Ltstr
> prefix_paradigms
;
88 * List of named suffix copy of a paradigm
90 map
<wstring
, map
<wstring
, int, Ltstr
>, Ltstr
> suffix_paradigms
;
93 * List of named endings of a suffix copy of a paradgim
95 map
<wstring
, map
<wstring
, int, Ltstr
>, Ltstr
> postsuffix_paradigms
;
98 * Mapping of aliases of characters specified in ACX files
100 map
<int, set
<int> > acx_map
;
103 * Original char being mapped
105 int acx_current_char
;
108 static string range(char const a, char const b);
109 string readAlphabet();
113 * Method to parse an XML Node
118 * Method to parse an XML Node in ACX files
124 * Parse the <alphabet> element
129 * Parse the <sdef< element
134 * Parse the <pardef> element
139 * Parse the <e> element
144 * Parse the <re> element
145 * @return a list of tokens from the dictionary's entry
147 EntryToken
procRegexp();
150 * Parse the <section> element
155 * Gets an attribute value with their name and the current context
156 * @param name the name of the attribute
157 * @return the value of the attribute
159 wstring
attrib(wstring
const &name
);
162 * Construct symbol pairs by align left side of both parts and insert
163 * them into a transducer
164 * @param lp left part of the transduction
165 * @param rp right part of the transduction
166 * @param state the state from wich insert the new transduction
167 * @param t the transducer
168 * @return the last state of the inserted transduction
170 int matchTransduction(list
<int> const &lp
, list
<int> const &rp
,
171 int state
, Transducer
&t
);
173 * Parse the <p< element
174 * @return a list of tokens from the dictionary's entry
176 EntryToken
procTransduction();
179 * Parse the <i< element
180 * @return a list of tokens from the dictionary's entry
182 EntryToken
procIdentity();
185 * Parse the <par> element
186 * @return a list of tokens from the dictionary's entry
188 EntryToken
procPar();
191 * Insert a list of tokens into the paradigm / section being processed
192 * @param elements the list
194 void insertEntryTokens(vector
<EntryToken
> const &elements
);
197 * Skip all document #text nodes before "elem"
198 * @param name the name of the node
199 * @param elem the name of the expected node
201 void skip(wstring
&name
, wstring
const &elem
);
204 * Skip all blank #text nodes before "name"
205 * @param name the name of the node
207 void skipBlanks(wstring
&name
);
210 void readString(list
<int> &result
, wstring
const &name
);
213 * Force an element to be empty, and check for it
214 * @param name the element
216 void requireEmptyError(wstring
const &name
);
219 * Force an attribute to be specified, amd check for it
220 * @param value the value of the attribute
221 * @param attrname the name of the attribute
222 * @param elemname the parent of the attribute
224 void requireAttribute(wstring
const &value
, wstring
const &attrname
,
225 wstring
const &elemname
);
228 * True if all the elements in the current node are blanks
229 * @return true if all are blanks
236 * Constants to represent the element and the attributes of
239 static wstring
const COMPILER_DICTIONARY_ELEM
;
240 static wstring
const COMPILER_ALPHABET_ELEM
;
241 static wstring
const COMPILER_SDEFS_ELEM
;
242 static wstring
const COMPILER_SDEF_ELEM
;
243 static wstring
const COMPILER_N_ATTR
;
244 static wstring
const COMPILER_PARDEFS_ELEM
;
245 static wstring
const COMPILER_PARDEF_ELEM
;
246 static wstring
const COMPILER_PAR_ELEM
;
247 static wstring
const COMPILER_ENTRY_ELEM
;
248 static wstring
const COMPILER_RESTRICTION_ATTR
;
249 static wstring
const COMPILER_RESTRICTION_LR_VAL
;
250 static wstring
const COMPILER_RESTRICTION_RL_VAL
;
251 static wstring
const COMPILER_PAIR_ELEM
;
252 static wstring
const COMPILER_LEFT_ELEM
;
253 static wstring
const COMPILER_RIGHT_ELEM
;
254 static wstring
const COMPILER_S_ELEM
;
255 static wstring
const COMPILER_REGEXP_ELEM
;
256 static wstring
const COMPILER_SECTION_ELEM
;
257 static wstring
const COMPILER_ID_ATTR
;
258 static wstring
const COMPILER_TYPE_ATTR
;
259 static wstring
const COMPILER_IDENTITY_ELEM
;
260 static wstring
const COMPILER_JOIN_ELEM
;
261 static wstring
const COMPILER_BLANK_ELEM
;
262 static wstring
const COMPILER_POSTGENERATOR_ELEM
;
263 static wstring
const COMPILER_GROUP_ELEM
;
264 static wstring
const COMPILER_LEMMA_ATTR
;
265 static wstring
const COMPILER_IGNORE_ATTR
;
266 static wstring
const COMPILER_IGNORE_YES_VAL
;
280 * Compile dictionary to letter transducers
282 void parse(string
const &fichero
, wstring
const &dir
);
287 void parseACX(string
const &fichero
, wstring
const &dir
);
291 * Write the result of compilation
292 * @param fd the stream where write the result
294 void write(FILE *fd
);