itinial example sentences;added gUI::html_view;doxygen comments
[aoi.git] / src / parsers.cxx
blob60074c411b382258553e707a60425bca8a72a963
1 /*
2 Copyright 2013 Karel Matas
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #include "parsers.hxx"
18 #include "utils.hxx"
19 #include "logger.hxx"
21 #include <cstring>
23 namespace parsers {
25 using utils::to_string;
27 //////////////////////////////////////////////////////////////////////////
28 // STATIC
30 vector<string> BaseParser::get_elements ( xml_node<> *parent, const char *element, bool unreference )
32 vector<string> out;
33 auto *elt = parent->first_node(element);
34 while ( elt != 0 ){
35 string s = elt->value();
36 if ( unreference ){
37 size_t start = (s.front()=='&'); // erase & from the beggining of elt
38 size_t end = s.size() - start - (s.back()==';'); // erase ; from the end of elt
39 s = string(s,start,end);
41 if ( !s.empty() )
42 out.push_back( s );
43 elt = elt->next_sibling(element);
45 return out;
48 //////////////////////////////////////////////////////////////////////////
50 BaseParser::BaseParser ( const char *filename )
52 file_.open( filename );
54 if ( !file_.is_open() )
55 throw utils::CantOpenFile(filename);
57 // size of file
58 file_.seekg( 0, file_.end );
59 int size = file_.tellg();
60 file_.seekg( 0, file_.beg );
62 if ( size == 0 ){
63 std::stringstream ss;
64 ss << "Source file has zero length. (" << filename << ")";
65 throw utils::ParsingError( ss.str() );
68 // read file
69 buffer_ = new char[size+1];
70 file_.read( buffer_, size );
71 int read = file_.gcount();
73 if ( read != size )
74 throw utils::ParsingError("Only " + std::to_string(read) + " bytes was read.");
76 // DOCTYPE node contains ENTITY definitions
77 doc_.parse<rapidxml::parse_doctype_node>( buffer_ );
80 BaseParser::~BaseParser ()
82 file_.close();
83 delete[] buffer_;
87 map<string,string> BaseParser::get_entities ()
89 map<string,string> m;
90 auto *node = doc_.first_node();
91 while ( node ){
92 if ( node->type() == rapidxml::node_doctype ){
93 char *pos = strstr( node->value(), "<!ENTITY " );
94 while ( pos ){
95 char buff[1024];
96 size_t i = 0;
97 pos += 9; // strlen("<!ENTITY ")
98 // entity name
99 while ( *pos != ' ' ) buff[i++] = *pos++;
100 buff[i] = '\0';
101 string abbr(buff);
102 // skip spaces
103 while ( *pos == ' ' ) pos++;
104 // entity description
105 i = 0;
106 while ( *pos != '>' ) buff[i++] = *pos++;
107 buff[i] = '\0';
108 m[abbr] = string(buff);
109 pos = strstr( pos, "<!ENTITY" );
112 node = node->next_sibling();
114 return m;
117 //////////////////////////////////////////////////////////////////////////
118 // JmdictParser
120 string JmdictParser::get_version ()
122 int n = 0;
123 string s = "";
124 file_.seekg( 0, file_.beg );
125 while ( file_.good() && n < 1000 ){
126 file_ >> s;
127 if ( s == "Rev" ){
128 string ver;
129 file_ >> ver;
130 return s + " " + ver;
132 n++;
134 return "NONE";
137 DicWord JmdictParser::get_entry ()
140 if ( !entry_ ){
141 return DicWord();
144 int did = -1;
145 vector<ElementKanji> vk_ele;
146 vector<ElementReading> vr_ele;
147 vector<ElementSense> vs_ele;
149 // dictionary id
150 did = std::stoi( entry_->first_node("ent_seq")->value() );
152 // KANJI (element k_ele)
153 // Elements of interest (name, count, description):
154 // keb 1 kanji
155 // ke_inf 0+ informations about keb
156 // ke_pri 0+ if not empty: keb can be considered "common" or "frequent"
157 auto k_ele = entry_->first_node("k_ele");
158 while ( k_ele != 0 ){
159 auto keb = k_ele->first_node("keb");
160 auto ke_inf = get_elements( k_ele, "ke_inf", true );
161 bool freq = ( k_ele->first_node("ke_pri") != 0 );
162 vk_ele.push_back( { n_kanji_++, keb->value(), ke_inf, freq } );
163 k_ele = k_ele->next_sibling("k_ele");
166 // READING (element r_ele)
167 // Elements of interest (name, count, description):
168 // reb 1 reading
169 // re_nokanji 0-1 if 1: reb cannot be regarded as a true reading of the kanji
170 // re_restr 0+ reb only applies for this keb
171 // re_inf 0+ informations about reb
172 // re_pri 0+ if not empty: reb can be considered "common" or "frequent"
173 auto r_ele = entry_->first_node("r_ele");
174 while ( r_ele != 0 ){
175 auto reb = r_ele->first_node("reb");
176 auto re_restr = get_elements( r_ele, "re_restr" );
177 auto re_inf = get_elements( r_ele, "re_inf", true );
178 bool re_nokanji = ( r_ele->first_node("re_nokanji") != 0 );
179 bool freq = ( r_ele->first_node("re_pri") != 0 );
180 vr_ele.push_back( {n_reading_++, reb->value(), re_nokanji, re_restr, re_inf, freq } );
181 r_ele = r_ele->next_sibling("r_ele");
184 // SENSE (element r_ele)
185 // Elements of interest (name, count, description):
186 // gloss 0+
187 // stagk 0+
188 // stagr 0+
189 // pos 0+
190 // xref 0+
191 // ant 0+
192 // field 0+
193 // misc 0+
194 // dial 0+
195 // s_inf 0+
196 auto *sense = entry_->first_node("sense");
197 while ( sense != 0 ){
198 auto gloss = get_elements( sense, "gloss" );
199 auto stagk = get_elements( sense, "stagk" );
200 auto stagr = get_elements( sense, "stagr" );
201 auto pos = get_elements( sense, "pos", true );
202 auto xref = get_elements( sense, "xref" );
203 auto ant = get_elements( sense, "ant" );
204 auto field = get_elements( sense, "field", true );
205 auto misc = get_elements( sense, "misc", true );
206 auto dial = get_elements( sense, "dial", true );
207 auto s_inf = get_elements( sense, "s_inf" );
208 vs_ele.push_back( {n_sense_++,gloss,stagk,stagr,pos,xref,ant,field,misc,dial,s_inf} );
209 n_gloss_ += gloss.size();
210 sense = sense->next_sibling("sense");
212 entry_ = entry_->next_sibling("entry");
213 n_entries_++;
215 return {did,vk_ele,vr_ele,vs_ele};
218 //////////////////////////////////////////////////////////////////////////
219 // KanjidicParser
221 string KanjidicParser::get_version ()
223 auto header = doc_.first_node("kanjidic2")->first_node("header");
224 const char *version = header->first_node("database_version")->value();
225 const char *date = header->first_node("date_of_creation")->value();
226 std::stringstream ss;
227 ss << version << " (" << date << ")";
228 return ss.str();
231 Kanji KanjidicParser::get_entry ()
233 Kanji k;
235 if ( !entry_ ){
236 return k;
239 k.kanji ( entry_->first_node("literal")->value() );
241 auto *cp_value = entry_->first_node("codepoint")->first_node("cp_value");
242 while ( cp_value ){
243 auto *type = cp_value->first_attribute("cp_type")->value();
244 if ( !strcmp( type, "ucs" ) )
245 k.ucs ( cp_value->value() );
246 else if ( !strncmp( type, "jis", 3 ) )
247 k.flags( type );
248 cp_value = cp_value->next_sibling();
251 auto *rad_value = entry_->first_node("radical")->first_node("rad_value");
252 while ( rad_value ){
253 auto *type = rad_value->first_attribute("rad_type")->value();
254 if ( !strcmp( type, "classical" ) )
255 k.rad_classic( rad_value->value() );
256 else if ( !strcmp( type, "nelson_c" ) )
257 k.rad_nelson( rad_value->value() );
258 rad_value = rad_value->next_sibling();
262 auto *query_code = entry_->first_node("query_code");
263 if ( query_code ) {
264 auto *q_code = query_code->first_node("q_code");
265 while ( q_code ){
266 auto *type = q_code->first_attribute("qc_type")->value();
267 auto *misclass = q_code->first_attribute("skip_misclass");
268 if ( !strcmp( type, "skip" ) ){
269 vector<int> v = utils::split_string_int(q_code->value(),"-");
270 if ( v.size() != 3 ){
271 char msg[512];
272 snprintf( msg, 512, "Kanjiparser: Wrong SKIP %s. (U+%s)",
273 q_code->value(), k.ucs().c_str());
274 throw utils::ParsingError(msg);
276 k.skip( v, misclass ? misclass->value():0 );
278 q_code = q_code->next_sibling();
282 auto *misc = entry_->first_node("misc");
283 auto *freq = misc->first_node("freq");
284 auto *jlpt = misc->first_node("jlpt");
285 auto *grade = misc->first_node("grade");
286 k.strokes( misc->first_node("stroke_count")->value() );
287 if ( freq ) k.freq( freq->value() );
288 if ( jlpt ) k.jlpt( atoi(jlpt->value()) );
289 if ( grade ) k.grade( atoi(grade->value()) );
291 auto *reading_meaning = entry_->first_node("reading_meaning");
292 if ( reading_meaning ){
293 auto *rmgroup = reading_meaning->first_node("rmgroup");
294 if ( rmgroup ){
295 auto *reading = rmgroup->first_node("reading");
296 while( reading ){
297 auto *type = reading->first_attribute("r_type");
298 if ( type ){
299 if ( !strcmp( type->value(), "ja_on") )
300 k.onyomi( reading->value() );
301 else if ( !strcmp( type->value(), "ja_kun") )
302 k.kunyomi( reading->value() );
304 reading = reading->next_sibling();
306 auto *meaning = rmgroup->first_node("meaning");
307 while ( meaning ){
308 auto *fattr = meaning->first_attribute("m_lang");
309 if ( !fattr ) // english meaning has no m_lang attribute
310 k.meaning( meaning->value() );
311 meaning = meaning->next_sibling();
314 k.nanori( get_elements( reading_meaning, "nanori") );
318 entry_ = entry_->next_sibling("character");
319 n_entries_++;
321 return k;
325 } // namespace parsers