itinial example sentences;added gUI::html_view;doxygen comments
[aoi.git] / src / parsers.hxx
blob465b589939d6843ca5a10233da1c1b5c7cecc55a
1 /*
2 Copyright 2013 Karel Matas
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #ifndef _PARSERS_HXX
18 #define _PARSERS_HXX
20 /*! \file parsers.hxx
21 * Parsers for JMdict and kanjidic2.
25 #include <vector>
26 #include <string>
27 #include <map>
28 #include <fstream>
29 #include "3rdparty/rapidxml.hpp"
30 #include "utils.hxx"
31 #include "datatypes.hxx"
33 using aoi::SEPARATOR_SQL;
34 using aoi::ElementKanji;
35 using aoi::ElementReading;
36 using aoi::ElementSense;
37 using aoi::DicWord;
38 using aoi::Kanji;
39 using std::vector;
40 using std::string;
41 using std::map;
42 using rapidxml::xml_node;
43 using rapidxml::xml_document;
45 namespace parsers {
48 /*!
49 * Base XML parser class. Loads XML file and build XML tree. Must be subclassed.
50 * \todo Parser should escape aoi::SEPARATOR_SQL character right after loading
51 * file into memory (i.e. before parsing).
53 class BaseParser
55 private:
56 char *buffer_;
58 protected:
59 xml_document<> doc_;
60 std::ifstream file_;
62 /*!
63 * Get values of all the elements of the type <i>element</i> in <i>node</i>.
64 * For example XML code:
65 \verbatim
66 <person>
67 <name>John Doe</name>
68 <phone>1232456789</phone>
69 <phone>987654321</phone>
70 </person>
71 \endverbatim
72 * get_elements( node, phone ) returns "{ "123456789", "987654321" }"
73 * \param parent parent node
74 * \param element what element to get
75 * \param unreference if true: remove '&' and ';' from the string borders
76 * \return values of all the elements <i>element</i> or empty vector
78 static vector<string> get_elements ( xml_node<> *parent, const char *element,
79 bool unreference=false );
81 public:
82 BaseParser ( const char *filename );
83 virtual ~BaseParser ();
85 /*!
86 * Scans first node of the document for the entities (<!ENTITY).
87 * \return map in format entity_name:entity_description
89 map<string,string> get_entities ();
93 //! Parser for JMDict_e XML file.
94 class JmdictParser : public BaseParser
96 private:
97 int n_entries_ = 0;
98 int n_reading_ = 0;
99 int n_kanji_ = 0;
100 int n_gloss_ = 0;
101 int n_sense_ = 0;
102 xml_node<> *entry_ = nullptr;
104 public:
105 JmdictParser( const char *filename ) : BaseParser(filename)
106 { entry_ = doc_.first_node("JMdict")->first_node("entry"); };
107 ~JmdictParser() {};
110 * Gets one entry from JMdict. Caller should call this function until
111 * Dicword.did() != -1
112 \verbatim
113 JmdictParser p("file.xml");
114 DicWord w = p.get_entry();
115 while ( w.did() != -1 ){
116 printf("Word ID: %d\n", w.did());
117 w = p.get_entry();
119 \endverbatim
120 * \return DicWord on succes, empty DicWord (did()=-1) otherwise
122 DicWord get_entry ();
124 //! Returns JMDict version.
125 string get_version ();
130 //! Parser for kanjidic2 XML file.
131 class KanjidicParser : public BaseParser
133 private:
134 int n_entries_ = 0;
135 xml_node<> *entry_ = nullptr;
137 public:
138 KanjidicParser( const char *filename ): BaseParser(filename)
139 { entry_ = doc_.first_node("kanjidic2")->first_node("character"); };
140 ~KanjidicParser(){};
143 * Gets one entry from kanjidic2. Caller should call this function until
144 * Kanji.kanji() != ""
145 \verbatim
146 KanjidicParser p("file.xml");
147 Kanji k = p.get_entry();
148 while ( !k.kanji().empty() ){
149 printf("Kanji: %s\n", k.kanji().c_str());
150 k = p.get_entry();
152 \endverbatim
153 * \return Kanji on success, empty Kanji (kanji()=="") otherwise
155 Kanji get_entry ();
157 //! Returns kanjidic2 version in format: "version (date)"
158 string get_version ();
162 } // namespace parsers
163 #endif // _PARSERS_HXX