1 /** @file termgenerator.h
2 * @brief parse free text and generate terms
4 /* Copyright (C) 2007,2009,2011,2012,2013,2014 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_TERMGENERATOR_H
22 #define XAPIAN_INCLUDED_TERMGENERATOR_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error "Never use <xapian/termgenerator.h> directly; include <xapian.h> instead."
28 #include <xapian/intrusive_ptr.h>
29 #include <xapian/types.h>
30 #include <xapian/unicode.h>
31 #include <xapian/visibility.h>
40 class WritableDatabase
;
42 /** Parses a piece of text and generate terms.
44 * This module takes a piece of text and parses it to produce words which are
45 * then used to generate suitable terms for indexing. The terms generated are
46 * suitable for use with Query objects produced by the QueryParser class.
48 class XAPIAN_VISIBILITY_DEFAULT TermGenerator
{
50 /// @private @internal Class representing the TermGenerator internals.
52 /// @private @internal Reference counted internals.
53 Xapian::Internal::intrusive_ptr
<Internal
> internal
;
56 TermGenerator(const TermGenerator
& o
);
59 TermGenerator
& operator=(const TermGenerator
& o
);
61 /// Default constructor.
67 /// Set the Xapian::Stem object to be used for generating stemmed terms.
68 void set_stemmer(const Xapian::Stem
& stemmer
);
70 /** Set the Xapian::Stopper object to be used for identifying stopwords.
72 * Stemmed forms of stopwords aren't indexed, but unstemmed forms still
73 * are so that searches for phrases including stop words still work.
75 * @param stop The Stopper object to set (default NULL, which means no
78 void set_stopper(const Xapian::Stopper
*stop
= NULL
);
80 /// Set the current document.
81 void set_document(const Xapian::Document
& doc
);
83 /// Get the current document.
84 const Xapian::Document
& get_document() const;
86 /// Set the database to index spelling data to.
87 void set_database(const Xapian::WritableDatabase
&db
);
89 /// For backward compatibility with Xapian 1.2
92 /// Flags to OR together and pass to TermGenerator::set_flags().
94 /// Index data required for spelling correction.
95 FLAG_SPELLING
= 128, // Value matches QueryParser flag.
97 /** Enable generation of n-grams from CJK text.
99 * With this enabled, spans of CJK characters are split into unigrams
100 * and bigrams, with the unigrams carrying positional information.
101 * Non-CJK characters are split into words as normal.
103 * The corresponding option needs to be passed to QueryParser.
105 * Flag added in Xapian 1.3.4 and 1.2.22, but this mode can be
106 * enabled in 1.2.8 and later by setting environment variable
109 FLAG_CJK_NGRAM
= 2048 // Value matches QueryParser flag.
112 /// Stemming strategies, for use with set_stemming_strategy().
113 typedef enum { STEM_NONE
, STEM_SOME
, STEM_ALL
, STEM_ALL_Z
} stem_strategy
;
117 * The new value of flags is: (flags & mask) ^ toggle
119 * To just set the flags, pass the new flags in toggle and the
120 * default value for mask.
122 * @param toggle Flags to XOR.
123 * @param mask Flags to AND with first.
125 * @return The old flags setting.
127 flags
set_flags(flags toggle
, flags mask
= flags(0));
129 /** Set the stemming strategy.
131 * This method controls how the stemming algorithm is applied. It was
132 * new in Xapian 1.3.1.
134 * @param strategy The strategy to use - possible values are:
135 * - STEM_NONE: Don't perform any stemming - only unstemmed terms
137 * - STEM_SOME: Generate both stemmed (with a "Z" prefix) and unstemmed
138 * terms. This is the default strategy.
139 * - STEM_ALL: Generate only stemmed terms (but without a "Z" prefix).
140 * - STEM_ALL_Z: Generate only stemmed terms (with a "Z" prefix).
142 void set_stemming_strategy(stem_strategy strategy
);
144 /** Set the maximum length word to index.
146 * The limit is on the length of a word prior to stemming and prior to
147 * adding any term prefix.
149 * The backends mostly impose a limit on the length of terms (often of
150 * about 240 bytes), but it's generally useful to have a lower limit to
151 * help prevent the index being bloated by useless junk terms from trying
152 * to indexing things like binary data, uuencoded data, ASCII art, etc.
154 * This method was new in Xapian 1.3.1.
156 * @param max_word_length The maximum length word to index, in bytes in
157 * UTF-8 representation. Default is 64.
159 void set_max_word_length(unsigned max_word_length
);
163 * @param itor Utf8Iterator pointing to the text to index.
164 * @param wdf_inc The wdf increment (default 1).
165 * @param prefix The term prefix to use (default is no prefix).
167 void index_text(const Xapian::Utf8Iterator
& itor
,
168 Xapian::termcount wdf_inc
= 1,
169 const std::string
& prefix
= std::string());
171 /** Index some text in a std::string.
173 * @param text The text to index.
174 * @param wdf_inc The wdf increment (default 1).
175 * @param prefix The term prefix to use (default is no prefix).
177 void index_text(const std::string
& text
,
178 Xapian::termcount wdf_inc
= 1,
179 const std::string
& prefix
= std::string()) {
180 return index_text(Utf8Iterator(text
), wdf_inc
, prefix
);
183 /** Index some text without positional information.
185 * Just like index_text, but no positional information is generated. This
186 * means that the database will be significantly smaller, but that phrase
187 * searching and NEAR won't be supported.
189 * @param itor Utf8Iterator pointing to the text to index.
190 * @param wdf_inc The wdf increment (default 1).
191 * @param prefix The term prefix to use (default is no prefix).
193 void index_text_without_positions(const Xapian::Utf8Iterator
& itor
,
194 Xapian::termcount wdf_inc
= 1,
195 const std::string
& prefix
= std::string());
197 /** Index some text in a std::string without positional information.
199 * Just like index_text, but no positional information is generated. This
200 * means that the database will be significantly smaller, but that phrase
201 * searching and NEAR won't be supported.
203 * @param text The text to index.
204 * @param wdf_inc The wdf increment (default 1).
205 * @param prefix The term prefix to use (default is no prefix).
207 void index_text_without_positions(const std::string
& text
,
208 Xapian::termcount wdf_inc
= 1,
209 const std::string
& prefix
= std::string()) {
210 return index_text_without_positions(Utf8Iterator(text
), wdf_inc
, prefix
);
213 /** Increase the term position used by index_text.
215 * This can be used between indexing text from different fields or other
216 * places to prevent phrase searches from spanning between them (e.g.
217 * between the title and body text, or between two chapters in a book).
219 * @param delta Amount to increase the term position by (default: 100).
221 void increase_termpos(Xapian::termcount delta
= 100);
223 /// Get the current term position.
224 Xapian::termcount
get_termpos() const;
226 /** Set the current term position.
228 * @param termpos The new term position to set.
230 void set_termpos(Xapian::termcount termpos
);
232 /// Return a string describing this object.
233 std::string
get_description() const;
238 #endif // XAPIAN_INCLUDED_TERMGENERATOR_H