Fix documentation comment typo
[xapian.git] / xapian-core / include / xapian / termgenerator.h
blobeb377a7899db035ef3e4593caf1a468034fa609c
1 /** @file termgenerator.h
2 * @brief parse free text and generate terms
3 */
4 /* Copyright (C) 2007,2009,2011,2012,2013,2014 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_TERMGENERATOR_H
22 #define XAPIAN_INCLUDED_TERMGENERATOR_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error "Never use <xapian/termgenerator.h> directly; include <xapian.h> instead."
26 #endif
28 #include <xapian/intrusive_ptr.h>
29 #include <xapian/types.h>
30 #include <xapian/unicode.h>
31 #include <xapian/visibility.h>
33 #include <string>
35 namespace Xapian {
37 class Document;
38 class Stem;
39 class Stopper;
40 class WritableDatabase;
42 /** Parses a piece of text and generate terms.
44 * This module takes a piece of text and parses it to produce words which are
45 * then used to generate suitable terms for indexing. The terms generated are
46 * suitable for use with Query objects produced by the QueryParser class.
48 class XAPIAN_VISIBILITY_DEFAULT TermGenerator {
49 public:
50 /// @private @internal Class representing the TermGenerator internals.
51 class Internal;
52 /// @private @internal Reference counted internals.
53 Xapian::Internal::intrusive_ptr<Internal> internal;
55 /// Copy constructor.
56 TermGenerator(const TermGenerator & o);
58 /// Assignment.
59 TermGenerator & operator=(const TermGenerator & o);
61 /// Default constructor.
62 TermGenerator();
64 /// Destructor.
65 ~TermGenerator();
67 /// Set the Xapian::Stem object to be used for generating stemmed terms.
68 void set_stemmer(const Xapian::Stem & stemmer);
70 /** Set the Xapian::Stopper object to be used for identifying stopwords.
72 * Stemmed forms of stopwords aren't indexed, but unstemmed forms still
73 * are so that searches for phrases including stop words still work.
75 * @param stop The Stopper object to set (default NULL, which means no
76 * stopwords).
78 void set_stopper(const Xapian::Stopper *stop = NULL);
80 /// Set the current document.
81 void set_document(const Xapian::Document & doc);
83 /// Get the current document.
84 const Xapian::Document & get_document() const;
86 /// Set the database to index spelling data to.
87 void set_database(const Xapian::WritableDatabase &db);
89 /// For backward compatibility with Xapian 1.2
90 typedef int flags;
92 /// Flags to OR together and pass to TermGenerator::set_flags().
93 enum {
94 /// Index data required for spelling correction.
95 FLAG_SPELLING = 128, // Value matches QueryParser flag.
97 /** Enable generation of n-grams from CJK text.
99 * With this enabled, spans of CJK characters are split into unigrams
100 * and bigrams, with the unigrams carrying positional information.
101 * Non-CJK characters are split into words as normal.
103 * The corresponding option needs to be passed to QueryParser.
105 * Flag added in Xapian 1.3.4 and 1.2.22, but this mode can be
106 * enabled in 1.2.8 and later by setting environment variable
107 * XAPIAN_CJK_NGRAM.
109 FLAG_CJK_NGRAM = 2048 // Value matches QueryParser flag.
112 /// Stemming strategies, for use with set_stemming_strategy().
113 typedef enum { STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z } stem_strategy;
115 /** Set flags.
117 * The new value of flags is: (flags & mask) ^ toggle
119 * To just set the flags, pass the new flags in toggle and the
120 * default value for mask.
122 * @param toggle Flags to XOR.
123 * @param mask Flags to AND with first.
125 * @return The old flags setting.
127 flags set_flags(flags toggle, flags mask = flags(0));
129 /** Set the stemming strategy.
131 * This method controls how the stemming algorithm is applied. It was
132 * new in Xapian 1.3.1.
134 * @param strategy The strategy to use - possible values are:
135 * - STEM_NONE: Don't perform any stemming - only unstemmed terms
136 * are generated.
137 * - STEM_SOME: Generate both stemmed (with a "Z" prefix) and unstemmed
138 * terms. This is the default strategy.
139 * - STEM_ALL: Generate only stemmed terms (but without a "Z" prefix).
140 * - STEM_ALL_Z: Generate only stemmed terms (with a "Z" prefix).
142 void set_stemming_strategy(stem_strategy strategy);
144 /** Set the maximum length word to index.
146 * The limit is on the length of a word prior to stemming and prior to
147 * adding any term prefix.
149 * The backends mostly impose a limit on the length of terms (often of
150 * about 240 bytes), but it's generally useful to have a lower limit to
151 * help prevent the index being bloated by useless junk terms from trying
152 * to indexing things like binary data, uuencoded data, ASCII art, etc.
154 * This method was new in Xapian 1.3.1.
156 * @param max_word_length The maximum length word to index, in bytes in
157 * UTF-8 representation. Default is 64.
159 void set_max_word_length(unsigned max_word_length);
161 /** Index some text.
163 * @param itor Utf8Iterator pointing to the text to index.
164 * @param wdf_inc The wdf increment (default 1).
165 * @param prefix The term prefix to use (default is no prefix).
167 void index_text(const Xapian::Utf8Iterator & itor,
168 Xapian::termcount wdf_inc = 1,
169 const std::string & prefix = std::string());
171 /** Index some text in a std::string.
173 * @param text The text to index.
174 * @param wdf_inc The wdf increment (default 1).
175 * @param prefix The term prefix to use (default is no prefix).
177 void index_text(const std::string & text,
178 Xapian::termcount wdf_inc = 1,
179 const std::string & prefix = std::string()) {
180 return index_text(Utf8Iterator(text), wdf_inc, prefix);
183 /** Index some text without positional information.
185 * Just like index_text, but no positional information is generated. This
186 * means that the database will be significantly smaller, but that phrase
187 * searching and NEAR won't be supported.
189 * @param itor Utf8Iterator pointing to the text to index.
190 * @param wdf_inc The wdf increment (default 1).
191 * @param prefix The term prefix to use (default is no prefix).
193 void index_text_without_positions(const Xapian::Utf8Iterator & itor,
194 Xapian::termcount wdf_inc = 1,
195 const std::string & prefix = std::string());
197 /** Index some text in a std::string without positional information.
199 * Just like index_text, but no positional information is generated. This
200 * means that the database will be significantly smaller, but that phrase
201 * searching and NEAR won't be supported.
203 * @param text The text to index.
204 * @param wdf_inc The wdf increment (default 1).
205 * @param prefix The term prefix to use (default is no prefix).
207 void index_text_without_positions(const std::string & text,
208 Xapian::termcount wdf_inc = 1,
209 const std::string & prefix = std::string()) {
210 return index_text_without_positions(Utf8Iterator(text), wdf_inc, prefix);
213 /** Increase the term position used by index_text.
215 * This can be used between indexing text from different fields or other
216 * places to prevent phrase searches from spanning between them (e.g.
217 * between the title and body text, or between two chapters in a book).
219 * @param delta Amount to increase the term position by (default: 100).
221 void increase_termpos(Xapian::termcount delta = 100);
223 /// Get the current term position.
224 Xapian::termcount get_termpos() const;
226 /** Set the current term position.
228 * @param termpos The new term position to set.
230 void set_termpos(Xapian::termcount termpos);
232 /// Return a string describing this object.
233 std::string get_description() const;
238 #endif // XAPIAN_INCLUDED_TERMGENERATOR_H