Add README
[xapian-trec.git] / indextext.h
blobf6037d1bad5c3d9bce1d8aa117308ea154016c72
1 /* indextext.h: split text into terms
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * USA
22 * -----END-LICENCE-----
25 #ifndef _INDEXTEXT_H_
26 #define _INDEXTEXT_H_
28 #include <xapian.h>
30 #include <limits.h>
31 #include <ctype.h>
33 #include "symboltab.h"
35 using std::string;
37 // Put a limit on the size of terms to help prevent the index being bloated
38 // by useless junk terms
39 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
41 static inline void
42 lowercase_term(string &term)
44 string::iterator i = term.begin();
45 while (i != term.end()) {
46 *i = tolower(*i);
47 i++;
51 class AccentNormalisingItor {
52 private:
53 string::const_iterator itor;
54 char queued;
56 public:
57 AccentNormalisingItor()
58 : itor(), queued(0) {}
59 AccentNormalisingItor(string::const_iterator itor_)
60 : itor(itor_), queued(0) {}
61 void operator=(string::const_iterator itor_)
63 itor = itor_;
64 queued = 0;
66 bool operator==(const AccentNormalisingItor &o) const {
67 return queued == o.queued && itor == o.itor;
69 bool operator!=(const AccentNormalisingItor &o) const {
70 return !(*this == o);
72 char operator*() const {
73 if (queued) return queued;
74 unsigned char ch = (unsigned char)*itor;
75 if (ch >= 160
76 #if CHAR_BIT > 8
77 && ch < 256
78 #endif
79 ) return TRANSLIT1[ch - 160];
80 return (char)ch;
82 AccentNormalisingItor & operator++() {
83 this->operator++(0);
84 return *this;
86 void operator++(int) {
87 if (queued) {
88 queued = 0;
89 } else {
90 unsigned char ch = (unsigned char)*itor;
91 if (ch >= 160
92 #if CHAR_BIT > 8
93 && ch < 256
94 #endif
95 ) {
96 ch = TRANSLIT2[(unsigned char)ch - 160];
97 if (ch != ' ') {
98 queued = ch;
99 return;
103 ++itor;
105 string::const_iterator raw() const { return itor; }
106 /// Allow use as an STL iterator
107 //@{
108 typedef std::input_iterator_tag iterator_category;
109 typedef char value_type;
110 typedef string::size_type difference_type;
111 typedef const char * pointer;
112 typedef const char & reference;
113 //@}
116 Xapian::termpos
117 index_text(const std::string &s, Xapian::Document &doc, Xapian::Stem &stemmer,
118 Xapian::termcount wdfinc, const std::string &prefix,
119 Xapian::termpos pos = static_cast<Xapian::termpos>(-1)
120 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
123 inline Xapian::termpos
124 index_text(const std::string &s, Xapian::Document &doc, Xapian::Stem &stemmer,
125 Xapian::termpos pos)
127 return index_text(s, doc, stemmer, 1, std::string(), pos);
130 #endif