1 /* indextext.h: split text into terms
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * -----END-LICENCE-----
33 #include "symboltab.h"
37 // Put a limit on the size of terms to help prevent the index being bloated
38 // by useless junk terms
39 static const unsigned int MAX_PROB_TERM_LENGTH
= 64;
42 lowercase_term(string
&term
)
44 string::iterator i
= term
.begin();
45 while (i
!= term
.end()) {
51 class AccentNormalisingItor
{
53 string::const_iterator itor
;
57 AccentNormalisingItor()
58 : itor(), queued(0) {}
59 AccentNormalisingItor(string::const_iterator itor_
)
60 : itor(itor_
), queued(0) {}
61 void operator=(string::const_iterator itor_
)
66 bool operator==(const AccentNormalisingItor
&o
) const {
67 return queued
== o
.queued
&& itor
== o
.itor
;
69 bool operator!=(const AccentNormalisingItor
&o
) const {
72 char operator*() const {
73 if (queued
) return queued
;
74 unsigned char ch
= (unsigned char)*itor
;
79 ) return TRANSLIT1
[ch
- 160];
82 AccentNormalisingItor
& operator++() {
86 void operator++(int) {
90 unsigned char ch
= (unsigned char)*itor
;
96 ch
= TRANSLIT2
[(unsigned char)ch
- 160];
105 string::const_iterator
raw() const { return itor
; }
106 /// Allow use as an STL iterator
108 typedef std::input_iterator_tag iterator_category
;
109 typedef char value_type
;
110 typedef string::size_type difference_type
;
111 typedef const char * pointer
;
112 typedef const char & reference
;
117 index_text(const std::string
&s
, Xapian::Document
&doc
, Xapian::Stem
&stemmer
,
118 Xapian::termcount wdfinc
, const std::string
&prefix
,
119 Xapian::termpos pos
= static_cast<Xapian::termpos
>(-1)
120 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
123 inline Xapian::termpos
124 index_text(const std::string
&s
, Xapian::Document
&doc
, Xapian::Stem
&stemmer
,
127 return index_text(s
, doc
, stemmer
, 1, std::string(), pos
);