Fix assertion for value range on empty slot
[xapian.git] / xapian-core / queryparser / cjk-tokenizer.h
blob4571874b12cc6b872799a30ec09d3512aaefbcb8
1 /** @file cjk-tokenizer.h
2 * @brief Tokenise CJK text as n-grams
3 */
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7 * Copyright (c) 2011 Olly Betts
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * deal in the Software without restriction, including without limitation the
12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13 * sell copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
28 #ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
29 #define XAPIAN_INCLUDED_CJK_TOKENIZER_H
31 #include "xapian/unicode.h"
33 #include <string>
35 namespace CJK {
37 /** Should we use the CJK n-gram code?
39 * The first time this is called it reads the environment variable
40 * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
41 * Subsequent calls cache and return the same value.
43 bool is_cjk_enabled();
45 bool codepoint_is_cjk(unsigned codepoint);
47 std::string get_cjk(Xapian::Utf8Iterator &it);
51 class CJKTokenIterator {
52 Xapian::Utf8Iterator it;
54 mutable Xapian::Utf8Iterator p;
56 mutable unsigned len;
58 mutable std::string current_token;
60 public:
61 explicit CJKTokenIterator(const std::string & s)
62 : it(s) { }
64 explicit CJKTokenIterator(const Xapian::Utf8Iterator & it_)
65 : it(it_) { }
67 CJKTokenIterator()
68 : it() { }
70 const std::string & operator*() const;
72 CJKTokenIterator & operator++();
74 /// Get the length of the current token in Unicode characters.
75 unsigned get_length() const { return len; }
77 friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &);
80 inline bool
81 operator==(const CJKTokenIterator & a, const CJKTokenIterator & b)
83 // We only really care about comparisons where one or other is an end
84 // iterator.
85 return a.it == b.it;
88 inline bool
89 operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b)
91 return !(a == b);
94 #endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H