Check signed right shift behaviour at compile time
[xapian.git] / xapian-core / include / xapian / unicode.h
blobbaec5b429152dbbced93f4f6fed99bea034b4289
1 /** @file unicode.h
2 * @brief Unicode and UTF-8 related classes and functions.
3 */
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error "Never use <xapian/unicode.h> directly; include <xapian.h> instead."
26 #endif
28 #include <xapian/attributes.h>
29 #include <xapian/visibility.h>
31 #include <string>
33 namespace Xapian {
35 /** An iterator which returns Unicode character values from a UTF-8 encoded
36 * string.
38 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
39 const unsigned char *p;
40 const unsigned char *end;
41 mutable unsigned seqlen;
43 bool XAPIAN_NOTHROW(calculate_sequence_length() const);
45 unsigned get_char() const;
47 Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
48 : p(p_), end(end_), seqlen(seqlen_) { }
50 public:
51 /** Return the raw const char * pointer for the current position. */
52 const char * raw() const {
53 return reinterpret_cast<const char *>(p ? p : end);
56 /** Return the number of bytes left in the iterator's buffer. */
57 size_t left() const { return p ? end - p : 0; }
59 /** Assign a new string to the iterator.
61 * The iterator will forget the string it was iterating through, and
62 * return characters from the start of the new string when next called.
63 * The string is not copied into the iterator, so it must remain valid
64 * while the iteration is in progress.
66 * @param p_ A pointer to the start of the string to read.
68 * @param len The length of the string to read.
70 void assign(const char *p_, size_t len) {
71 if (len) {
72 p = reinterpret_cast<const unsigned char*>(p_);
73 end = p + len;
74 seqlen = 0;
75 } else {
76 p = NULL;
80 /** Assign a new string to the iterator.
82 * The iterator will forget the string it was iterating through, and
83 * return characters from the start of the new string when next called.
84 * The string is not copied into the iterator, so it must remain valid
85 * while the iteration is in progress.
87 * @param s The string to read. Must not be modified while the iteration
88 * is in progress.
90 void assign(const std::string &s) { assign(s.data(), s.size()); }
92 /** Create an iterator given a pointer to a null terminated string.
94 * The iterator will return characters from the start of the string when
95 * next called. The string is not copied into the iterator, so it must
96 * remain valid while the iteration is in progress.
98 * @param p_ A pointer to the start of the null terminated string to read.
100 explicit Utf8Iterator(const char *p_);
102 /** Create an iterator given a pointer and a length.
104 * The iterator will return characters from the start of the string when
105 * next called. The string is not copied into the iterator, so it must
106 * remain valid while the iteration is in progress.
108 * @param p_ A pointer to the start of the string to read.
110 * @param len The length of the string to read.
112 Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
114 /** Create an iterator given a string.
116 * The iterator will return characters from the start of the string when
117 * next called. The string is not copied into the iterator, so it must
118 * remain valid while the iteration is in progress.
120 * @param s The string to read. Must not be modified while the iteration
121 * is in progress.
123 Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
125 /** Create an iterator which is at the end of its iteration.
127 * This can be compared to another iterator to check if the other iterator
128 * has reached its end.
130 XAPIAN_NOTHROW(Utf8Iterator())
131 : p(NULL), end(0), seqlen(0) { }
133 /** Get the current Unicode character value pointed to by the iterator.
135 * If an invalid UTF-8 sequence is encountered, then the byte values
136 * comprising it are returned until valid UTF-8 or the end of the input is
137 * reached.
139 * Returns unsigned(-1) if the iterator has reached the end of its buffer.
141 unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
143 /** @private @internal Get the current Unicode character
144 * value pointed to by the iterator.
146 * If an invalid UTF-8 sequence is encountered, then the byte values
147 * comprising it are returned with the top bit set (so the caller can
148 * differentiate these from the same values arising from valid UTF-8)
149 * until valid UTF-8 or the end of the input is reached.
151 * Returns unsigned(-1) if the iterator has reached the end of its buffer.
153 unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
155 /** Move forward to the next Unicode character.
157 * @return An iterator pointing to the position before the move.
159 Utf8Iterator operator++(int) {
160 // If we've not calculated seqlen yet, do so.
161 if (seqlen == 0) calculate_sequence_length();
162 const unsigned char *old_p = p;
163 unsigned old_seqlen = seqlen;
164 p += seqlen;
165 if (p == end) p = NULL;
166 seqlen = 0;
167 return Utf8Iterator(old_p, end, old_seqlen);
170 /** Move forward to the next Unicode character.
172 * @return A reference to this object.
174 Utf8Iterator & operator++() {
175 if (seqlen == 0) calculate_sequence_length();
176 p += seqlen;
177 if (p == end) p = NULL;
178 seqlen = 0;
179 return *this;
182 /** Test two Utf8Iterators for equality.
184 * @param other The Utf8Iterator to compare this one with.
185 * @return true iff the iterators point to the same position.
187 bool XAPIAN_NOTHROW(operator==(const Utf8Iterator &other) const) {
188 return p == other.p;
191 /** Test two Utf8Iterators for inequality.
193 * @param other The Utf8Iterator to compare this one with.
194 * @return true iff the iterators do not point to the same position.
196 bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator &other) const) {
197 return p != other.p;
200 /// We implement the semantics of an STL input_iterator.
201 //@{
202 typedef std::input_iterator_tag iterator_category;
203 typedef unsigned value_type;
204 typedef size_t difference_type;
205 typedef const unsigned * pointer;
206 typedef const unsigned & reference;
207 //@}
210 /// Functions associated with handling Unicode characters.
211 namespace Unicode {
213 /** Each Unicode character is in exactly one of these categories. */
214 typedef enum {
215 UNASSIGNED,
216 UPPERCASE_LETTER,
217 LOWERCASE_LETTER,
218 TITLECASE_LETTER,
219 MODIFIER_LETTER,
220 OTHER_LETTER,
221 NON_SPACING_MARK,
222 ENCLOSING_MARK,
223 COMBINING_SPACING_MARK,
224 DECIMAL_DIGIT_NUMBER,
225 LETTER_NUMBER,
226 OTHER_NUMBER,
227 SPACE_SEPARATOR,
228 LINE_SEPARATOR,
229 PARAGRAPH_SEPARATOR,
230 CONTROL,
231 FORMAT,
232 PRIVATE_USE,
233 SURROGATE,
234 CONNECTOR_PUNCTUATION,
235 DASH_PUNCTUATION,
236 OPEN_PUNCTUATION,
237 CLOSE_PUNCTUATION,
238 INITIAL_QUOTE_PUNCTUATION,
239 FINAL_QUOTE_PUNCTUATION,
240 OTHER_PUNCTUATION,
241 MATH_SYMBOL,
242 CURRENCY_SYMBOL,
243 MODIFIER_SYMBOL,
244 OTHER_SYMBOL
245 } category;
247 namespace Internal {
248 /** @private @internal Extract the information about a character from the
249 * Unicode character tables.
251 * Characters outside of the Unicode range (i.e. ch >= 0x110000) are
252 * treated as UNASSIGNED with no case variants.
254 XAPIAN_VISIBILITY_DEFAULT
255 int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
257 /** @private @internal Extract how to convert the case of a Unicode
258 * character from its info.
260 inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
262 /** @private @internal Extract the category of a Unicode character from its
263 * info.
265 inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
267 /** @private @internal Extract the delta to use for case conversion of a
268 * character from its info.
270 inline int get_delta(int info) {
271 /* It's implementation defined if sign extension happens when right
272 * shifting a signed int, although in practice sign extension is what
273 * most compilers implement.
275 * Some compilers are smart enough to spot common idioms for sign
276 * extension, but not all (e.g. GCC < 7 doesn't spot the one used in
277 * the else below), so check what the implementation defined behaviour
278 * is with a constant conditional which should get optimised away.
280 if ((-1 >> 1) == -1) {
281 // Right shift sign-extends.
282 return info >> 8;
283 } else {
284 // Right shift shifts in zeros, not before and after the shift for
285 // negative values.
286 return (info >= 0) ? (info >> 8) : (~(~info >> 8));
291 /** Convert a single non-ASCII Unicode character to UTF-8.
293 * This is intended mainly as a helper method for to_utf8().
295 * @param ch The character (which must be > 128) to write to @a buf.
296 * @param buf The buffer to write the character to - it must have
297 * space for (at least) 4 bytes.
299 * @return The length of the resultant UTF-8 character in bytes.
301 XAPIAN_VISIBILITY_DEFAULT
302 unsigned nonascii_to_utf8(unsigned ch, char * buf);
304 /** Convert a single Unicode character to UTF-8.
306 * @param ch The character to write to @a buf.
307 * @param buf The buffer to write the character to - it must have
308 * space for (at least) 4 bytes.
310 * @return The length of the resultant UTF-8 character in bytes.
312 inline unsigned to_utf8(unsigned ch, char *buf) {
313 if (ch < 128) {
314 *buf = static_cast<unsigned char>(ch);
315 return 1;
317 return Xapian::Unicode::nonascii_to_utf8(ch, buf);
320 /** Append the UTF-8 representation of a single Unicode character to a
321 * std::string.
323 inline void append_utf8(std::string &s, unsigned ch) {
324 char buf[4];
325 s.append(buf, to_utf8(ch, buf));
328 /// Return the category which a given Unicode character falls into.
329 inline category get_category(unsigned ch) {
330 return Internal::get_category(Internal::get_character_info(ch));
333 /// Test if a given Unicode character is "word character".
334 inline bool is_wordchar(unsigned ch) {
335 const unsigned int WORDCHAR_MASK =
336 (1 << Xapian::Unicode::UPPERCASE_LETTER) |
337 (1 << Xapian::Unicode::LOWERCASE_LETTER) |
338 (1 << Xapian::Unicode::TITLECASE_LETTER) |
339 (1 << Xapian::Unicode::MODIFIER_LETTER) |
340 (1 << Xapian::Unicode::OTHER_LETTER) |
341 (1 << Xapian::Unicode::NON_SPACING_MARK) |
342 (1 << Xapian::Unicode::ENCLOSING_MARK) |
343 (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
344 (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
345 (1 << Xapian::Unicode::LETTER_NUMBER) |
346 (1 << Xapian::Unicode::OTHER_NUMBER) |
347 (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
348 return ((WORDCHAR_MASK >> get_category(ch)) & 1);
351 /// Test if a given Unicode character is a whitespace character.
352 inline bool is_whitespace(unsigned ch) {
353 const unsigned int WHITESPACE_MASK =
354 (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
355 (1 << Xapian::Unicode::SPACE_SEPARATOR) |
356 (1 << Xapian::Unicode::LINE_SEPARATOR) |
357 (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
358 return ((WHITESPACE_MASK >> get_category(ch)) & 1);
361 /// Test if a given Unicode character is a currency symbol.
362 inline bool is_currency(unsigned ch) {
363 return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
366 /// Convert a Unicode character to lowercase.
367 inline unsigned tolower(unsigned ch) {
368 int info = Xapian::Unicode::Internal::get_character_info(ch);
369 if (!(Internal::get_case_type(info) & 2))
370 return ch;
371 return ch + Internal::get_delta(info);
374 /// Convert a Unicode character to uppercase.
375 inline unsigned toupper(unsigned ch) {
376 int info = Xapian::Unicode::Internal::get_character_info(ch);
377 if (!(Internal::get_case_type(info) & 4))
378 return ch;
379 return ch - Internal::get_delta(info);
382 /// Convert a UTF-8 std::string to lowercase.
383 inline std::string
384 tolower(const std::string &term)
386 std::string result;
387 result.reserve(term.size());
388 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
389 append_utf8(result, tolower(*i));
391 return result;
394 /// Convert a UTF-8 std::string to uppercase.
395 inline std::string
396 toupper(const std::string &term)
398 std::string result;
399 result.reserve(term.size());
400 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
401 append_utf8(result, toupper(*i));
403 return result;
410 #endif // XAPIAN_INCLUDED_UNICODE_H