xapian-core/include/xapian/unicode.h

   1 /** @file unicode.h
   2  * @brief Unicode and UTF-8 related classes and functions.
   3  */
   4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #ifndef XAPIAN_INCLUDED_UNICODE_H
  22 #define XAPIAN_INCLUDED_UNICODE_H
  23
  24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
  25 # error "Never use <xapian/unicode.h> directly; include <xapian.h> instead."
  26 #endif
  27
  28 #include <xapian/attributes.h>
  29 #include <xapian/visibility.h>
  30
  31 #include <string>
  32
  33 namespace Xapian {
  34
  35 /** An iterator which returns Unicode character values from a UTF-8 encoded
  36  *  string.
  37  */
  38 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
  39     const unsigned char *p;
  40     const unsigned char *end;
  41     mutable unsigned seqlen;
  42
  43     bool XAPIAN_NOTHROW(calculate_sequence_length() const);
  44
  45     unsigned get_char() const;
  46
  47     Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
  48         : p(p_), end(end_), seqlen(seqlen_) { }
  49
  50   public:
  51     /** Return the raw const char * pointer for the current position. */
  52     const char * raw() const {
  53         return reinterpret_cast<const char *>(p ? p : end);
  54     }
  55
  56     /** Return the number of bytes left in the iterator's buffer. */
  57     size_t left() const { return p ? end - p : 0; }
  58
  59     /** Assign a new string to the iterator.
  60      *
  61      *  The iterator will forget the string it was iterating through, and
  62      *  return characters from the start of the new string when next called.
  63      *  The string is not copied into the iterator, so it must remain valid
  64      *  while the iteration is in progress.
  65      *
  66      *  @param p_ A pointer to the start of the string to read.
  67      *
  68      *  @param len The length of the string to read.
  69      */
  70     void assign(const char *p_, size_t len) {
  71         if (len) {
  72             p = reinterpret_cast<const unsigned char*>(p_);
  73             end = p + len;
  74             seqlen = 0;
  75         } else {
  76             p = NULL;
  77         }
  78     }
  79
  80     /** Assign a new string to the iterator.
  81      *
  82      *  The iterator will forget the string it was iterating through, and
  83      *  return characters from the start of the new string when next called.
  84      *  The string is not copied into the iterator, so it must remain valid
  85      *  while the iteration is in progress.
  86      *
  87      *  @param s The string to read.  Must not be modified while the iteration
  88      *           is in progress.
  89      */
  90     void assign(const std::string &s) { assign(s.data(), s.size()); }
  91
  92     /** Create an iterator given a pointer to a null terminated string.
  93      *
  94      *  The iterator will return characters from the start of the string when
  95      *  next called.  The string is not copied into the iterator, so it must
  96      *  remain valid while the iteration is in progress.
  97      *
  98      *  @param p_ A pointer to the start of the null terminated string to read.
  99      */
 100     explicit Utf8Iterator(const char *p_);
 101
 102     /** Create an iterator given a pointer and a length.
 103      *
 104      *  The iterator will return characters from the start of the string when
 105      *  next called.  The string is not copied into the iterator, so it must
 106      *  remain valid while the iteration is in progress.
 107      *
 108      *  @param p_ A pointer to the start of the string to read.
 109      *
 110      *  @param len The length of the string to read.
 111      */
 112     Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
 113
 114     /** Create an iterator given a string.
 115      *
 116      *  The iterator will return characters from the start of the string when
 117      *  next called.  The string is not copied into the iterator, so it must
 118      *  remain valid while the iteration is in progress.
 119      *
 120      *  @param s The string to read.  Must not be modified while the iteration
 121      *           is in progress.
 122      */
 123     Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
 124
 125     /** Create an iterator which is at the end of its iteration.
 126      *
 127      *  This can be compared to another iterator to check if the other iterator
 128      *  has reached its end.
 129      */
 130     XAPIAN_NOTHROW(Utf8Iterator())
 131         : p(NULL), end(0), seqlen(0) { }
 132
 133     /** Get the current Unicode character value pointed to by the iterator.
 134      *
 135      *  If an invalid UTF-8 sequence is encountered, then the byte values
 136      *  comprising it are returned until valid UTF-8 or the end of the input is
 137      *  reached.
 138      *
 139      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
 140      */
 141     unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
 142
 143     /** @private @internal Get the current Unicode character
 144      *  value pointed to by the iterator.
 145      *
 146      *  If an invalid UTF-8 sequence is encountered, then the byte values
 147      *  comprising it are returned with the top bit set (so the caller can
 148      *  differentiate these from the same values arising from valid UTF-8)
 149      *  until valid UTF-8 or the end of the input is reached.
 150      *
 151      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
 152      */
 153     unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
 154
 155     /** Move forward to the next Unicode character.
 156      *
 157      *  @return An iterator pointing to the position before the move.
 158      */
 159     Utf8Iterator operator++(int) {
 160         // If we've not calculated seqlen yet, do so.
 161         if (seqlen == 0) calculate_sequence_length();
 162         const unsigned char *old_p = p;
 163         unsigned old_seqlen = seqlen;
 164         p += seqlen;
 165         if (p == end) p = NULL;
 166         seqlen = 0;
 167         return Utf8Iterator(old_p, end, old_seqlen);
 168     }
 169
 170     /** Move forward to the next Unicode character.
 171      *
 172      *  @return A reference to this object.
 173      */
 174     Utf8Iterator & operator++() {
 175         if (seqlen == 0) calculate_sequence_length();
 176         p += seqlen;
 177         if (p == end) p = NULL;
 178         seqlen = 0;
 179         return *this;
 180     }
 181
 182     /** Test two Utf8Iterators for equality.
 183      *
 184      *  @param other    The Utf8Iterator to compare this one with.
 185      *  @return true iff the iterators point to the same position.
 186      */
 187     bool XAPIAN_NOTHROW(operator==(const Utf8Iterator &other) const) {
 188         return p == other.p;
 189     }
 190
 191     /** Test two Utf8Iterators for inequality.
 192      *
 193      *  @param other    The Utf8Iterator to compare this one with.
 194      *  @return true iff the iterators do not point to the same position.
 195      */
 196     bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator &other) const) {
 197         return p != other.p;
 198     }
 199
 200     /// We implement the semantics of an STL input_iterator.
 201     //@{
 202     typedef std::input_iterator_tag iterator_category;
 203     typedef unsigned value_type;
 204     typedef size_t difference_type;
 205     typedef const unsigned * pointer;
 206     typedef const unsigned & reference;
 207     //@}
 208 };
 209
 210 /// Functions associated with handling Unicode characters.
 211 namespace Unicode {
 212
 213 /** Each Unicode character is in exactly one of these categories. */
 214 typedef enum {
 215     UNASSIGNED,
 216     UPPERCASE_LETTER,
 217     LOWERCASE_LETTER,
 218     TITLECASE_LETTER,
 219     MODIFIER_LETTER,
 220     OTHER_LETTER,
 221     NON_SPACING_MARK,
 222     ENCLOSING_MARK,
 223     COMBINING_SPACING_MARK,
 224     DECIMAL_DIGIT_NUMBER,
 225     LETTER_NUMBER,
 226     OTHER_NUMBER,
 227     SPACE_SEPARATOR,
 228     LINE_SEPARATOR,
 229     PARAGRAPH_SEPARATOR,
 230     CONTROL,
 231     FORMAT,
 232     PRIVATE_USE,
 233     SURROGATE,
 234     CONNECTOR_PUNCTUATION,
 235     DASH_PUNCTUATION,
 236     OPEN_PUNCTUATION,
 237     CLOSE_PUNCTUATION,
 238     INITIAL_QUOTE_PUNCTUATION,
 239     FINAL_QUOTE_PUNCTUATION,
 240     OTHER_PUNCTUATION,
 241     MATH_SYMBOL,
 242     CURRENCY_SYMBOL,
 243     MODIFIER_SYMBOL,
 244     OTHER_SYMBOL
 245 } category;
 246
 247 namespace Internal {
 248     /** @private @internal Extract the information about a character from the
 249      *  Unicode character tables.
 250      *
 251      *  Characters outside of the Unicode range (i.e. ch >= 0x110000) are
 252      *  treated as UNASSIGNED with no case variants.
 253      */
 254     XAPIAN_VISIBILITY_DEFAULT
 255     int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
 256
 257     /** @private @internal Extract how to convert the case of a Unicode
 258      *  character from its info.
 259      */
 260     inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
 261
 262     /** @private @internal Extract the category of a Unicode character from its
 263      *  info.
 264      */
 265     inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
 266
 267     /** @private @internal Extract the delta to use for case conversion of a
 268      *  character from its info.
 269      */
 270     inline int get_delta(int info) {
 271         /* It's implementation defined if sign extension happens when right
 272          * shifting a signed int, although in practice sign extension is what
 273          * most compilers implement.
 274          *
 275          * Some compilers are smart enough to spot common idioms for sign
 276          * extension, but not all (e.g. GCC < 7 doesn't spot the one used in
 277          * the else below), so check what the implementation defined behaviour
 278          * is with a constant conditional which should get optimised away.
 279          */
 280         if ((-1 >> 1) == -1) {
 281             // Right shift sign-extends.
 282             return info >> 8;
 283         } else {
 284             // Right shift shifts in zeros, not before and after the shift for
 285             // negative values.
 286             return (info >= 0) ? (info >> 8) : (~(~info >> 8));
 287         }
 288     }
 289 }
 290
 291 /** Convert a single non-ASCII Unicode character to UTF-8.
 292  *
 293  *  This is intended mainly as a helper method for to_utf8().
 294  *
 295  *  @param ch   The character (which must be > 128) to write to @a buf.
 296  *  @param buf  The buffer to write the character to - it must have
 297  *              space for (at least) 4 bytes.
 298  *
 299  *  @return     The length of the resultant UTF-8 character in bytes.
 300  */
 301 XAPIAN_VISIBILITY_DEFAULT
 302 unsigned nonascii_to_utf8(unsigned ch, char * buf);
 303
 304 /** Convert a single Unicode character to UTF-8.
 305  *
 306  *  @param ch   The character to write to @a buf.
 307  *  @param buf  The buffer to write the character to - it must have
 308  *              space for (at least) 4 bytes.
 309  *
 310  *  @return     The length of the resultant UTF-8 character in bytes.
 311  */
 312 inline unsigned to_utf8(unsigned ch, char *buf) {
 313     if (ch < 128) {
 314         *buf = static_cast<unsigned char>(ch);
 315         return 1;
 316     }
 317     return Xapian::Unicode::nonascii_to_utf8(ch, buf);
 318 }
 319
 320 /** Append the UTF-8 representation of a single Unicode character to a
 321  *  std::string.
 322  */
 323 inline void append_utf8(std::string &s, unsigned ch) {
 324     char buf[4];
 325     s.append(buf, to_utf8(ch, buf));
 326 }
 327
 328 /// Return the category which a given Unicode character falls into.
 329 inline category get_category(unsigned ch) {
 330     return Internal::get_category(Internal::get_character_info(ch));
 331 }
 332
 333 /// Test if a given Unicode character is "word character".
 334 inline bool is_wordchar(unsigned ch) {
 335     const unsigned int WORDCHAR_MASK =
 336             (1 << Xapian::Unicode::UPPERCASE_LETTER) |
 337             (1 << Xapian::Unicode::LOWERCASE_LETTER) |
 338             (1 << Xapian::Unicode::TITLECASE_LETTER) |
 339             (1 << Xapian::Unicode::MODIFIER_LETTER) |
 340             (1 << Xapian::Unicode::OTHER_LETTER) |
 341             (1 << Xapian::Unicode::NON_SPACING_MARK) |
 342             (1 << Xapian::Unicode::ENCLOSING_MARK) |
 343             (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
 344             (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
 345             (1 << Xapian::Unicode::LETTER_NUMBER) |
 346             (1 << Xapian::Unicode::OTHER_NUMBER) |
 347             (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
 348     return ((WORDCHAR_MASK >> get_category(ch)) & 1);
 349 }
 350
 351 /// Test if a given Unicode character is a whitespace character.
 352 inline bool is_whitespace(unsigned ch) {
 353     const unsigned int WHITESPACE_MASK =
 354             (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
 355             (1 << Xapian::Unicode::SPACE_SEPARATOR) |
 356             (1 << Xapian::Unicode::LINE_SEPARATOR) |
 357             (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
 358     return ((WHITESPACE_MASK >> get_category(ch)) & 1);
 359 }
 360
 361 /// Test if a given Unicode character is a currency symbol.
 362 inline bool is_currency(unsigned ch) {
 363     return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
 364 }
 365
 366 /// Convert a Unicode character to lowercase.
 367 inline unsigned tolower(unsigned ch) {
 368     int info = Xapian::Unicode::Internal::get_character_info(ch);
 369     if (!(Internal::get_case_type(info) & 2))
 370         return ch;
 371     return ch + Internal::get_delta(info);
 372 }
 373
 374 /// Convert a Unicode character to uppercase.
 375 inline unsigned toupper(unsigned ch) {
 376     int info = Xapian::Unicode::Internal::get_character_info(ch);
 377     if (!(Internal::get_case_type(info) & 4))
 378         return ch;
 379     return ch - Internal::get_delta(info);
 380 }
 381
 382 /// Convert a UTF-8 std::string to lowercase.
 383 inline std::string
 384 tolower(const std::string &term)
 385 {
 386     std::string result;
 387     result.reserve(term.size());
 388     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
 389         append_utf8(result, tolower(*i));
 390     }
 391     return result;
 392 }
 393
 394 /// Convert a UTF-8 std::string to uppercase.
 395 inline std::string
 396 toupper(const std::string &term)
 397 {
 398     std::string result;
 399     result.reserve(term.size());
 400     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
 401         append_utf8(result, toupper(*i));
 402     }
 403     return result;
 404 }
 405
 406 }
 407
 408 }
 409
 410 #endif // XAPIAN_INCLUDED_UNICODE_H