dep/include/utf8cpp/utf8/core.h

   1 // Copyright 2006 Nemanja Trifunovic
   2
   3 /*
   4 Permission is hereby granted, free of charge, to any person or organization
   5 obtaining a copy of the software and accompanying documentation covered by
   6 this license (the "Software") to use, reproduce, display, distribute,
   7 execute, and transmit the Software, and to prepare derivative works of the
   8 Software, and to permit third-parties to whom the Software is furnished to
   9 do so, all subject to the following:
  10
  11 The copyright notices in the Software and this entire statement, including
  12 the above license grant, this restriction and the following disclaimer,
  13 must be included in all copies of the Software, in whole or in part, and
  14 all derivative works of the Software, unless such copies or derivative
  15 works are solely in the form of machine-executable object code generated by
  16 a source language processor.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 DEALINGS IN THE SOFTWARE.
  25 */
  26
  27
  28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  30
  31 #include <iterator>
  32
  33 namespace utf8
  34 {
  35     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
  36     // You may need to change them to match your system.
  37     // These typedefs have the same names as ones from cstdint, or boost/cstdint
  38     typedef unsigned char   uint8_t;
  39     typedef unsigned short  uint16_t;
  40     typedef unsigned int    uint32_t;
  41
  42 // Helper code - not intended to be directly called by the library users. May be changed at any time
  43 namespace internal
  44 {
  45     // Unicode constants
  46     // Leading (high) surrogates: 0xd800 - 0xdbff
  47     // Trailing (low) surrogates: 0xdc00 - 0xdfff
  48     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
  49     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
  50     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
  51     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
  52     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
  53     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
  54
  55     // Maximum valid value for a Unicode code point
  56     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
  57
  58     template<typename octet_type>
  59     inline uint8_t mask8(octet_type oc)
  60     {
  61         return static_cast<uint8_t>(0xff & oc);
  62     }
  63     template<typename u16_type>
  64     inline uint16_t mask16(u16_type oc)
  65     {
  66         return static_cast<uint16_t>(0xffff & oc);
  67     }
  68     template<typename octet_type>
  69     inline bool is_trail(octet_type oc)
  70     {
  71         return ((mask8(oc) >> 6) == 0x2);
  72     }
  73
  74     template <typename u16>
  75     inline bool is_lead_surrogate(u16 cp)
  76     {
  77         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
  78     }
  79
  80     template <typename u16>
  81     inline bool is_trail_surrogate(u16 cp)
  82     {
  83         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
  84     }
  85
  86     template <typename u16>
  87     inline bool is_surrogate(u16 cp)
  88     {
  89         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
  90     }
  91
  92     template <typename u32>
  93     inline bool is_code_point_valid(u32 cp)
  94     {
  95         return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
  96     }
  97
  98     template <typename octet_iterator>
  99     inline typename std::iterator_traits<octet_iterator>::difference_type
 100     sequence_length(octet_iterator lead_it)
 101     {
 102         uint8_t lead = mask8(*lead_it);
 103         if (lead < 0x80)
 104             return 1;
 105         else if ((lead >> 5) == 0x6)
 106             return 2;
 107         else if ((lead >> 4) == 0xe)
 108             return 3;
 109         else if ((lead >> 3) == 0x1e)
 110             return 4;
 111         else
 112             return 0;
 113     }
 114
 115     inline bool is_overlong_sequence(uint32_t cp, int length)
 116     {
 117         if (cp < 0x80) {
 118             if (length != 1)
 119                 return true;
 120         }
 121         else if (cp < 0x800) {
 122             if (length != 2)
 123                 return true;
 124         }
 125         else if (cp < 0x10000) {
 126             if (length != 3)
 127                 return true;
 128         }
 129
 130         return false;
 131     }
 132
 133     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
 134
 135     /// get_sequence_x functions decode utf-8 sequences of the length x
 136
 137     template <typename octet_iterator>
 138     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
 139     {
 140         if (it != end) {
 141             if (code_point)
 142                 *code_point = mask8(*it);
 143             return UTF8_OK;
 144         }
 145         return NOT_ENOUGH_ROOM;
 146     }
 147
 148     template <typename octet_iterator>
 149     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
 150     {
 151         utf_error ret_code = NOT_ENOUGH_ROOM;
 152
 153         if (it != end) {
 154             uint32_t cp = mask8(*it);
 155             if (++it != end) {
 156                 if (is_trail(*it)) {
 157                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 158
 159                     if (code_point)
 160                         *code_point = cp;
 161                     ret_code = UTF8_OK;
 162                 }
 163                 else
 164                     ret_code = INCOMPLETE_SEQUENCE;
 165             }
 166             else
 167                 ret_code = NOT_ENOUGH_ROOM;
 168         }
 169
 170         return ret_code;
 171     }
 172
 173     template <typename octet_iterator>
 174     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
 175     {
 176         utf_error ret_code = NOT_ENOUGH_ROOM;
 177
 178         if (it != end) {
 179             uint32_t cp = mask8(*it);
 180             if (++it != end) {
 181                 if (is_trail(*it)) {
 182                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
 183                     if (++it != end) {
 184                         if (is_trail(*it)) {
 185                             cp += (*it) & 0x3f;
 186
 187                             if (code_point)
 188                                 *code_point = cp;
 189                             ret_code = UTF8_OK;
 190                         }
 191                         else
 192                             ret_code = INCOMPLETE_SEQUENCE;
 193                     }
 194                     else
 195                         ret_code = NOT_ENOUGH_ROOM;
 196                 }
 197                 else
 198                     ret_code = INCOMPLETE_SEQUENCE;
 199             }
 200             else
 201                 ret_code = NOT_ENOUGH_ROOM;
 202         }
 203
 204         return ret_code;
 205     }
 206
 207     template <typename octet_iterator>
 208     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
 209     {
 210         utf_error ret_code = NOT_ENOUGH_ROOM;
 211
 212         if (it != end) {
 213             uint32_t cp = mask8(*it);
 214             if (++it != end) {
 215                 if (is_trail(*it)) {
 216                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
 217                     if (++it != end) {
 218                         if (is_trail(*it)) {
 219                             cp += (mask8(*it) << 6) & 0xfff;
 220                             if (++it != end) {
 221                                 if (is_trail(*it)) {
 222                                     cp += (*it) & 0x3f;
 223
 224                                     if (code_point)
 225                                         *code_point = cp;
 226                                     ret_code = UTF8_OK;
 227                                 }
 228                                 else
 229                                     ret_code = INCOMPLETE_SEQUENCE;
 230                             }
 231                             else
 232                                 ret_code = NOT_ENOUGH_ROOM;
 233                         }
 234                         else
 235                             ret_code = INCOMPLETE_SEQUENCE;
 236                     }
 237                     else
 238                         ret_code = NOT_ENOUGH_ROOM;
 239                 }
 240                 else
 241                     ret_code = INCOMPLETE_SEQUENCE;
 242             }
 243             else
 244                 ret_code = NOT_ENOUGH_ROOM;
 245         }
 246
 247         return ret_code;
 248     }
 249
 250     template <typename octet_iterator>
 251     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
 252     {
 253         // Save the original value of it so we can go back in case of failure
 254         // Of course, it does not make much sense with i.e. stream iterators
 255         octet_iterator original_it = it;
 256
 257         uint32_t cp = 0;
 258         // Determine the sequence length based on the lead octet
 259         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
 260         octet_difference_type length = sequence_length(it);
 261         if (length == 0)
 262             return INVALID_LEAD;
 263
 264         // Now that we have a valid sequence length, get trail octets and calculate the code point
 265         utf_error err = UTF8_OK;
 266         switch (length) {
 267             case 1:
 268                 err = get_sequence_1(it, end, &cp);
 269                 break;
 270             case 2:
 271                 err = get_sequence_2(it, end, &cp);
 272             break;
 273             case 3:
 274                 err = get_sequence_3(it, end, &cp);
 275             break;
 276             case 4:
 277                 err = get_sequence_4(it, end, &cp);
 278             break;
 279         }
 280
 281         if (err == UTF8_OK) {
 282             // Decoding succeeded. Now, security checks...
 283             if (is_code_point_valid(cp)) {
 284                 if (!is_overlong_sequence(cp, length)){
 285                     // Passed! Return here.
 286                     if (code_point)
 287                         *code_point = cp;
 288                     ++it;
 289                     return UTF8_OK;
 290                 }
 291                 else
 292                     err = OVERLONG_SEQUENCE;
 293             }
 294             else
 295                 err = INVALID_CODE_POINT;
 296         }
 297
 298         // Failure branch - restore the original value of the iterator
 299         it = original_it;
 300         return err;
 301     }
 302
 303     template <typename octet_iterator>
 304     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
 305         return validate_next(it, end, 0);
 306     }
 307
 308 } // namespace internal
 309
 310     /// The library API - functions intended to be called by the users
 311
 312     // Byte order mark
 313     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
 314
 315     template <typename octet_iterator>
 316     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
 317     {
 318         octet_iterator result = start;
 319         while (result != end) {
 320             internal::utf_error err_code = internal::validate_next(result, end);
 321             if (err_code != internal::UTF8_OK)
 322                 return result;
 323         }
 324         return result;
 325     }
 326
 327     template <typename octet_iterator>
 328     inline bool is_valid(octet_iterator start, octet_iterator end)
 329     {
 330         return (find_invalid(start, end) == end);
 331     }
 332
 333     template <typename octet_iterator>
 334     inline bool is_bom (octet_iterator it)
 335     {
 336         return (
 337             (internal::mask8(*it++)) == bom[0] &&
 338             (internal::mask8(*it++)) == bom[1] &&
 339             (internal::mask8(*it))   == bom[2]
 340            );
 341     }
 342 } // namespace utf8
 343
 344 #endif // header guard
 345
 346