util/unicode.c

   1 /*
   2  * Dealing with Unicode
   3  *
   4  * Copyright (C) 2013 Red Hat, Inc.
   5  *
   6  * Authors:
   7  *  Markus Armbruster <armbru@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or
  10  * later.  See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "qemu/unicode.h"
  15
  16 static bool is_valid_codepoint(int codepoint)
  17 {
  18     if (codepoint > 0x10FFFFu) {
  19         return false;            /* beyond Unicode range */
  20     }
  21     if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
  22         || (codepoint & 0xFFFE) == 0xFFFE) {
  23         return false;            /* noncharacter */
  24     }
  25     if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
  26         return false;            /* surrogate code point */
  27     }
  28     return true;
  29 }
  30
  31 /**
  32  * mod_utf8_codepoint:
  33  * @s: string encoded in modified UTF-8
  34  * @n: maximum number of bytes to read from @s, if less than 6
  35  * @end: set to end of sequence on return
  36  *
  37  * Convert the modified UTF-8 sequence at the start of @s.  Modified
  38  * UTF-8 is exactly like UTF-8, except U+0000 is encoded as
  39  * "\xC0\x80".
  40  *
  41  * If @n is zero or @s points to a zero byte, the sequence is invalid,
  42  * and @end is set to @s.
  43  *
  44  * If @s points to an impossible byte (0xFE or 0xFF) or a continuation
  45  * byte, the sequence is invalid, and @end is set to @s + 1
  46  *
  47  * Else, the first byte determines how many continuation bytes are
  48  * expected.  If there are fewer, the sequence is invalid, and @end is
  49  * set to @s + 1 + actual number of continuation bytes.  Else, the
  50  * sequence is well-formed, and @end is set to @s + 1 + expected
  51  * number of continuation bytes.
  52  *
  53  * A well-formed sequence is valid unless it encodes a codepoint
  54  * outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66
  55  * noncharacters, a surrogate codepoint, or is overlong.  Except the
  56  * overlong sequence "\xC0\x80" is valid.
  57  *
  58  * Conversion succeeds if and only if the sequence is valid.
  59  *
  60  * Returns: the Unicode codepoint on success, -1 on failure.
  61  */
  62 int mod_utf8_codepoint(const char *s, size_t n, char **end)
  63 {
  64     static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
  65     const unsigned char *p;
  66     unsigned byte, mask, len, i;
  67     int cp;
  68
  69     if (n == 0 || *s == 0) {
  70         /* empty sequence */
  71         *end = (char *)s;
  72         return -1;
  73     }
  74
  75     p = (const unsigned char *)s;
  76     byte = *p++;
  77     if (byte < 0x80) {
  78         cp = byte;              /* one byte sequence */
  79     } else if (byte >= 0xFE) {
  80         cp = -1;                /* impossible bytes 0xFE, 0xFF */
  81     } else if ((byte & 0x40) == 0) {
  82         cp = -1;                /* unexpected continuation byte */
  83     } else {
  84         /* multi-byte sequence */
  85         len = 0;
  86         for (mask = 0x80; byte & mask; mask >>= 1) {
  87             len++;
  88         }
  89         assert(len > 1 && len < 7);
  90         cp = byte & (mask - 1);
  91         for (i = 1; i < len; i++) {
  92             byte = i < n ? *p : 0;
  93             if ((byte & 0xC0) != 0x80) {
  94                 cp = -1;        /* continuation byte missing */
  95                 goto out;
  96             }
  97             p++;
  98             cp <<= 6;
  99             cp |= byte & 0x3F;
 100         }
 101         if (!is_valid_codepoint(cp)) {
 102             cp = -1;
 103         } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
 104             cp = -1;            /* overlong, not \xC0\x80 */
 105         }
 106     }
 107
 108 out:
 109     *end = (char *)p;
 110     return cp;
 111 }
 112
 113 /**
 114  * mod_utf8_encode:
 115  * @buf: Destination buffer
 116  * @bufsz: size of @buf, at least 5.
 117  * @codepoint: Unicode codepoint to encode
 118  *
 119  * Convert Unicode codepoint @codepoint to modified UTF-8.
 120  *
 121  * Returns: the length of the UTF-8 sequence on success, -1 when
 122  * @codepoint is invalid.
 123  */
 124 ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
 125 {
 126     assert(bufsz >= 5);
 127
 128     if (!is_valid_codepoint(codepoint)) {
 129         return -1;
 130     }
 131
 132     if (codepoint > 0 && codepoint <= 0x7F) {
 133         buf[0] = codepoint & 0x7F;
 134         buf[1] = 0;
 135         return 1;
 136     }
 137     if (codepoint <= 0x7FF) {
 138         buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
 139         buf[1] = 0x80 | (codepoint & 0x3F);
 140         buf[2] = 0;
 141         return 2;
 142     }
 143     if (codepoint <= 0xFFFF) {
 144         buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
 145         buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
 146         buf[2] = 0x80 | (codepoint & 0x3F);
 147         buf[3] = 0;
 148         return 3;
 149     }
 150     buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
 151     buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
 152     buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
 153     buf[3] = 0x80 | (codepoint & 0x3F);
 154     buf[4] = 0;
 155     return 4;
 156 }