src/utf8.c

   1 ////////////////////////////////////////////////////////////////////////////////
   2 // UTF-8
   3
   4 static const unsigned char utf8Length[256] = {
   5   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f
   6   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f
   7   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f
   8   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f
   9   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f
  10   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f
  11   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f
  12   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f
  13   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f
  14   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f
  15   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf
  16   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf
  17   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf  c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127
  18   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf
  19   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef
  20   4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8  //0xf0-0xff
  21 };
  22
  23
  24 // decode one utf-8 char from *buf to *u, return char length; return '?' on error
  25 static int k8t_UTF8Decode (uint32_t *u, const void *buf) {
  26   const unsigned char *data = (const unsigned char *)buf;
  27   unsigned char len = utf8Length[*data];
  28   uint32_t uc;
  29   //
  30   switch (len) {
  31     case 0: // ascii
  32       *u = *data;
  33       return 1;
  34     case 8: case 9: // invalid
  35       *u = '?';
  36       return 1;
  37   }
  38   // utf-8
  39   uc = (*data++)&(0x7c>>len);
  40   while (--len) {
  41     if (utf8Length[*data] != 9) { uc = 0xffff; break; }
  42     uc = (uc<<6)|((*data++)&0x3f);
  43   }
  44   if (uc > 0x10ffff) uc &= 0x1fffff;
  45   if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates
  46       (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun
  47       (uc >= 0xfffe && uc <= 0xffff)) uc = '?'; // bad unicode
  48   *u = uc;
  49   return data-((const unsigned char *)buf);
  50 }
  51
  52
  53 // encode one utf-8 char from u to *buf, return char length
  54 static int k8t_UTF8Encode (void *buf, uint32_t uc) {
  55   uint8_t *sp = (uint8_t *)buf;
  56   int n;
  57   //
  58   if (uc < 0x80) { *sp = uc; return 1; } /* 0xxxxxxx */
  59   uc &= 0x1fffff;
  60   if (uc < 0x800) {
  61     /* 110xxxxx */
  62     *sp++ = (uc>>6)|0xc0;
  63     n = 1;
  64   } else if (uc < 0x10000) {
  65     /* 1110xxxx */
  66     *sp++ = (uc>>12)|0xe0;
  67     n = 2;
  68   } else if (uc <= 0x10FFFF) {
  69     /* 11110xxx */
  70     *sp++ = (uc>>18)|0xf0;
  71     n = 3;
  72   } else {
  73     /* U+FFFD */
  74     memcpy(sp, "\xEF\xBF\xBD", 3);
  75     return 3;
  76   }
  77   for (int f = n; f > 0; --f) *sp++ = ((uc>>(6*(f-1)))&0x3f)|0x80; /* 10xxxxxx */
  78   return n+1;
  79 }
  80
  81
  82 /* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode
  83    UTF-8 otherwise return 0 */
  84 static int k8t_UTF8IsFull (const void *buf, int buflen) {
  85   if (buflen > 0) {
  86     const unsigned char *data = (const unsigned char *)buf;
  87     unsigned char len = utf8Length[*data++];
  88     int res;
  89     //
  90     switch (len) {
  91       case 0: case 8: case 9: return 1;
  92     }
  93     if ((res = (buflen >= len))) buflen = len;
  94     for (int f = buflen-1; f > 0; --f) if (((*data++)&0xc0) != 0x80) return 1;
  95     return res;
  96   }
  97   return 0;
  98 }
  99
 100
 101 static int k8t_UTF8Size (const void *buf) {
 102   const unsigned char *data = (const unsigned char *)buf;
 103   unsigned char len = utf8Length[*data];
 104   //
 105   switch (len) {
 106     case 0: return 1;
 107     case 8: case 9: return 0;
 108   }
 109   return len;
 110 }
 111
 112
 113 static int k8t_UTF8strlen (const char *s) {
 114   int len = 0;
 115   //
 116   while (*s) {
 117     if (((unsigned char)(s[0])&0xc0) == 0xc0 || ((unsigned char)(s[0])&0x80) == 0) ++len;
 118     ++s;
 119   }
 120   return len;
 121 }
 122
 123
 124 static void k8t_UTF8ChopLast (char *s) {
 125   int lastpos = 0;
 126   //
 127   for (char *t = s; *t; ++t) {
 128     if (((unsigned char)(t[0])&0xc0) == 0xc0 || ((unsigned char)(t[0])&0x80) == 0) lastpos = (int)(t-s);
 129   }
 130   s[lastpos] = 0;
 131 }