contrib/nvi2/common/encoding.c

   1 /*-
   2  * Copyright (c) 2011, 2012
   3  *      Zhihao Yuan.  All rights reserved.
   4  *
   5  * See the LICENSE file for redistribution information.
   6  */
   7
   8 #include <sys/types.h>
   9
  10 int looks_utf8(const char *, size_t);
  11 int looks_utf16(const char *, size_t);
  12 int decode_utf8(const char *);
  13 int decode_utf16(const char *, int);
  14
  15 #define F 0   /* character never appears in text */
  16 #define T 1   /* character appears in plain ASCII text */
  17 #define I 2   /* character appears in ISO-8859 text */
  18 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
  19
  20 static char text_chars[256] = {
  21         /*                  BEL BS HT LF    FF CR    */
  22         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
  23         /*                              ESC          */
  24         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
  25         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
  26         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
  27         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
  28         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
  29         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
  30         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
  31         /*            NEL                            */
  32         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
  33         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
  34         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
  35         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
  36         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
  37         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
  38         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
  39         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
  40 };
  41
  42 /*
  43  * looks_utf8 --
  44  *  Decide whether some text looks like UTF-8. Returns:
  45  *
  46  *     -1: invalid UTF-8
  47  *      0: uses odd control characters, so doesn't look like text
  48  *      1: 7-bit text
  49  *      2: definitely UTF-8 text (valid high-bit set bytes)
  50  *
  51  *  Based on RFC 3629. UTF-8 with BOM is not accepted.
  52  *
  53  * PUBLIC: int looks_utf8(const char *, size_t);
  54  */
  55 int
  56 looks_utf8(const char *ibuf, size_t nbytes)
  57 {
  58         const u_char *buf = (u_char *)ibuf;
  59         size_t i;
  60         int n;
  61         int gotone = 0, ctrl = 0;
  62
  63         for (i = 0; i < nbytes; i++) {
  64                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
  65                         /*
  66                          * Even if the whole file is valid UTF-8 sequences,
  67                          * still reject it if it uses weird control characters.
  68                          */
  69
  70                         if (text_chars[buf[i]] != T)
  71                                 ctrl = 1;
  72                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
  73                         return -1;
  74                 } else {                           /* 11xxxxxx begins UTF-8 */
  75                         int following;
  76
  77                         if ((buf[i] & 0x20) == 0)       /* 110xxxxx */
  78                                 if (buf[i] > 0xC1)      /* C0, C1 */
  79                                         following = 1;
  80                                 else return -1;
  81                         else if ((buf[i] & 0x10) == 0)  /* 1110xxxx */
  82                                 following = 2;
  83                         else if ((buf[i] & 0x08) == 0)  /* 11110xxx */
  84                                 if (buf[i] < 0xF5)
  85                                         following = 3;
  86                                 else return -1;         /* F5, F6, F7 */
  87                         else
  88                                 return -1;              /* F8~FF */
  89
  90                         for (n = 0; n < following; n++) {
  91                                 i++;
  92                                 if (i >= nbytes)
  93                                         goto done;
  94
  95                                 if ((buf[i] & 0xc0) != 0x80)    /* 10xxxxxx */
  96                                         return -1;
  97                         }
  98
  99                         gotone = 1;
 100                 }
 101         }
 102 done:
 103         return ctrl ? 0 : (gotone ? 2 : 1);
 104 }
 105
 106 /*
 107  * looks_utf16 --
 108  *  Decide whether some text looks like UTF-16. Returns:
 109  *
 110  *      0: invalid UTF-16
 111  *      1: Little-endian UTF-16
 112  *      2: Big-endian UTF-16
 113  *
 114  * PUBLIC: int looks_utf16(const char *, size_t);
 115  */
 116 int
 117 looks_utf16(const char *ibuf, size_t nbytes)
 118 {
 119         const u_char *buf = (u_char *)ibuf;
 120         int bigend;
 121         size_t i;
 122         unsigned int c;
 123         int bom;
 124         int following = 0;
 125
 126         if (nbytes < 2)
 127                 return 0;
 128
 129         bom = buf[0] << 8 ^ buf[1];
 130         if (bom == 0xFFFE)
 131                 bigend = 0;
 132         else if (bom == 0xFEFF)
 133                 bigend = 1;
 134         else
 135                 return 0;
 136
 137         for (i = 2; i + 1 < nbytes; i += 2) {
 138                 if (bigend)
 139                         c = buf[i] << 8 ^ buf[i + 1];
 140                 else
 141                         c = buf[i] ^ buf[i + 1] << 8;
 142
 143                 if (!following)
 144                         if (c < 0xD800 || c > 0xDFFF)
 145                                 if (c < 128 && text_chars[c] != T)
 146                                         return 0;
 147                                 else
 148                                         following = 0;
 149                         else if (c > 0xDBFF)
 150                                 return 0;
 151                         else {
 152                                 following = 1;
 153                                 continue;
 154                         }
 155                 else if (c < 0xDC00 || c > 0xDFFF)
 156                         return 0;
 157         }
 158
 159         return 1 + bigend;
 160 }
 161
 162 #undef F
 163 #undef T
 164 #undef I
 165 #undef X
 166
 167 /*
 168  * decode_utf8 --
 169  *  Decode a UTF-8 character from byte string to Unicode.
 170  *  Returns -1 if the first byte is a not UTF-8 leader.
 171  *
 172  *  Based on RFC 3629, but without error detection.
 173  *
 174  * PUBLIC: int decode_utf8(const char *);
 175  */
 176 int
 177 decode_utf8(const char *ibuf)
 178 {
 179         const u_char *buf = (u_char *)ibuf;
 180         int u = -1;
 181
 182         if ((buf[0] & 0x80) == 0)
 183                 u = buf[0];
 184         else if ((buf[0] & 0x40) == 0);
 185         else {
 186                 if ((buf[0] & 0x20) == 0)
 187                         u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
 188                 else if ((buf[0] & 0x10) == 0)
 189                         u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
 190                           ^ (buf[2] ^ 0x80);
 191                 else if (((buf[0] & 0x08) == 0))
 192                         u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
 193                           ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
 194         }
 195
 196         return u;
 197 }
 198
 199 /*
 200  * decode_utf16 --
 201  *  Decode a UTF-16 character from byte string to Unicode.
 202  *  Returns -1 if the first unsigned integer is invalid.
 203  *
 204  *  No error detection on supplementary bytes.
 205  *
 206  * PUBLIC: int decode_utf16(const char *, int);
 207  */
 208 int
 209 decode_utf16(const char* ibuf, int bigend)
 210 {
 211         const u_char *buf = (u_char *)ibuf;
 212         int u = -1;
 213         unsigned int w1, w2;
 214
 215         if (bigend)
 216                 w1 = buf[0] << 8 ^ buf[1];
 217         else
 218                 w1 = buf[0] ^ buf[1] << 8;
 219
 220         if (w1 < 0xD800 || w1 > 0xDFFF)
 221                 u = w1;
 222         else if (w1 > 0xDBFF);
 223         else {
 224                 if (bigend)
 225                         w2 = buf[2] << 8 ^ buf[3];
 226                 else
 227                         w2 = buf[2] ^ buf[3] << 8;
 228                 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
 229         }
 230
 231         return u;
 232 }