uchar.c

   1 /*
   2  * Copyright 2004-2005 Timo Hirvonen
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License as
   6  * published by the Free Software Foundation; either version 2 of the
   7  * License, or (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  17  * 02111-1307, USA.
  18  */
  19
  20 #include "uchar.h"
  21 #include "compiler.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wctype.h>
  26 #include <ctype.h>
  27
  28 const char hex_tab[16] = "0123456789abcdef";
  29
  30 /*
  31  * Byte Sequence                                             Min       Min        Max
  32  * ----------------------------------------------------------------------------------
  33  * 0xxxxxxx                                              0000000   0x00000   0x00007f
  34  * 110xxxxx 10xxxxxx                                000 10000000   0x00080   0x0007ff
  35  * 1110xxxx 10xxxxxx 10xxxxxx                  00001000 00000000   0x00800   0x00ffff
  36  * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx   00001 00000000 00000000   0x10000   0x10ffff (not 0x1fffff)
  37  *
  38  * max: 100   001111   111111   111111  (0x10ffff)
  39  */
  40
  41 /* Length of UTF-8 byte sequence.
  42  * Table index is the first byte of UTF-8 sequence.
  43  */
  44 static const signed char len_tab[256] = {
  45         /*   0-127  0xxxxxxx */
  46         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  48         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  49         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  50         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54
  55         /* 128-191  10xxxxxx (invalid first byte) */
  56         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  57         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  58         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  59         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  60
  61         /* 192-223  110xxxxx */
  62         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  63         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  64
  65         /* 224-239  1110xxxx */
  66         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  67
  68         /* 240-244  11110xxx (000 - 100) */
  69         4, 4, 4, 4, 4,
  70
  71         /* 11110xxx (101 - 111) (always invalid) */
  72         -1, -1, -1,
  73
  74         /* 11111xxx (always invalid) */
  75         -1, -1, -1, -1, -1, -1, -1, -1
  76 };
  77
  78 /* index is length of the UTF-8 sequence - 1 */
  79 static int min_val[4] = { 0x000000, 0x000080, 0x000800, 0x010000 };
  80 static int max_val[4] = { 0x00007f, 0x0007ff, 0x00ffff, 0x10ffff };
  81
  82 /* get value bits from the first UTF-8 sequence byte */
  83 static unsigned int first_byte_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
  84
  85 int u_is_valid(const char *str)
  86 {
  87         const unsigned char *s = (const unsigned char *)str;
  88         int i = 0;
  89
  90         while (s[i]) {
  91                 unsigned char ch = s[i++];
  92                 int len = len_tab[ch];
  93
  94                 if (len <= 0)
  95                         return 0;
  96
  97                 if (len > 1) {
  98                         /* len - 1 10xxxxxx bytes */
  99                         uchar u;
 100                         int c;
 101
 102                         len--;
 103                         u = ch & first_byte_mask[len];
 104                         c = len;
 105                         do {
 106                                 ch = s[i++];
 107                                 if (len_tab[ch] != 0)
 108                                         return 0;
 109                                 u = (u << 6) | (ch & 0x3f);
 110                         } while (--c);
 111
 112                         if (u < min_val[len] || u > max_val[len])
 113                                 return 0;
 114                 }
 115         }
 116         return 1;
 117 }
 118
 119 int u_strlen(const char *str)
 120 {
 121         const unsigned char *s = (const unsigned char *)str;
 122         int len = 0;
 123
 124         while (*s) {
 125                 int l = len_tab[*s];
 126
 127                 if (unlikely(l > 1)) {
 128                         /* next l - 1 bytes must be 0x10xxxxxx */
 129                         int c = 1;
 130                         do {
 131                                 if (len_tab[s[c]] != 0) {
 132                                         /* invalid sequence */
 133                                         goto single_char;
 134                                 }
 135                                 c++;
 136                         } while (c < l);
 137
 138                         /* valid sequence */
 139                         s += l;
 140                         len++;
 141                         continue;
 142                 }
 143 single_char:
 144                 /* l is -1, 0 or 1
 145                  * invalid chars counted as single characters */
 146                 s++;
 147                 len++;
 148         }
 149         return len;
 150 }
 151
 152 int u_char_width(uchar u)
 153 {
 154         if (unlikely(u < 0x20))
 155                 goto control;
 156
 157         if (u < 0x1100U)
 158                 goto narrow;
 159
 160         /* Hangul Jamo init. consonants */
 161         if (u <= 0x115fU)
 162                 goto wide;
 163
 164         /* angle brackets */
 165         if (u == 0x2329U || u == 0x232aU)
 166                 goto wide;
 167
 168         if (u < 0x2e80U)
 169                 goto narrow;
 170         /* CJK ... Yi */
 171         if (u < 0x302aU)
 172                 goto wide;
 173         if (u <= 0x302fU)
 174                 goto narrow;
 175         if (u == 0x303fU)
 176                 goto narrow;
 177         if (u == 0x3099U)
 178                 goto narrow;
 179         if (u == 0x309aU)
 180                 goto narrow;
 181         /* CJK ... Yi */
 182         if (u <= 0xa4cfU)
 183                 goto wide;
 184
 185         /* Hangul Syllables */
 186         if (u >= 0xac00U && u <= 0xd7a3U)
 187                 goto wide;
 188
 189         /* CJK Compatibility Ideographs */
 190         if (u >= 0xf900U && u <= 0xfaffU)
 191                 goto wide;
 192
 193         /* CJK Compatibility Forms */
 194         if (u >= 0xfe30U && u <= 0xfe6fU)
 195                 goto wide;
 196
 197         /* Fullwidth Forms */
 198         if (u >= 0xff00U && u <= 0xff60U)
 199                 goto wide;
 200
 201         /* Fullwidth Forms */
 202         if (u >= 0xffe0U && u <= 0xffe6U)
 203                 goto wide;
 204
 205         /* CJK extra stuff */
 206         if (u >= 0x20000U && u <= 0x2fffdU)
 207                 goto wide;
 208
 209         /* ? */
 210         if (u >= 0x30000U && u <= 0x3fffdU)
 211                 goto wide;
 212
 213         /* invalid bytes in unicode stream are rendered "<xx>" */
 214         if (u & U_INVALID_MASK)
 215                 goto invalid;
 216 narrow:
 217         return 1;
 218 wide:
 219         return 2;
 220 control:
 221         /* special case */
 222         if (u == 0)
 223                 return 1;
 224
 225         /* print control chars as <xx> */
 226 invalid:
 227         return 4;
 228 }
 229
 230 int u_str_width(const char *str)
 231 {
 232         int idx = 0, w = 0;
 233
 234         while (str[idx]) {
 235                 uchar u;
 236
 237                 u_get_char(str, &idx, &u);
 238                 w += u_char_width(u);
 239         }
 240         return w;
 241 }
 242
 243 int u_str_nwidth(const char *str, int len)
 244 {
 245         int idx = 0;
 246         int w = 0;
 247         uchar u;
 248
 249         while (len > 0) {
 250                 u_get_char(str, &idx, &u);
 251                 if (u == 0)
 252                         break;
 253                 w += u_char_width(u);
 254                 len--;
 255         }
 256         return w;
 257 }
 258
 259 void u_prev_char_pos(const char *str, int *idx)
 260 {
 261         const unsigned char *s = (const unsigned char *)str;
 262         int c, len, i = *idx;
 263         uchar ch;
 264
 265         ch = s[--i];
 266         len = len_tab[ch];
 267         if (len != 0) {
 268                 /* start of byte sequence or invelid uchar */
 269                 goto one;
 270         }
 271
 272         c = 1;
 273         while (1) {
 274                 if (i == 0) {
 275                         /* first byte of the sequence is missing */
 276                         break;
 277                 }
 278
 279                 ch = s[--i];
 280                 len = len_tab[ch];
 281                 c++;
 282
 283                 if (len == 0) {
 284                         if (c < 4)
 285                                 continue;
 286
 287                         /* too long sequence */
 288                         break;
 289                 }
 290                 if (len != c) {
 291                         /* incorrect length */
 292                         break;
 293                 }
 294
 295                 /* ok */
 296                 *idx = i;
 297                 return;
 298         }
 299 one:
 300         *idx = *idx - 1;
 301         return;
 302 }
 303
 304 void u_get_char(const char *str, int *idx, uchar *uch)
 305 {
 306         const unsigned char *s = (const unsigned char *)str;
 307         int len, i = *idx;
 308         uchar ch, u;
 309
 310         ch = s[i++];
 311         len = len_tab[ch];
 312         if (unlikely(len < 1))
 313                 goto invalid;
 314
 315         len--;
 316         u = ch & first_byte_mask[len];
 317         while (len > 0) {
 318                 ch = s[i++];
 319                 if (unlikely(len_tab[ch] != 0))
 320                         goto invalid;
 321                 u = (u << 6) | (ch & 0x3f);
 322                 len--;
 323         }
 324         *idx = i;
 325         *uch = u;
 326         return;
 327 invalid:
 328         i = *idx;
 329         u = s[i++];
 330         *uch = u | U_INVALID_MASK;
 331         *idx = i;
 332 }
 333
 334 void u_set_char_raw(char *str, int *idx, uchar uch)
 335 {
 336         int i = *idx;
 337
 338         if (uch <= 0x0000007fU) {
 339                 str[i++] = uch;
 340                 *idx = i;
 341         } else if (uch <= 0x000007ffU) {
 342                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 343                 str[i + 0] = uch | 0x000000c0U;
 344                 i += 2;
 345                 *idx = i;
 346         } else if (uch <= 0x0000ffffU) {
 347                 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
 348                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 349                 str[i + 0] = uch | 0x000000e0U;
 350                 i += 3;
 351                 *idx = i;
 352         } else if (uch <= 0x0010ffffU) {
 353                 str[i + 3] = (uch & 63) | 0x80; uch >>= 6;
 354                 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
 355                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 356                 str[i + 0] = uch | 0x000000f0U;
 357                 i += 4;
 358                 *idx = i;
 359         } else {
 360                 /* must be an invalid uchar */
 361                 str[i++] = uch & 0xff;
 362                 *idx = i;
 363         }
 364 }
 365
 366 /*
 367  * Printing functions, these lose information
 368  */
 369
 370 void u_set_char(char *str, int *idx, uchar uch)
 371 {
 372         int i = *idx;
 373
 374         if (unlikely(uch <= 0x0000001fU))
 375                 goto invalid;
 376
 377         if (uch <= 0x0000007fU) {
 378                 str[i++] = uch;
 379                 *idx = i;
 380                 return;
 381         } else if (uch <= 0x000007ffU) {
 382                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 383                 str[i + 0] = uch | 0x000000c0U;
 384                 i += 2;
 385                 *idx = i;
 386                 return;
 387         } else if (uch <= 0x0000ffffU) {
 388                 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
 389                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 390                 str[i + 0] = uch | 0x000000e0U;
 391                 i += 3;
 392                 *idx = i;
 393                 return;
 394         } else if (uch <= 0x0010ffffU) {
 395                 str[i + 3] = (uch & 63) | 0x80; uch >>= 6;
 396                 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
 397                 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
 398                 str[i + 0] = uch | 0x000000f0U;
 399                 i += 4;
 400                 *idx = i;
 401                 return;
 402         }
 403 invalid:
 404         /* control character or invalid unicode */
 405         if (uch == 0) {
 406                 /* handle this special case here to make the common case fast */
 407                 str[i++] = 0;
 408                 *idx = i;
 409         } else {
 410                 str[i++] = '<';
 411                 str[i++] = hex_tab[(uch >> 4) & 0xf];
 412                 str[i++] = hex_tab[uch & 0xf];
 413                 str[i++] = '>';
 414                 *idx = i;
 415         }
 416 }
 417
 418 int u_copy_chars(char *dst, const char *src, int *width)
 419 {
 420         int w = *width;
 421         int si = 0, di = 0;
 422         int cw;
 423         uchar u;
 424
 425         while (w > 0) {
 426                 u_get_char(src, &si, &u);
 427                 if (u == 0)
 428                         break;
 429
 430                 cw = u_char_width(u);
 431                 w -= cw;
 432
 433                 if (unlikely(w < 0)) {
 434                         if (cw == 2)
 435                                 dst[di++] = ' ';
 436                         if (cw == 4) {
 437                                 dst[di++] = '<';
 438                                 if (w >= -2)
 439                                         dst[di++] = hex_tab[(u >> 4) & 0xf];
 440                                 if (w >= -1)
 441                                         dst[di++] = hex_tab[u & 0xf];
 442                         }
 443                         w = 0;
 444                         break;
 445                 }
 446                 u_set_char(dst, &di, u);
 447         }
 448         *width -= w;
 449         return di;
 450 }
 451
 452 int u_skip_chars(const char *str, int *width)
 453 {
 454         int w = *width;
 455         int idx = 0;
 456
 457         while (w > 0) {
 458                 uchar u;
 459
 460                 u_get_char(str, &idx, &u);
 461                 w -= u_char_width(u);
 462         }
 463         /* add 1..3 if skipped 'too much' (the last char was double width or invalid (<xx>)) */
 464         *width -= w;
 465         return idx;
 466 }
 467
 468 /*
 469  * Comparison functions
 470  */
 471
 472 static inline int chcasecmp(int a, int b)
 473 {
 474         return towupper(a) - towupper(b);
 475 }
 476
 477 int u_strcasecmp(const char *a, const char *b)
 478 {
 479         int ai = 0;
 480         int bi = 0;
 481         int res;
 482
 483         do {
 484                 uchar au, bu;
 485
 486                 u_get_char(a, &ai, &au);
 487                 u_get_char(b, &bi, &bu);
 488                 res = chcasecmp(au, bu);
 489                 if (res)
 490                         break;
 491                 if (au == 0) {
 492                         /* bu is 0 too */
 493                         break;
 494                 }
 495         } while (1);
 496         return res;
 497 }
 498
 499 int u_strncasecmp(const char *a, const char *b, int len)
 500 {
 501         int ai = 0;
 502         int bi = 0;
 503
 504         while (len > 0) {
 505                 uchar au, bu;
 506                 int res;
 507
 508                 u_get_char(a, &ai, &au);
 509                 u_get_char(b, &bi, &bu);
 510                 res = chcasecmp(au, bu);
 511                 if (res)
 512                         return res;
 513                 if (au == 0) {
 514                         /* bu is 0 too */
 515                         return 0;
 516                 }
 517                 len--;
 518         }
 519         return 0;
 520 }
 521
 522 char *u_strcasestr(const char *haystack, const char *needle)
 523 {
 524         /* strlen is faster and works here */
 525         int haystack_len = strlen(haystack);
 526         int needle_len = u_strlen(needle);
 527
 528         do {
 529                 uchar u;
 530                 int idx;
 531
 532                 if (haystack_len < needle_len)
 533                         return NULL;
 534                 if (u_strncasecmp(needle, haystack, needle_len) == 0)
 535                         return (char *)haystack;
 536
 537                 /* skip one char */
 538                 idx = 0;
 539                 u_get_char(haystack, &idx, &u);
 540                 haystack += idx;
 541                 haystack_len -= idx;
 542         } while (1);
 543 }