uc.c

   1 #include <ctype.h>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <string.h>
   5 #include "vi.h"
   6
   7 #define LEN(a)          (sizeof(a) / sizeof((a)[0]))
   8
   9 /* return the length of a utf-8 character */
  10 int uc_len(char *s)
  11 {
  12         int c = (unsigned char) s[0];
  13         if (c > 0 && c <= 0x7f)
  14                 return 1;
  15         if (c >= 0xfc)
  16                 return 6;
  17         if (c >= 0xf8)
  18                 return 5;
  19         if (c >= 0xf0)
  20                 return 4;
  21         if (c >= 0xe0)
  22                 return 3;
  23         if (c >= 0xc0)
  24                 return 2;
  25         return c != 0;
  26 }
  27
  28 /* the number of utf-8 characters in s */
  29 int uc_slen(char *s)
  30 {
  31         char *e = s + strlen(s);
  32         int i;
  33         for (i = 0; s < e; i++)
  34                 s += uc_len(s);
  35         return i;
  36 }
  37
  38 /* the unicode codepoint of the given utf-8 character */
  39 int uc_code(char *s)
  40 {
  41         int result;
  42         int l = uc_len(s);
  43         if (l <= 1)
  44                 return (unsigned char) *s;
  45         result = (0x3f >> --l) & (unsigned char) *s++;
  46         while (l--)
  47                 result = (result << 6) | ((unsigned char) *s++ & 0x3f);
  48         return result;
  49 }
  50
  51 /* find the beginning of the character at s[i] */
  52 char *uc_beg(char *beg, char *s)
  53 {
  54         while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
  55                 s--;
  56         return s;
  57 }
  58
  59 /* find the end of the character at s[i] */
  60 char *uc_end(char *s)
  61 {
  62         if (!*s || !((unsigned char) *s & 0x80))
  63                 return s;
  64         if (((unsigned char) *s & 0xc0) == 0xc0)
  65                 s++;
  66         while (((unsigned char) *s & 0xc0) == 0x80)
  67                 s++;
  68         return s - 1;
  69 }
  70
  71 /* return a pointer to the character following s */
  72 char *uc_next(char *s)
  73 {
  74         s = uc_end(s);
  75         return *s ? s + 1 : s;
  76 }
  77
  78 /* return a pointer to the character preceding s */
  79 char *uc_prev(char *beg, char *s)
  80 {
  81         return s == beg ? beg : uc_beg(beg, s - 1);
  82 }
  83
  84 int uc_wid(char *s)
  85 {
  86         return 1;
  87 }
  88
  89 char *uc_lastline(char *s)
  90 {
  91         char *r = strrchr(s, '\n');
  92         return r ? r + 1 : s;
  93 }
  94
  95 /* allocate and return an array for the characters in s */
  96 char **uc_chop(char *s, int *n)
  97 {
  98         char **chrs;
  99         int i;
 100         *n = uc_slen(s);
 101         chrs = malloc((*n + 1) * sizeof(chrs[0]));
 102         for (i = 0; i < *n + 1; i++) {
 103                 chrs[i] = s;
 104                 s = uc_next(s);
 105         }
 106         return chrs;
 107 }
 108
 109 char *uc_chr(char *s, int off)
 110 {
 111         int i = 0;
 112         while (s && *s) {
 113                 if (i++ == off)
 114                         return s;
 115                 s = uc_next(s);
 116         }
 117         return s && (off < 0 || i == off) ? s : "";
 118 }
 119
 120 /* the number of characters between s and s + off*/
 121 int uc_off(char *s, int off)
 122 {
 123         char *e = s + off;
 124         int i;
 125         for (i = 0; s < e && *s; i++)
 126                 s = uc_next(s);
 127         return i;
 128 }
 129
 130 char *uc_sub(char *s, int beg, int end)
 131 {
 132         char *sbeg = uc_chr(s, beg);
 133         char *send = uc_chr(s, end);
 134         int len = sbeg && send && sbeg <= send ? send - sbeg : 0;
 135         char *r = malloc(len + 1);
 136         memcpy(r, sbeg, len);
 137         r[len] = '\0';
 138         return r;
 139 }
 140
 141 char *uc_dup(char *s)
 142 {
 143         char *r = malloc(strlen(s) + 1);
 144         return r ? strcpy(r, s) : NULL;
 145 }
 146
 147 int uc_isspace(char *s)
 148 {
 149         int c = s ? (unsigned char) *s : 0;
 150         return c <= 0x7f && isspace(c);
 151 }
 152
 153 int uc_isprint(char *s)
 154 {
 155         int c = s ? (unsigned char) *s : 0;
 156         return c > 0x7f || isprint(c);
 157 }
 158
 159 int uc_isalpha(char *s)
 160 {
 161         int c = s ? (unsigned char) *s : 0;
 162         return c > 0x7f || isalpha(c);
 163 }
 164
 165 int uc_isdigit(char *s)
 166 {
 167         int c = s ? (unsigned char) *s : 0;
 168         return c <= 0x7f && isdigit(c);
 169 }
 170
 171 int uc_kind(char *c)
 172 {
 173         if (uc_isspace(c))
 174                 return 0;
 175         if (uc_isalpha(c) || uc_isdigit(c) || c[0] == '_')
 176                 return 1;
 177         return 2;
 178 }
 179
 180 #define UC_R2L(ch)      (((ch) & 0xff00) == 0x0600 || \
 181                         ((ch) & 0xfffc) == 0x200c || \
 182                         ((ch) & 0xff00) == 0xfb00 || \
 183                         ((ch) & 0xff00) == 0xfc00 || \
 184                         ((ch) & 0xff00) == 0xfe00)
 185
 186 /* sorted list of characters that can be shaped */
 187 static struct achar {
 188         unsigned c;             /* utf-8 code */
 189         unsigned s;             /* single form */
 190         unsigned i;             /* initial form */
 191         unsigned m;             /* medial form */
 192         unsigned f;             /* final form */
 193 } achars[] = {
 194         {0x0621, 0xfe80},                               /* hamza */
 195         {0x0622, 0xfe81, 0, 0, 0xfe82},                 /* alef madda */
 196         {0x0623, 0xfe83, 0, 0, 0xfe84},                 /* alef hamza above */
 197         {0x0624, 0xfe85, 0, 0, 0xfe86},                 /* waw hamza */
 198         {0x0625, 0xfe87, 0, 0, 0xfe88},                 /* alef hamza below */
 199         {0x0626, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},       /* yeh hamza */
 200         {0x0627, 0xfe8d, 0, 0, 0xfe8e},                 /* alef */
 201         {0x0628, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},       /* beh */
 202         {0x0629, 0xfe93, 0, 0, 0xfe94},                 /* teh marbuta */
 203         {0x062a, 0xfe95, 0xfe97, 0xfe98, 0xfe96},       /* teh */
 204         {0x062b, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},       /* theh */
 205         {0x062c, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},       /* jeem */
 206         {0x062d, 0xfea1, 0xfea3, 0xfea4, 0xfea2},       /* hah */
 207         {0x062e, 0xfea5, 0xfea7, 0xfea8, 0xfea6},       /* khah */
 208         {0x062f, 0xfea9, 0, 0, 0xfeaa},                 /* dal */
 209         {0x0630, 0xfeab, 0, 0, 0xfeac},                 /* thal */
 210         {0x0631, 0xfead, 0, 0, 0xfeae},                 /* reh */
 211         {0x0632, 0xfeaf, 0, 0, 0xfeb0},                 /* zain */
 212         {0x0633, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},       /* seen */
 213         {0x0634, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},       /* sheen */
 214         {0x0635, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},       /* sad */
 215         {0x0636, 0xfebd, 0xfebf, 0xfec0, 0xfebe},       /* dad */
 216         {0x0637, 0xfec1, 0xfec3, 0xfec4, 0xfec2},       /* tah */
 217         {0x0638, 0xfec5, 0xfec7, 0xfec8, 0xfec6},       /* zah */
 218         {0x0639, 0xfec9, 0xfecb, 0xfecc, 0xfeca},       /* ain */
 219         {0x063a, 0xfecd, 0xfecf, 0xfed0, 0xfece},       /* ghain */
 220         {0x0640, 0x640, 0x640, 0x640},                  /* tatweel */
 221         {0x0641, 0xfed1, 0xfed3, 0xfed4, 0xfed2},       /* feh */
 222         {0x0642, 0xfed5, 0xfed7, 0xfed8, 0xfed6},       /* qaf */
 223         {0x0643, 0xfed9, 0xfedb, 0xfedc, 0xfeda},       /* kaf */
 224         {0x0644, 0xfedd, 0xfedf, 0xfee0, 0xfede},       /* lam */
 225         {0x0645, 0xfee1, 0xfee3, 0xfee4, 0xfee2},       /* meem */
 226         {0x0646, 0xfee5, 0xfee7, 0xfee8, 0xfee6},       /* noon */
 227         {0x0647, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},       /* heh */
 228         {0x0648, 0xfeed, 0, 0, 0xfeee},                 /* waw */
 229         {0x0649, 0xfeef, 0, 0, 0xfef0},                 /* alef maksura */
 230         {0x064a, 0xfef1, 0xfef3, 0xfef4, 0xfef2},       /* yeh */
 231         {0x067e, 0xfb56, 0xfb58, 0xfb59, 0xfb57},       /* peh */
 232         {0x0686, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},       /* tcheh */
 233         {0x0698, 0xfb8a, 0, 0, 0xfb8b},                 /* jeh */
 234         {0x06a9, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},       /* fkaf */
 235         {0x06af, 0xfb92, 0xfb94, 0xfb95, 0xfb93},       /* gaf */
 236         {0x06cc, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},       /* fyeh */
 237         {0x200c},                                       /* ZWNJ */
 238         {0x200d, 0, 0x200d, 0x200d},                    /* ZWJ */
 239 };
 240
 241 static struct achar *find_achar(int c)
 242 {
 243         int h, m, l;
 244         h = LEN(achars);
 245         l = 0;
 246         /* using binary search to find c */
 247         while (l < h) {
 248                 m = (h + l) >> 1;
 249                 if (achars[m].c == c)
 250                         return &achars[m];
 251                 if (c < achars[m].c)
 252                         h = m;
 253                 else
 254                         l = m + 1;
 255         }
 256         return NULL;
 257 }
 258
 259 static int can_join(int c1, int c2)
 260 {
 261         struct achar *a1 = find_achar(c1);
 262         struct achar *a2 = find_achar(c2);
 263         return a1 && a2 && (a1->i || a1->m) && (a2->f || a2->m);
 264 }
 265
 266 static int uc_cshape(int cur, int prev, int next)
 267 {
 268         int c = cur;
 269         int join_prev, join_next;
 270         struct achar *ac = find_achar(c);
 271         if (!ac)                /* ignore non-Arabic characters */
 272                 return c;
 273         join_prev = can_join(prev, c);
 274         join_next = can_join(c, next);
 275         if (join_prev && join_next)
 276                 c = ac->m;
 277         if (join_prev && !join_next)
 278                 c = ac->f;
 279         if (!join_prev && join_next)
 280                 c = ac->i;
 281         if (!join_prev && !join_next)
 282                 c = ac->c;      /* some fonts do not have a glyph for ac->s */
 283         return c ? c : cur;
 284 }
 285
 286 /*
 287  * return nonzero for Arabic combining characters
 288  *
 289  * The standard Arabic diacritics:
 290  * + 0x064b: fathatan
 291  * + 0x064c: dammatan
 292  * + 0x064d: kasratan
 293  * + 0x064e: fatha
 294  * + 0x064f: damma
 295  * + 0x0650: kasra
 296  * + 0x0651: shadda
 297  * + 0x0652: sukun
 298  * + 0x0653: madda above
 299  * + 0x0654: hamza above
 300  * + 0x0655: hamza below
 301  * + 0x0670: superscript alef
 302  */
 303 static int uc_comb(int c)
 304 {
 305         return (c >= 0x064b && c <= 0x0655) ||          /* the standard diacritics */
 306                 (c >= 0xfc5e && c <= 0xfc63) ||         /* shadda ligatures */
 307                 c == 0x0670;                            /* superscript alef */
 308 }
 309
 310 static void uc_cput(char *d, int c)
 311 {
 312         int l = 0;
 313         if (c > 0xffff) {
 314                 *d++ = 0xf0 | (c >> 18);
 315                 l = 3;
 316         } else if (c > 0x7ff) {
 317                 *d++ = 0xe0 | (c >> 12);
 318                 l = 2;
 319         } else if (c > 0x7f) {
 320                 *d++ = 0xc0 | (c >> 6);
 321                 l = 1;
 322         } else {
 323                 *d++ = c;
 324         }
 325         while (l--)
 326                 *d++ = 0x80 | ((c >> (l * 6)) & 0x3f);
 327         *d = '\0';
 328 }
 329
 330 /* shape the given arabic character; returns a static buffer */
 331 char *uc_shape(char *beg, char *s)
 332 {
 333         static char out[16];
 334         char *r;
 335         int prev = 0;
 336         int next = 0;
 337         int curr = uc_code(s);
 338         if (!curr || !UC_R2L(curr))
 339                 return NULL;
 340         r = s;
 341         while (r > beg) {
 342                 r = uc_beg(beg, r - 1);
 343                 if (!uc_comb(uc_code(r))) {
 344                         prev = uc_code(r);
 345                         break;
 346                 }
 347         }
 348         r = s;
 349         while (*r) {
 350                 r = uc_next(r);
 351                 if (!uc_comb(uc_code(r))) {
 352                         next = uc_code(r);
 353                         break;
 354                 }
 355         }
 356         uc_cput(out, uc_cshape(curr, prev, next));
 357         return out;
 358 }