ex: basic global command
[neatvi.git] / uc.c
blob774dfcef75267a96cab2cd93b644a6602d672bfe
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "vi.h"
7 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
9 /* return the length of a utf-8 character */
10 int uc_len(char *s)
12 int c = (unsigned char) s[0];
13 if (~c & 0x80)
14 return c > 0;
15 if (~c & 0x20)
16 return 2;
17 if (~c & 0x10)
18 return 3;
19 if (~c & 0x80)
20 return 4;
21 if (~c & 0x40)
22 return 5;
23 if (~c & 0x20)
24 return 6;
25 return 1;
28 /* the number of utf-8 characters in s */
29 int uc_slen(char *s)
31 int n;
32 for (n = 0; *s; n++)
33 s = uc_end(s) + 1;
34 return n;
37 /* the unicode codepoint of the given utf-8 character */
38 int uc_code(char *s)
40 int c = (unsigned char) s[0];
41 int l;
42 if (!(c & 0x80))
43 return c;
44 if (!(c & 0x20))
45 return ((c & 0x1f) << 6) | (s[1] & 0x3f);
46 if (!(c & 0x10))
47 return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
48 l = uc_len(s);
49 c = (0x3f >> --l) & (unsigned char) *s++;
50 while (l--)
51 c = (c << 6) | ((unsigned char) *s++ & 0x3f);
52 return c;
55 /* find the beginning of the character at s[i] */
56 char *uc_beg(char *beg, char *s)
58 while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
59 s--;
60 return s;
63 /* find the end of the character at s[i] */
64 char *uc_end(char *s)
66 if (!*s || !((unsigned char) *s & 0x80))
67 return s;
68 if (((unsigned char) *s & 0xc0) == 0xc0)
69 s++;
70 while (((unsigned char) *s & 0xc0) == 0x80)
71 s++;
72 return s - 1;
75 /* return a pointer to the character following s */
76 char *uc_next(char *s)
78 s = uc_end(s);
79 return *s ? s + 1 : s;
82 /* return a pointer to the character preceding s */
83 char *uc_prev(char *beg, char *s)
85 return s == beg ? beg : uc_beg(beg, s - 1);
88 int uc_wid(char *s)
90 return 1;
93 char *uc_lastline(char *s)
95 char *r = strrchr(s, '\n');
96 return r ? r + 1 : s;
99 /* allocate and return an array for the characters in s */
100 char **uc_chop(char *s, int *n)
102 char **chrs;
103 int i;
104 *n = uc_slen(s);
105 chrs = malloc((*n + 1) * sizeof(chrs[0]));
106 for (i = 0; i < *n + 1; i++) {
107 chrs[i] = s;
108 s = uc_next(s);
110 return chrs;
113 char *uc_chr(char *s, int off)
115 int i = 0;
116 while (s && *s) {
117 if (i++ == off)
118 return s;
119 s = uc_next(s);
121 return s && (off < 0 || i == off) ? s : "";
124 /* the number of characters between s and s + off */
125 int uc_off(char *s, int off)
127 char *e = s + off;
128 int i;
129 for (i = 0; s < e && *s; i++)
130 s = uc_next(s);
131 return i;
134 char *uc_sub(char *s, int beg, int end)
136 char *sbeg = uc_chr(s, beg);
137 char *send = uc_chr(s, end);
138 int len = sbeg && send && sbeg <= send ? send - sbeg : 0;
139 char *r = malloc(len + 1);
140 memcpy(r, sbeg, len);
141 r[len] = '\0';
142 return r;
145 char *uc_dup(char *s)
147 char *r = malloc(strlen(s) + 1);
148 return r ? strcpy(r, s) : NULL;
151 int uc_isspace(char *s)
153 int c = s ? (unsigned char) *s : 0;
154 return c <= 0x7f && isspace(c);
157 int uc_isprint(char *s)
159 int c = s ? (unsigned char) *s : 0;
160 return c > 0x7f || isprint(c);
163 int uc_isalpha(char *s)
165 int c = s ? (unsigned char) *s : 0;
166 return c > 0x7f || isalpha(c);
169 int uc_isdigit(char *s)
171 int c = s ? (unsigned char) *s : 0;
172 return c <= 0x7f && isdigit(c);
175 int uc_kind(char *c)
177 if (uc_isspace(c))
178 return 0;
179 if (uc_isalpha(c) || uc_isdigit(c) || c[0] == '_')
180 return 1;
181 return 2;
184 #define UC_R2L(ch) (((ch) & 0xff00) == 0x0600 || \
185 ((ch) & 0xfffc) == 0x200c || \
186 ((ch) & 0xff00) == 0xfb00 || \
187 ((ch) & 0xff00) == 0xfc00 || \
188 ((ch) & 0xff00) == 0xfe00)
190 /* sorted list of characters that can be shaped */
191 static struct achar {
192 unsigned c; /* utf-8 code */
193 unsigned s; /* single form */
194 unsigned i; /* initial form */
195 unsigned m; /* medial form */
196 unsigned f; /* final form */
197 } achars[] = {
198 {0x0621, 0xfe80}, /* hamza */
199 {0x0622, 0xfe81, 0, 0, 0xfe82}, /* alef madda */
200 {0x0623, 0xfe83, 0, 0, 0xfe84}, /* alef hamza above */
201 {0x0624, 0xfe85, 0, 0, 0xfe86}, /* waw hamza */
202 {0x0625, 0xfe87, 0, 0, 0xfe88}, /* alef hamza below */
203 {0x0626, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a}, /* yeh hamza */
204 {0x0627, 0xfe8d, 0, 0, 0xfe8e}, /* alef */
205 {0x0628, 0xfe8f, 0xfe91, 0xfe92, 0xfe90}, /* beh */
206 {0x0629, 0xfe93, 0, 0, 0xfe94}, /* teh marbuta */
207 {0x062a, 0xfe95, 0xfe97, 0xfe98, 0xfe96}, /* teh */
208 {0x062b, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a}, /* theh */
209 {0x062c, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e}, /* jeem */
210 {0x062d, 0xfea1, 0xfea3, 0xfea4, 0xfea2}, /* hah */
211 {0x062e, 0xfea5, 0xfea7, 0xfea8, 0xfea6}, /* khah */
212 {0x062f, 0xfea9, 0, 0, 0xfeaa}, /* dal */
213 {0x0630, 0xfeab, 0, 0, 0xfeac}, /* thal */
214 {0x0631, 0xfead, 0, 0, 0xfeae}, /* reh */
215 {0x0632, 0xfeaf, 0, 0, 0xfeb0}, /* zain */
216 {0x0633, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2}, /* seen */
217 {0x0634, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6}, /* sheen */
218 {0x0635, 0xfeb9, 0xfebb, 0xfebc, 0xfeba}, /* sad */
219 {0x0636, 0xfebd, 0xfebf, 0xfec0, 0xfebe}, /* dad */
220 {0x0637, 0xfec1, 0xfec3, 0xfec4, 0xfec2}, /* tah */
221 {0x0638, 0xfec5, 0xfec7, 0xfec8, 0xfec6}, /* zah */
222 {0x0639, 0xfec9, 0xfecb, 0xfecc, 0xfeca}, /* ain */
223 {0x063a, 0xfecd, 0xfecf, 0xfed0, 0xfece}, /* ghain */
224 {0x0640, 0x640, 0x640, 0x640}, /* tatweel */
225 {0x0641, 0xfed1, 0xfed3, 0xfed4, 0xfed2}, /* feh */
226 {0x0642, 0xfed5, 0xfed7, 0xfed8, 0xfed6}, /* qaf */
227 {0x0643, 0xfed9, 0xfedb, 0xfedc, 0xfeda}, /* kaf */
228 {0x0644, 0xfedd, 0xfedf, 0xfee0, 0xfede}, /* lam */
229 {0x0645, 0xfee1, 0xfee3, 0xfee4, 0xfee2}, /* meem */
230 {0x0646, 0xfee5, 0xfee7, 0xfee8, 0xfee6}, /* noon */
231 {0x0647, 0xfee9, 0xfeeb, 0xfeec, 0xfeea}, /* heh */
232 {0x0648, 0xfeed, 0, 0, 0xfeee}, /* waw */
233 {0x0649, 0xfeef, 0, 0, 0xfef0}, /* alef maksura */
234 {0x064a, 0xfef1, 0xfef3, 0xfef4, 0xfef2}, /* yeh */
235 {0x067e, 0xfb56, 0xfb58, 0xfb59, 0xfb57}, /* peh */
236 {0x0686, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b}, /* tcheh */
237 {0x0698, 0xfb8a, 0, 0, 0xfb8b}, /* jeh */
238 {0x06a9, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f}, /* fkaf */
239 {0x06af, 0xfb92, 0xfb94, 0xfb95, 0xfb93}, /* gaf */
240 {0x06cc, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd}, /* fyeh */
241 {0x200c}, /* ZWNJ */
242 {0x200d, 0, 0x200d, 0x200d}, /* ZWJ */
245 static struct achar *find_achar(int c)
247 int h, m, l;
248 h = LEN(achars);
249 l = 0;
250 /* using binary search to find c */
251 while (l < h) {
252 m = (h + l) >> 1;
253 if (achars[m].c == c)
254 return &achars[m];
255 if (c < achars[m].c)
256 h = m;
257 else
258 l = m + 1;
260 return NULL;
263 static int can_join(int c1, int c2)
265 struct achar *a1 = find_achar(c1);
266 struct achar *a2 = find_achar(c2);
267 return a1 && a2 && (a1->i || a1->m) && (a2->f || a2->m);
270 static int uc_cshape(int cur, int prev, int next)
272 int c = cur;
273 int join_prev, join_next;
274 struct achar *ac = find_achar(c);
275 if (!ac) /* ignore non-Arabic characters */
276 return c;
277 join_prev = can_join(prev, c);
278 join_next = can_join(c, next);
279 if (join_prev && join_next)
280 c = ac->m;
281 if (join_prev && !join_next)
282 c = ac->f;
283 if (!join_prev && join_next)
284 c = ac->i;
285 if (!join_prev && !join_next)
286 c = ac->c; /* some fonts do not have a glyph for ac->s */
287 return c ? c : cur;
291 * return nonzero for Arabic combining characters
293 * The standard Arabic diacritics:
294 * + 0x064b: fathatan
295 * + 0x064c: dammatan
296 * + 0x064d: kasratan
297 * + 0x064e: fatha
298 * + 0x064f: damma
299 * + 0x0650: kasra
300 * + 0x0651: shadda
301 * + 0x0652: sukun
302 * + 0x0653: madda above
303 * + 0x0654: hamza above
304 * + 0x0655: hamza below
305 * + 0x0670: superscript alef
307 static int uc_comb(int c)
309 return (c >= 0x064b && c <= 0x0655) || /* the standard diacritics */
310 (c >= 0xfc5e && c <= 0xfc63) || /* shadda ligatures */
311 c == 0x0670; /* superscript alef */
314 static void uc_cput(char *d, int c)
316 int l = 0;
317 if (c > 0xffff) {
318 *d++ = 0xf0 | (c >> 18);
319 l = 3;
320 } else if (c > 0x7ff) {
321 *d++ = 0xe0 | (c >> 12);
322 l = 2;
323 } else if (c > 0x7f) {
324 *d++ = 0xc0 | (c >> 6);
325 l = 1;
326 } else {
327 *d++ = c;
329 while (l--)
330 *d++ = 0x80 | ((c >> (l * 6)) & 0x3f);
331 *d = '\0';
334 /* shape the given arabic character; returns a static buffer */
335 char *uc_shape(char *beg, char *s)
337 static char out[16];
338 char *r;
339 int prev = 0;
340 int next = 0;
341 int curr = uc_code(s);
342 if (!curr || !UC_R2L(curr))
343 return NULL;
344 r = s;
345 while (r > beg) {
346 r = uc_beg(beg, r - 1);
347 if (!uc_comb(uc_code(r))) {
348 prev = uc_code(r);
349 break;
352 r = s;
353 while (*r) {
354 r = uc_next(r);
355 if (!uc_comb(uc_code(r))) {
356 next = uc_code(r);
357 break;
360 uc_cput(out, uc_cshape(curr, prev, next));
361 return out;