7 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
9 /* return the length of a utf-8 character */
12 int c
= (unsigned char) s
[0];
13 if (c
> 0 && c
<= 0x7f)
28 /* the number of utf-8 characters in s */
31 char *e
= s
+ strlen(s
);
33 for (i
= 0; s
< e
; i
++)
38 /* the unicode codepoint of the given utf-8 character */
44 return (unsigned char) *s
;
45 result
= (0x3f >> --l
) & (unsigned char) *s
++;
47 result
= (result
<< 6) | ((unsigned char) *s
++ & 0x3f);
51 /* find the beginning of the character at s[i] */
52 char *uc_beg(char *beg
, char *s
)
54 while (s
> beg
&& (((unsigned char) *s
) & 0xc0) == 0x80)
59 /* find the end of the character at s[i] */
62 if (!*s
|| !((unsigned char) *s
& 0x80))
64 if (((unsigned char) *s
& 0xc0) == 0xc0)
66 while (((unsigned char) *s
& 0xc0) == 0x80)
71 /* return a pointer to the character following s */
72 char *uc_next(char *s
)
75 return *s
? s
+ 1 : s
;
78 /* return a pointer to the character preceding s */
79 char *uc_prev(char *beg
, char *s
)
81 return s
== beg
? beg
: uc_beg(beg
, s
- 1);
89 char *uc_lastline(char *s
)
91 char *r
= strrchr(s
, '\n');
95 /* allocate and return an array for the characters in s */
96 char **uc_chop(char *s
, int *n
)
101 chrs
= malloc((*n
+ 1) * sizeof(chrs
[0]));
102 for (i
= 0; i
< *n
+ 1; i
++) {
109 char *uc_chr(char *s
, int off
)
117 return s
&& (off
< 0 || i
== off
) ? s
: "";
120 /* the number of characters between s and s + off*/
121 int uc_off(char *s
, int off
)
125 for (i
= 0; s
< e
&& *s
; i
++)
130 char *uc_sub(char *s
, int beg
, int end
)
132 char *sbeg
= uc_chr(s
, beg
);
133 char *send
= uc_chr(s
, end
);
134 int len
= sbeg
&& send
&& sbeg
<= send
? send
- sbeg
: 0;
135 char *r
= malloc(len
+ 1);
136 memcpy(r
, sbeg
, len
);
141 char *uc_dup(char *s
)
143 char *r
= malloc(strlen(s
) + 1);
144 return r
? strcpy(r
, s
) : NULL
;
147 int uc_isspace(char *s
)
149 int c
= s
? (unsigned char) *s
: 0;
150 return c
<= 0x7f && isspace(c
);
153 int uc_isprint(char *s
)
155 int c
= s
? (unsigned char) *s
: 0;
156 return c
> 0x7f || isprint(c
);
159 int uc_isalpha(char *s
)
161 int c
= s
? (unsigned char) *s
: 0;
162 return c
> 0x7f || isalpha(c
);
165 int uc_isdigit(char *s
)
167 int c
= s
? (unsigned char) *s
: 0;
168 return c
<= 0x7f && isdigit(c
);
175 if (uc_isalpha(c
) || uc_isdigit(c
) || c
[0] == '_')
180 #define UC_R2L(ch) (((ch) & 0xff00) == 0x0600 || \
181 ((ch) & 0xfffc) == 0x200c || \
182 ((ch) & 0xff00) == 0xfb00 || \
183 ((ch) & 0xff00) == 0xfc00 || \
184 ((ch) & 0xff00) == 0xfe00)
186 /* sorted list of characters that can be shaped */
187 static struct achar
{
188 unsigned c
; /* utf-8 code */
189 unsigned s
; /* single form */
190 unsigned i
; /* initial form */
191 unsigned m
; /* medial form */
192 unsigned f
; /* final form */
194 {0x0621, 0xfe80}, /* hamza */
195 {0x0622, 0xfe81, 0, 0, 0xfe82}, /* alef madda */
196 {0x0623, 0xfe83, 0, 0, 0xfe84}, /* alef hamza above */
197 {0x0624, 0xfe85, 0, 0, 0xfe86}, /* waw hamza */
198 {0x0625, 0xfe87, 0, 0, 0xfe88}, /* alef hamza below */
199 {0x0626, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a}, /* yeh hamza */
200 {0x0627, 0xfe8d, 0, 0, 0xfe8e}, /* alef */
201 {0x0628, 0xfe8f, 0xfe91, 0xfe92, 0xfe90}, /* beh */
202 {0x0629, 0xfe93, 0, 0, 0xfe94}, /* teh marbuta */
203 {0x062a, 0xfe95, 0xfe97, 0xfe98, 0xfe96}, /* teh */
204 {0x062b, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a}, /* theh */
205 {0x062c, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e}, /* jeem */
206 {0x062d, 0xfea1, 0xfea3, 0xfea4, 0xfea2}, /* hah */
207 {0x062e, 0xfea5, 0xfea7, 0xfea8, 0xfea6}, /* khah */
208 {0x062f, 0xfea9, 0, 0, 0xfeaa}, /* dal */
209 {0x0630, 0xfeab, 0, 0, 0xfeac}, /* thal */
210 {0x0631, 0xfead, 0, 0, 0xfeae}, /* reh */
211 {0x0632, 0xfeaf, 0, 0, 0xfeb0}, /* zain */
212 {0x0633, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2}, /* seen */
213 {0x0634, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6}, /* sheen */
214 {0x0635, 0xfeb9, 0xfebb, 0xfebc, 0xfeba}, /* sad */
215 {0x0636, 0xfebd, 0xfebf, 0xfec0, 0xfebe}, /* dad */
216 {0x0637, 0xfec1, 0xfec3, 0xfec4, 0xfec2}, /* tah */
217 {0x0638, 0xfec5, 0xfec7, 0xfec8, 0xfec6}, /* zah */
218 {0x0639, 0xfec9, 0xfecb, 0xfecc, 0xfeca}, /* ain */
219 {0x063a, 0xfecd, 0xfecf, 0xfed0, 0xfece}, /* ghain */
220 {0x0640, 0x640, 0x640, 0x640}, /* tatweel */
221 {0x0641, 0xfed1, 0xfed3, 0xfed4, 0xfed2}, /* feh */
222 {0x0642, 0xfed5, 0xfed7, 0xfed8, 0xfed6}, /* qaf */
223 {0x0643, 0xfed9, 0xfedb, 0xfedc, 0xfeda}, /* kaf */
224 {0x0644, 0xfedd, 0xfedf, 0xfee0, 0xfede}, /* lam */
225 {0x0645, 0xfee1, 0xfee3, 0xfee4, 0xfee2}, /* meem */
226 {0x0646, 0xfee5, 0xfee7, 0xfee8, 0xfee6}, /* noon */
227 {0x0647, 0xfee9, 0xfeeb, 0xfeec, 0xfeea}, /* heh */
228 {0x0648, 0xfeed, 0, 0, 0xfeee}, /* waw */
229 {0x0649, 0xfeef, 0, 0, 0xfef0}, /* alef maksura */
230 {0x064a, 0xfef1, 0xfef3, 0xfef4, 0xfef2}, /* yeh */
231 {0x067e, 0xfb56, 0xfb58, 0xfb59, 0xfb57}, /* peh */
232 {0x0686, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b}, /* tcheh */
233 {0x0698, 0xfb8a, 0, 0, 0xfb8b}, /* jeh */
234 {0x06a9, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f}, /* fkaf */
235 {0x06af, 0xfb92, 0xfb94, 0xfb95, 0xfb93}, /* gaf */
236 {0x06cc, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd}, /* fyeh */
238 {0x200d, 0, 0x200d, 0x200d}, /* ZWJ */
241 static struct achar
*find_achar(int c
)
246 /* using binary search to find c */
249 if (achars
[m
].c
== c
)
259 static int can_join(int c1
, int c2
)
261 struct achar
*a1
= find_achar(c1
);
262 struct achar
*a2
= find_achar(c2
);
263 return a1
&& a2
&& (a1
->i
|| a1
->m
) && (a2
->f
|| a2
->m
);
266 static int uc_cshape(int cur
, int prev
, int next
)
269 int join_prev
, join_next
;
270 struct achar
*ac
= find_achar(c
);
271 if (!ac
) /* ignore non-Arabic characters */
273 join_prev
= can_join(prev
, c
);
274 join_next
= can_join(c
, next
);
275 if (join_prev
&& join_next
)
277 if (join_prev
&& !join_next
)
279 if (!join_prev
&& join_next
)
281 if (!join_prev
&& !join_next
)
282 c
= ac
->c
; /* some fonts do not have a glyph for ac->s */
287 * return nonzero for Arabic combining characters
289 * The standard Arabic diacritics:
298 * + 0x0653: madda above
299 * + 0x0654: hamza above
300 * + 0x0655: hamza below
301 * + 0x0670: superscript alef
303 static int uc_comb(int c
)
305 return (c
>= 0x064b && c
<= 0x0655) || /* the standard diacritics */
306 (c
>= 0xfc5e && c
<= 0xfc63) || /* shadda ligatures */
307 c
== 0x0670; /* superscript alef */
310 static void uc_cput(char *d
, int c
)
314 *d
++ = 0xf0 | (c
>> 18);
316 } else if (c
> 0x7ff) {
317 *d
++ = 0xe0 | (c
>> 12);
319 } else if (c
> 0x7f) {
320 *d
++ = 0xc0 | (c
>> 6);
326 *d
++ = 0x80 | ((c
>> (l
* 6)) & 0x3f);
330 /* shape the given arabic character; returns a static buffer */
331 char *uc_shape(char *beg
, char *s
)
337 int curr
= uc_code(s
);
338 if (!curr
|| !UC_R2L(curr
))
342 r
= uc_beg(beg
, r
- 1);
343 if (!uc_comb(uc_code(r
))) {
351 if (!uc_comb(uc_code(r
))) {
356 uc_cput(out
, uc_cshape(curr
, prev
, next
));