flac: Saner EOF handling
[cmus.git] / uchar.c
blob5a99c57145872d5877cb6561c76d5a289e83771f
1 /*
2 * Copyright 2004-2005 Timo Hirvonen
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 * 02111-1307, USA.
20 #include "uchar.h"
21 #include "compiler.h"
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wctype.h>
26 #include <ctype.h>
28 const char hex_tab[16] = "0123456789abcdef";
31 * Byte Sequence Min Min Max
32 * ----------------------------------------------------------------------------------
33 * 0xxxxxxx 0000000 0x00000 0x00007f
34 * 110xxxxx 10xxxxxx 000 10000000 0x00080 0x0007ff
35 * 1110xxxx 10xxxxxx 10xxxxxx 00001000 00000000 0x00800 0x00ffff
36 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00001 00000000 00000000 0x10000 0x10ffff (not 0x1fffff)
38 * max: 100 001111 111111 111111 (0x10ffff)
41 /* Length of UTF-8 byte sequence.
42 * Table index is the first byte of UTF-8 sequence.
44 static const signed char len_tab[256] = {
45 /* 0-127 0xxxxxxx */
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 /* 128-191 10xxxxxx (invalid first byte) */
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 /* 192-223 110xxxxx */
62 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
63 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
65 /* 224-239 1110xxxx */
66 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
68 /* 240-244 11110xxx (000 - 100) */
69 4, 4, 4, 4, 4,
71 /* 11110xxx (101 - 111) (always invalid) */
72 -1, -1, -1,
74 /* 11111xxx (always invalid) */
75 -1, -1, -1, -1, -1, -1, -1, -1
78 /* index is length of the UTF-8 sequence - 1 */
79 static int min_val[4] = { 0x000000, 0x000080, 0x000800, 0x010000 };
80 static int max_val[4] = { 0x00007f, 0x0007ff, 0x00ffff, 0x10ffff };
82 /* get value bits from the first UTF-8 sequence byte */
83 static unsigned int first_byte_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
85 int u_is_valid(const char *str)
87 const unsigned char *s = (const unsigned char *)str;
88 int i = 0;
90 while (s[i]) {
91 unsigned char ch = s[i++];
92 int len = len_tab[ch];
94 if (len <= 0)
95 return 0;
97 if (len > 1) {
98 /* len - 1 10xxxxxx bytes */
99 uchar u;
100 int c;
102 len--;
103 u = ch & first_byte_mask[len];
104 c = len;
105 do {
106 ch = s[i++];
107 if (len_tab[ch] != 0)
108 return 0;
109 u = (u << 6) | (ch & 0x3f);
110 } while (--c);
112 if (u < min_val[len] || u > max_val[len])
113 return 0;
116 return 1;
119 int u_strlen(const char *str)
121 const unsigned char *s = (const unsigned char *)str;
122 int len = 0;
124 while (*s) {
125 int l = len_tab[*s];
127 if (unlikely(l > 1)) {
128 /* next l - 1 bytes must be 0x10xxxxxx */
129 int c = 1;
130 do {
131 if (len_tab[s[c]] != 0) {
132 /* invalid sequence */
133 goto single_char;
135 c++;
136 } while (c < l);
138 /* valid sequence */
139 s += l;
140 len++;
141 continue;
143 single_char:
144 /* l is -1, 0 or 1
145 * invalid chars counted as single characters */
146 s++;
147 len++;
149 return len;
152 int u_char_width(uchar u)
154 if (unlikely(u < 0x20))
155 goto control;
157 if (u < 0x1100U)
158 goto narrow;
160 /* Hangul Jamo init. consonants */
161 if (u <= 0x115fU)
162 goto wide;
164 /* angle brackets */
165 if (u == 0x2329U || u == 0x232aU)
166 goto wide;
168 if (u < 0x2e80U)
169 goto narrow;
170 /* CJK ... Yi */
171 if (u < 0x302aU)
172 goto wide;
173 if (u <= 0x302fU)
174 goto narrow;
175 if (u == 0x303fU)
176 goto narrow;
177 if (u == 0x3099U)
178 goto narrow;
179 if (u == 0x309aU)
180 goto narrow;
181 /* CJK ... Yi */
182 if (u <= 0xa4cfU)
183 goto wide;
185 /* Hangul Syllables */
186 if (u >= 0xac00U && u <= 0xd7a3U)
187 goto wide;
189 /* CJK Compatibility Ideographs */
190 if (u >= 0xf900U && u <= 0xfaffU)
191 goto wide;
193 /* CJK Compatibility Forms */
194 if (u >= 0xfe30U && u <= 0xfe6fU)
195 goto wide;
197 /* Fullwidth Forms */
198 if (u >= 0xff00U && u <= 0xff60U)
199 goto wide;
201 /* Fullwidth Forms */
202 if (u >= 0xffe0U && u <= 0xffe6U)
203 goto wide;
205 /* CJK extra stuff */
206 if (u >= 0x20000U && u <= 0x2fffdU)
207 goto wide;
209 /* ? */
210 if (u >= 0x30000U && u <= 0x3fffdU)
211 goto wide;
213 /* invalid bytes in unicode stream are rendered "<xx>" */
214 if (u & U_INVALID_MASK)
215 goto invalid;
216 narrow:
217 return 1;
218 wide:
219 return 2;
220 control:
221 /* special case */
222 if (u == 0)
223 return 1;
225 /* print control chars as <xx> */
226 invalid:
227 return 4;
230 int u_str_width(const char *str)
232 int idx = 0, w = 0;
234 while (str[idx]) {
235 uchar u;
237 u_get_char(str, &idx, &u);
238 w += u_char_width(u);
240 return w;
243 int u_str_nwidth(const char *str, int len)
245 int idx = 0;
246 int w = 0;
247 uchar u;
249 while (len > 0) {
250 u_get_char(str, &idx, &u);
251 if (u == 0)
252 break;
253 w += u_char_width(u);
254 len--;
256 return w;
259 void u_prev_char_pos(const char *str, int *idx)
261 const unsigned char *s = (const unsigned char *)str;
262 int c, len, i = *idx;
263 uchar ch;
265 ch = s[--i];
266 len = len_tab[ch];
267 if (len != 0) {
268 /* start of byte sequence or invelid uchar */
269 goto one;
272 c = 1;
273 while (1) {
274 if (i == 0) {
275 /* first byte of the sequence is missing */
276 break;
279 ch = s[--i];
280 len = len_tab[ch];
281 c++;
283 if (len == 0) {
284 if (c < 4)
285 continue;
287 /* too long sequence */
288 break;
290 if (len != c) {
291 /* incorrect length */
292 break;
295 /* ok */
296 *idx = i;
297 return;
299 one:
300 *idx = *idx - 1;
301 return;
304 void u_get_char(const char *str, int *idx, uchar *uch)
306 const unsigned char *s = (const unsigned char *)str;
307 int len, i = *idx;
308 uchar ch, u;
310 ch = s[i++];
311 len = len_tab[ch];
312 if (unlikely(len < 1))
313 goto invalid;
315 len--;
316 u = ch & first_byte_mask[len];
317 while (len > 0) {
318 ch = s[i++];
319 if (unlikely(len_tab[ch] != 0))
320 goto invalid;
321 u = (u << 6) | (ch & 0x3f);
322 len--;
324 *idx = i;
325 *uch = u;
326 return;
327 invalid:
328 i = *idx;
329 u = s[i++];
330 *uch = u | U_INVALID_MASK;
331 *idx = i;
334 void u_set_char_raw(char *str, int *idx, uchar uch)
336 int i = *idx;
338 if (uch <= 0x0000007fU) {
339 str[i++] = uch;
340 *idx = i;
341 } else if (uch <= 0x000007ffU) {
342 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
343 str[i + 0] = uch | 0x000000c0U;
344 i += 2;
345 *idx = i;
346 } else if (uch <= 0x0000ffffU) {
347 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
348 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
349 str[i + 0] = uch | 0x000000e0U;
350 i += 3;
351 *idx = i;
352 } else if (uch <= 0x0010ffffU) {
353 str[i + 3] = (uch & 63) | 0x80; uch >>= 6;
354 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
355 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
356 str[i + 0] = uch | 0x000000f0U;
357 i += 4;
358 *idx = i;
359 } else {
360 /* must be an invalid uchar */
361 str[i++] = uch & 0xff;
362 *idx = i;
367 * Printing functions, these lose information
370 void u_set_char(char *str, int *idx, uchar uch)
372 int i = *idx;
374 if (unlikely(uch <= 0x0000001fU))
375 goto invalid;
377 if (uch <= 0x0000007fU) {
378 str[i++] = uch;
379 *idx = i;
380 return;
381 } else if (uch <= 0x000007ffU) {
382 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
383 str[i + 0] = uch | 0x000000c0U;
384 i += 2;
385 *idx = i;
386 return;
387 } else if (uch <= 0x0000ffffU) {
388 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
389 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
390 str[i + 0] = uch | 0x000000e0U;
391 i += 3;
392 *idx = i;
393 return;
394 } else if (uch <= 0x0010ffffU) {
395 str[i + 3] = (uch & 63) | 0x80; uch >>= 6;
396 str[i + 2] = (uch & 63) | 0x80; uch >>= 6;
397 str[i + 1] = (uch & 63) | 0x80; uch >>= 6;
398 str[i + 0] = uch | 0x000000f0U;
399 i += 4;
400 *idx = i;
401 return;
403 invalid:
404 /* control character or invalid unicode */
405 if (uch == 0) {
406 /* handle this special case here to make the common case fast */
407 str[i++] = 0;
408 *idx = i;
409 } else {
410 str[i++] = '<';
411 str[i++] = hex_tab[(uch >> 4) & 0xf];
412 str[i++] = hex_tab[uch & 0xf];
413 str[i++] = '>';
414 *idx = i;
418 int u_copy_chars(char *dst, const char *src, int *width)
420 int w = *width;
421 int si = 0, di = 0;
422 int cw;
423 uchar u;
425 while (w > 0) {
426 u_get_char(src, &si, &u);
427 if (u == 0)
428 break;
430 cw = u_char_width(u);
431 w -= cw;
433 if (unlikely(w < 0)) {
434 if (cw == 2)
435 dst[di++] = ' ';
436 if (cw == 4) {
437 dst[di++] = '<';
438 if (w >= -2)
439 dst[di++] = hex_tab[(u >> 4) & 0xf];
440 if (w >= -1)
441 dst[di++] = hex_tab[u & 0xf];
443 w = 0;
444 break;
446 u_set_char(dst, &di, u);
448 *width -= w;
449 return di;
452 int u_skip_chars(const char *str, int *width)
454 int w = *width;
455 int idx = 0;
457 while (w > 0) {
458 uchar u;
460 u_get_char(str, &idx, &u);
461 w -= u_char_width(u);
463 /* add 1..3 if skipped 'too much' (the last char was double width or invalid (<xx>)) */
464 *width -= w;
465 return idx;
469 * Comparison functions
472 static inline int chcasecmp(int a, int b)
474 return towupper(a) - towupper(b);
477 int u_strcasecmp(const char *a, const char *b)
479 int ai = 0;
480 int bi = 0;
481 int res;
483 do {
484 uchar au, bu;
486 u_get_char(a, &ai, &au);
487 u_get_char(b, &bi, &bu);
488 res = chcasecmp(au, bu);
489 if (res)
490 break;
491 if (au == 0) {
492 /* bu is 0 too */
493 break;
495 } while (1);
496 return res;
499 int u_strncasecmp(const char *a, const char *b, int len)
501 int ai = 0;
502 int bi = 0;
504 while (len > 0) {
505 uchar au, bu;
506 int res;
508 u_get_char(a, &ai, &au);
509 u_get_char(b, &bi, &bu);
510 res = chcasecmp(au, bu);
511 if (res)
512 return res;
513 if (au == 0) {
514 /* bu is 0 too */
515 return 0;
517 len--;
519 return 0;
522 char *u_strcasestr(const char *haystack, const char *needle)
524 /* strlen is faster and works here */
525 int haystack_len = strlen(haystack);
526 int needle_len = u_strlen(needle);
528 do {
529 uchar u;
530 int idx;
532 if (haystack_len < needle_len)
533 return NULL;
534 if (u_strncasecmp(needle, haystack, needle_len) == 0)
535 return (char *)haystack;
537 /* skip one char */
538 idx = 0;
539 u_get_char(haystack, &idx, &u);
540 haystack += idx;
541 haystack_len -= idx;
542 } while (1);