Add $ and ` for escaping and reorder it according to the ascii values
[midnight-commander.git] / slang / slutf8.c
blobd9323393f0f2c1b7e89e69d728a15d30e8febf25
1 #include "slinclud.h"
2 #include <string.h>
4 #include "slang.h"
5 #include "_slang.h"
7 static unsigned char Len_Map[256] =
9 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 31 */
10 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 63 */
11 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 95 */
12 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 127 */
13 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 159 */
14 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 191 */
15 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* - 223 */
16 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 /* - 255 */
20 * Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
21 * as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
22 * data. UTF-8 decoders should treat them like malformed or overlong
23 * sequences for safety reasons.
25 #define IS_ILLEGAL_UNICODE(w) \
26 (((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
28 _INLINE_
29 static int is_invalid_or_overlong_utf8 (SLuchar_Type *u, unsigned int len)
31 unsigned int i;
32 unsigned char ch, ch1;
34 /* Check for invalid sequences */
35 for (i = 1; i < len; i++)
37 if ((u[i] & 0xC0) != 0x80)
38 return 1;
41 /* Illegal (overlong) sequences */
42 /* 1100000x (10xxxxxx) */
43 /* 11100000 100xxxxx (10xxxxxx) */
44 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
45 /* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
46 /* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
47 ch = *u;
48 if ((ch == 0xC0) || (ch == 0xC1))
49 return 1;
51 ch1 = u[1];
52 if (((ch1 & ch) == 0x80)
53 && ((ch == 0xE0)
54 || (ch == 0xF0)
55 || (ch == 0xF8)
56 || (ch == 0xFC)))
57 return 1;
59 if (len == 3)
61 /* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
62 if ((ch == 0xED)
63 && ((ch1 >= 0xA0) && (ch1 <= 0xBF))
64 && (u[2] >= 0x80) && (u[2] <= 0xBF))
65 return 1;
66 /* Now FFFE and FFFF */
67 if ((ch == 0xEF)
68 && (ch1 == 0xBF)
69 && ((u[2] == 0xBE) || (u[2] == 0xBF)))
70 return 1;
72 return 0;
75 /* This function assumes that the necessary checks have been made to ensure
76 * a valid UTF-8 encoded character is present.
78 _INLINE_
79 static SLwchar_Type fast_utf8_decode (SLuchar_Type *u, unsigned int len)
81 static unsigned char masks[7] =
83 0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
85 SLuchar_Type *umax;
86 SLwchar_Type w;
88 w = (*u & masks[len]);
89 umax = u + len;
90 u++;
91 while (u < umax)
93 w = (w << 6)| (u[0] & 0x3F);
94 u++;
96 return w;
99 unsigned char *SLutf8_skip_char (unsigned char *s, unsigned char *smax)
101 unsigned int len;
103 if (s >= smax)
104 return s;
106 len = Len_Map[*s];
107 if (len <= 1)
108 return s+1;
110 if (s + len > smax)
111 return s+1;
113 if (is_invalid_or_overlong_utf8 (s, len))
114 return s + 1;
116 return s + len;
119 SLuchar_Type *SLutf8_skip_chars (SLuchar_Type *s, SLuchar_Type *smax,
120 unsigned int num, unsigned int *dnum,
121 int ignore_combining)
123 unsigned int n;
125 n = 0;
126 while ((n < num) && (s < smax))
128 unsigned int len = Len_Map[*s];
130 if (len <= 1)
132 n++;
133 s++;
134 continue;
137 if (s + len > smax)
139 s++;
140 n++;
141 continue;
144 if (is_invalid_or_overlong_utf8 (s, len))
146 s++;
147 n++;
148 continue;
151 if (ignore_combining)
153 SLwchar_Type w = fast_utf8_decode (s, len);
154 if (0 != SLwchar_wcwidth (w))
155 n++;
156 s += len;
157 continue;
160 n++;
161 s += len;
164 if (ignore_combining)
166 while (s < smax)
168 SLwchar_Type w;
169 unsigned int nconsumed;
170 if (NULL == SLutf8_decode (s, smax, &w, &nconsumed))
171 break;
173 if (0 != SLwchar_wcwidth (w))
174 break;
176 s += nconsumed;
180 if (dnum != NULL)
181 *dnum = n;
182 return s;
186 SLuchar_Type *SLutf8_bskip_chars (SLuchar_Type *smin, SLuchar_Type *s,
187 unsigned int num, unsigned int *dnum,
188 int ignore_combining)
190 unsigned int n;
191 SLuchar_Type *smax = s;
193 n = 0;
194 while ((n < num) && (s > smin))
196 unsigned char ch;
197 unsigned int dn;
199 s--;
200 ch = *s;
201 if (ch < 0x80)
203 n++;
204 smax = s;
205 continue;
208 dn = 0;
209 while ((s != smin)
210 && (Len_Map[ch] == 0)
211 && (dn < SLUTF8_MAX_MBLEN))
213 s--;
214 ch = *s;
215 dn++;
218 if (ch <= 0xBF)
220 /* Invalid sequence */
221 n++;
222 smax--;
223 s = smax;
224 continue;
227 if (ch > 0xBF)
229 SLwchar_Type w;
230 SLuchar_Type *s1;
232 if ((NULL == (s1 = SLutf8_decode (s, smax, &w, NULL)))
233 || (s1 != smax))
235 /* This means we backed up over an invalid sequence */
236 dn = (unsigned int) (smax - s);
237 n++;
238 smax--;
239 s = smax;
240 continue;
243 if ((ignore_combining == 0)
244 || (0 != SLwchar_wcwidth (w)))
245 n++;
247 smax = s;
251 if (dnum != NULL)
252 *dnum = n;
253 return s;
256 SLuchar_Type *SLutf8_bskip_char (SLuchar_Type *smin, SLuchar_Type *s)
258 if (s > smin)
260 unsigned int dn;
262 s--;
263 if (*s >= 0x80)
264 s = SLutf8_bskip_chars (smin, s+1, 1, &dn, 0);
266 return s;
270 /* This function counts the number of wide characters in a UTF-8 encoded
271 * string. Each byte in an invalid sequence is counted as a single character.
272 * If the string contains illegal values, the bytes making up the character is
273 * counted as 1 character.
275 unsigned int SLutf8_strlen (SLuchar_Type *s, int ignore_combining)
277 unsigned int count, len;
279 if (s == NULL)
280 return 0;
282 len = strlen ((char *)s);
283 (void) SLutf8_skip_chars (s, s + len, len, &count, ignore_combining);
284 return count;
289 * This function returns NULL if the input does not correspond to a valid
290 * UTF-8 sequence, otherwise, it returns the position of the next character
291 * in the sequence.
293 unsigned char *SLutf8_decode (unsigned char *u, unsigned char *umax,
294 SLwchar_Type *wp, unsigned int *nconsumedp)
296 unsigned int len;
297 unsigned char ch;
298 SLwchar_Type w;
300 if (u >= umax)
302 *wp = 0;
303 if (nconsumedp != NULL)
304 *nconsumedp = 0;
305 return NULL;
308 *wp = ch = *u;
309 if (ch < 0x80)
311 if (nconsumedp != NULL) *nconsumedp = 1;
312 return u+1;
315 len = Len_Map[ch];
316 if (len < 2)
318 /* should not happen--- code here for completeness */
319 if (nconsumedp != NULL) *nconsumedp = 1;
320 return NULL;
322 if (u + len > umax)
324 if (nconsumedp != NULL) *nconsumedp = 1; /* (unsigned int) (umax - u); */
325 return NULL;
328 if (is_invalid_or_overlong_utf8 (u, len))
330 if (nconsumedp != NULL)
331 *nconsumedp = 1;
333 return NULL;
336 if (nconsumedp != NULL)
337 *nconsumedp = len;
339 *wp = w = fast_utf8_decode (u, len);
341 if (IS_ILLEGAL_UNICODE(w))
342 return NULL;
344 return u + len;
348 /* Encode the wide character returning a pointer to the end of the
349 * utf8 of the encoded multi-byte character. This function will also encode
350 * illegal unicode values. It returns NULL if buflen is too small.
351 * Otherwise, it returns a pointer at the end of the last encoded byte.
352 * It does not null terminate the encoded string.
354 SLuchar_Type *SLutf8_encode (SLwchar_Type w, SLuchar_Type *u, unsigned int ulen)
356 SLuchar_Type *umax = u + ulen;
358 /* U-00000000 - U-0000007F: 0xxxxxxx */
359 if (w <= 0x7F)
361 if (u >= umax)
362 return NULL;
364 *u++ = (unsigned char) w;
365 return u;
368 /* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
369 if (w <= 0x7FF)
371 if ((u + 1) >= umax)
372 return NULL;
374 *u++ = (w >> 6) | 0xC0;
375 *u++ = (w & 0x3F) | 0x80;
376 return u;
379 /* First bad character starts at 0xD800 */
381 /* Allow illegal values to be encoded */
384 *if (IS_ILLEGAL_UNICODE(w))
385 * return NULL;
388 /* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
389 if (w <= 0xFFFF)
391 if (u+2 >= umax)
392 return NULL;
393 *u++ = (w >> 12 ) | 0xE0;
394 goto finish_2;
397 /* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
398 if (w <= 0x1FFFFF)
400 if (u+3 >= umax)
401 return NULL;
402 *u++ = (w >> 18) | 0xF0;
403 goto finish_3;
406 /* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
407 if (w <= 0x3FFFFFF)
409 if (u+4 >= umax)
410 return NULL;
411 *u++ = (w >> 24) | 0xF8;
412 goto finish_4;
415 /* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
416 if (w <= 0x7FFFFFFF)
418 if (u+5 >= umax)
419 return NULL;
420 *u++ = (w >> 30) | 0xFC;
421 goto finish_5;
424 /* unreached?? */
425 return NULL;
427 finish_5: *u++ = ((w >> 24) & 0x3F)|0x80;
428 finish_4: *u++ = ((w >> 18) & 0x3F)|0x80;
429 finish_3: *u++ = ((w >> 12) & 0x3F)|0x80;
430 finish_2: *u++ = ((w >> 6) & 0x3F)|0x80;
431 *u++ = (w & 0x3F)|0x80;
433 return u;
436 /* Like SLutf8_encode, but null terminates the result.
437 * At least SLUTF8_MAX_MBLEN+1 bytes assumed.
439 SLuchar_Type *SLutf8_encode_null_terminate (SLwchar_Type w, SLuchar_Type *u)
441 SLuchar_Type *p;
443 p = SLutf8_encode (w, u, SLUTF8_MAX_MBLEN);
444 if (p != NULL)
445 *p = 0;
446 return p;
449 #if 0
450 int SLutf8_decode_bytes (SLuchar_Type *u, SLuchar_Type *umax,
451 unsigned char *b, unsigned int *np)
453 unsigned char *bmax;
455 bmax = b;
456 while (u < umax)
458 SLwchar_Type w;
460 if (0 == (*u & 0x80))
462 *bmax++ = *u++;
463 continue;
466 if (NULL == (u = SLutf8_decode (u, umax, &w, NULL)))
467 return -1; /* FIXME: HANDLE ERROR */
469 if (w > 0xFF)
471 #if 0
472 sprintf (bmax, "<U+%04X>", w);
473 bmax += strlen (bmax);
474 continue;
475 #endif
476 /* FIXME: HANDLE ERROR */
477 w = w & 0xFF;
480 *bmax++ = w;
482 *np = bmax - b;
483 *bmax = 0;
484 return 0;
487 /* UTF-8 Encode the bytes between b and bmax storing the results in the
488 * buffer defined by u and umax, returning the position following the
489 * last encoded character. Upon return, *np is set to the number of bytes
490 * sucessfully encoded.
492 SLuchar_Type *SLutf8_encode_bytes (unsigned char *b, unsigned char *bmax,
493 SLuchar_Type *u, unsigned int ulen,
494 unsigned int *np)
496 unsigned char *bstart = b;
497 SLuchar_Type *umax = u + ulen;
499 while (b < bmax)
501 SLuchar_Type *u1;
503 if (0 == (*b & 0x80))
505 if (u >= umax)
506 break;
508 *u++ = *b++;
509 continue;
512 if (NULL == (u1 = SLutf8_encode (*b, u, umax - u)))
513 break;
514 u = u1;
515 b++;
518 *np = b - bstart;
519 if (u < umax)
520 *u = 0;
522 return u;
524 #endif
526 static SLuchar_Type *xform_utf8 (SLuchar_Type *u, SLuchar_Type *umax,
527 SLwchar_Type (*fun)(SLwchar_Type))
529 SLuchar_Type *buf, *p;
530 unsigned int malloced_len, len;
532 if (umax < u)
533 return NULL;
535 len = 0;
536 p = buf = NULL;
537 malloced_len = 0;
539 while (1)
541 SLwchar_Type w;
542 SLuchar_Type *u1;
543 unsigned int nconsumed;
545 if (malloced_len <= len + SLUTF8_MAX_MBLEN)
547 SLuchar_Type *newbuf;
548 malloced_len += 1 + (umax - u) + SLUTF8_MAX_MBLEN;
550 newbuf = (SLuchar_Type *)SLrealloc ((char *)buf, malloced_len);
551 if (newbuf == NULL)
553 SLfree ((char *)buf);
554 return NULL;
556 buf = newbuf;
557 p = buf + len;
560 if (u >= umax)
562 *p = 0;
563 p = (SLuchar_Type *) SLang_create_nslstring ((char *)buf, len);
564 SLfree ((char *)buf);
565 return p;
568 if (NULL == (u1 = SLutf8_decode (u, umax, &w, &nconsumed)))
570 /* Invalid sequence */
571 memcpy ((char *) p, u, nconsumed);
572 p += nconsumed;
573 len += nconsumed;
574 u1 = u + nconsumed;
576 else
578 SLuchar_Type *p1;
580 p1 = SLutf8_encode ((*fun)(w), p, malloced_len);
581 if (p1 == NULL)
583 SLfree ((char *)buf);
584 SLang_verror (SL_INTERNAL_ERROR, "SLutf8_encode returned NULL");
585 return NULL;
587 len += p1 - p;
588 p = p1;
591 u = u1;
595 /* Returned an uppercased version of an UTF-8 encoded string. Illegal or
596 * invalid sequences will be returned as-is. This function returns
597 * an SLstring.
599 SLuchar_Type *SLutf8_strup (SLuchar_Type *u, SLuchar_Type *umax)
601 return xform_utf8 (u, umax, SLwchar_toupper);
604 /* Returned an lowercased version of an UTF-8 encoded string. Illegal or
605 * invalid sequences will be returned as-is. This function returns
606 * an SLstring.
608 SLuchar_Type *SLutf8_strlo (SLuchar_Type *u, SLuchar_Type *umax)
610 return xform_utf8 (u, umax, SLwchar_tolower);
613 int SLutf8_compare (SLuchar_Type *a, SLuchar_Type *amax,
614 SLuchar_Type *b, SLuchar_Type *bmax,
615 unsigned int nchars,
616 int cs)
618 while (nchars && (a < amax) && (b < bmax))
620 SLwchar_Type cha, chb;
621 unsigned int na, nb;
622 int aok, bok;
624 if (*a < 0x80)
626 cha = (SLwchar_Type) *a++;
627 aok = 1;
629 else
631 aok = (NULL != SLutf8_decode (a, amax, &cha, &na));
632 a += na;
635 if (*b < 0x80)
637 chb = (SLwchar_Type) *b++;
638 bok = 1;
640 else
642 bok = (NULL != SLutf8_decode (b, bmax, &chb, &nb));
643 b += nb;
646 nchars--;
648 if (aok && bok)
650 if (cs == 0)
652 cha = SLwchar_toupper (cha);
653 chb = SLwchar_toupper (chb);
656 else if (aok)
657 return 1;
658 else if (bok)
659 return -1;
661 if (cha == chb)
662 continue;
664 if (cha > chb)
665 return 1;
667 return -1;
670 if (nchars == 0)
671 return 0;
673 if ((a >= amax) && (b >= bmax))
674 return 0;
676 if (b >= bmax)
677 return 1;
679 return -1;
683 /* Returns an SLstring */
684 SLstr_Type *SLutf8_subst_wchar (SLuchar_Type *u, SLuchar_Type *umax,
685 SLwchar_Type wch, unsigned int pos,
686 int ignore_combining)
688 SLuchar_Type *a, *a1, *b;
689 unsigned int dpos;
690 SLuchar_Type buf[SLUTF8_MAX_MBLEN+1];
691 SLstr_Type *c;
692 unsigned int n1, n2, n3, len;
694 a = SLutf8_skip_chars (u, umax, pos, &dpos, ignore_combining);
696 if ((dpos != pos) || (a == umax))
698 SLang_verror (SL_INDEX_ERROR, "Specified character position is invalid for string");
699 return NULL;
702 a1 = SLutf8_skip_chars (a, umax, 1, NULL, ignore_combining);
704 b = SLutf8_encode (wch, buf, SLUTF8_MAX_MBLEN);
705 if (b == NULL)
707 SLang_verror (SL_UNICODE_ERROR, "Unable to encode wchar 0x%lX", (unsigned long)wch);
708 return NULL;
711 n1 = (a-u);
712 n2 = (b-buf);
713 n3 = (umax-a1);
714 len = n1 + n2 + n3;
715 c = _pSLallocate_slstring (len);
716 if (c == NULL)
717 return NULL;
719 memcpy (c, (char *)u, n1);
720 memcpy (c+n1, (char *)buf, n2);
721 memcpy (c+n1+n2, (char *)a1, n3);
722 c[len] = 0;
724 /* No need to worry about this failing-- it frees its argument */
725 return _pSLcreate_via_alloced_slstring (c, len);
729 /* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes. Result will be
730 * null terminated. Returns position of NEXT character.
731 * Analogous to: *p++
733 SLuchar_Type *SLutf8_extract_utf8_char (SLuchar_Type *u,
734 SLuchar_Type *umax,
735 SLuchar_Type *utf8)
737 SLuchar_Type *u1;
739 u1 = SLutf8_skip_char (u, umax);
740 memcpy ((char *)utf8, u, u1-u);
741 utf8[u1-u] = 0;
743 return u1;
748 /* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
749 * They also generate slang errors upon error.
751 SLuchar_Type *_pSLinterp_decode_wchar (SLuchar_Type *u,
752 SLuchar_Type *umax,
753 SLwchar_Type *chp)
755 if (_pSLinterp_UTF8_Mode == 0)
757 if (u < umax)
758 *chp = (SLwchar_Type) *u++;
759 return u;
762 if (NULL == (u = SLutf8_decode (u, umax, chp, NULL)))
763 SLang_verror (SL_INVALID_UTF8, "Invalid UTF-8 encoded string");
765 return u;
768 /* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
769 * Upon success, it returns a pointer to the _end_ of the encoded character
771 SLuchar_Type *_pSLinterp_encode_wchar (SLwchar_Type wch, SLuchar_Type *u, unsigned int *encoded_len)
773 SLuchar_Type *u1;
775 if (_pSLinterp_UTF8_Mode == 0)
777 *encoded_len = 1;
778 *u++ = (SLuchar_Type) wch;
779 *u++ = 0;
780 return u;
783 if (NULL == (u1 = SLutf8_encode_null_terminate (wch, u)))
785 SLang_verror (SL_UNICODE_ERROR, "Unable to encode character 0x%lX", (unsigned long)wch);
786 return NULL;
789 *encoded_len = (unsigned int) (u1 - u);
790 return u1;
793 #ifdef REGRESSION
794 int main (int argc, char **argv)
796 unsigned char *s, *smax;
797 char **t;
798 char *ok_tests [] =
800 "퟿",
801 "",
802 "�",
803 "􏿿",
805 NULL
807 char *long_tests [] =
809 "À¯",
810 "à€¯",
811 "ð€€¯",
812 "ø€€€¯",
813 "ü€€€€¯",
814 NULL
817 t = long_tests;
818 while ((s = (unsigned char *) *t++) != NULL)
820 smax = s + strlen ((char *)s);
822 while (s < smax)
824 SLwchar_Type w;
826 if (NULL == (s = SLutf8_to_wc (s, smax, &w)))
828 fprintf (stderr, "SLutf8_to_wc failed\n");
829 break;
831 if (w == 0)
832 break;
833 fprintf (stdout, " 0x%X", w);
836 fprintf (stdout, "\n");
838 return 0;
840 #endif