2 * gutf8.c: UTF-8 conversion
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
8 * Copyright 2012 Xamarin Inc
15 * Index into the table below with the first byte of a UTF-8 sequence to get
16 * the number of bytes that are supposed to follow it to complete the sequence.
18 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
19 * as-is for anyone who may want to do such conversion, which was allowed in
22 const guchar g_utf8_jump_table
[256] = {
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
26 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
27 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
30 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
34 utf8_case_conv (const gchar
*str
, gssize len
, gboolean upper
)
40 ustr
= g_utf8_to_ucs4_fast (str
, (glong
) len
, &ulen
);
41 for (i
= 0; i
< ulen
; i
++)
42 ustr
[i
] = upper
? g_unichar_toupper (ustr
[i
]) : g_unichar_tolower (ustr
[i
]);
43 utf8
= g_ucs4_to_utf8 (ustr
, ulen
, NULL
, NULL
, NULL
);
50 g_utf8_strup (const gchar
*str
, gssize len
)
52 return utf8_case_conv (str
, len
, TRUE
);
56 g_utf8_strdown (const gchar
*str
, gssize len
)
58 return utf8_case_conv (str
, len
, FALSE
);
62 utf8_validate (const unsigned char *inptr
, size_t len
)
64 const unsigned char *ptr
= inptr
+ len
;
67 /* Everything falls through when TRUE... */
72 if ((c
= (*--ptr
)) < 0x80 || c
> 0xBF)
75 if ((c
== 0xBF || c
== 0xBE) && ptr
[-1] == 0xBF) {
76 if (ptr
[-2] == 0x8F || ptr
[-2] == 0x9F ||
77 ptr
[-2] == 0xAF || ptr
[-2] == 0xBF)
81 if ((c
= (*--ptr
)) < 0x80 || c
> 0xBF)
84 if ((c
= (*--ptr
)) < 0x80 || c
> 0xBF)
87 /* no fall-through in this inner switch */
89 case 0xE0: if (c
< 0xA0) return FALSE
; break;
90 case 0xED: if (c
> 0x9F) return FALSE
; break;
91 case 0xEF: if (c
== 0xB7 && (ptr
[1] > 0x8F && ptr
[1] < 0xB0)) return FALSE
;
92 if (c
== 0xBF && (ptr
[1] == 0xBE || ptr
[1] == 0xBF)) return FALSE
;
94 case 0xF0: if (c
< 0x90) return FALSE
; break;
95 case 0xF4: if (c
> 0x8F) return FALSE
; break;
96 default: if (c
< 0x80) return FALSE
; break;
98 case 1: if (*inptr
>= 0x80 && *inptr
< 0xC2) return FALSE
;
109 * @str: a utf-8 encoded string
110 * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
111 * @end: output parameter to mark the end of the valid input
113 * Checks @utf for being valid UTF-8. @str is assumed to be
114 * null-terminated. This function is not super-strict, as it will
115 * allow longer UTF-8 sequences than necessary. Note that Java is
116 * capable of producing these sequences if provoked. Also note, this
117 * routine checks for the 4-byte maximum size, but does not check for
118 * 0x10ffff maximum value.
120 * Return value: %TRUE if @str is valid or %FALSE otherwise.
123 g_utf8_validate (const gchar
*str
, gssize max_len
, const gchar
**end
)
125 guchar
*inptr
= (guchar
*) str
;
126 gboolean valid
= TRUE
;
134 while (*inptr
!= 0) {
135 length
= g_utf8_jump_table
[*inptr
];
136 if (!utf8_validate (inptr
, length
)) {
144 while (n
< max_len
) {
146 /* Note: return FALSE if we encounter nul-byte
147 * before max_len is reached. */
152 length
= g_utf8_jump_table
[*inptr
];
153 min
= MIN (length
, max_len
- n
);
155 if (!utf8_validate (inptr
, min
)) {
171 *end
= (gchar
*) inptr
;
177 g_utf8_get_char_validated (const gchar
*str
, gssize max_len
)
179 unsigned char *inptr
= (unsigned char *) str
;
187 /* simple ascii case */
189 } else if (u
< 0xc2) {
191 } else if (u
< 0xe0) {
194 } else if (u
< 0xf0) {
197 } else if (u
< 0xf8) {
200 } else if (u
< 0xfc) {
203 } else if (u
< 0xfe) {
211 if (!utf8_validate (inptr
, MIN (max_len
, n
)))
217 if (!utf8_validate (inptr
, n
))
221 for (i
= 1; i
< n
; i
++)
222 u
= (u
<< 6) | (*++inptr
^ 0x80);
228 g_utf8_strlen (const gchar
*str
, gssize max_len
)
230 const guchar
*inptr
= (const guchar
*) str
;
231 glong clen
= 0, len
= 0, n
;
238 inptr
+= g_utf8_jump_table
[*inptr
];
242 while (len
< max_len
&& *inptr
) {
243 n
= g_utf8_jump_table
[*inptr
];
244 if ((clen
+ n
) > max_len
)
257 g_utf8_get_char (const gchar
*src
)
259 unsigned char *inptr
= (unsigned char *) src
;
264 /* simple ascii case */
266 } else if (u
< 0xe0) {
269 } else if (u
< 0xf0) {
272 } else if (u
< 0xf8) {
275 } else if (u
< 0xfc) {
283 for (i
= 1; i
< n
; i
++)
284 u
= (u
<< 6) | (*++inptr
^ 0x80);
290 g_utf8_find_prev_char (const gchar
*str
, const gchar
*p
)
294 if ((*p
& 0xc0) != 0xb0)
301 g_utf8_prev_char (const gchar
*str
)
303 const gchar
*p
= str
;
306 } while ((*p
& 0xc0) == 0xb0);
312 g_utf8_offset_to_pointer (const gchar
*str
, glong offset
)
314 const gchar
*p
= str
;
318 p
= g_utf8_next_char (p
);
320 } while (offset
> 0);
322 else if (offset
< 0) {
323 const gchar
*jump
= str
;
325 // since the minimum size of a character is 1
326 // we know we can step back at least offset bytes
327 jump
= jump
+ offset
;
329 // if we land in the middle of a character
330 // walk to the beginning
331 while ((*jump
& 0xc0) == 0xb0)
334 // count how many characters we've actually walked
338 p
= g_utf8_next_char (p
);
342 } while (offset
< 0);
349 g_utf8_pointer_to_offset (const gchar
*str
, const gchar
*pos
)
351 const gchar
*inptr
, *inend
;
368 inptr
= g_utf8_next_char (inptr
);
370 } while (inptr
< inend
);
372 return offset
* sign
;