2 * gunicode.c: Some Unicode routines
5 * Miguel de Icaza (miguel@novell.com)
7 * (C) 2006 Novell, Inc.
9 * utf8 validation code came from:
10 * libxml2-2.6.26 licensed under the MIT X11 license
12 * Authors credit in libxml's string.c:
13 * William Brack <wbrack@mmm.com.hk>
16 * Permission is hereby granted, free of charge, to any person obtaining
17 * a copy of this software and associated documentation files (the
18 * "Software"), to deal in the Software without restriction, including
19 * without limitation the rights to use, copy, modify, merge, publish,
20 * distribute, sublicense, and/or sell copies of the Software, and to
21 * permit persons to whom the Software is furnished to do so, subject to
22 * the following conditions:
24 * The above copyright notice and this permission notice shall be
25 * included in all copies or substantial portions of the Software.
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
31 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
32 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
33 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
39 #include <unicode-data.h>
42 #if defined(_MSC_VER) || defined(G_OS_WIN32)
50 # ifdef HAVE_LANGINFO_H
51 # include <langinfo.h>
56 # ifdef HAVE_LOCALCHARSET_H
57 # include <localcharset.h>
61 static char *my_charset
;
62 static gboolean is_utf8
;
65 * Character set conversion
68 * Index into the table below with the first byte of a UTF-8 sequence to
69 * get the number of trailing bytes that are supposed to follow it.
70 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
71 * left as-is for anyone who may want to do such conversion, which was
72 * allowed in earlier algorithms.
74 const gchar g_trailingBytesForUTF8
[256] = {
75 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
76 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
77 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
78 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
86 * Magic values subtracted from a buffer value during UTF8 conversion.
87 * This table contains as many values as there might be trailing bytes
88 * in a UTF-8 sequence.
90 static const gulong offsetsFromUTF8
[6] = { 0x00000000UL
, 0x00003080UL
, 0x000E2080UL
,
91 0x03C82080UL
, 0xFA082080UL
, 0x82082080UL
};
94 g_unichar_type (gunichar c
)
98 guint16 cp
= (guint16
) c
;
99 for (i
= 0; i
< unicode_category_ranges_count
; i
++) {
100 if (cp
< unicode_category_ranges
[i
].start
)
102 if (unicode_category_ranges
[i
].end
<= cp
)
104 return unicode_category
[i
] [cp
- unicode_category_ranges
[i
].start
];
108 // 3400-4DB5: OtherLetter
109 // 4E00-9FC3: OtherLetter
110 // AC00-D7A3: OtherLetter
111 // D800-DFFF: OtherSurrogate
112 // E000-F8FF: OtherPrivateUse
113 // 20000-2A6D6 OtherLetter
114 // F0000-FFFFD OtherPrivateUse
115 // 100000-10FFFD OtherPrivateUse
117 if (0x3400 <= cp
&& cp
< 0x4DB5)
118 return G_UNICODE_OTHER_LETTER
;
119 if (0x4E00 <= cp
&& cp
< 0x9FC3)
120 return G_UNICODE_OTHER_LETTER
;
121 if (0xAC00<= cp
&& cp
< 0xD7A3)
122 return G_UNICODE_OTHER_LETTER
;
123 if (0xD800 <= cp
&& cp
< 0xDFFF)
124 return G_UNICODE_SURROGATE
;
125 if (0xE000 <= cp
&& cp
< 0xF8FF)
126 return G_UNICODE_PRIVATE_USE
;
127 /* since the argument is UTF-16, we cannot check beyond FFFF */
129 /* It should match any of above */
134 g_unichar_break_type (gunichar c
)
137 return G_UNICODE_BREAK_UNKNOWN
;
141 g_unichar_case (gunichar c
, gboolean upper
)
144 guint32 cp
= (guint32
) c
, v
;
146 for (i
= 0; i
< simple_case_map_ranges_count
; i
++) {
147 if (cp
< simple_case_map_ranges
[i
].start
)
149 if (simple_case_map_ranges
[i
].end
<= cp
)
152 const guint16
*tab
= upper
? simple_upper_case_mapping_lowarea
[i
] : simple_lower_case_mapping_lowarea
[i
];
153 v
= tab
[cp
- simple_case_map_ranges
[i
].start
];
156 i2
= (gint8
)(i
- (upper
? simple_upper_case_mapping_lowarea_table_count
: simple_lower_case_mapping_lowarea_table_count
));
157 tab
= upper
? simple_upper_case_mapping_higharea
[i2
] : simple_lower_case_mapping_higharea
[i2
];
158 v
= tab
[cp
- simple_case_map_ranges
[i
].start
];
160 return v
!= 0 ? (gunichar
) v
: c
;
166 g_unichar_toupper (gunichar c
)
168 return g_unichar_case (c
, TRUE
);
172 g_unichar_tolower (gunichar c
)
174 return g_unichar_case (c
, FALSE
);
178 g_unichar_totitle (gunichar c
)
184 for (i
= 0; i
< simple_titlecase_mapping_count
; i
++) {
185 if (simple_titlecase_mapping
[i
].codepoint
== cp
)
186 return simple_titlecase_mapping
[i
].title
;
187 if (simple_titlecase_mapping
[i
].codepoint
> cp
)
188 /* it is ordered, hence no more match */
191 return g_unichar_toupper (c
);
195 g_unichar_isxdigit (gunichar c
)
197 return (g_unichar_xdigit_value (c
) != -1);
202 g_unichar_xdigit_value (gunichar c
)
204 if (c
>= 0x30 && c
<= 0x39) /*0-9*/
206 if (c
>= 0x41 && c
<= 0x46) /*A-F*/
208 if (c
>= 0x61 && c
<= 0x66) /*a-f*/
214 g_unichar_isspace (gunichar c
)
216 GUnicodeType type
= g_unichar_type (c
);
217 if (type
== G_UNICODE_LINE_SEPARATOR
||
218 type
== G_UNICODE_PARAGRAPH_SEPARATOR
||
219 type
== G_UNICODE_SPACE_SEPARATOR
)
226 g_convert (const gchar
*str
, gssize len
,
227 const gchar
*to_codeset
, const gchar
*from_codeset
,
228 gsize
*bytes_read
, gsize
*bytes_written
, GError
**error
)
234 char *buffer
, *output
;
235 const char *strptr
= (const char *) str
;
236 size_t str_len
= len
== -1 ? strlen (str
) : len
;
238 size_t left
, out_left
;
240 convertor
= iconv_open (to_codeset
, from_codeset
);
241 if (convertor
== (iconv_t
) -1){
249 buffer_size
= str_len
+ 1 + 8;
250 buffer
= g_malloc (buffer_size
);
255 int res
= iconv (convertor
, (char **) &strptr
, &left
, &output
, &out_left
);
256 if (res
== (size_t) -1){
259 size_t extra_space
= 8 + left
;
260 size_t output_used
= output
- buffer
;
262 buffer_size
+= extra_space
;
264 n
= g_realloc (buffer
, buffer_size
);
268 *error
= g_error_new (NULL
, G_CONVERT_ERROR_FAILED
, "No memory left");
274 out_left
+= extra_space
;
275 output
= buffer
+ output_used
;
276 } else if (errno
== EILSEQ
){
278 *error
= g_error_new (NULL
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "Invalid multi-byte sequence on input");
282 } else if (errno
== EINVAL
){
284 *error
= g_error_new (NULL
, G_CONVERT_ERROR_PARTIAL_INPUT
, "Partial character sequence");
291 if (bytes_read
!= NULL
)
292 *bytes_read
= strptr
- str
;
293 if (bytes_written
!= NULL
)
294 *bytes_written
= output
- buffer
;
298 iconv_close (convertor
);
304 * This is broken, and assumes an UTF8 system, but will do for eglib's first user
307 g_filename_from_utf8 (const gchar
*utf8string
, gssize len
, gsize
*bytes_read
, gsize
*bytes_written
, GError
**error
)
312 len
= strlen (utf8string
);
314 res
= g_malloc (len
+ 1);
315 g_strlcpy (res
, utf8string
, len
+ 1);
320 g_get_charset (G_CONST_RETURN
char **charset
)
322 if (my_charset
== NULL
) {
324 static char buf
[14];
325 sprintf (buf
, "CP%u", GetACP ());
329 /* These shouldn't be heap allocated */
330 #if HAVE_LOCALCHARSET_H
331 my_charset
= locale_charset ();
332 #elif defined(HAVE_LANGINFO_H)
333 my_charset
= nl_langinfo (CODESET
);
335 my_charset
= "UTF-8";
337 is_utf8
= strcmp (my_charset
, "UTF-8") == 0;
342 *charset
= my_charset
;
348 g_locale_to_utf8 (const gchar
*opsysstring
, gssize len
, gsize
*bytes_read
, gsize
*bytes_written
, GError
**error
)
350 g_get_charset (NULL
);
352 return g_convert (opsysstring
, len
, "UTF-8", my_charset
, bytes_read
, bytes_written
, error
);
356 g_locale_from_utf8 (const gchar
*utf8string
, gssize len
, gsize
*bytes_read
, gsize
*bytes_written
, GError
**error
)
358 g_get_charset (NULL
);
360 return g_convert (utf8string
, len
, my_charset
, "UTF-8", bytes_read
, bytes_written
, error
);
364 * @utf: Pointer to putative UTF-8 encoded string.
366 * Checks @utf for being valid UTF-8. @utf is assumed to be
367 * null-terminated. This function is not super-strict, as it will
368 * allow longer UTF-8 sequences than necessary. Note that Java is
369 * capable of producing these sequences if provoked. Also note, this
370 * routine checks for the 4-byte maximum size, but does not check for
371 * 0x10ffff maximum value.
373 * Return value: true if @utf is valid.
376 g_utf8_validate (const gchar
*str
, gssize max_len
, const gchar
**end
)
378 gssize byteCount
= 0;
379 gboolean retVal
= TRUE
;
380 gboolean lastRet
= TRUE
;
381 guchar
* ptr
= (guchar
*) str
;
387 else if (max_len
< 0)
389 while (*ptr
!= 0 && byteCount
<= max_len
) {
390 length
= g_trailingBytesForUTF8
[*ptr
] + 1;
391 srcPtr
= (guchar
*) ptr
+ length
;
393 default: retVal
= FALSE
;
394 /* Everything else falls through when "TRUE"... */
395 case 4: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
396 if ((a
== (guchar
) 0xBF || a
== (guchar
) 0xBE) && *(srcPtr
-1) == (guchar
) 0xBF) {
397 if (*(srcPtr
-2) == (guchar
) 0x8F || *(srcPtr
-2) == (guchar
) 0x9F ||
398 *(srcPtr
-2) == (guchar
) 0xAF || *(srcPtr
-2) == (guchar
) 0xBF)
401 case 3: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
402 case 2: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
405 /* no fall-through in this inner switch */
406 case 0xE0: if (a
< (guchar
) 0xA0) retVal
= FALSE
; break;
407 case 0xED: if (a
> (guchar
) 0x9F) retVal
= FALSE
; break;
408 case 0xEF: if (a
== (guchar
)0xB7 && (*(srcPtr
+1) > (guchar
) 0x8F && *(srcPtr
+1) < 0xB0)) retVal
= FALSE
;
409 if (a
== (guchar
)0xBF && (*(srcPtr
+1) == (guchar
) 0xBE || *(srcPtr
+1) == 0xBF)) retVal
= FALSE
; break;
410 case 0xF0: if (a
< (guchar
) 0x90) retVal
= FALSE
; break;
411 case 0xF4: if (a
> (guchar
) 0x8F) retVal
= FALSE
; break;
412 default: if (a
< (guchar
) 0x80) retVal
= FALSE
;
415 case 1: if (*ptr
>= (guchar
) 0x80 && *ptr
< (guchar
) 0xC2) retVal
= FALSE
;
417 if (*ptr
> (guchar
) 0xF4)
419 //If the string is invalid, set the end to the invalid byte.
420 if (!retVal
&& lastRet
) {
429 if (retVal
&& end
!= NULL
)
435 * @src: Pointer to UTF-8 encoded character.
437 * Return value: UTF-16 value of @src
440 g_utf8_get_char (const gchar
*src
)
443 guchar
* ptr
= (guchar
*) src
;
444 gushort extraBytesToRead
= g_trailingBytesForUTF8
[*ptr
];
446 switch (extraBytesToRead
) {
447 case 5: ch
+= *ptr
++; ch
<<= 6; // remember, illegal UTF-8
448 case 4: ch
+= *ptr
++; ch
<<= 6; // remember, illegal UTF-8
449 case 3: ch
+= *ptr
++; ch
<<= 6;
450 case 2: ch
+= *ptr
++; ch
<<= 6;
451 case 1: ch
+= *ptr
++; ch
<<= 6;
454 ch
-= offsetsFromUTF8
[extraBytesToRead
];
458 g_utf8_strlen (const gchar
*str
, gssize max
)
460 gssize byteCount
= 0;
461 guchar
* ptr
= (guchar
*) str
;
467 while (*ptr
!= 0 && byteCount
<= max
) {
468 gssize cLen
= g_trailingBytesForUTF8
[*ptr
] + 1;
469 if (max
> 0 && (byteCount
+ cLen
) > max
)