10 #include <sys/types.h>
12 // Returns the amount of 16-bit elements in UTF-16LE needed
13 // (without the terminating null) to store given UTF-8 string
14 static int utf8_to_utf16_size(const char *s
)
20 while ((byte
= *((const unsigned char *)s
++))) {
48 // Converts one UTF-8 sequence to cpu-endian Unicode value
49 static int utf8_to_unicode(uint32_t *wc
, const char *s
)
51 unsigned int byte
= *((const unsigned char *)s
);
57 } else if (byte
< 0x80) {
58 *wc
= (uint32_t) byte
;
61 } else if (byte
< 0xc2) {
63 } else if (byte
< 0xE0) {
64 if ((s
[1] & 0xC0) == 0x80) {
65 *wc
= ((uint32_t)(byte
& 0x1F) << 6)
66 | ((uint32_t)(s
[1] & 0x3F));
71 } else if (byte
< 0xF0) {
72 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)) {
73 *wc
= ((uint32_t)(byte
& 0x0F) << 12)
74 | ((uint32_t)(s
[1] & 0x3F) << 6)
75 | ((uint32_t)(s
[2] & 0x3F));
76 /* Check valid ranges */
77 if (((*wc
>= 0x800) && (*wc
<= 0xD7FF))
78 || ((*wc
>= 0xe000) && (*wc
<= 0xFFFF)))
83 } else if (byte
< 0xF5) {
84 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)
85 && ((s
[3] & 0xC0) == 0x80)) {
86 *wc
= ((uint32_t)(byte
& 0x07) << 18)
87 | ((uint32_t)(s
[1] & 0x3F) << 12)
88 | ((uint32_t)(s
[2] & 0x3F) << 6)
89 | ((uint32_t)(s
[3] & 0x3F));
90 /* Check valid ranges */
91 if ((*wc
<= 0x10ffff) && (*wc
>= 0x10000))
102 // Converts a UTF-8 string to a UTF-16LE string (little endian version)
103 // Returns length of output buffer in utf16 characters
104 int utf8_to_utf16_string(const char *ins
, uint16_t **outs
)
110 int shorts
, ret
= -1;
112 shorts
= utf8_to_utf16_size(ins
);
118 *outs
= malloc((shorts
+ 1) * sizeof(uint16_t));
126 int m
= utf8_to_unicode(&wc
, t
);
129 /* do not leave space allocated if failed */
132 *outs
= (uint16_t *)NULL
;
136 *outpos
++ = (uint16_t)0;
140 *outpos
++ = (uint16_t)wc
;
143 *outpos
++ = (uint16_t)((wc
>> 10) + 0xd800);
144 *outpos
++ = (uint16_t)((wc
& 0x3ff) + 0xdc00);
149 ret
= --outpos
- *outs
;
156 static inline char *js_utf16_char(const uint16_t u
, char *b
)
158 unsigned char c
= (unsigned char)u
;
161 if ((c
== '"') || (c
== '\'') || (c
== '\\') || (!isprint(c
)))
162 b
+= sprintf(b
, "\\x%02x", c
);
167 b
+= sprintf(b
, "\\u%04x", u
);
172 static inline char *html_utf16_char(const uint16_t u
, char *b
)
174 unsigned char c
= (unsigned char)u
;
176 if ((u
> 0xFF) || (c
== '&') || (c
== '<') || (c
== '>') || (c
== '"') || (c
== '\'') || (!isprint(c
)))
177 b
+= sprintf(b
, "&#%d;", u
);
184 typedef char *(*utf16_conv
)(const uint16_t u
, char *b
);
186 static char *utf8_to_string(const char *ins
, int csize
, utf16_conv conv
)
190 char *outpos
, *outs
= NULL
;
193 shorts
= utf8_to_utf16_size(ins
);
197 outpos
= outs
= malloc((shorts
+ 1) * csize
+ 1);
202 int m
= utf8_to_unicode(&wc
, t
);
205 /* do not leave space allocated if failed */
213 outpos
= conv((uint16_t)wc
, outpos
);
217 outpos
= conv((uint16_t)((wc
>> 10) + 0xd800), outpos
);
218 outpos
= conv((uint16_t)((wc
& 0x3ff) + 0xdc00), outpos
);
227 char *utf8_to_js_string(const char *ins
)
229 return utf8_to_string(ins
, 6, js_utf16_char
);
232 char *utf8_to_html_string(const char *ins
)
234 return utf8_to_string(ins
, 8, html_utf16_char
);