2 * libid3tag - ID3 tag manipulation library
3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
35 * NAME: utf8->length()
36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
38 id3_length_t
id3_utf8_length(id3_utf8_t
const *utf8
)
40 id3_length_t length
= 0;
43 if ((utf8
[0] & 0x80) == 0x00)
45 else if ((utf8
[0] & 0xe0) == 0xc0 &&
46 (utf8
[1] & 0xc0) == 0x80) {
47 if (((utf8
[0] & 0x1fL
) << 6) >= 0x00000080L
) {
52 else if ((utf8
[0] & 0xf0) == 0xe0 &&
53 (utf8
[1] & 0xc0) == 0x80 &&
54 (utf8
[2] & 0xc0) == 0x80) {
55 if ((((utf8
[0] & 0x0fL
) << 12) |
56 ((utf8
[1] & 0x3fL
) << 6)) >= 0x00000800L
) {
61 else if ((utf8
[0] & 0xf8) == 0xf0 &&
62 (utf8
[1] & 0xc0) == 0x80 &&
63 (utf8
[2] & 0xc0) == 0x80 &&
64 (utf8
[3] & 0xc0) == 0x80) {
65 if ((((utf8
[0] & 0x07L
) << 18) |
66 ((utf8
[1] & 0x3fL
) << 12)) >= 0x00010000L
) {
71 else if ((utf8
[0] & 0xfc) == 0xf8 &&
72 (utf8
[1] & 0xc0) == 0x80 &&
73 (utf8
[2] & 0xc0) == 0x80 &&
74 (utf8
[3] & 0xc0) == 0x80 &&
75 (utf8
[4] & 0xc0) == 0x80) {
76 if ((((utf8
[0] & 0x03L
) << 24) |
77 ((utf8
[0] & 0x3fL
) << 18)) >= 0x00200000L
) {
82 else if ((utf8
[0] & 0xfe) == 0xfc &&
83 (utf8
[1] & 0xc0) == 0x80 &&
84 (utf8
[2] & 0xc0) == 0x80 &&
85 (utf8
[3] & 0xc0) == 0x80 &&
86 (utf8
[4] & 0xc0) == 0x80 &&
87 (utf8
[5] & 0xc0) == 0x80) {
88 if ((((utf8
[0] & 0x01L
) << 30) |
89 ((utf8
[0] & 0x3fL
) << 24)) >= 0x04000000L
) {
103 * DESCRIPTION: return the encoding size of a utf8 string
105 id3_length_t
id3_utf8_size(id3_utf8_t
const *utf8
)
107 id3_utf8_t
const *ptr
= utf8
;
112 return ptr
- utf8
+ 1;
116 * NAME: utf8->ucs4duplicate()
117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
119 id3_ucs4_t
*id3_utf8_ucs4duplicate(id3_utf8_t
const *utf8
)
123 ucs4
= malloc((id3_utf8_length(utf8
) + 1) * sizeof(*ucs4
));
125 id3_utf8_decode(utf8
, ucs4
);
127 return release(ucs4
);
131 * NAME: utf8->decodechar()
132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
134 id3_length_t
id3_utf8_decodechar(id3_utf8_t
const *utf8
, id3_ucs4_t
*ucs4
)
136 id3_utf8_t
const *start
= utf8
;
139 if ((utf8
[0] & 0x80) == 0x00) {
141 return utf8
- start
+ 1;
143 else if ((utf8
[0] & 0xe0) == 0xc0 &&
144 (utf8
[1] & 0xc0) == 0x80) {
146 ((utf8
[0] & 0x1fL
) << 6) |
147 ((utf8
[1] & 0x3fL
) << 0);
148 if (*ucs4
>= 0x00000080L
)
149 return utf8
- start
+ 2;
151 else if ((utf8
[0] & 0xf0) == 0xe0 &&
152 (utf8
[1] & 0xc0) == 0x80 &&
153 (utf8
[2] & 0xc0) == 0x80) {
155 ((utf8
[0] & 0x0fL
) << 12) |
156 ((utf8
[1] & 0x3fL
) << 6) |
157 ((utf8
[2] & 0x3fL
) << 0);
158 if (*ucs4
>= 0x00000800L
)
159 return utf8
- start
+ 3;
161 else if ((utf8
[0] & 0xf8) == 0xf0 &&
162 (utf8
[1] & 0xc0) == 0x80 &&
163 (utf8
[2] & 0xc0) == 0x80 &&
164 (utf8
[3] & 0xc0) == 0x80) {
166 ((utf8
[0] & 0x07L
) << 18) |
167 ((utf8
[1] & 0x3fL
) << 12) |
168 ((utf8
[2] & 0x3fL
) << 6) |
169 ((utf8
[3] & 0x3fL
) << 0);
170 if (*ucs4
>= 0x00010000L
)
171 return utf8
- start
+ 4;
173 else if ((utf8
[0] & 0xfc) == 0xf8 &&
174 (utf8
[1] & 0xc0) == 0x80 &&
175 (utf8
[2] & 0xc0) == 0x80 &&
176 (utf8
[3] & 0xc0) == 0x80 &&
177 (utf8
[4] & 0xc0) == 0x80) {
179 ((utf8
[0] & 0x03L
) << 24) |
180 ((utf8
[1] & 0x3fL
) << 18) |
181 ((utf8
[2] & 0x3fL
) << 12) |
182 ((utf8
[3] & 0x3fL
) << 6) |
183 ((utf8
[4] & 0x3fL
) << 0);
184 if (*ucs4
>= 0x00200000L
)
185 return utf8
- start
+ 5;
187 else if ((utf8
[0] & 0xfe) == 0xfc &&
188 (utf8
[1] & 0xc0) == 0x80 &&
189 (utf8
[2] & 0xc0) == 0x80 &&
190 (utf8
[3] & 0xc0) == 0x80 &&
191 (utf8
[4] & 0xc0) == 0x80 &&
192 (utf8
[5] & 0xc0) == 0x80) {
194 ((utf8
[0] & 0x01L
) << 30) |
195 ((utf8
[1] & 0x3fL
) << 24) |
196 ((utf8
[2] & 0x3fL
) << 18) |
197 ((utf8
[3] & 0x3fL
) << 12) |
198 ((utf8
[4] & 0x3fL
) << 6) |
199 ((utf8
[5] & 0x3fL
) << 0);
200 if (*ucs4
>= 0x04000000L
)
201 return utf8
- start
+ 6;
209 * NAME: utf8->encodechar()
210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
212 id3_length_t
id3_utf8_encodechar(id3_utf8_t
*utf8
, id3_ucs4_t ucs4
)
214 if (ucs4
<= 0x0000007fL
) {
219 else if (ucs4
<= 0x000007ffL
) {
220 utf8
[0] = 0xc0 | ((ucs4
>> 6) & 0x1f);
221 utf8
[1] = 0x80 | ((ucs4
>> 0) & 0x3f);
225 else if (ucs4
<= 0x0000ffffL
) {
226 utf8
[0] = 0xe0 | ((ucs4
>> 12) & 0x0f);
227 utf8
[1] = 0x80 | ((ucs4
>> 6) & 0x3f);
228 utf8
[2] = 0x80 | ((ucs4
>> 0) & 0x3f);
232 else if (ucs4
<= 0x001fffffL
) {
233 utf8
[0] = 0xf0 | ((ucs4
>> 18) & 0x07);
234 utf8
[1] = 0x80 | ((ucs4
>> 12) & 0x3f);
235 utf8
[2] = 0x80 | ((ucs4
>> 6) & 0x3f);
236 utf8
[3] = 0x80 | ((ucs4
>> 0) & 0x3f);
240 else if (ucs4
<= 0x03ffffffL
) {
241 utf8
[0] = 0xf8 | ((ucs4
>> 24) & 0x03);
242 utf8
[1] = 0x80 | ((ucs4
>> 18) & 0x3f);
243 utf8
[2] = 0x80 | ((ucs4
>> 12) & 0x3f);
244 utf8
[3] = 0x80 | ((ucs4
>> 6) & 0x3f);
245 utf8
[4] = 0x80 | ((ucs4
>> 0) & 0x3f);
249 else if (ucs4
<= 0x7fffffffL
) {
250 utf8
[0] = 0xfc | ((ucs4
>> 30) & 0x01);
251 utf8
[1] = 0x80 | ((ucs4
>> 24) & 0x3f);
252 utf8
[2] = 0x80 | ((ucs4
>> 18) & 0x3f);
253 utf8
[3] = 0x80 | ((ucs4
>> 12) & 0x3f);
254 utf8
[4] = 0x80 | ((ucs4
>> 6) & 0x3f);
255 utf8
[5] = 0x80 | ((ucs4
>> 0) & 0x3f);
262 return id3_utf8_encodechar(utf8
, ID3_UCS4_REPLACEMENTCHAR
);
266 * NAME: utf8->decode()
267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
269 void id3_utf8_decode(id3_utf8_t
const *utf8
, id3_ucs4_t
*ucs4
)
272 utf8
+= id3_utf8_decodechar(utf8
, ucs4
);
277 * NAME: utf8->encode()
278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
280 void id3_utf8_encode(id3_utf8_t
*utf8
, id3_ucs4_t
const *ucs4
)
283 utf8
+= id3_utf8_encodechar(utf8
, *ucs4
);
289 * DESCRIPTION: serialize a single utf8 character
291 id3_length_t
id3_utf8_put(id3_byte_t
**ptr
, id3_utf8_t utf8
)
301 * DESCRIPTION: deserialize a single utf8 character
303 id3_utf8_t
id3_utf8_get(id3_byte_t
const **ptr
)
309 * NAME: utf8->serialize()
310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
312 id3_length_t
id3_utf8_serialize(id3_byte_t
**ptr
, id3_ucs4_t
const *ucs4
,
315 id3_length_t size
= 0;
316 id3_utf8_t utf8
[6], *out
;
319 switch (id3_utf8_encodechar(out
= utf8
, *ucs4
++)) {
320 case 6: size
+= id3_utf8_put(ptr
, *out
++);
321 case 5: size
+= id3_utf8_put(ptr
, *out
++);
322 case 4: size
+= id3_utf8_put(ptr
, *out
++);
323 case 3: size
+= id3_utf8_put(ptr
, *out
++);
324 case 2: size
+= id3_utf8_put(ptr
, *out
++);
325 case 1: size
+= id3_utf8_put(ptr
, *out
++);
331 size
+= id3_utf8_put(ptr
, 0);
337 * NAME: utf8->deserialize()
338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
340 id3_ucs4_t
*id3_utf8_deserialize(id3_byte_t
const **ptr
, id3_length_t length
)
342 id3_byte_t
const *end
;
343 id3_utf8_t
*utf8ptr
, *utf8
;
348 utf8
= malloc((length
+ 1) * sizeof(*utf8
));
353 while (end
- *ptr
> 0 && (*utf8ptr
= id3_utf8_get(ptr
)))
358 ucs4
= malloc((id3_utf8_length(utf8
) + 1) * sizeof(*ucs4
));
360 id3_utf8_decode(utf8
, ucs4
);