2 Copyright (C) 2001-2008, The Perl Foundation.
7 src/encodings/utf8.c - UTF-8 encoding
11 UTF-8 (L<http://www.utf-8.com/>).
21 #include "parrot/parrot.h"
22 #include "../unicode.h"
25 /* HEADERIZER HFILE: src/encodings/utf8.h */
27 /* HEADERIZER BEGIN: static */
28 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
30 static void become_encoding(PARROT_INTERP
, SHIM(STRING
*src
))
31 __attribute__nonnull__(1);
34 static UINTVAL
bytes(SHIM_INTERP
, ARGIN(STRING
*src
))
35 __attribute__nonnull__(2);
37 static UINTVAL
codepoints(PARROT_INTERP
, ARGMOD(STRING
*src
))
38 __attribute__nonnull__(1)
39 __attribute__nonnull__(2)
42 static UINTVAL
get_byte(SHIM_INTERP
,
43 ARGIN(const STRING
*src
),
45 __attribute__nonnull__(2);
47 PARROT_CANNOT_RETURN_NULL
48 static STRING
* get_bytes(PARROT_INTERP
,
52 __attribute__nonnull__(1)
53 __attribute__nonnull__(2)
56 PARROT_CANNOT_RETURN_NULL
57 static STRING
* get_bytes_inplace(PARROT_INTERP
,
61 SHIM(STRING
*return_string
))
62 __attribute__nonnull__(1);
64 static UINTVAL
get_codepoint(PARROT_INTERP
,
65 ARGIN(const STRING
*src
),
67 __attribute__nonnull__(1)
68 __attribute__nonnull__(2);
70 PARROT_CANNOT_RETURN_NULL
71 static STRING
* get_codepoints(PARROT_INTERP
,
75 __attribute__nonnull__(1)
76 __attribute__nonnull__(2);
78 PARROT_CANNOT_RETURN_NULL
79 static STRING
* get_codepoints_inplace(PARROT_INTERP
,
83 ARGMOD(STRING
*return_string
))
84 __attribute__nonnull__(1)
85 __attribute__nonnull__(2)
86 __attribute__nonnull__(5)
88 FUNC_MODIFIES(*return_string
);
90 static void iter_init(SHIM_INTERP
,
91 ARGIN(const STRING
*src
),
92 ARGOUT(String_iter
*iter
))
93 __attribute__nonnull__(2)
94 __attribute__nonnull__(3)
97 static void set_byte(PARROT_INTERP
,
98 ARGIN(const STRING
*src
),
101 __attribute__nonnull__(1)
102 __attribute__nonnull__(2);
104 static void set_bytes(PARROT_INTERP
,
108 SHIM(STRING
*new_bytes
))
109 __attribute__nonnull__(1);
111 static void set_codepoint(PARROT_INTERP
,
115 __attribute__nonnull__(1)
116 __attribute__nonnull__(2);
118 static void set_codepoints(PARROT_INTERP
,
122 SHIM(STRING
*new_codepoints
))
123 __attribute__nonnull__(1);
125 PARROT_CAN_RETURN_NULL
126 static STRING
* to_encoding(PARROT_INTERP
,
128 ARGMOD_NULLOK(STRING
*dest
))
129 __attribute__nonnull__(1)
130 __attribute__nonnull__(2)
133 static UINTVAL
utf8_characters(PARROT_INTERP
,
134 ARGIN(const utf8_t
*ptr
),
136 __attribute__nonnull__(1)
137 __attribute__nonnull__(2);
139 static UINTVAL
utf8_decode(PARROT_INTERP
, ARGIN(const utf8_t
*ptr
))
140 __attribute__nonnull__(1)
141 __attribute__nonnull__(2);
143 static UINTVAL
utf8_decode_and_advance(PARROT_INTERP
,
144 ARGMOD(String_iter
*i
))
145 __attribute__nonnull__(1)
146 __attribute__nonnull__(2)
149 PARROT_CANNOT_RETURN_NULL
150 static void * utf8_encode(PARROT_INTERP
, ARGIN(void *ptr
), UINTVAL c
)
151 __attribute__nonnull__(1)
152 __attribute__nonnull__(2);
154 static void utf8_encode_and_advance(PARROT_INTERP
,
155 ARGMOD(String_iter
*i
),
157 __attribute__nonnull__(1)
158 __attribute__nonnull__(2)
161 static void utf8_set_position(SHIM_INTERP
,
162 ARGMOD(String_iter
*i
),
164 __attribute__nonnull__(2)
167 PARROT_WARN_UNUSED_RESULT
168 PARROT_CANNOT_RETURN_NULL
169 static const void * utf8_skip_backward(ARGIN(const void *ptr
), UINTVAL n
)
170 __attribute__nonnull__(1);
172 PARROT_CANNOT_RETURN_NULL
173 static const void * utf8_skip_forward(ARGIN(const void *ptr
), UINTVAL n
)
174 __attribute__nonnull__(1);
176 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
177 /* HEADERIZER END: static */
179 #define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
182 const char Parrot_utf8skip
[256] = {
183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
195 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */
196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */
197 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* cjk etc. */
198 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */
202 typedef unsigned char utf8_t
;
207 =item C<static UINTVAL utf8_characters>
209 Returns the number of characters in the C<byte_len> bytes from C<*ptr>.
216 utf8_characters(PARROT_INTERP
, ARGIN(const utf8_t
*ptr
), UINTVAL byte_len
)
218 const utf8_t
*u8ptr
= ptr
;
219 const utf8_t
*u8end
= u8ptr
+ byte_len
;
220 UINTVAL characters
= 0;
222 while (u8ptr
< u8end
) {
223 u8ptr
+= UTF8SKIP(u8ptr
);
228 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
229 "Unaligned end in UTF-8 string\n");
236 =item C<static UINTVAL utf8_decode>
238 Returns the integer for the UTF-8 character found at C<*ptr>.
245 utf8_decode(PARROT_INTERP
, ARGIN(const utf8_t
*ptr
))
247 const utf8_t
*u8ptr
= ptr
;
250 if (UTF8_IS_START(c
)) {
251 UINTVAL len
= UTF8SKIP(u8ptr
);
254 c
&= UTF8_START_MASK(len
);
255 for (count
= 1; count
< len
; count
++) {
258 if (!UTF8_IS_CONTINUATION(*u8ptr
))
259 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
260 "Malformed UTF-8 string\n");
262 c
= UTF8_ACCUMULATE(c
, *u8ptr
);
265 if (UNICODE_IS_SURROGATE(c
))
266 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
267 "Surrogate in UTF-8 string\n");
269 else if (!UNICODE_IS_INVARIANT(c
)) {
270 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
271 "Malformed UTF-8 string\n");
279 =item C<static void * utf8_encode>
281 Returns the UTF-8 encoding of integer C<c>.
287 PARROT_CANNOT_RETURN_NULL
289 utf8_encode(PARROT_INTERP
, ARGIN(void *ptr
), UINTVAL c
)
291 const UINTVAL len
= UNISKIP(c
);
293 /* the const is good on u8ptr, but using ptr on other variables avoids the
294 * need to do a yucky cast to remove constness */
295 const utf8_t
* const u8ptr
= (utf8_t
*)ptr
;
296 utf8_t
*u8end
= (utf8_t
*)ptr
+ len
- 1;
298 if (c
> 0x10FFFF || UNICODE_IS_SURROGATE(c
)) {
299 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_CHARACTER
,
300 "Invalid character for UTF-8 encoding\n");
303 while (u8end
> u8ptr
) {
305 (utf8_t
)((c
& UTF8_CONTINUATION_MASK
) | UTF8_CONTINUATION_MARK
);
306 c
>>= UTF8_ACCUMULATION_SHIFT
;
308 *u8end
= (utf8_t
)((c
& UTF8_START_MASK(len
)) | UTF8_START_MARK(len
));
310 return (utf8_t
*)ptr
+ len
;
315 =item C<static const void * utf8_skip_forward>
317 Moves C<ptr> C<n> characters forward.
323 PARROT_CANNOT_RETURN_NULL
325 utf8_skip_forward(ARGIN(const void *ptr
), UINTVAL n
)
327 const utf8_t
*u8ptr
= (const utf8_t
*)ptr
;
330 u8ptr
+= UTF8SKIP(u8ptr
);
338 =item C<static const void * utf8_skip_backward>
340 Moves C<ptr> C<n> characters back.
346 PARROT_WARN_UNUSED_RESULT
347 PARROT_CANNOT_RETURN_NULL
349 utf8_skip_backward(ARGIN(const void *ptr
), UINTVAL n
)
351 const utf8_t
*u8ptr
= (const utf8_t
*)ptr
;
355 while (UTF8_IS_CONTINUATION(*u8ptr
))
366 =head2 Iterator Functions
376 =item C<static UINTVAL utf8_decode_and_advance>
378 The UTF-8 implementation of the string iterator's C<get_and_advance>
384 utf8_decode_and_advance(PARROT_INTERP
, ARGMOD(String_iter
*i
))
386 const utf8_t
*u8ptr
= (utf8_t
*)((char *)i
->str
->strstart
+ i
->bytepos
);
389 if (UTF8_IS_START(c
)) {
390 UINTVAL len
= UTF8SKIP(u8ptr
);
392 c
&= UTF8_START_MASK(len
);
394 for (len
--; len
; len
--) {
397 if (!UTF8_IS_CONTINUATION(*u8ptr
))
398 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
399 "Malformed UTF-8 string\n");
401 c
= UTF8_ACCUMULATE(c
, *u8ptr
);
404 if (UNICODE_IS_SURROGATE(c
))
405 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
406 "Surrogate in UTF-8 string\n");
408 else if (!UNICODE_IS_INVARIANT(c
)) {
409 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_MALFORMED_UTF8
,
410 "Malformed UTF-8 string\n");
422 =item C<static void utf8_encode_and_advance>
424 The UTF-8 implementation of the string iterator's C<set_and_advance>
432 utf8_encode_and_advance(PARROT_INTERP
, ARGMOD(String_iter
*i
), UINTVAL c
)
434 const STRING
* const s
= i
->str
;
435 unsigned char * const pos
= (unsigned char *)s
->strstart
+ i
->bytepos
;
436 unsigned char * const new_pos
= (unsigned char *)utf8_encode(interp
, pos
, c
);
438 i
->bytepos
+= (new_pos
- pos
);
439 /* XXX possible buffer overrun exception? */
440 PARROT_ASSERT(i
->bytepos
<= PObj_buflen(s
));
446 =item C<static void utf8_set_position>
448 The UTF-8 implementation of the string iterator's C<set_position>
456 utf8_set_position(SHIM_INTERP
, ARGMOD(String_iter
*i
), UINTVAL pos
)
458 const utf8_t
*u8ptr
= (const utf8_t
*)i
->str
->strstart
;
460 /* start from last known charpos, if we can */
461 if (i
->charpos
<= pos
) {
462 const UINTVAL old_pos
= pos
;
465 i
->charpos
= old_pos
;
471 u8ptr
+= UTF8SKIP(u8ptr
);
473 i
->bytepos
= (const char *)u8ptr
- (const char *)i
->str
->strstart
;
479 =item C<static STRING * to_encoding>
481 Converts the string C<src> to this particular encoding. If C<dest> is
482 provided, it will contain the result. Otherwise this function operates in
489 PARROT_CAN_RETURN_NULL
491 to_encoding(PARROT_INTERP
, ARGMOD(STRING
*src
), ARGMOD_NULLOK(STRING
*dest
))
494 String_iter src_iter
;
495 UINTVAL offs
, dest_len
, dest_pos
, src_len
;
496 const int in_place
= (dest
== NULL
);
497 unsigned char *new_pos
, *pos
, *p
;
499 if (src
->encoding
== Parrot_utf8_encoding_ptr
)
500 return in_place
? src
: string_copy(interp
, src
);
501 src_len
= src
->strlen
;
509 /* init iter before possilby changing encoding */
510 ENCODING_ITER_INIT(interp
, src
, &src_iter
);
511 result
->charset
= Parrot_unicode_charset_ptr
;
512 result
->encoding
= Parrot_utf8_encoding_ptr
;
513 result
->strlen
= src_len
;
519 /* need intermediate memory */
520 p
= (unsigned char *)mem_sys_allocate(src_len
);
523 Parrot_reallocate_string(interp
, dest
, src_len
);
524 p
= (unsigned char *)dest
->strstart
;
526 if (src
->charset
== Parrot_ascii_charset_ptr
) {
527 for (dest_len
= 0; dest_len
< src_len
; ++dest_len
) {
528 p
[dest_len
] = ((unsigned char*)src
->strstart
)[dest_len
];
530 result
->bufused
= dest_len
;
535 for (offs
= 0; offs
< src_len
; ++offs
) {
536 const UINTVAL c
= src_iter
.get_and_advance(interp
, &src_iter
);
537 if (dest_len
- dest_pos
< 6) {
538 UINTVAL need
= (UINTVAL
)((src
->strlen
- offs
) * 1.5);
543 p
= (unsigned char *)mem_sys_realloc(p
, dest_len
);
545 result
->bufused
= dest_pos
;
546 Parrot_reallocate_string(interp
, dest
, dest_len
);
547 p
= (unsigned char *)dest
->strstart
;
552 new_pos
= (unsigned char *)utf8_encode(interp
, pos
, c
);
553 dest_pos
+= (new_pos
- pos
);
555 result
->bufused
= dest_pos
;
558 Parrot_reallocate_string(interp
, src
, src
->bufused
);
559 memcpy(src
->strstart
, p
, src
->bufused
);
567 =item C<static UINTVAL get_codepoint>
569 Returns the codepoint in string C<src> at position C<offset>.
576 get_codepoint(PARROT_INTERP
, ARGIN(const STRING
*src
), UINTVAL offset
)
578 const utf8_t
* const start
= (const utf8_t
*)utf8_skip_forward(src
->strstart
, offset
);
579 return utf8_decode(interp
, start
);
584 =item C<static void set_codepoint>
586 Sets, in string C<src> at position C<offset>, the codepoint C<codepoint>.
593 set_codepoint(PARROT_INTERP
, ARGIN(STRING
*src
), UINTVAL offset
, UINTVAL codepoint
)
599 start
= utf8_skip_forward(src
->strstart
, offset
);
600 p
= PARROT_const_cast(void *, start
);
601 utf8_encode(interp
, p
, codepoint
);
606 =item C<static UINTVAL get_byte>
608 Returns the byte in string C<src> at position C<offset>.
615 get_byte(SHIM_INTERP
, ARGIN(const STRING
*src
), UINTVAL offset
)
617 unsigned char *contents
= (unsigned char *)src
->strstart
;
618 if (offset
>= src
->bufused
) {
619 /* Parrot_ex_throw_from_c_args(interp, NULL, 0,
620 "get_byte past the end of the buffer (%i of %i)",
621 offset, src->bufused); */
624 return contents
[offset
];
629 =item C<static void set_byte>
631 Sets, in string C<src> at position C<offset>, the byte C<byte>.
638 set_byte(PARROT_INTERP
, ARGIN(const STRING
*src
),
639 UINTVAL offset
, UINTVAL byte
)
641 unsigned char *contents
;
643 if (offset
>= src
->bufused
)
644 Parrot_ex_throw_from_c_args(interp
, NULL
, 0,
645 "set_byte past the end of the buffer");
647 contents
= (unsigned char *)src
->strstart
;
648 contents
[offset
] = (unsigned char)byte
;
653 =item C<static STRING * get_codepoints>
655 Returns the codepoints in string C<src> at position C<offset> and length
662 PARROT_CANNOT_RETURN_NULL
664 get_codepoints(PARROT_INTERP
, ARGIN(STRING
*src
), UINTVAL offset
, UINTVAL count
)
667 STRING
* const return_string
= Parrot_make_COW_reference(interp
, src
);
671 iter_init(interp
, src
, &iter
);
674 iter
.set_position(interp
, &iter
, offset
);
676 start
= iter
.bytepos
;
677 return_string
->strstart
= (char *)return_string
->strstart
+ start
;
680 iter
.set_position(interp
, &iter
, offset
+ count
);
682 return_string
->bufused
= iter
.bytepos
- start
;
683 return_string
->strlen
= count
;
684 return_string
->hashval
= 0;
686 return return_string
;
691 =item C<static STRING * get_bytes>
693 Returns the bytes in string C<src> at position C<offset> and length C<count>.
699 PARROT_CANNOT_RETURN_NULL
701 get_bytes(PARROT_INTERP
, ARGMOD(STRING
*src
), UINTVAL offset
, UINTVAL count
)
703 STRING
* const return_string
= Parrot_make_COW_reference(interp
, src
);
705 return_string
->encoding
= src
->encoding
; /* XXX */
706 return_string
->charset
= src
->charset
;
708 return_string
->strstart
= (char *)return_string
->strstart
+ offset
;
709 return_string
->bufused
= count
;
711 return_string
->strlen
= count
;
712 return_string
->hashval
= 0;
714 return return_string
;
719 =item C<static STRING * get_codepoints_inplace>
721 Gets from string C<src> at position C<offset> C<count> codepoints and returns
722 them in C<return_string>.
728 PARROT_CANNOT_RETURN_NULL
730 get_codepoints_inplace(PARROT_INTERP
, ARGMOD(STRING
*src
),
731 UINTVAL offset
, UINTVAL count
, ARGMOD(STRING
*return_string
))
736 Parrot_reuse_COW_reference(interp
, src
, return_string
);
737 iter_init(interp
, src
, &iter
);
738 iter
.set_position(interp
, &iter
, offset
);
740 start
= iter
.bytepos
;
742 return_string
->strstart
= (char *)return_string
->strstart
+ start
;
743 iter
.set_position(interp
, &iter
, offset
+ count
);
745 return_string
->bufused
= iter
.bytepos
- start
;
746 return_string
->strlen
= count
;
747 return_string
->hashval
= 0;
749 return return_string
;
754 =item C<static STRING * get_bytes_inplace>
756 Gets from string C<src> at position C<offset> C<count> bytes and returns them
763 PARROT_CANNOT_RETURN_NULL
765 get_bytes_inplace(PARROT_INTERP
, SHIM(STRING
*src
),
766 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*return_string
))
773 =item C<static void set_codepoints>
775 Replaces in string C<src> at position C<offset> for C<count> codepoints with
776 the contents of string C<new_codepoints>.
783 set_codepoints(PARROT_INTERP
, SHIM(STRING
*src
),
784 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*new_codepoints
))
791 =item C<static void set_bytes>
793 Replaces in string C<src> at position C<offset> for C<count> bytes with the
794 contents of string C<new_bytes>.
801 set_bytes(PARROT_INTERP
, SHIM(STRING
*src
),
802 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*new_bytes
))
809 =item C<static void become_encoding>
811 Unconditionally makes the string be in this encoding, if that's valid
818 become_encoding(PARROT_INTERP
, SHIM(STRING
*src
))
826 =item C<static UINTVAL codepoints>
828 Returns the number of codepoints in string C<src>.
835 codepoints(PARROT_INTERP
, ARGMOD(STRING
*src
))
839 * this is used to initially calculate src->strlen,
840 * therefore we must scan the whole string
842 iter_init(interp
, src
, &iter
);
843 while (iter
.bytepos
< src
->bufused
)
844 iter
.get_and_advance(interp
, &iter
);
850 =item C<static UINTVAL bytes>
852 Returns the number of bytes in string C<src>.
860 bytes(SHIM_INTERP
, ARGIN(STRING
*src
))
867 =item C<static void iter_init>
869 Initializes for string C<src> the string iterator C<iter>.
876 iter_init(SHIM_INTERP
, ARGIN(const STRING
*src
), ARGOUT(String_iter
*iter
))
881 iter
->get_and_advance
= utf8_decode_and_advance
;
882 iter
->set_and_advance
= utf8_encode_and_advance
;
883 iter
->set_position
= utf8_set_position
;
888 =item C<ENCODING * Parrot_encoding_utf8_init>
890 Initializes the UTF-8 encoding.
896 PARROT_CANNOT_RETURN_NULL
898 Parrot_encoding_utf8_init(PARROT_INTERP
)
900 ENCODING
* const return_encoding
= Parrot_new_encoding(interp
);
902 static const ENCODING base_encoding
= {
904 4, /* Max bytes per codepoint 0 .. 0x10ffff */
911 get_codepoints_inplace
,
921 STRUCT_COPY_FROM_STRUCT(return_encoding
, base_encoding
);
922 Parrot_register_encoding(interp
, "utf8", return_encoding
);
923 return return_encoding
;
932 F<src/encodings/fixed_8.c>,
934 F<include/parrot/string.h>,
944 * c-file-style: "parrot"
946 * vim: expandtab shiftwidth=4: