Starting release 0.7.0
[parrot.git] / src / encodings / utf8.c
blob60960fb827ac549136e8b86cd132a43bf03df29a
1 /*
2 Copyright (C) 2001-2008, The Perl Foundation.
3 $Id$
5 =head1 NAME
7 src/encodings/utf8.c - UTF-8 encoding
9 =head1 DESCRIPTION
11 UTF-8 (L<http://www.utf-8.com/>).
13 =head2 Functions
15 =over 4
17 =cut
21 #include "parrot/parrot.h"
22 #include "../unicode.h"
23 #include "utf8.h"
25 /* HEADERIZER HFILE: src/encodings/utf8.h */
27 /* HEADERIZER BEGIN: static */
28 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
30 static void become_encoding(PARROT_INTERP, SHIM(STRING *src))
31 __attribute__nonnull__(1);
33 PARROT_PURE_FUNCTION
34 static UINTVAL bytes(SHIM_INTERP, ARGIN(STRING *src))
35 __attribute__nonnull__(2);
37 static UINTVAL codepoints(PARROT_INTERP, ARGMOD(STRING *src))
38 __attribute__nonnull__(1)
39 __attribute__nonnull__(2)
40 FUNC_MODIFIES(*src);
42 static UINTVAL get_byte(SHIM_INTERP,
43 ARGIN(const STRING *src),
44 UINTVAL offset)
45 __attribute__nonnull__(2);
47 PARROT_CANNOT_RETURN_NULL
48 static STRING * get_bytes(PARROT_INTERP,
49 ARGMOD(STRING *src),
50 UINTVAL offset,
51 UINTVAL count)
52 __attribute__nonnull__(1)
53 __attribute__nonnull__(2)
54 FUNC_MODIFIES(*src);
56 PARROT_CANNOT_RETURN_NULL
57 static STRING * get_bytes_inplace(PARROT_INTERP,
58 SHIM(STRING *src),
59 UINTVAL offset,
60 UINTVAL count,
61 SHIM(STRING *return_string))
62 __attribute__nonnull__(1);
64 static UINTVAL get_codepoint(PARROT_INTERP,
65 ARGIN(const STRING *src),
66 UINTVAL offset)
67 __attribute__nonnull__(1)
68 __attribute__nonnull__(2);
70 PARROT_CANNOT_RETURN_NULL
71 static STRING * get_codepoints(PARROT_INTERP,
72 ARGIN(STRING *src),
73 UINTVAL offset,
74 UINTVAL count)
75 __attribute__nonnull__(1)
76 __attribute__nonnull__(2);
78 PARROT_CANNOT_RETURN_NULL
79 static STRING * get_codepoints_inplace(PARROT_INTERP,
80 ARGMOD(STRING *src),
81 UINTVAL offset,
82 UINTVAL count,
83 ARGMOD(STRING *return_string))
84 __attribute__nonnull__(1)
85 __attribute__nonnull__(2)
86 __attribute__nonnull__(5)
87 FUNC_MODIFIES(*src)
88 FUNC_MODIFIES(*return_string);
90 static void iter_init(SHIM_INTERP,
91 ARGIN(const STRING *src),
92 ARGOUT(String_iter *iter))
93 __attribute__nonnull__(2)
94 __attribute__nonnull__(3)
95 FUNC_MODIFIES(*iter);
97 static void set_byte(PARROT_INTERP,
98 ARGIN(const STRING *src),
99 UINTVAL offset,
100 UINTVAL byte)
101 __attribute__nonnull__(1)
102 __attribute__nonnull__(2);
104 static void set_bytes(PARROT_INTERP,
105 SHIM(STRING *src),
106 UINTVAL offset,
107 UINTVAL count,
108 SHIM(STRING *new_bytes))
109 __attribute__nonnull__(1);
111 static void set_codepoint(PARROT_INTERP,
112 ARGIN(STRING *src),
113 UINTVAL offset,
114 UINTVAL codepoint)
115 __attribute__nonnull__(1)
116 __attribute__nonnull__(2);
118 static void set_codepoints(PARROT_INTERP,
119 SHIM(STRING *src),
120 UINTVAL offset,
121 UINTVAL count,
122 SHIM(STRING *new_codepoints))
123 __attribute__nonnull__(1);
125 PARROT_CAN_RETURN_NULL
126 static STRING * to_encoding(PARROT_INTERP,
127 ARGMOD(STRING *src),
128 ARGMOD_NULLOK(STRING *dest))
129 __attribute__nonnull__(1)
130 __attribute__nonnull__(2)
131 FUNC_MODIFIES(*src);
133 static UINTVAL utf8_characters(PARROT_INTERP,
134 ARGIN(const utf8_t *ptr),
135 UINTVAL byte_len)
136 __attribute__nonnull__(1)
137 __attribute__nonnull__(2);
139 static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
140 __attribute__nonnull__(1)
141 __attribute__nonnull__(2);
143 static UINTVAL utf8_decode_and_advance(PARROT_INTERP,
144 ARGMOD(String_iter *i))
145 __attribute__nonnull__(1)
146 __attribute__nonnull__(2)
147 FUNC_MODIFIES(*i);
149 PARROT_CANNOT_RETURN_NULL
150 static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
151 __attribute__nonnull__(1)
152 __attribute__nonnull__(2);
154 static void utf8_encode_and_advance(PARROT_INTERP,
155 ARGMOD(String_iter *i),
156 UINTVAL c)
157 __attribute__nonnull__(1)
158 __attribute__nonnull__(2)
159 FUNC_MODIFIES(*i);
161 static void utf8_set_position(SHIM_INTERP,
162 ARGMOD(String_iter *i),
163 UINTVAL pos)
164 __attribute__nonnull__(2)
165 FUNC_MODIFIES(*i);
167 PARROT_WARN_UNUSED_RESULT
168 PARROT_CANNOT_RETURN_NULL
169 static const void * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
170 __attribute__nonnull__(1);
172 PARROT_CANNOT_RETURN_NULL
173 static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
174 __attribute__nonnull__(1);
176 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
177 /* HEADERIZER END: static */
179 #define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
180 "unimpl utf8")
182 const char Parrot_utf8skip[256] = {
183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */
195 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */
196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */
197 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* cjk etc. */
198 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */
201 #if 0
202 typedef unsigned char utf8_t;
203 #endif
207 =item C<static UINTVAL utf8_characters>
209 Returns the number of characters in the C<byte_len> bytes from C<*ptr>.
211 =cut
215 static UINTVAL
216 utf8_characters(PARROT_INTERP, ARGIN(const utf8_t *ptr), UINTVAL byte_len)
218 const utf8_t *u8ptr = ptr;
219 const utf8_t *u8end = u8ptr + byte_len;
220 UINTVAL characters = 0;
222 while (u8ptr < u8end) {
223 u8ptr += UTF8SKIP(u8ptr);
224 characters++;
227 if (u8ptr > u8end)
228 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
229 "Unaligned end in UTF-8 string\n");
231 return characters;
236 =item C<static UINTVAL utf8_decode>
238 Returns the integer for the UTF-8 character found at C<*ptr>.
240 =cut
244 static UINTVAL
245 utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
247 const utf8_t *u8ptr = ptr;
248 UINTVAL c = *u8ptr;
250 if (UTF8_IS_START(c)) {
251 UINTVAL len = UTF8SKIP(u8ptr);
252 UINTVAL count;
254 c &= UTF8_START_MASK(len);
255 for (count = 1; count < len; count++) {
256 u8ptr++;
258 if (!UTF8_IS_CONTINUATION(*u8ptr))
259 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
260 "Malformed UTF-8 string\n");
262 c = UTF8_ACCUMULATE(c, *u8ptr);
265 if (UNICODE_IS_SURROGATE(c))
266 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
267 "Surrogate in UTF-8 string\n");
269 else if (!UNICODE_IS_INVARIANT(c)) {
270 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
271 "Malformed UTF-8 string\n");
274 return c;
279 =item C<static void * utf8_encode>
281 Returns the UTF-8 encoding of integer C<c>.
283 =cut
287 PARROT_CANNOT_RETURN_NULL
288 static void *
289 utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
291 const UINTVAL len = UNISKIP(c);
293 /* the const is good on u8ptr, but using ptr on other variables avoids the
294 * need to do a yucky cast to remove constness */
295 const utf8_t * const u8ptr = (utf8_t *)ptr;
296 utf8_t *u8end = (utf8_t *)ptr + len - 1;
298 if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
299 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
300 "Invalid character for UTF-8 encoding\n");
303 while (u8end > u8ptr) {
304 *u8end-- =
305 (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
306 c >>= UTF8_ACCUMULATION_SHIFT;
308 *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));
310 return (utf8_t *)ptr + len;
315 =item C<static const void * utf8_skip_forward>
317 Moves C<ptr> C<n> characters forward.
319 =cut
323 PARROT_CANNOT_RETURN_NULL
324 static const void *
325 utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
327 const utf8_t *u8ptr = (const utf8_t *)ptr;
329 while (n-- > 0) {
330 u8ptr += UTF8SKIP(u8ptr);
333 return u8ptr;
338 =item C<static const void * utf8_skip_backward>
340 Moves C<ptr> C<n> characters back.
342 =cut
346 PARROT_WARN_UNUSED_RESULT
347 PARROT_CANNOT_RETURN_NULL
348 static const void *
349 utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
351 const utf8_t *u8ptr = (const utf8_t *)ptr;
353 while (n-- > 0) {
354 u8ptr--;
355 while (UTF8_IS_CONTINUATION(*u8ptr))
356 u8ptr--;
359 return u8ptr;
364 =back
366 =head2 Iterator Functions
368 =over 4
370 =cut
376 =item C<static UINTVAL utf8_decode_and_advance>
378 The UTF-8 implementation of the string iterator's C<get_and_advance>
379 function.
383 static UINTVAL
384 utf8_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
386 const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos);
387 UINTVAL c = *u8ptr;
389 if (UTF8_IS_START(c)) {
390 UINTVAL len = UTF8SKIP(u8ptr);
392 c &= UTF8_START_MASK(len);
393 i->bytepos += len;
394 for (len--; len; len--) {
395 u8ptr++;
397 if (!UTF8_IS_CONTINUATION(*u8ptr))
398 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
399 "Malformed UTF-8 string\n");
401 c = UTF8_ACCUMULATE(c, *u8ptr);
404 if (UNICODE_IS_SURROGATE(c))
405 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
406 "Surrogate in UTF-8 string\n");
408 else if (!UNICODE_IS_INVARIANT(c)) {
409 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
410 "Malformed UTF-8 string\n");
412 else {
413 i->bytepos++;
416 i->charpos++;
417 return c;
422 =item C<static void utf8_encode_and_advance>
424 The UTF-8 implementation of the string iterator's C<set_and_advance>
425 function.
427 =cut
431 static void
432 utf8_encode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL c)
434 const STRING * const s = i->str;
435 unsigned char * const pos = (unsigned char *)s->strstart + i->bytepos;
436 unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
438 i->bytepos += (new_pos - pos);
439 /* XXX possible buffer overrun exception? */
440 PARROT_ASSERT(i->bytepos <= PObj_buflen(s));
441 i->charpos++;
446 =item C<static void utf8_set_position>
448 The UTF-8 implementation of the string iterator's C<set_position>
449 function.
451 =cut
455 static void
456 utf8_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL pos)
458 const utf8_t *u8ptr = (const utf8_t *)i->str->strstart;
460 /* start from last known charpos, if we can */
461 if (i->charpos <= pos) {
462 const UINTVAL old_pos = pos;
463 pos -= i->charpos;
464 u8ptr += i->bytepos;
465 i->charpos = old_pos;
467 else
468 i->charpos = pos;
470 while (pos-- > 0)
471 u8ptr += UTF8SKIP(u8ptr);
473 i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
479 =item C<static STRING * to_encoding>
481 Converts the string C<src> to this particular encoding. If C<dest> is
482 provided, it will contain the result. Otherwise this function operates in
483 place.
485 =cut
489 PARROT_CAN_RETURN_NULL
490 static STRING *
491 to_encoding(PARROT_INTERP, ARGMOD(STRING *src), ARGMOD_NULLOK(STRING *dest))
493 STRING *result;
494 String_iter src_iter;
495 UINTVAL offs, dest_len, dest_pos, src_len;
496 const int in_place = (dest == NULL);
497 unsigned char *new_pos, *pos, *p;
499 if (src->encoding == Parrot_utf8_encoding_ptr)
500 return in_place ? src : string_copy(interp, src);
501 src_len = src->strlen;
502 if (in_place) {
503 result = src;
505 else {
506 result = dest;
509 /* init iter before possilby changing encoding */
510 ENCODING_ITER_INIT(interp, src, &src_iter);
511 result->charset = Parrot_unicode_charset_ptr;
512 result->encoding = Parrot_utf8_encoding_ptr;
513 result->strlen = src_len;
515 if (!src->strlen)
516 return dest;
518 if (in_place) {
519 /* need intermediate memory */
520 p = (unsigned char *)mem_sys_allocate(src_len);
522 else {
523 Parrot_reallocate_string(interp, dest, src_len);
524 p = (unsigned char *)dest->strstart;
526 if (src->charset == Parrot_ascii_charset_ptr) {
527 for (dest_len = 0; dest_len < src_len; ++dest_len) {
528 p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
530 result->bufused = dest_len;
532 else {
533 dest_len = src_len;
534 dest_pos = 0;
535 for (offs = 0; offs < src_len; ++offs) {
536 const UINTVAL c = src_iter.get_and_advance(interp, &src_iter);
537 if (dest_len - dest_pos < 6) {
538 UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
539 if (need < 16)
540 need = 16;
541 dest_len += need;
542 if (in_place)
543 p = (unsigned char *)mem_sys_realloc(p, dest_len);
544 else {
545 result->bufused = dest_pos;
546 Parrot_reallocate_string(interp, dest, dest_len);
547 p = (unsigned char *)dest->strstart;
551 pos = p + dest_pos;
552 new_pos = (unsigned char *)utf8_encode(interp, pos, c);
553 dest_pos += (new_pos - pos);
555 result->bufused = dest_pos;
557 if (in_place) {
558 Parrot_reallocate_string(interp, src, src->bufused);
559 memcpy(src->strstart, p, src->bufused);
560 mem_sys_free(p);
562 return result;
567 =item C<static UINTVAL get_codepoint>
569 Returns the codepoint in string C<src> at position C<offset>.
571 =cut
575 static UINTVAL
576 get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
578 const utf8_t * const start = (const utf8_t *)utf8_skip_forward(src->strstart, offset);
579 return utf8_decode(interp, start);
584 =item C<static void set_codepoint>
586 Sets, in string C<src> at position C<offset>, the codepoint C<codepoint>.
588 =cut
592 static void
593 set_codepoint(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL codepoint)
595 const void *start;
596 void *p;
597 DECL_CONST_CAST;
599 start = utf8_skip_forward(src->strstart, offset);
600 p = PARROT_const_cast(void *, start);
601 utf8_encode(interp, p, codepoint);
606 =item C<static UINTVAL get_byte>
608 Returns the byte in string C<src> at position C<offset>.
610 =cut
614 static UINTVAL
615 get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset)
617 unsigned char *contents = (unsigned char *)src->strstart;
618 if (offset >= src->bufused) {
619 /* Parrot_ex_throw_from_c_args(interp, NULL, 0,
620 "get_byte past the end of the buffer (%i of %i)",
621 offset, src->bufused); */
622 return 0;
624 return contents[offset];
629 =item C<static void set_byte>
631 Sets, in string C<src> at position C<offset>, the byte C<byte>.
633 =cut
637 static void
638 set_byte(PARROT_INTERP, ARGIN(const STRING *src),
639 UINTVAL offset, UINTVAL byte)
641 unsigned char *contents;
643 if (offset >= src->bufused)
644 Parrot_ex_throw_from_c_args(interp, NULL, 0,
645 "set_byte past the end of the buffer");
647 contents = (unsigned char *)src->strstart;
648 contents[offset] = (unsigned char)byte;
653 =item C<static STRING * get_codepoints>
655 Returns the codepoints in string C<src> at position C<offset> and length
656 C<count>.
658 =cut
662 PARROT_CANNOT_RETURN_NULL
663 static STRING *
664 get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count)
667 STRING * const return_string = Parrot_make_COW_reference(interp, src);
668 String_iter iter;
669 UINTVAL start;
671 iter_init(interp, src, &iter);
673 if (offset)
674 iter.set_position(interp, &iter, offset);
676 start = iter.bytepos;
677 return_string->strstart = (char *)return_string->strstart + start;
679 if (count)
680 iter.set_position(interp, &iter, offset + count);
682 return_string->bufused = iter.bytepos - start;
683 return_string->strlen = count;
684 return_string->hashval = 0;
686 return return_string;
691 =item C<static STRING * get_bytes>
693 Returns the bytes in string C<src> at position C<offset> and length C<count>.
695 =cut
699 PARROT_CANNOT_RETURN_NULL
700 static STRING *
701 get_bytes(PARROT_INTERP, ARGMOD(STRING *src), UINTVAL offset, UINTVAL count)
703 STRING * const return_string = Parrot_make_COW_reference(interp, src);
705 return_string->encoding = src->encoding; /* XXX */
706 return_string->charset = src->charset;
708 return_string->strstart = (char *)return_string->strstart + offset ;
709 return_string->bufused = count;
711 return_string->strlen = count;
712 return_string->hashval = 0;
714 return return_string;
719 =item C<static STRING * get_codepoints_inplace>
721 Gets from string C<src> at position C<offset> C<count> codepoints and returns
722 them in C<return_string>.
724 =cut
728 PARROT_CANNOT_RETURN_NULL
729 static STRING *
730 get_codepoints_inplace(PARROT_INTERP, ARGMOD(STRING *src),
731 UINTVAL offset, UINTVAL count, ARGMOD(STRING *return_string))
733 String_iter iter;
734 UINTVAL start;
736 Parrot_reuse_COW_reference(interp, src, return_string);
737 iter_init(interp, src, &iter);
738 iter.set_position(interp, &iter, offset);
740 start = iter.bytepos;
742 return_string->strstart = (char *)return_string->strstart + start;
743 iter.set_position(interp, &iter, offset + count);
745 return_string->bufused = iter.bytepos - start;
746 return_string->strlen = count;
747 return_string->hashval = 0;
749 return return_string;
754 =item C<static STRING * get_bytes_inplace>
756 Gets from string C<src> at position C<offset> C<count> bytes and returns them
757 in C<return_string>.
759 =cut
763 PARROT_CANNOT_RETURN_NULL
764 static STRING *
765 get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src),
766 UINTVAL offset, UINTVAL count, SHIM(STRING *return_string))
768 UNIMPL;
773 =item C<static void set_codepoints>
775 Replaces in string C<src> at position C<offset> for C<count> codepoints with
776 the contents of string C<new_codepoints>.
778 =cut
782 static void
783 set_codepoints(PARROT_INTERP, SHIM(STRING *src),
784 UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints))
786 UNIMPL;
791 =item C<static void set_bytes>
793 Replaces in string C<src> at position C<offset> for C<count> bytes with the
794 contents of string C<new_bytes>.
796 =cut
800 static void
801 set_bytes(PARROT_INTERP, SHIM(STRING *src),
802 UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes))
804 UNIMPL;
809 =item C<static void become_encoding>
811 Unconditionally makes the string be in this encoding, if that's valid
813 =cut
817 static void
818 become_encoding(PARROT_INTERP, SHIM(STRING *src))
820 UNIMPL;
826 =item C<static UINTVAL codepoints>
828 Returns the number of codepoints in string C<src>.
830 =cut
834 static UINTVAL
835 codepoints(PARROT_INTERP, ARGMOD(STRING *src))
837 String_iter iter;
839 * this is used to initially calculate src->strlen,
840 * therefore we must scan the whole string
842 iter_init(interp, src, &iter);
843 while (iter.bytepos < src->bufused)
844 iter.get_and_advance(interp, &iter);
845 return iter.charpos;
850 =item C<static UINTVAL bytes>
852 Returns the number of bytes in string C<src>.
854 =cut
858 PARROT_PURE_FUNCTION
859 static UINTVAL
860 bytes(SHIM_INTERP, ARGIN(STRING *src))
862 return src->bufused;
867 =item C<static void iter_init>
869 Initializes for string C<src> the string iterator C<iter>.
871 =cut
875 static void
876 iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
878 iter->str = src;
879 iter->bytepos = 0;
880 iter->charpos = 0;
881 iter->get_and_advance = utf8_decode_and_advance;
882 iter->set_and_advance = utf8_encode_and_advance;
883 iter->set_position = utf8_set_position;
888 =item C<ENCODING * Parrot_encoding_utf8_init>
890 Initializes the UTF-8 encoding.
892 =cut
896 PARROT_CANNOT_RETURN_NULL
897 ENCODING *
898 Parrot_encoding_utf8_init(PARROT_INTERP)
900 ENCODING * const return_encoding = Parrot_new_encoding(interp);
902 static const ENCODING base_encoding = {
903 "utf8",
904 4, /* Max bytes per codepoint 0 .. 0x10ffff */
905 to_encoding,
906 get_codepoint,
907 set_codepoint,
908 get_byte,
909 set_byte,
910 get_codepoints,
911 get_codepoints_inplace,
912 get_bytes,
913 get_bytes_inplace,
914 set_codepoints,
915 set_bytes,
916 become_encoding,
917 codepoints,
918 bytes,
919 iter_init
921 STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
922 Parrot_register_encoding(interp, "utf8", return_encoding);
923 return return_encoding;
928 =back
930 =head1 SEE ALSO
932 F<src/encodings/fixed_8.c>,
933 F<src/string.c>,
934 F<include/parrot/string.h>,
935 F<docs/string.pod>.
937 =cut
943 * Local variables:
944 * c-file-style: "parrot"
945 * End:
946 * vim: expandtab shiftwidth=4: