src/encodings/utf8.c

   1 /*
   2 Copyright (C) 2001-2008, The Perl Foundation.
   3 $Id$
   4
   5 =head1 NAME
   6
   7 src/encodings/utf8.c - UTF-8 encoding
   8
   9 =head1 DESCRIPTION
  10
  11 UTF-8 (L<http://www.utf-8.com/>).
  12
  13 =head2 Functions
  14
  15 =over 4
  16
  17 =cut
  18
  19 */
  20
  21 #include "parrot/parrot.h"
  22 #include "../unicode.h"
  23 #include "utf8.h"
  24
  25 /* HEADERIZER HFILE: src/encodings/utf8.h */
  26
  27 /* HEADERIZER BEGIN: static */
  28 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
  29
  30 static void become_encoding(PARROT_INTERP, SHIM(STRING *src))
  31         __attribute__nonnull__(1);
  32
  33 PARROT_PURE_FUNCTION
  34 static UINTVAL bytes(SHIM_INTERP, ARGIN(STRING *src))
  35         __attribute__nonnull__(2);
  36
  37 static UINTVAL codepoints(PARROT_INTERP, ARGMOD(STRING *src))
  38         __attribute__nonnull__(1)
  39         __attribute__nonnull__(2)
  40         FUNC_MODIFIES(*src);
  41
  42 static UINTVAL get_byte(SHIM_INTERP,
  43     ARGIN(const STRING *src),
  44     UINTVAL offset)
  45         __attribute__nonnull__(2);
  46
  47 PARROT_CANNOT_RETURN_NULL
  48 static STRING * get_bytes(PARROT_INTERP,
  49     ARGMOD(STRING *src),
  50     UINTVAL offset,
  51     UINTVAL count)
  52         __attribute__nonnull__(1)
  53         __attribute__nonnull__(2)
  54         FUNC_MODIFIES(*src);
  55
  56 PARROT_CANNOT_RETURN_NULL
  57 static STRING * get_bytes_inplace(PARROT_INTERP,
  58     SHIM(STRING *src),
  59     UINTVAL offset,
  60     UINTVAL count,
  61     SHIM(STRING *return_string))
  62         __attribute__nonnull__(1);
  63
  64 static UINTVAL get_codepoint(PARROT_INTERP,
  65     ARGIN(const STRING *src),
  66     UINTVAL offset)
  67         __attribute__nonnull__(1)
  68         __attribute__nonnull__(2);
  69
  70 PARROT_CANNOT_RETURN_NULL
  71 static STRING * get_codepoints(PARROT_INTERP,
  72     ARGIN(STRING *src),
  73     UINTVAL offset,
  74     UINTVAL count)
  75         __attribute__nonnull__(1)
  76         __attribute__nonnull__(2);
  77
  78 PARROT_CANNOT_RETURN_NULL
  79 static STRING * get_codepoints_inplace(PARROT_INTERP,
  80     ARGMOD(STRING *src),
  81     UINTVAL offset,
  82     UINTVAL count,
  83     ARGMOD(STRING *return_string))
  84         __attribute__nonnull__(1)
  85         __attribute__nonnull__(2)
  86         __attribute__nonnull__(5)
  87         FUNC_MODIFIES(*src)
  88         FUNC_MODIFIES(*return_string);
  89
  90 static void iter_init(SHIM_INTERP,
  91     ARGIN(const STRING *src),
  92     ARGOUT(String_iter *iter))
  93         __attribute__nonnull__(2)
  94         __attribute__nonnull__(3)
  95         FUNC_MODIFIES(*iter);
  96
  97 static void set_byte(PARROT_INTERP,
  98     ARGIN(const STRING *src),
  99     UINTVAL offset,
 100     UINTVAL byte)
 101         __attribute__nonnull__(1)
 102         __attribute__nonnull__(2);
 103
 104 static void set_bytes(PARROT_INTERP,
 105     SHIM(STRING *src),
 106     UINTVAL offset,
 107     UINTVAL count,
 108     SHIM(STRING *new_bytes))
 109         __attribute__nonnull__(1);
 110
 111 static void set_codepoint(PARROT_INTERP,
 112     ARGIN(STRING *src),
 113     UINTVAL offset,
 114     UINTVAL codepoint)
 115         __attribute__nonnull__(1)
 116         __attribute__nonnull__(2);
 117
 118 static void set_codepoints(PARROT_INTERP,
 119     SHIM(STRING *src),
 120     UINTVAL offset,
 121     UINTVAL count,
 122     SHIM(STRING *new_codepoints))
 123         __attribute__nonnull__(1);
 124
 125 PARROT_CAN_RETURN_NULL
 126 static STRING * to_encoding(PARROT_INTERP,
 127     ARGMOD(STRING *src),
 128     ARGMOD_NULLOK(STRING *dest))
 129         __attribute__nonnull__(1)
 130         __attribute__nonnull__(2)
 131         FUNC_MODIFIES(*src);
 132
 133 static UINTVAL utf8_characters(PARROT_INTERP,
 134     ARGIN(const utf8_t *ptr),
 135     UINTVAL byte_len)
 136         __attribute__nonnull__(1)
 137         __attribute__nonnull__(2);
 138
 139 static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
 140         __attribute__nonnull__(1)
 141         __attribute__nonnull__(2);
 142
 143 static UINTVAL utf8_decode_and_advance(PARROT_INTERP,
 144     ARGMOD(String_iter *i))
 145         __attribute__nonnull__(1)
 146         __attribute__nonnull__(2)
 147         FUNC_MODIFIES(*i);
 148
 149 PARROT_CANNOT_RETURN_NULL
 150 static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
 151         __attribute__nonnull__(1)
 152         __attribute__nonnull__(2);
 153
 154 static void utf8_encode_and_advance(PARROT_INTERP,
 155     ARGMOD(String_iter *i),
 156     UINTVAL c)
 157         __attribute__nonnull__(1)
 158         __attribute__nonnull__(2)
 159         FUNC_MODIFIES(*i);
 160
 161 static void utf8_set_position(SHIM_INTERP,
 162     ARGMOD(String_iter *i),
 163     UINTVAL pos)
 164         __attribute__nonnull__(2)
 165         FUNC_MODIFIES(*i);
 166
 167 PARROT_WARN_UNUSED_RESULT
 168 PARROT_CANNOT_RETURN_NULL
 169 static const void * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
 170         __attribute__nonnull__(1);
 171
 172 PARROT_CANNOT_RETURN_NULL
 173 static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
 174         __attribute__nonnull__(1);
 175
 176 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 177 /* HEADERIZER END: static */
 178
 179 #define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
 180     "unimpl utf8")
 181
 182 const char Parrot_utf8skip[256] = {
 183     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 184     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 185     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 186     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 187     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 188     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 189     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 190     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* ascii */
 191     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* bogus */
 192     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* bogus */
 193     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* bogus */
 194     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /* bogus */
 195     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,     /* scripts */
 196     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,     /* scripts */
 197     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,     /* cjk etc. */
 198     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6      /* cjk etc. */
 199 };
 200
 201 #if 0
 202 typedef unsigned char utf8_t;
 203 #endif
 204
 205 /*
 206
 207 =item C<static UINTVAL utf8_characters>
 208
 209 Returns the number of characters in the C<byte_len> bytes from C<*ptr>.
 210
 211 =cut
 212
 213 */
 214
 215 static UINTVAL
 216 utf8_characters(PARROT_INTERP, ARGIN(const utf8_t *ptr), UINTVAL byte_len)
 217 {
 218     const utf8_t *u8ptr = ptr;
 219     const utf8_t *u8end = u8ptr + byte_len;
 220     UINTVAL characters = 0;
 221
 222     while (u8ptr < u8end) {
 223         u8ptr += UTF8SKIP(u8ptr);
 224         characters++;
 225     }
 226
 227     if (u8ptr > u8end)
 228         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 229             "Unaligned end in UTF-8 string\n");
 230
 231     return characters;
 232 }
 233
 234 /*
 235
 236 =item C<static UINTVAL utf8_decode>
 237
 238 Returns the integer for the UTF-8 character found at C<*ptr>.
 239
 240 =cut
 241
 242 */
 243
 244 static UINTVAL
 245 utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
 246 {
 247     const utf8_t *u8ptr = ptr;
 248     UINTVAL c = *u8ptr;
 249
 250     if (UTF8_IS_START(c)) {
 251         UINTVAL len = UTF8SKIP(u8ptr);
 252         UINTVAL count;
 253
 254         c &= UTF8_START_MASK(len);
 255         for (count = 1; count < len; count++) {
 256             u8ptr++;
 257
 258             if (!UTF8_IS_CONTINUATION(*u8ptr))
 259                 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 260                     "Malformed UTF-8 string\n");
 261
 262             c = UTF8_ACCUMULATE(c, *u8ptr);
 263         }
 264
 265         if (UNICODE_IS_SURROGATE(c))
 266             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 267                 "Surrogate in UTF-8 string\n");
 268     }
 269     else if (!UNICODE_IS_INVARIANT(c)) {
 270         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 271             "Malformed UTF-8 string\n");
 272     }
 273
 274     return c;
 275 }
 276
 277 /*
 278
 279 =item C<static void * utf8_encode>
 280
 281 Returns the UTF-8 encoding of integer C<c>.
 282
 283 =cut
 284
 285 */
 286
 287 PARROT_CANNOT_RETURN_NULL
 288 static void *
 289 utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
 290 {
 291     const UINTVAL        len   = UNISKIP(c);
 292
 293     /* the const is good on u8ptr, but using ptr on other variables avoids the
 294      * need to do a yucky cast to remove constness */
 295     const utf8_t * const u8ptr = (utf8_t *)ptr;
 296     utf8_t              *u8end = (utf8_t *)ptr + len - 1;
 297
 298     if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
 299         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
 300                            "Invalid character for UTF-8 encoding\n");
 301     }
 302
 303     while (u8end > u8ptr) {
 304         *u8end-- =
 305             (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
 306         c >>= UTF8_ACCUMULATION_SHIFT;
 307     }
 308     *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));
 309
 310     return (utf8_t *)ptr + len;
 311 }
 312
 313 /*
 314
 315 =item C<static const void * utf8_skip_forward>
 316
 317 Moves C<ptr> C<n> characters forward.
 318
 319 =cut
 320
 321 */
 322
 323 PARROT_CANNOT_RETURN_NULL
 324 static const void *
 325 utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
 326 {
 327     const utf8_t *u8ptr = (const utf8_t *)ptr;
 328
 329     while (n-- > 0) {
 330         u8ptr += UTF8SKIP(u8ptr);
 331     }
 332
 333     return u8ptr;
 334 }
 335
 336 /*
 337
 338 =item C<static const void * utf8_skip_backward>
 339
 340 Moves C<ptr> C<n> characters back.
 341
 342 =cut
 343
 344 */
 345
 346 PARROT_WARN_UNUSED_RESULT
 347 PARROT_CANNOT_RETURN_NULL
 348 static const void *
 349 utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
 350 {
 351     const utf8_t *u8ptr = (const utf8_t *)ptr;
 352
 353     while (n-- > 0) {
 354         u8ptr--;
 355         while (UTF8_IS_CONTINUATION(*u8ptr))
 356             u8ptr--;
 357     }
 358
 359     return u8ptr;
 360 }
 361
 362 /*
 363
 364 =back
 365
 366 =head2 Iterator Functions
 367
 368 =over 4
 369
 370 =cut
 371
 372 */
 373
 374 /*
 375
 376 =item C<static UINTVAL utf8_decode_and_advance>
 377
 378 The UTF-8 implementation of the string iterator's C<get_and_advance>
 379 function.
 380
 381 */
 382
 383 static UINTVAL
 384 utf8_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
 385 {
 386     const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos);
 387     UINTVAL c = *u8ptr;
 388
 389     if (UTF8_IS_START(c)) {
 390         UINTVAL len = UTF8SKIP(u8ptr);
 391
 392         c &= UTF8_START_MASK(len);
 393         i->bytepos += len;
 394         for (len--; len; len--) {
 395             u8ptr++;
 396
 397             if (!UTF8_IS_CONTINUATION(*u8ptr))
 398                 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 399                     "Malformed UTF-8 string\n");
 400
 401             c = UTF8_ACCUMULATE(c, *u8ptr);
 402         }
 403
 404         if (UNICODE_IS_SURROGATE(c))
 405             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 406                 "Surrogate in UTF-8 string\n");
 407     }
 408     else if (!UNICODE_IS_INVARIANT(c)) {
 409         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
 410             "Malformed UTF-8 string\n");
 411     }
 412     else {
 413         i->bytepos++;
 414     }
 415
 416     i->charpos++;
 417     return c;
 418 }
 419
 420 /*
 421
 422 =item C<static void utf8_encode_and_advance>
 423
 424 The UTF-8 implementation of the string iterator's C<set_and_advance>
 425 function.
 426
 427 =cut
 428
 429 */
 430
 431 static void
 432 utf8_encode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL c)
 433 {
 434     const STRING * const s = i->str;
 435     unsigned char * const pos = (unsigned char *)s->strstart + i->bytepos;
 436     unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
 437
 438     i->bytepos += (new_pos - pos);
 439     /* XXX possible buffer overrun exception? */
 440     PARROT_ASSERT(i->bytepos <= PObj_buflen(s));
 441     i->charpos++;
 442 }
 443
 444 /*
 445
 446 =item C<static void utf8_set_position>
 447
 448 The UTF-8 implementation of the string iterator's C<set_position>
 449 function.
 450
 451 =cut
 452
 453 */
 454
 455 static void
 456 utf8_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL pos)
 457 {
 458     const utf8_t *u8ptr = (const utf8_t *)i->str->strstart;
 459
 460     /* start from last known charpos, if we can */
 461     if (i->charpos <= pos) {
 462         const UINTVAL old_pos = pos;
 463         pos       -= i->charpos;
 464         u8ptr     += i->bytepos;
 465         i->charpos = old_pos;
 466     }
 467     else
 468         i->charpos = pos;
 469
 470     while (pos-- > 0)
 471         u8ptr += UTF8SKIP(u8ptr);
 472
 473     i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
 474 }
 475
 476
 477 /*
 478
 479 =item C<static STRING * to_encoding>
 480
 481 Converts the string C<src> to this particular encoding.  If C<dest> is
 482 provided, it will contain the result.  Otherwise this function operates in
 483 place.
 484
 485 =cut
 486
 487 */
 488
 489 PARROT_CAN_RETURN_NULL
 490 static STRING *
 491 to_encoding(PARROT_INTERP, ARGMOD(STRING *src), ARGMOD_NULLOK(STRING *dest))
 492 {
 493     STRING *result;
 494     String_iter src_iter;
 495     UINTVAL offs, dest_len, dest_pos, src_len;
 496     const int in_place = (dest == NULL);
 497     unsigned char *new_pos, *pos, *p;
 498
 499     if (src->encoding == Parrot_utf8_encoding_ptr)
 500         return in_place ? src : string_copy(interp, src);
 501     src_len = src->strlen;
 502     if (in_place) {
 503         result = src;
 504     }
 505     else {
 506         result = dest;
 507     }
 508
 509     /* init iter before possilby changing encoding */
 510     ENCODING_ITER_INIT(interp, src, &src_iter);
 511     result->charset  = Parrot_unicode_charset_ptr;
 512     result->encoding = Parrot_utf8_encoding_ptr;
 513     result->strlen   = src_len;
 514
 515     if (!src->strlen)
 516         return dest;
 517
 518     if (in_place) {
 519         /* need intermediate memory */
 520         p = (unsigned char *)mem_sys_allocate(src_len);
 521     }
 522     else {
 523         Parrot_reallocate_string(interp, dest, src_len);
 524         p = (unsigned char *)dest->strstart;
 525     }
 526     if (src->charset == Parrot_ascii_charset_ptr) {
 527         for (dest_len = 0; dest_len < src_len; ++dest_len) {
 528             p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
 529         }
 530         result->bufused = dest_len;
 531     }
 532     else {
 533         dest_len = src_len;
 534         dest_pos = 0;
 535         for (offs = 0; offs < src_len; ++offs) {
 536             const UINTVAL c = src_iter.get_and_advance(interp, &src_iter);
 537             if (dest_len - dest_pos < 6) {
 538                 UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
 539                 if (need < 16)
 540                     need = 16;
 541                 dest_len += need;
 542                 if (in_place)
 543                     p = (unsigned char *)mem_sys_realloc(p, dest_len);
 544                 else {
 545                     result->bufused = dest_pos;
 546                     Parrot_reallocate_string(interp, dest, dest_len);
 547                     p = (unsigned char *)dest->strstart;
 548                 }
 549             }
 550
 551             pos = p + dest_pos;
 552             new_pos = (unsigned char *)utf8_encode(interp, pos, c);
 553             dest_pos += (new_pos - pos);
 554         }
 555         result->bufused = dest_pos;
 556     }
 557     if (in_place) {
 558         Parrot_reallocate_string(interp, src, src->bufused);
 559         memcpy(src->strstart, p, src->bufused);
 560         mem_sys_free(p);
 561     }
 562     return result;
 563 }
 564
 565 /*
 566
 567 =item C<static UINTVAL get_codepoint>
 568
 569 Returns the codepoint in string C<src> at position C<offset>.
 570
 571 =cut
 572
 573 */
 574
 575 static UINTVAL
 576 get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
 577 {
 578     const utf8_t * const start = (const utf8_t *)utf8_skip_forward(src->strstart, offset);
 579     return utf8_decode(interp, start);
 580 }
 581
 582 /*
 583
 584 =item C<static void set_codepoint>
 585
 586 Sets, in string C<src> at position C<offset>, the codepoint C<codepoint>.
 587
 588 =cut
 589
 590 */
 591
 592 static void
 593 set_codepoint(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL codepoint)
 594 {
 595     const void *start;
 596     void *p;
 597     DECL_CONST_CAST;
 598
 599     start = utf8_skip_forward(src->strstart, offset);
 600     p = PARROT_const_cast(void *, start);
 601     utf8_encode(interp, p, codepoint);
 602 }
 603
 604 /*
 605
 606 =item C<static UINTVAL get_byte>
 607
 608 Returns the byte in string C<src> at position C<offset>.
 609
 610 =cut
 611
 612 */
 613
 614 static UINTVAL
 615 get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset)
 616 {
 617     unsigned char *contents = (unsigned char *)src->strstart;
 618     if (offset >= src->bufused) {
 619 /*        Parrot_ex_throw_from_c_args(interp, NULL, 0,
 620                 "get_byte past the end of the buffer (%i of %i)",
 621                 offset, src->bufused); */
 622         return 0;
 623     }
 624     return contents[offset];
 625 }
 626
 627 /*
 628
 629 =item C<static void set_byte>
 630
 631 Sets, in string C<src> at position C<offset>, the byte C<byte>.
 632
 633 =cut
 634
 635 */
 636
 637 static void
 638 set_byte(PARROT_INTERP, ARGIN(const STRING *src),
 639         UINTVAL offset, UINTVAL byte)
 640 {
 641     unsigned char *contents;
 642
 643     if (offset >= src->bufused)
 644         Parrot_ex_throw_from_c_args(interp, NULL, 0,
 645             "set_byte past the end of the buffer");
 646
 647     contents = (unsigned char *)src->strstart;
 648     contents[offset] = (unsigned char)byte;
 649 }
 650
 651 /*
 652
 653 =item C<static STRING * get_codepoints>
 654
 655 Returns the codepoints in string C<src> at position C<offset> and length
 656 C<count>.
 657
 658 =cut
 659
 660 */
 661
 662 PARROT_CANNOT_RETURN_NULL
 663 static STRING *
 664 get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count)
 665 {
 666
 667     STRING * const return_string = Parrot_make_COW_reference(interp, src);
 668     String_iter    iter;
 669     UINTVAL        start;
 670
 671     iter_init(interp, src, &iter);
 672
 673     if (offset)
 674         iter.set_position(interp, &iter, offset);
 675
 676     start                   = iter.bytepos;
 677     return_string->strstart = (char *)return_string->strstart + start;
 678
 679     if (count)
 680         iter.set_position(interp, &iter, offset + count);
 681
 682     return_string->bufused  = iter.bytepos - start;
 683     return_string->strlen   = count;
 684     return_string->hashval  = 0;
 685
 686     return return_string;
 687 }
 688
 689 /*
 690
 691 =item C<static STRING * get_bytes>
 692
 693 Returns the bytes in string C<src> at position C<offset> and length C<count>.
 694
 695 =cut
 696
 697 */
 698
 699 PARROT_CANNOT_RETURN_NULL
 700 static STRING *
 701 get_bytes(PARROT_INTERP, ARGMOD(STRING *src), UINTVAL offset, UINTVAL count)
 702 {
 703     STRING * const return_string = Parrot_make_COW_reference(interp, src);
 704
 705     return_string->encoding = src->encoding;    /* XXX */
 706     return_string->charset = src->charset;
 707
 708     return_string->strstart = (char *)return_string->strstart + offset ;
 709     return_string->bufused = count;
 710
 711     return_string->strlen = count;
 712     return_string->hashval = 0;
 713
 714     return return_string;
 715 }
 716
 717 /*
 718
 719 =item C<static STRING * get_codepoints_inplace>
 720
 721 Gets from string C<src> at position C<offset> C<count> codepoints and returns
 722 them in C<return_string>.
 723
 724 =cut
 725
 726 */
 727
 728 PARROT_CANNOT_RETURN_NULL
 729 static STRING *
 730 get_codepoints_inplace(PARROT_INTERP, ARGMOD(STRING *src),
 731         UINTVAL offset, UINTVAL count, ARGMOD(STRING *return_string))
 732 {
 733     String_iter iter;
 734     UINTVAL start;
 735
 736     Parrot_reuse_COW_reference(interp, src, return_string);
 737     iter_init(interp, src, &iter);
 738     iter.set_position(interp, &iter, offset);
 739
 740     start = iter.bytepos;
 741
 742     return_string->strstart = (char *)return_string->strstart + start;
 743     iter.set_position(interp, &iter, offset + count);
 744
 745     return_string->bufused = iter.bytepos - start;
 746     return_string->strlen  = count;
 747     return_string->hashval = 0;
 748
 749     return return_string;
 750 }
 751
 752 /*
 753
 754 =item C<static STRING * get_bytes_inplace>
 755
 756 Gets from string C<src> at position C<offset> C<count> bytes and returns them
 757 in C<return_string>.
 758
 759 =cut
 760
 761 */
 762
 763 PARROT_CANNOT_RETURN_NULL
 764 static STRING *
 765 get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src),
 766         UINTVAL offset, UINTVAL count, SHIM(STRING *return_string))
 767 {
 768     UNIMPL;
 769 }
 770
 771 /*
 772
 773 =item C<static void set_codepoints>
 774
 775 Replaces in string C<src> at position C<offset> for C<count> codepoints with
 776 the contents of string C<new_codepoints>.
 777
 778 =cut
 779
 780 */
 781
 782 static void
 783 set_codepoints(PARROT_INTERP, SHIM(STRING *src),
 784         UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints))
 785 {
 786     UNIMPL;
 787 }
 788
 789 /*
 790
 791 =item C<static void set_bytes>
 792
 793 Replaces in string C<src> at position C<offset> for C<count> bytes with the
 794 contents of string C<new_bytes>.
 795
 796 =cut
 797
 798 */
 799
 800 static void
 801 set_bytes(PARROT_INTERP, SHIM(STRING *src),
 802         UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes))
 803 {
 804     UNIMPL;
 805 }
 806
 807 /*
 808
 809 =item C<static void become_encoding>
 810
 811 Unconditionally makes the string be in this encoding, if that's valid
 812
 813 =cut
 814
 815 */
 816
 817 static void
 818 become_encoding(PARROT_INTERP, SHIM(STRING *src))
 819 {
 820     UNIMPL;
 821 }
 822
 823
 824 /*
 825
 826 =item C<static UINTVAL codepoints>
 827
 828 Returns the number of codepoints in string C<src>.
 829
 830 =cut
 831
 832 */
 833
 834 static UINTVAL
 835 codepoints(PARROT_INTERP, ARGMOD(STRING *src))
 836 {
 837     String_iter iter;
 838     /*
 839      * this is used to initially calculate src->strlen,
 840      * therefore we must scan the whole string
 841      */
 842     iter_init(interp, src, &iter);
 843     while (iter.bytepos < src->bufused)
 844         iter.get_and_advance(interp, &iter);
 845     return iter.charpos;
 846 }
 847
 848 /*
 849
 850 =item C<static UINTVAL bytes>
 851
 852 Returns the number of bytes in string C<src>.
 853
 854 =cut
 855
 856 */
 857
 858 PARROT_PURE_FUNCTION
 859 static UINTVAL
 860 bytes(SHIM_INTERP, ARGIN(STRING *src))
 861 {
 862     return src->bufused;
 863 }
 864
 865 /*
 866
 867 =item C<static void iter_init>
 868
 869 Initializes for string C<src> the string iterator C<iter>.
 870
 871 =cut
 872
 873 */
 874
 875 static void
 876 iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
 877 {
 878     iter->str             = src;
 879     iter->bytepos         = 0;
 880     iter->charpos         = 0;
 881     iter->get_and_advance = utf8_decode_and_advance;
 882     iter->set_and_advance = utf8_encode_and_advance;
 883     iter->set_position    = utf8_set_position;
 884 }
 885
 886 /*
 887
 888 =item C<ENCODING * Parrot_encoding_utf8_init>
 889
 890 Initializes the UTF-8 encoding.
 891
 892 =cut
 893
 894 */
 895
 896 PARROT_CANNOT_RETURN_NULL
 897 ENCODING *
 898 Parrot_encoding_utf8_init(PARROT_INTERP)
 899 {
 900     ENCODING * const return_encoding = Parrot_new_encoding(interp);
 901
 902     static const ENCODING base_encoding = {
 903         "utf8",
 904         4, /* Max bytes per codepoint 0 .. 0x10ffff */
 905         to_encoding,
 906         get_codepoint,
 907         set_codepoint,
 908         get_byte,
 909         set_byte,
 910         get_codepoints,
 911         get_codepoints_inplace,
 912         get_bytes,
 913         get_bytes_inplace,
 914         set_codepoints,
 915         set_bytes,
 916         become_encoding,
 917         codepoints,
 918         bytes,
 919         iter_init
 920     };
 921     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
 922     Parrot_register_encoding(interp, "utf8", return_encoding);
 923     return return_encoding;
 924 }
 925
 926 /*
 927
 928 =back
 929
 930 =head1 SEE ALSO
 931
 932 F<src/encodings/fixed_8.c>,
 933 F<src/string.c>,
 934 F<include/parrot/string.h>,
 935 F<docs/string.pod>.
 936
 937 =cut
 938
 939 */
 940
 941
 942 /*
 943  * Local variables:
 944  *   c-file-style: "parrot"
 945  * End:
 946  * vim: expandtab shiftwidth=4:
 947  */