tagged release 0.6.4
[parrot.git] / src / encodings / ucs2.c
blobdf2e764914141da8aa6102d4802455b49ca1b68f
1 /*
2 Copyright (C) 2001-2008, The Perl Foundation.
3 $Id$
5 =head1 NAME
7 src/encodings/ucs2.c - UCS-2 encoding
9 =head1 DESCRIPTION
11 UCS-2 encoding with the help of the ICU library.
13 =head2 Functions
15 =over 4
17 =cut
21 #include "parrot/parrot.h"
22 #include "../unicode.h"
24 /* HEADERIZER HFILE: src/encodings/ucs2.h */
26 /* HEADERIZER BEGIN: static */
27 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
29 static void become_encoding(PARROT_INTERP, SHIM(STRING *src))
30 __attribute__nonnull__(1);
32 PARROT_WARN_UNUSED_RESULT
33 static UINTVAL bytes(PARROT_INTERP, ARGIN(STRING *src))
34 __attribute__nonnull__(1)
35 __attribute__nonnull__(2);
37 PARROT_WARN_UNUSED_RESULT
38 static UINTVAL codepoints(PARROT_INTERP, ARGIN(STRING *src))
39 __attribute__nonnull__(1)
40 __attribute__nonnull__(2);
42 static UINTVAL get_byte(PARROT_INTERP,
43 SHIM(const STRING *src),
44 UINTVAL offset)
45 __attribute__nonnull__(1);
47 PARROT_WARN_UNUSED_RESULT
48 PARROT_CANNOT_RETURN_NULL
49 static STRING * get_bytes(PARROT_INTERP,
50 SHIM(STRING *src),
51 UINTVAL offset,
52 UINTVAL count)
53 __attribute__nonnull__(1);
55 PARROT_WARN_UNUSED_RESULT
56 PARROT_CANNOT_RETURN_NULL
57 static STRING * get_bytes_inplace(PARROT_INTERP,
58 SHIM(STRING *src),
59 UINTVAL offset,
60 UINTVAL count,
61 SHIM(STRING *return_string))
62 __attribute__nonnull__(1);
64 static UINTVAL get_codepoint(PARROT_INTERP,
65 ARGIN(const STRING *src),
66 UINTVAL offset)
67 __attribute__nonnull__(1)
68 __attribute__nonnull__(2);
70 PARROT_WARN_UNUSED_RESULT
71 PARROT_CANNOT_RETURN_NULL
72 static STRING * get_codepoints(PARROT_INTERP,
73 ARGIN(STRING *src),
74 UINTVAL offset,
75 UINTVAL count)
76 __attribute__nonnull__(1)
77 __attribute__nonnull__(2);
79 PARROT_WARN_UNUSED_RESULT
80 PARROT_CANNOT_RETURN_NULL
81 static STRING * get_codepoints_inplace(PARROT_INTERP,
82 SHIM(STRING *src),
83 UINTVAL offset,
84 UINTVAL count,
85 SHIM(STRING *dest_string))
86 __attribute__nonnull__(1);
88 static void iter_init(PARROT_INTERP,
89 ARGIN(const STRING *src),
90 ARGOUT(String_iter *iter))
91 __attribute__nonnull__(1)
92 __attribute__nonnull__(2)
93 __attribute__nonnull__(3)
94 FUNC_MODIFIES(*iter);
96 static void set_byte(PARROT_INTERP,
97 SHIM(const STRING *src),
98 UINTVAL offset,
99 UINTVAL byte)
100 __attribute__nonnull__(1);
102 static void set_bytes(PARROT_INTERP,
103 SHIM(STRING *src),
104 UINTVAL offset,
105 UINTVAL count,
106 SHIM(STRING *new_bytes))
107 __attribute__nonnull__(1);
109 static void set_codepoint(PARROT_INTERP,
110 ARGIN(STRING *src),
111 UINTVAL offset,
112 UINTVAL codepoint)
113 __attribute__nonnull__(1)
114 __attribute__nonnull__(2);
116 static void set_codepoints(PARROT_INTERP,
117 SHIM(STRING *src),
118 UINTVAL offset,
119 UINTVAL count,
120 SHIM(STRING *new_codepoints))
121 __attribute__nonnull__(1);
123 PARROT_WARN_UNUSED_RESULT
124 PARROT_CANNOT_RETURN_NULL
125 static STRING * to_encoding(PARROT_INTERP,
126 ARGIN(STRING *src),
127 ARGMOD(STRING *dest))
128 __attribute__nonnull__(1)
129 __attribute__nonnull__(2)
130 __attribute__nonnull__(3)
131 FUNC_MODIFIES(*dest);
133 static UINTVAL ucs2_decode_and_advance(PARROT_INTERP,
134 ARGMOD(String_iter *i))
135 __attribute__nonnull__(1)
136 __attribute__nonnull__(2)
137 FUNC_MODIFIES(*i);
139 static void ucs2_encode_and_advance(PARROT_INTERP,
140 ARGMOD(String_iter *i),
141 UINTVAL c)
142 __attribute__nonnull__(1)
143 __attribute__nonnull__(2)
144 FUNC_MODIFIES(*i);
146 static void ucs2_set_position(SHIM_INTERP,
147 ARGMOD(String_iter *i),
148 UINTVAL n)
149 __attribute__nonnull__(2)
150 FUNC_MODIFIES(*i);
152 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
153 /* HEADERIZER END: static */
155 #include "ucs2.h"
157 #if PARROT_HAS_ICU
158 # include <unicode/ustring.h>
159 #endif
161 #define UNIMPL real_exception(interp, NULL, UNIMPLEMENTED, "unimpl ucs2")
167 =item C<static STRING * to_encoding>
169 Converts the string C<src> to this particular encoding. If C<dest> is
170 provided, it will contain the result. Otherwise this function operates in
171 place.
173 =cut
177 PARROT_WARN_UNUSED_RESULT
178 PARROT_CANNOT_RETURN_NULL
179 static STRING *
180 to_encoding(PARROT_INTERP, ARGIN(STRING *src), ARGMOD(STRING *dest))
182 STRING * const result =
183 Parrot_utf16_encoding_ptr->to_encoding(interp, src, dest);
185 * conversion to utf16 downgrads to ucs-2 if possible - check result
187 if (result->encoding == Parrot_utf16_encoding_ptr) {
188 real_exception(interp, NULL, E_UnicodeError,
189 "can't convert string with surrogates to ucs2");
191 return result;
196 =item C<static UINTVAL get_codepoint>
198 Returns the codepoint in string C<src> at position C<offset>.
200 =cut
204 static UINTVAL
205 get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
207 #if PARROT_HAS_ICU
208 UChar * const s = (UChar*) src->strstart;
209 return s[offset];
210 #else
211 real_exception(interp, NULL, E_LibraryNotLoadedError,
212 "no ICU lib loaded");
213 #endif
218 =item C<static void set_codepoint>
220 Sets, in string C<src> at position C<offset>, the codepoint C<codepoint>.
222 =cut
226 static void
227 set_codepoint(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL codepoint)
229 #if PARROT_HAS_ICU
230 UChar * const s = (UChar*) src->strstart;
231 s[offset] = codepoint;
232 #else
233 UNUSED(src);
234 real_exception(interp, NULL, E_LibraryNotLoadedError,
235 "no ICU lib loaded");
236 #endif
241 =item C<static UINTVAL get_byte>
243 Returns the byte in string C<src> at position C<offset>.
245 =cut
249 static UINTVAL
250 get_byte(PARROT_INTERP, SHIM(const STRING *src), UINTVAL offset)
252 UNIMPL;
257 =item C<static void set_byte>
259 Sets, in string C<src> at position C<offset>, the byte C<byte>.
261 =cut
265 static void
266 set_byte(PARROT_INTERP, SHIM(const STRING *src), UINTVAL offset, UINTVAL byte)
268 UNIMPL;
273 =item C<static STRING * get_codepoints>
275 Returns the codepoints in string C<src> at position C<offset> and length
276 C<count>.
278 =cut
282 PARROT_WARN_UNUSED_RESULT
283 PARROT_CANNOT_RETURN_NULL
284 static STRING *
285 get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count)
287 STRING * const return_string = Parrot_make_COW_reference(interp, src);
288 #if PARROT_HAS_ICU
289 return_string->strstart = (char*)src->strstart + offset * sizeof (UChar);
290 return_string->bufused = count * sizeof (UChar);
291 #else
293 String_iter iter;
294 UINTVAL start;
296 iter_init(interp, src, &iter);
297 iter.set_position(interp, &iter, offset);
298 start = iter.bytepos;
299 return_string->strstart = (char *)return_string->strstart + start;
300 iter.set_position(interp, &iter, offset + count);
301 return_string->bufused = iter.bytepos - start;
303 #endif
304 return_string->strlen = count;
305 return_string->hashval = 0;
306 return return_string;
311 =item C<static STRING * get_bytes>
313 Returns the bytes in string C<src> at position C<offset> and length C<count>.
315 =cut
319 PARROT_WARN_UNUSED_RESULT
320 PARROT_CANNOT_RETURN_NULL
321 static STRING *
322 get_bytes(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count)
324 UNIMPL;
330 =item C<static STRING * get_codepoints_inplace>
332 Gets from string C<src> at position C<offset> C<count> codepoints and returns
333 them in C<return_string>.
335 =cut
339 PARROT_WARN_UNUSED_RESULT
340 PARROT_CANNOT_RETURN_NULL
341 static STRING *
342 get_codepoints_inplace(PARROT_INTERP, SHIM(STRING *src),
343 UINTVAL offset, UINTVAL count, SHIM(STRING *dest_string))
346 UNIMPL;
351 =item C<static STRING * get_bytes_inplace>
353 Gets from string C<src> at position C<offset> C<count> bytes and returns them
354 in C<return_string>.
356 =cut
360 PARROT_WARN_UNUSED_RESULT
361 PARROT_CANNOT_RETURN_NULL
362 static STRING *
363 get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src),
364 UINTVAL offset, UINTVAL count, SHIM(STRING *return_string))
366 UNIMPL;
371 =item C<static void set_codepoints>
373 Replaces in string C<src> at position C<offset> for C<count> codepoints with
374 the contents of string C<new_codepoints>.
376 =cut
380 static void
381 set_codepoints(PARROT_INTERP, SHIM(STRING *src),
382 UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints))
384 UNIMPL;
389 =item C<static void set_bytes>
391 Replaces in string C<src> at position C<offset> for C<count> bytes with the
392 contents of string C<new_bytes>.
394 =cut
398 static void
399 set_bytes(PARROT_INTERP, SHIM(STRING *src),
400 UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes))
402 UNIMPL;
407 =item C<static void become_encoding>
409 Unconditionally makes the string be in this encoding, if that's valid
411 =cut
415 static void
416 become_encoding(PARROT_INTERP, SHIM(STRING *src))
418 UNIMPL;
424 =item C<static UINTVAL codepoints>
426 Returns the number of codepoints in string C<src>.
428 =cut
432 PARROT_WARN_UNUSED_RESULT
433 static UINTVAL
434 codepoints(PARROT_INTERP, ARGIN(STRING *src))
436 #if PARROT_HAS_ICU
437 return src->bufused / sizeof (UChar);
438 #else
439 real_exception(interp, NULL, E_LibraryNotLoadedError,
440 "no ICU lib loaded");
441 #endif
446 =item C<static UINTVAL bytes>
448 Returns the number of bytes in string C<src>.
450 =cut
454 PARROT_WARN_UNUSED_RESULT
455 static UINTVAL
456 bytes(PARROT_INTERP, ARGIN(STRING *src))
458 return src->bufused;
463 =item C<static UINTVAL ucs2_decode_and_advance>
465 Moves the string iterator C<i> to the next UCS-2 codepoint.
467 =cut
471 #if PARROT_HAS_ICU
472 static UINTVAL
473 ucs2_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
475 UChar * const s = (UChar*) i->str->strstart;
476 size_t pos = i->bytepos / sizeof (UChar);
478 /* TODO either make sure that we don't go past end or use SAFE
479 * iter versions
481 const UChar c = s[pos++];
482 i->charpos++;
483 i->bytepos = pos * sizeof (UChar);
484 return c;
489 =item C<static void ucs2_encode_and_advance>
491 With the string iterator C<i>, appends the codepoint C<c> and advances to the
492 next position in the string.
494 =cut
498 static void
499 ucs2_encode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL c)
501 UChar * const s = (UChar*) i->str->strstart;
502 UINTVAL pos = i->bytepos / sizeof (UChar);
503 s[pos++] = (UChar)c;
504 i->charpos++;
505 i->bytepos = pos * sizeof (UChar);
510 =item C<static void ucs2_set_position>
512 Moves the string iterator C<i> to the position C<n> in the string.
514 =cut
518 static void
519 ucs2_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n)
521 i->charpos = n;
522 i->bytepos = n * sizeof (UChar);
525 #endif
529 =item C<static void iter_init>
531 Initializes for string C<src> the string iterator C<iter>.
533 =cut
537 static void
538 iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
540 #if PARROT_HAS_ICU
541 iter->str = src;
542 iter->bytepos = 0;
543 iter->charpos = 0;
544 iter->get_and_advance = ucs2_decode_and_advance;
545 iter->set_and_advance = ucs2_encode_and_advance;
546 iter->set_position = ucs2_set_position;
547 #else
548 real_exception(interp, NULL, E_LibraryNotLoadedError,
549 "no ICU lib loaded");
550 #endif
555 =item C<ENCODING * Parrot_encoding_ucs2_init>
557 Initializes the UCS-2 encoding.
559 =cut
563 PARROT_CANNOT_RETURN_NULL
564 ENCODING *
565 Parrot_encoding_ucs2_init(PARROT_INTERP)
567 ENCODING * const return_encoding = Parrot_new_encoding(interp);
569 static const ENCODING base_encoding = {
570 "ucs2",
571 2, /* Max bytes per codepoint 0 .. 0x10ffff */
572 to_encoding,
573 get_codepoint,
574 set_codepoint,
575 get_byte,
576 set_byte,
577 get_codepoints,
578 get_codepoints_inplace,
579 get_bytes,
580 get_bytes_inplace,
581 set_codepoints,
582 set_bytes,
583 become_encoding,
584 codepoints,
585 bytes,
586 iter_init
588 STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
589 Parrot_register_encoding(interp, "ucs2", return_encoding);
590 return return_encoding;
595 =back
597 =head1 SEE ALSO
599 F<src/encodings/fixed_8.c>,
600 F<src/encodings/utf8.c>,
601 F<src/string.c>,
602 F<include/parrot/string.h>,
603 F<docs/string.pod>.
605 =cut
611 * Local variables:
612 * c-file-style: "parrot"
613 * End:
614 * vim: expandtab shiftwidth=4: