2 Copyright (C) 2001-2008, The Perl Foundation.
7 src/encodings/ucs2.c - UCS-2 encoding
11 UCS-2 encoding with the help of the ICU library.
21 #include "parrot/parrot.h"
22 #include "../unicode.h"
24 /* HEADERIZER HFILE: src/encodings/ucs2.h */
26 /* HEADERIZER BEGIN: static */
27 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
29 static void become_encoding(PARROT_INTERP
, SHIM(STRING
*src
))
30 __attribute__nonnull__(1);
32 PARROT_WARN_UNUSED_RESULT
33 static UINTVAL
bytes(PARROT_INTERP
, ARGIN(STRING
*src
))
34 __attribute__nonnull__(1)
35 __attribute__nonnull__(2);
37 PARROT_WARN_UNUSED_RESULT
38 static UINTVAL
codepoints(PARROT_INTERP
, ARGIN(STRING
*src
))
39 __attribute__nonnull__(1)
40 __attribute__nonnull__(2);
42 static UINTVAL
get_byte(PARROT_INTERP
,
43 SHIM(const STRING
*src
),
45 __attribute__nonnull__(1);
47 PARROT_WARN_UNUSED_RESULT
48 PARROT_CANNOT_RETURN_NULL
49 static STRING
* get_bytes(PARROT_INTERP
,
53 __attribute__nonnull__(1);
55 PARROT_WARN_UNUSED_RESULT
56 PARROT_CANNOT_RETURN_NULL
57 static STRING
* get_bytes_inplace(PARROT_INTERP
,
61 SHIM(STRING
*return_string
))
62 __attribute__nonnull__(1);
64 static UINTVAL
get_codepoint(PARROT_INTERP
,
65 ARGIN(const STRING
*src
),
67 __attribute__nonnull__(1)
68 __attribute__nonnull__(2);
70 PARROT_WARN_UNUSED_RESULT
71 PARROT_CANNOT_RETURN_NULL
72 static STRING
* get_codepoints(PARROT_INTERP
,
76 __attribute__nonnull__(1)
77 __attribute__nonnull__(2);
79 PARROT_WARN_UNUSED_RESULT
80 PARROT_CANNOT_RETURN_NULL
81 static STRING
* get_codepoints_inplace(PARROT_INTERP
,
85 SHIM(STRING
*dest_string
))
86 __attribute__nonnull__(1);
88 static void iter_init(PARROT_INTERP
,
89 ARGIN(const STRING
*src
),
90 ARGOUT(String_iter
*iter
))
91 __attribute__nonnull__(1)
92 __attribute__nonnull__(2)
93 __attribute__nonnull__(3)
96 static void set_byte(PARROT_INTERP
,
97 SHIM(const STRING
*src
),
100 __attribute__nonnull__(1);
102 static void set_bytes(PARROT_INTERP
,
106 SHIM(STRING
*new_bytes
))
107 __attribute__nonnull__(1);
109 static void set_codepoint(PARROT_INTERP
,
113 __attribute__nonnull__(1)
114 __attribute__nonnull__(2);
116 static void set_codepoints(PARROT_INTERP
,
120 SHIM(STRING
*new_codepoints
))
121 __attribute__nonnull__(1);
123 PARROT_WARN_UNUSED_RESULT
124 PARROT_CANNOT_RETURN_NULL
125 static STRING
* to_encoding(PARROT_INTERP
,
127 ARGMOD(STRING
*dest
))
128 __attribute__nonnull__(1)
129 __attribute__nonnull__(2)
130 __attribute__nonnull__(3)
131 FUNC_MODIFIES(*dest
);
133 static UINTVAL
ucs2_decode_and_advance(PARROT_INTERP
,
134 ARGMOD(String_iter
*i
))
135 __attribute__nonnull__(1)
136 __attribute__nonnull__(2)
139 static void ucs2_encode_and_advance(PARROT_INTERP
,
140 ARGMOD(String_iter
*i
),
142 __attribute__nonnull__(1)
143 __attribute__nonnull__(2)
146 static void ucs2_set_position(SHIM_INTERP
,
147 ARGMOD(String_iter
*i
),
149 __attribute__nonnull__(2)
152 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
153 /* HEADERIZER END: static */
158 # include <unicode/ustring.h>
161 #define UNIMPL real_exception(interp, NULL, UNIMPLEMENTED, "unimpl ucs2")
167 =item C<static STRING * to_encoding>
169 Converts the string C<src> to this particular encoding. If C<dest> is
170 provided, it will contain the result. Otherwise this function operates in
177 PARROT_WARN_UNUSED_RESULT
178 PARROT_CANNOT_RETURN_NULL
180 to_encoding(PARROT_INTERP
, ARGIN(STRING
*src
), ARGMOD(STRING
*dest
))
182 STRING
* const result
=
183 Parrot_utf16_encoding_ptr
->to_encoding(interp
, src
, dest
);
185 * conversion to utf16 downgrads to ucs-2 if possible - check result
187 if (result
->encoding
== Parrot_utf16_encoding_ptr
) {
188 real_exception(interp
, NULL
, E_UnicodeError
,
189 "can't convert string with surrogates to ucs2");
196 =item C<static UINTVAL get_codepoint>
198 Returns the codepoint in string C<src> at position C<offset>.
205 get_codepoint(PARROT_INTERP
, ARGIN(const STRING
*src
), UINTVAL offset
)
208 UChar
* const s
= (UChar
*) src
->strstart
;
211 real_exception(interp
, NULL
, E_LibraryNotLoadedError
,
212 "no ICU lib loaded");
218 =item C<static void set_codepoint>
220 Sets, in string C<src> at position C<offset>, the codepoint C<codepoint>.
227 set_codepoint(PARROT_INTERP
, ARGIN(STRING
*src
), UINTVAL offset
, UINTVAL codepoint
)
230 UChar
* const s
= (UChar
*) src
->strstart
;
231 s
[offset
] = codepoint
;
234 real_exception(interp
, NULL
, E_LibraryNotLoadedError
,
235 "no ICU lib loaded");
241 =item C<static UINTVAL get_byte>
243 Returns the byte in string C<src> at position C<offset>.
250 get_byte(PARROT_INTERP
, SHIM(const STRING
*src
), UINTVAL offset
)
257 =item C<static void set_byte>
259 Sets, in string C<src> at position C<offset>, the byte C<byte>.
266 set_byte(PARROT_INTERP
, SHIM(const STRING
*src
), UINTVAL offset
, UINTVAL byte
)
273 =item C<static STRING * get_codepoints>
275 Returns the codepoints in string C<src> at position C<offset> and length
282 PARROT_WARN_UNUSED_RESULT
283 PARROT_CANNOT_RETURN_NULL
285 get_codepoints(PARROT_INTERP
, ARGIN(STRING
*src
), UINTVAL offset
, UINTVAL count
)
287 STRING
* const return_string
= Parrot_make_COW_reference(interp
, src
);
289 return_string
->strstart
= (char*)src
->strstart
+ offset
* sizeof (UChar
);
290 return_string
->bufused
= count
* sizeof (UChar
);
296 iter_init(interp
, src
, &iter
);
297 iter
.set_position(interp
, &iter
, offset
);
298 start
= iter
.bytepos
;
299 return_string
->strstart
= (char *)return_string
->strstart
+ start
;
300 iter
.set_position(interp
, &iter
, offset
+ count
);
301 return_string
->bufused
= iter
.bytepos
- start
;
304 return_string
->strlen
= count
;
305 return_string
->hashval
= 0;
306 return return_string
;
311 =item C<static STRING * get_bytes>
313 Returns the bytes in string C<src> at position C<offset> and length C<count>.
319 PARROT_WARN_UNUSED_RESULT
320 PARROT_CANNOT_RETURN_NULL
322 get_bytes(PARROT_INTERP
, SHIM(STRING
*src
), UINTVAL offset
, UINTVAL count
)
330 =item C<static STRING * get_codepoints_inplace>
332 Gets from string C<src> at position C<offset> C<count> codepoints and returns
333 them in C<return_string>.
339 PARROT_WARN_UNUSED_RESULT
340 PARROT_CANNOT_RETURN_NULL
342 get_codepoints_inplace(PARROT_INTERP
, SHIM(STRING
*src
),
343 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*dest_string
))
351 =item C<static STRING * get_bytes_inplace>
353 Gets from string C<src> at position C<offset> C<count> bytes and returns them
360 PARROT_WARN_UNUSED_RESULT
361 PARROT_CANNOT_RETURN_NULL
363 get_bytes_inplace(PARROT_INTERP
, SHIM(STRING
*src
),
364 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*return_string
))
371 =item C<static void set_codepoints>
373 Replaces in string C<src> at position C<offset> for C<count> codepoints with
374 the contents of string C<new_codepoints>.
381 set_codepoints(PARROT_INTERP
, SHIM(STRING
*src
),
382 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*new_codepoints
))
389 =item C<static void set_bytes>
391 Replaces in string C<src> at position C<offset> for C<count> bytes with the
392 contents of string C<new_bytes>.
399 set_bytes(PARROT_INTERP
, SHIM(STRING
*src
),
400 UINTVAL offset
, UINTVAL count
, SHIM(STRING
*new_bytes
))
407 =item C<static void become_encoding>
409 Unconditionally makes the string be in this encoding, if that's valid
416 become_encoding(PARROT_INTERP
, SHIM(STRING
*src
))
424 =item C<static UINTVAL codepoints>
426 Returns the number of codepoints in string C<src>.
432 PARROT_WARN_UNUSED_RESULT
434 codepoints(PARROT_INTERP
, ARGIN(STRING
*src
))
437 return src
->bufused
/ sizeof (UChar
);
439 real_exception(interp
, NULL
, E_LibraryNotLoadedError
,
440 "no ICU lib loaded");
446 =item C<static UINTVAL bytes>
448 Returns the number of bytes in string C<src>.
454 PARROT_WARN_UNUSED_RESULT
456 bytes(PARROT_INTERP
, ARGIN(STRING
*src
))
463 =item C<static UINTVAL ucs2_decode_and_advance>
465 Moves the string iterator C<i> to the next UCS-2 codepoint.
473 ucs2_decode_and_advance(PARROT_INTERP
, ARGMOD(String_iter
*i
))
475 UChar
* const s
= (UChar
*) i
->str
->strstart
;
476 size_t pos
= i
->bytepos
/ sizeof (UChar
);
478 /* TODO either make sure that we don't go past end or use SAFE
481 const UChar c
= s
[pos
++];
483 i
->bytepos
= pos
* sizeof (UChar
);
489 =item C<static void ucs2_encode_and_advance>
491 With the string iterator C<i>, appends the codepoint C<c> and advances to the
492 next position in the string.
499 ucs2_encode_and_advance(PARROT_INTERP
, ARGMOD(String_iter
*i
), UINTVAL c
)
501 UChar
* const s
= (UChar
*) i
->str
->strstart
;
502 UINTVAL pos
= i
->bytepos
/ sizeof (UChar
);
505 i
->bytepos
= pos
* sizeof (UChar
);
510 =item C<static void ucs2_set_position>
512 Moves the string iterator C<i> to the position C<n> in the string.
519 ucs2_set_position(SHIM_INTERP
, ARGMOD(String_iter
*i
), UINTVAL n
)
522 i
->bytepos
= n
* sizeof (UChar
);
529 =item C<static void iter_init>
531 Initializes for string C<src> the string iterator C<iter>.
538 iter_init(PARROT_INTERP
, ARGIN(const STRING
*src
), ARGOUT(String_iter
*iter
))
544 iter
->get_and_advance
= ucs2_decode_and_advance
;
545 iter
->set_and_advance
= ucs2_encode_and_advance
;
546 iter
->set_position
= ucs2_set_position
;
548 real_exception(interp
, NULL
, E_LibraryNotLoadedError
,
549 "no ICU lib loaded");
555 =item C<ENCODING * Parrot_encoding_ucs2_init>
557 Initializes the UCS-2 encoding.
563 PARROT_CANNOT_RETURN_NULL
565 Parrot_encoding_ucs2_init(PARROT_INTERP
)
567 ENCODING
* const return_encoding
= Parrot_new_encoding(interp
);
569 static const ENCODING base_encoding
= {
571 2, /* Max bytes per codepoint 0 .. 0x10ffff */
578 get_codepoints_inplace
,
588 STRUCT_COPY_FROM_STRUCT(return_encoding
, base_encoding
);
589 Parrot_register_encoding(interp
, "ucs2", return_encoding
);
590 return return_encoding
;
599 F<src/encodings/fixed_8.c>,
600 F<src/encodings/utf8.c>,
602 F<include/parrot/string.h>,
612 * c-file-style: "parrot"
614 * vim: expandtab shiftwidth=4: