2 Copyright (C) 2006-2007, The Perl Foundation.
7 src/string_primitives.c - String Primitives
11 This file collects together all the functions that call into the ICU
22 /* HEADERIZER HFILE: include/parrot/string_primitives.h */
24 #include "parrot/parrot.h"
26 # include <unicode/ucnv.h>
27 # include <unicode/utypes.h>
28 # include <unicode/uchar.h>
29 # include <unicode/ustring.h>
36 =item C<void string_set_data_directory>
38 Set the directory where ICU finds its data files (encodings, locales,
47 string_set_data_directory(PARROT_INTERP
, ARGIN(const char *dir
))
50 u_setDataDirectory(dir
);
52 /* Since u_setDataDirectory doesn't have a result code, we'll spot
53 check that everything is okay by making sure that '9' had decimal
54 value 9. Using 57 rather than '9' so that the encoding of this
55 source code file isn't an issue.... (Don't want to get bitten by
58 if (!u_isdigit(57) || (u_charDigitValue(57) != 9)) {
59 real_exception(interp
, NULL
, ICU_ERROR
,
60 "string_set_data_directory: ICU data files not found"
61 "(apparently) for directory [%s]", dir
);
66 real_exception(interp
, NULL
, ICU_ERROR
,
67 "string_set_data_directory: parrot compiled without ICU support");
73 =item C<Parrot_UInt4 string_unescape_one>
75 Unescape a single character. We assume that we're at the start of a
76 sequence, right after the \.
84 string_unescape_one(PARROT_INTERP
, ARGMOD(UINTVAL
*offset
),
85 ARGMOD(STRING
*string
))
88 UINTVAL charcount
= 0;
89 const UINTVAL len
= string_length(interp
, string
);
90 /* Well, not right now */
91 UINTVAL codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
95 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
96 if (codepoint
>= '0' && codepoint
<= '9') {
97 workchar
= codepoint
- '0';
99 else if (codepoint
>= 'a' && codepoint
<= 'f') {
100 workchar
= codepoint
- 'a' + 10;
102 else if (codepoint
>= 'A' && codepoint
<= 'F') {
103 workchar
= codepoint
- 'A' + 10;
105 else if (codepoint
== '{') {
109 for (i
= 0; i
< 8 && *offset
< len
; ++i
, ++*offset
) {
110 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
111 if (codepoint
== '}') {
116 if (codepoint
>= '0' && codepoint
<= '9') {
117 workchar
+= codepoint
- '0';
119 else if (codepoint
>= 'a' && codepoint
<= 'f') {
120 workchar
+= codepoint
- 'a' + 10;
122 else if (codepoint
>= 'A' && codepoint
<= 'F') {
123 workchar
+= codepoint
- 'A' + 10;
126 real_exception(interp
, NULL
, UNIMPLEMENTED
,
127 "Illegal escape sequence inside {}");
131 real_exception(interp
, NULL
, UNIMPLEMENTED
,
132 "Illegal escape sequence no '}'");
135 real_exception(interp
, NULL
, UNIMPLEMENTED
, "Illegal escape sequence in");
140 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
141 if (codepoint
>= '0' && codepoint
<= '9') {
142 workchar
+= codepoint
- '0';
144 else if (codepoint
>= 'a' && codepoint
<= 'f') {
145 workchar
+= codepoint
- 'a' + 10;
147 else if (codepoint
>= 'A' && codepoint
<= 'F') {
148 workchar
+= codepoint
- 'A' + 10;
160 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
161 if (codepoint
>= 'A' && codepoint
<= 'Z') {
162 workchar
= codepoint
- 'A' + 1;
165 real_exception(interp
, NULL
, UNIMPLEMENTED
, "Illegal escape sequence");
171 for (charcount
= 0; charcount
< 4; charcount
++) {
174 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
175 if (codepoint
>= '0' && codepoint
<= '9') {
176 workchar
+= codepoint
- '0';
178 else if (codepoint
>= 'a' && codepoint
<= 'f') {
179 workchar
+= codepoint
- 'a' + 10;
181 else if (codepoint
>= 'A' && codepoint
<= 'F') {
182 workchar
+= codepoint
- 'A' + 10;
185 real_exception(interp
, NULL
, UNIMPLEMENTED
,
186 "Illegal escape sequence in uxxx escape");
190 real_exception(interp
, NULL
, UNIMPLEMENTED
,
191 "Illegal escape sequence in uxxx escape - too short");
198 for (charcount
= 0; charcount
< 8; charcount
++) {
201 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
202 if (codepoint
>= '0' && codepoint
<= '9') {
203 workchar
+= codepoint
- '0';
205 else if (codepoint
>= 'a' && codepoint
<= 'f') {
206 workchar
+= codepoint
- 'a' + 10;
208 else if (codepoint
>= 'A' && codepoint
<= 'F') {
209 workchar
+= codepoint
- 'A' + 10;
212 real_exception(interp
, NULL
, UNIMPLEMENTED
,
213 "Illegal escape sequence in Uxxx escape");
217 real_exception(interp
, NULL
, UNIMPLEMENTED
,
218 "Illegal escape sequence in uxxx escape - too short");
231 workchar
= codepoint
- '0';
234 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
235 if (codepoint
>= '0' && codepoint
<= '7') {
236 workchar
+= codepoint
- '0';
248 codepoint
= CHARSET_GET_BYTE(interp
, string
, *offset
);
249 if (codepoint
>= '0' && codepoint
<= '7') {
250 workchar
+= codepoint
- '0';
282 return codepoint
; /* any not special return the char */
290 =head2 Character Property Functions
294 =item C<INTVAL Parrot_char_digit_value>
296 Returns the decimal digit value of the specified character if it is a decimal
297 digit character. If not, then -1 is returned.
299 Note that as currently written, C<Parrot_char_digit_value()> can
300 correctly return the decimal digit value of characters for which
301 C<Parrot_char_is_digit()> returns false.
308 PARROT_CONST_FUNCTION
310 Parrot_char_digit_value(SHIM_INTERP
, UINTVAL character
)
313 return u_charDigitValue(character
);
315 if ((character
>= 0x30) || (character
<= 0x39))
316 return character
- 0x30;
323 =item C<char * str_dup>
325 Duplicate a C string. Just like strdup(), except it dies if it runs
334 PARROT_CANNOT_RETURN_NULL
336 str_dup(ARGIN(const char *old
))
338 const size_t bytes
= strlen(old
) + 1;
339 char * const copy
= (char *)mem_sys_allocate(bytes
);
340 memcpy(copy
, old
, bytes
);
342 debug(interp
, 1, "line %d str_dup %s [%x]\n", line
, old
, copy
);
355 =item F<include/parrot/string_primitives.h>
357 =item F<include/parrot/string.h>
359 =item F<src/string.c>
370 * c-file-style: "parrot"
372 * vim: expandtab shiftwidth=4: