tagged release 0.6.4
[parrot.git] / src / string_primitives.c
blob55ee17c0f224ee71f6e96e01c9b5deb26d7d6157
1 /*
2 Copyright (C) 2006-2007, The Perl Foundation.
3 $Id$
5 =head1 NAME
7 src/string_primitives.c - String Primitives
9 =head1 DESCRIPTION
11 This file collects together all the functions that call into the ICU
12 API.
14 =head2 Functions
16 =over 4
18 =cut
22 /* HEADERIZER HFILE: include/parrot/string_primitives.h */
24 #include "parrot/parrot.h"
25 #if PARROT_HAS_ICU
26 # include <unicode/ucnv.h>
27 # include <unicode/utypes.h>
28 # include <unicode/uchar.h>
29 # include <unicode/ustring.h>
30 #else
31 # include <ctype.h>
32 #endif
36 =item C<void string_set_data_directory>
38 Set the directory where ICU finds its data files (encodings, locales,
39 etc.).
41 =cut
45 PARROT_API
46 void
47 string_set_data_directory(PARROT_INTERP, ARGIN(const char *dir))
49 #if PARROT_HAS_ICU
50 u_setDataDirectory(dir);
52 /* Since u_setDataDirectory doesn't have a result code, we'll spot
53 check that everything is okay by making sure that '9' had decimal
54 value 9. Using 57 rather than '9' so that the encoding of this
55 source code file isn't an issue.... (Don't want to get bitten by
56 EBCDIC.) */
58 if (!u_isdigit(57) || (u_charDigitValue(57) != 9)) {
59 real_exception(interp, NULL, ICU_ERROR,
60 "string_set_data_directory: ICU data files not found"
61 "(apparently) for directory [%s]", dir);
63 #else
64 UNUSED(dir);
66 real_exception(interp, NULL, ICU_ERROR,
67 "string_set_data_directory: parrot compiled without ICU support");
68 #endif
73 =item C<Parrot_UInt4 string_unescape_one>
75 Unescape a single character. We assume that we're at the start of a
76 sequence, right after the \.
78 =cut
82 PARROT_API
83 Parrot_UInt4
84 string_unescape_one(PARROT_INTERP, ARGMOD(UINTVAL *offset),
85 ARGMOD(STRING *string))
87 UINTVAL workchar = 0;
88 UINTVAL charcount = 0;
89 const UINTVAL len = string_length(interp, string);
90 /* Well, not right now */
91 UINTVAL codepoint = CHARSET_GET_BYTE(interp, string, *offset);
92 ++*offset;
93 switch (codepoint) {
94 case 'x':
95 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
96 if (codepoint >= '0' && codepoint <= '9') {
97 workchar = codepoint - '0';
99 else if (codepoint >= 'a' && codepoint <= 'f') {
100 workchar = codepoint - 'a' + 10;
102 else if (codepoint >= 'A' && codepoint <= 'F') {
103 workchar = codepoint - 'A' + 10;
105 else if (codepoint == '{') {
106 int i;
107 ++*offset;
108 workchar = 0;
109 for (i = 0; i < 8 && *offset < len; ++i, ++*offset) {
110 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
111 if (codepoint == '}') {
112 ++*offset;
113 return workchar;
115 workchar *= 16;
116 if (codepoint >= '0' && codepoint <= '9') {
117 workchar += codepoint - '0';
119 else if (codepoint >= 'a' && codepoint <= 'f') {
120 workchar += codepoint - 'a' + 10;
122 else if (codepoint >= 'A' && codepoint <= 'F') {
123 workchar += codepoint - 'A' + 10;
125 else {
126 real_exception(interp, NULL, UNIMPLEMENTED,
127 "Illegal escape sequence inside {}");
130 if (*offset == len)
131 real_exception(interp, NULL, UNIMPLEMENTED,
132 "Illegal escape sequence no '}'");
134 else {
135 real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in");
137 ++*offset;
138 if (*offset < len) {
139 workchar *= 16;
140 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
141 if (codepoint >= '0' && codepoint <= '9') {
142 workchar += codepoint - '0';
144 else if (codepoint >= 'a' && codepoint <= 'f') {
145 workchar += codepoint - 'a' + 10;
147 else if (codepoint >= 'A' && codepoint <= 'F') {
148 workchar += codepoint - 'A' + 10;
150 else {
151 return workchar;
154 else {
155 return workchar;
157 ++*offset;
158 return workchar;
159 case 'c':
160 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
161 if (codepoint >= 'A' && codepoint <= 'Z') {
162 workchar = codepoint - 'A' + 1;
164 else {
165 real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence");
167 ++*offset;
168 return workchar;
169 case 'u':
170 workchar = 0;
171 for (charcount = 0; charcount < 4; charcount++) {
172 if (*offset < len) {
173 workchar *= 16;
174 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
175 if (codepoint >= '0' && codepoint <= '9') {
176 workchar += codepoint - '0';
178 else if (codepoint >= 'a' && codepoint <= 'f') {
179 workchar += codepoint - 'a' + 10;
181 else if (codepoint >= 'A' && codepoint <= 'F') {
182 workchar += codepoint - 'A' + 10;
184 else {
185 real_exception(interp, NULL, UNIMPLEMENTED,
186 "Illegal escape sequence in uxxx escape");
189 else {
190 real_exception(interp, NULL, UNIMPLEMENTED,
191 "Illegal escape sequence in uxxx escape - too short");
193 ++*offset;
195 return workchar;
196 case 'U':
197 workchar = 0;
198 for (charcount = 0; charcount < 8; charcount++) {
199 if (*offset < len) {
200 workchar *= 16;
201 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
202 if (codepoint >= '0' && codepoint <= '9') {
203 workchar += codepoint - '0';
205 else if (codepoint >= 'a' && codepoint <= 'f') {
206 workchar += codepoint - 'a' + 10;
208 else if (codepoint >= 'A' && codepoint <= 'F') {
209 workchar += codepoint - 'A' + 10;
211 else {
212 real_exception(interp, NULL, UNIMPLEMENTED,
213 "Illegal escape sequence in Uxxx escape");
216 else {
217 real_exception(interp, NULL, UNIMPLEMENTED,
218 "Illegal escape sequence in uxxx escape - too short");
220 ++*offset;
222 return workchar;
223 case '0':
224 case '1':
225 case '2':
226 case '3':
227 case '4':
228 case '5':
229 case '6':
230 case '7':
231 workchar = codepoint - '0';
232 if (*offset < len) {
233 workchar *= 8;
234 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
235 if (codepoint >= '0' && codepoint <= '7') {
236 workchar += codepoint - '0';
238 else {
239 return workchar;
242 else {
243 return workchar;
245 ++*offset;
246 if (*offset < len) {
247 workchar *= 8;
248 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
249 if (codepoint >= '0' && codepoint <= '7') {
250 workchar += codepoint - '0';
252 else {
253 return workchar;
256 else {
257 return workchar;
259 ++*offset;
260 return workchar;
261 case 'a':
262 return 7; /* bell */
263 case 'b':
264 return 8; /* bs */
265 case 't':
266 return 9;
267 case 'n':
268 return 10;
269 case 'v':
270 return 11;
271 case 'f':
272 return 12;
273 case 'r':
274 return 13;
275 case 'e':
276 return 27;
277 case 92: /* \ */
278 return 92;
279 case '"':
280 return '"';
281 default:
282 return codepoint; /* any not special return the char */
288 =back
290 =head2 Character Property Functions
292 =over 4
294 =item C<INTVAL Parrot_char_digit_value>
296 Returns the decimal digit value of the specified character if it is a decimal
297 digit character. If not, then -1 is returned.
299 Note that as currently written, C<Parrot_char_digit_value()> can
300 correctly return the decimal digit value of characters for which
301 C<Parrot_char_is_digit()> returns false.
303 =cut
307 PARROT_API
308 PARROT_CONST_FUNCTION
309 INTVAL
310 Parrot_char_digit_value(SHIM_INTERP, UINTVAL character)
312 #if PARROT_HAS_ICU
313 return u_charDigitValue(character);
314 #else
315 if ((character >= 0x30) || (character <= 0x39))
316 return character - 0x30;
317 return -1;
318 #endif
323 =item C<char * str_dup>
325 Duplicate a C string. Just like strdup(), except it dies if it runs
326 out of memory.
328 =cut
332 PARROT_API
333 PARROT_MALLOC
334 PARROT_CANNOT_RETURN_NULL
335 char *
336 str_dup(ARGIN(const char *old))
338 const size_t bytes = strlen(old) + 1;
339 char * const copy = (char *)mem_sys_allocate(bytes);
340 memcpy(copy, old, bytes);
341 #ifdef MEMDEBUG
342 debug(interp, 1, "line %d str_dup %s [%x]\n", line, old, copy);
343 #endif
344 return copy;
349 =back
351 =head1 SEE ALSO
353 =over 4
355 =item F<include/parrot/string_primitives.h>
357 =item F<include/parrot/string.h>
359 =item F<src/string.c>
361 =back
363 =cut
369 * Local variables:
370 * c-file-style: "parrot"
371 * End:
372 * vim: expandtab shiftwidth=4: