[parrot_debugger] Improve error checking of eval, add tests and untodo-ify tests...
[parrot.git] / src / string / api.c
blob425bb466aaead089fb88c55e63439a98a52fcc2d
1 /*
2 Copyright (C) 2001-2009, Parrot Foundation.
3 $Id$
5 =head1 NAME
7 src/string/api.c - Parrot Strings
9 =head1 DESCRIPTION
11 This file implements the non-ICU parts of the Parrot string subsystem.
13 Note that C<bufstart> and C<buflen> are used by the memory subsystem. The
14 string functions may only use C<buflen> to determine if there is some space
15 left beyond C<bufused>. This is the I<only> valid usage of these two data
16 members, beside setting C<bufstart>/C<buflen> for external strings.
18 =head2 Functions
20 =over 4
22 =cut
26 #include "parrot/parrot.h"
27 #include "parrot/compiler.h"
28 #include "parrot/string_funcs.h"
29 #include "private_cstring.h"
30 #include "api.str"
32 #define nonnull_encoding_name(s) (s) ? (s)->encoding->name : "null string"
33 #define saneify_string(s) \
34 PARROT_ASSERT((s)->encoding); \
35 PARROT_ASSERT((s)->charset); \
36 PARROT_ASSERT(!PObj_on_free_list_TEST(s))
38 /* HEADERIZER HFILE: include/parrot/string_funcs.h */
40 /* HEADERIZER BEGIN: static */
41 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
43 static void make_writable(PARROT_INTERP,
44 ARGMOD(STRING **s),
45 const size_t len,
46 parrot_string_representation_t representation)
47 __attribute__nonnull__(1)
48 __attribute__nonnull__(2)
49 FUNC_MODIFIES(*s);
51 #define ASSERT_ARGS_make_writable __attribute__unused__ int _ASSERT_ARGS_CHECK = \
52 PARROT_ASSERT_ARG(interp) \
53 || PARROT_ASSERT_ARG(s)
54 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
55 /* HEADERIZER END: static */
60 =item C<void Parrot_str_write_COW(PARROT_INTERP, STRING *s)>
62 If the specified Parrot string is copy-on-write then the memory is
63 copied over and the copy-on-write flag is cleared.
65 =cut
69 PARROT_EXPORT
70 void
71 Parrot_str_write_COW(PARROT_INTERP, ARGMOD(STRING *s))
73 ASSERT_ARGS(Parrot_str_write_COW)
75 /* COW_FLAG | constant_FLAG | external_FLAG) */
76 if (PObj_is_cowed_TESTALL(s)) {
77 STRING for_alloc;
79 /* Create new pool data for this header to use,
80 * independent of the original COW data */
81 PObj_constant_CLEAR(s);
83 /* constant may have been marked */
84 PObj_live_CLEAR(s);
87 * allocate a dummy strings memory
88 * buflen might be bigger and used, so pass this length
89 * also be sure not to allocate from the constant pool
91 PObj_flags_CLEARALL(&for_alloc);
92 Parrot_gc_allocate_string_storage(interp, &for_alloc, PObj_buflen(s));
94 /* now copy memory over */
95 mem_sys_memcopy(for_alloc.strstart, s->strstart, s->bufused);
97 /* and finally use that string memory */
99 PObj_bufstart(s) = PObj_bufstart(&for_alloc);
100 s->strstart = for_alloc.strstart;
101 PObj_buflen(s) = PObj_buflen(&for_alloc);
103 /* COW_FLAG | external_FLAG */
104 PObj_is_external_CLEARALL(s);
107 s->hashval = 0;
112 =item C<STRING * Parrot_str_new_COW(PARROT_INTERP, STRING *s)>
114 Creates a copy-on-write string, cloning a string header without
115 allocating a new buffer.
117 =cut
121 PARROT_EXPORT
122 PARROT_CANNOT_RETURN_NULL
123 PARROT_WARN_UNUSED_RESULT
124 STRING *
125 Parrot_str_new_COW(PARROT_INTERP, ARGMOD(STRING *s))
127 ASSERT_ARGS(Parrot_str_new_COW)
128 STRING *d;
130 if (PObj_constant_TEST(s)) {
131 d = Parrot_gc_new_string_header(interp,
132 PObj_get_FLAGS(s) & ~PObj_constant_FLAG);
133 PObj_COW_SET(s);
134 STRUCT_COPY(d, s);
135 /* we can't move the memory, because constants aren't
136 * scanned in compact_pool, therefore the other end
137 * would point to garbage.
139 PObj_constant_CLEAR(d);
140 PObj_external_SET(d);
142 else {
143 d = Parrot_gc_new_string_header(interp, PObj_get_FLAGS(s));
144 PObj_COW_SET(s);
145 STRUCT_COPY(d, s);
146 PObj_sysmem_CLEAR(d);
147 #if 0
148 /* XXX FIXME hack to avoid cross-interpreter issue until it
149 * is fixed correctly. */
150 if (n_interpreters > 1 && PObj_is_movable_TESTALL(s) &&
151 !Parrot_gc_ptr_in_memory_pool(interp, PObj_bufstart(s))) {
152 Parrot_str_write_COW(interp, d);
153 Parrot_io_eprintf(interp, "cross-interpreter copy of "
154 "relocatable string '%Ss' into tid %d\n",
156 interp->thread_data->tid);
158 #endif
160 return d;
165 =item C<STRING * Parrot_str_reuse_COW(PARROT_INTERP, STRING *s, STRING *d)>
167 Creates a copy-on-write string by cloning a string header without
168 allocating a new buffer. Doesn't allocate a new string header, instead
169 using the one passed in and returns it.
171 =cut
175 PARROT_EXPORT
176 PARROT_CANNOT_RETURN_NULL
177 STRING *
178 Parrot_str_reuse_COW(SHIM_INTERP, ARGMOD(STRING *s), ARGOUT(STRING *d))
180 ASSERT_ARGS(Parrot_str_reuse_COW)
182 if (PObj_constant_TEST(s)) {
183 PObj_COW_SET(s);
184 STRUCT_COPY(d, s);
185 PObj_constant_CLEAR(d);
186 PObj_external_SET(d);
188 else {
189 PObj_COW_SET(s);
190 STRUCT_COPY(d, s);
191 PObj_sysmem_CLEAR(d);
193 return d;
198 =item C<STRING * Parrot_str_set(PARROT_INTERP, STRING *dest, STRING *src)>
200 Makes the contents of first Parrot string a copy of the contents of
201 second.
203 =cut
207 PARROT_EXPORT
208 PARROT_CANNOT_RETURN_NULL
209 STRING *
210 Parrot_str_set(PARROT_INTERP, ARGIN_NULLOK(STRING *dest), ARGMOD(STRING *src))
212 ASSERT_ARGS(Parrot_str_set)
213 if (dest == src)
214 return dest;
215 if (dest) { /* && dest != src */
216 /* they are different, dest is not an external string */
217 #ifdef GC_IS_MALLOC
218 if (!PObj_is_cowed_TESTALL(dest) && PObj_bufstart(dest)) {
219 mem_sys_free(PObj_bufallocstart(dest));
221 #endif
222 dest = Parrot_str_reuse_COW(interp, src, dest);
224 else
225 dest = Parrot_str_new_COW(interp, src);
226 return dest;
232 =back
234 =head2 Basic String Functions
236 Creation, enlargement, etc.
238 =over 4
240 =item C<void Parrot_str_init(PARROT_INTERP)>
242 Initializes the Parrot string subsystem.
244 =cut
248 PARROT_EXPORT
249 void
250 Parrot_str_init(PARROT_INTERP)
252 ASSERT_ARGS(Parrot_str_init)
253 Hash *const_cstring_hash;
254 size_t i;
255 const size_t n_parrot_cstrings =
256 sizeof (parrot_cstrings) / sizeof (parrot_cstrings[0]);
258 if (interp->parent_interpreter) {
259 interp->hash_seed = interp->parent_interpreter->hash_seed;
261 else {
262 /* TT #64 - use an entropy source once available */
263 Parrot_srand(Parrot_intval_time());
264 interp->hash_seed = Parrot_uint_rand(0);
267 /* initialize the constant string table */
268 if (interp->parent_interpreter) {
269 interp->const_cstring_table =
270 interp->parent_interpreter->const_cstring_table;
271 interp->const_cstring_hash =
272 interp->parent_interpreter->const_cstring_hash;
273 return;
276 /* Set up the cstring cache, then load the basic encodings and charsets */
277 const_cstring_hash = parrot_new_cstring_hash(interp);
278 interp->const_cstring_hash = const_cstring_hash;
279 Parrot_charsets_encodings_init(interp);
281 interp->const_cstring_table =
282 mem_allocate_n_zeroed_typed(n_parrot_cstrings, STRING *);
284 for (i = 0; i < n_parrot_cstrings; ++i) {
285 DECL_CONST_CAST;
286 STRING * const s =
287 Parrot_str_new_init(interp,
288 parrot_cstrings[i].string,
289 parrot_cstrings[i].len,
290 PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
291 PObj_external_FLAG|PObj_constant_FLAG);
292 parrot_hash_put(interp, const_cstring_hash,
293 PARROT_const_cast(char *, parrot_cstrings[i].string), (void *)s);
294 interp->const_cstring_table[i] = s;
300 =item C<void Parrot_str_finish(PARROT_INTERP)>
302 De-Initializes the Parrot string subsystem.
304 =cut
308 PARROT_EXPORT
309 void
310 Parrot_str_finish(PARROT_INTERP)
312 ASSERT_ARGS(Parrot_str_finish)
313 /* all are shared between interpreters */
314 if (!interp->parent_interpreter) {
315 mem_sys_free(interp->const_cstring_table);
316 interp->const_cstring_table = NULL;
317 Parrot_charsets_encodings_deinit(interp);
318 parrot_hash_destroy(interp, interp->const_cstring_hash);
324 =item C<UINTVAL string_capacity(PARROT_INTERP, const STRING *s)>
326 Returns the capacity of the specified Parrot string in bytes, that
327 is how many bytes can be appended onto strstart.
329 =cut
333 PARROT_EXPORT
334 PARROT_WARN_UNUSED_RESULT
335 PARROT_PURE_FUNCTION
336 UINTVAL
337 string_capacity(SHIM_INTERP, ARGIN(const STRING *s))
339 ASSERT_ARGS(string_capacity)
341 return ((ptrcast_t)PObj_bufstart(s) + PObj_buflen(s) -
342 (ptrcast_t)s->strstart);
347 =item C<STRING * Parrot_str_new_noinit(PARROT_INTERP,
348 parrot_string_representation_t representation, UINTVAL capacity)>
350 Creates and returns an empty Parrot string.
352 =cut
356 PARROT_EXPORT
357 PARROT_CANNOT_RETURN_NULL
358 STRING *
359 Parrot_str_new_noinit(PARROT_INTERP,
360 parrot_string_representation_t representation, UINTVAL capacity)
362 ASSERT_ARGS(Parrot_str_new_noinit)
363 STRING * const s = Parrot_gc_new_string_header(interp, 0);
365 /* TODO adapt string creation functions */
366 if (representation != enum_stringrep_one)
367 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARTYPE,
368 "Unsupported representation");
370 s->charset = PARROT_DEFAULT_CHARSET;
371 s->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, s);
373 Parrot_gc_allocate_string_storage(interp, s,
374 (size_t)string_max_bytes(interp, s, capacity));
376 return s;
381 =item C<const CHARSET * string_rep_compatible(PARROT_INTERP, const STRING *a,
382 const STRING *b, const ENCODING **e)>
384 Find the "lowest" possible charset and encoding for the given string. E.g.
386 ascii <op> utf8 => utf8
387 => ascii, B<if> C<STRING *b> has ascii chars only.
389 Returs NULL, if no compatible string representation can be found.
391 =cut
395 PARROT_EXPORT
396 PARROT_WARN_UNUSED_RESULT
397 PARROT_CAN_RETURN_NULL
398 const CHARSET *
399 string_rep_compatible(SHIM_INTERP,
400 ARGIN(const STRING *a), ARGIN(const STRING *b), ARGOUT(const ENCODING **e))
402 ASSERT_ARGS(string_rep_compatible)
403 if (a->encoding == b->encoding && a->charset == b->charset) {
404 *e = a->encoding;
405 return a->charset;
408 /* a table could possibly simplify the logic */
409 if (a->encoding == Parrot_utf8_encoding_ptr &&
410 b->charset == Parrot_ascii_charset_ptr) {
411 if (a->strlen == a->bufused) {
412 *e = Parrot_fixed_8_encoding_ptr;
413 return b->charset;
415 *e = a->encoding;
416 return a->charset;
418 if (b->encoding == Parrot_utf8_encoding_ptr &&
419 a->charset == Parrot_ascii_charset_ptr) {
420 if (b->strlen == b->bufused) {
421 *e = Parrot_fixed_8_encoding_ptr;
422 return a->charset;
424 *e = b->encoding;
425 return b->charset;
427 if (a->encoding != b->encoding)
428 return NULL;
429 if (a->encoding != Parrot_fixed_8_encoding_ptr)
430 return NULL;
431 *e = Parrot_fixed_8_encoding_ptr;
432 if (a->charset == b->charset)
433 return a->charset;
434 if (b->charset == Parrot_ascii_charset_ptr)
435 return a->charset;
436 if (a->charset == Parrot_ascii_charset_ptr)
437 return b->charset;
438 if (a->charset == Parrot_binary_charset_ptr)
439 return a->charset;
440 if (b->charset == Parrot_binary_charset_ptr)
441 return b->charset;
442 return NULL;
447 =item C<STRING * Parrot_str_concat(PARROT_INTERP, STRING *a, STRING *b, UINTVAL
448 Uflags)>
450 Concatenates two Parrot strings. If necessary, converts the second
451 string's encoding and/or type to match those of the first string. If
452 either string is C<NULL>, then a copy of the non-C<NULL> string is
453 returned. If both strings are C<NULL>, then a new zero-length string is
454 created and returned.
456 =cut
460 PARROT_EXPORT
461 PARROT_CANNOT_RETURN_NULL
462 STRING *
463 Parrot_str_concat(PARROT_INTERP, ARGIN_NULLOK(STRING *a),
464 ARGIN_NULLOK(STRING *b), UINTVAL Uflags)
466 ASSERT_ARGS(Parrot_str_concat)
467 if (a != NULL && a->strlen != 0) {
468 if (b != NULL && b->strlen != 0) {
469 const ENCODING *enc;
470 const CHARSET *cs = string_rep_compatible(interp, a, b, &enc);
471 STRING *result;
473 if (!cs) {
474 cs = a->charset;
475 enc = a->encoding;
477 result = Parrot_str_new_init(interp, NULL, a->bufused + b->bufused,
478 enc, cs, 0);
480 result = Parrot_str_append(interp, result, a);
481 result = Parrot_str_append(interp, result, b);
483 return result;
486 return Parrot_str_copy(interp, a);
489 return b
490 ? Parrot_str_copy(interp, b)
491 : string_make(interp, NULL, 0, NULL, Uflags);
497 =item C<STRING * Parrot_str_append(PARROT_INTERP, STRING *a, STRING *b)>
499 Take in two Parrot strings and append the second to the first. NOTE THAT
500 RETURN VALUE MAY NOT BE THE FIRST STRING, if the first string is COW'd or
501 read-only. So make sure to _use_ the return value.
503 =cut
507 PARROT_EXPORT
508 PARROT_WARN_UNUSED_RESULT
509 PARROT_CAN_RETURN_NULL
510 STRING *
511 Parrot_str_append(PARROT_INTERP, ARGMOD_NULLOK(STRING *a), ARGIN_NULLOK(STRING *b))
513 ASSERT_ARGS(Parrot_str_append)
514 UINTVAL a_capacity;
515 UINTVAL total_length;
516 const CHARSET *cs;
517 const ENCODING *enc;
519 /* XXX should this be a CHARSET method? */
521 /* If B isn't real, we just bail */
522 const UINTVAL b_len = b ? Parrot_str_byte_length(interp, b) : 0;
523 if (!b_len)
524 return a;
526 /* Is A real? */
527 if (a == NULL || PObj_bufstart(a) == NULL)
528 return Parrot_str_copy(interp, b);
530 saneify_string(a);
531 saneify_string(b);
533 /* If the destination's constant, or external then just fall back to
534 Parrot_str_concat */
535 if (PObj_is_cowed_TESTALL(a))
536 return Parrot_str_concat(interp, a, b, 0);
538 cs = string_rep_compatible(interp, a, b, &enc);
539 if (cs) {
540 a->charset = cs;
541 a->encoding = enc;
543 else {
544 /* upgrade strings for concatenation */
545 enc = (a->encoding == Parrot_utf16_encoding_ptr ||
546 b->encoding == Parrot_utf16_encoding_ptr ||
547 a->encoding == Parrot_ucs2_encoding_ptr ||
548 b->encoding == Parrot_ucs2_encoding_ptr)
549 ? Parrot_utf16_encoding_ptr
550 : Parrot_utf8_encoding_ptr;
552 Parrot_unicode_charset_ptr->to_charset(interp, a, NULL);
553 b = Parrot_unicode_charset_ptr->to_charset(interp, b,
554 Parrot_gc_new_string_header(interp, 0));
556 if (a->encoding != enc)
557 enc->to_encoding(interp, a, NULL);
558 if (b->encoding != enc)
559 enc->to_encoding(interp, b, NULL);
562 /* calc usable and total bytes */
563 a_capacity = string_capacity(interp, a);
564 total_length = a->bufused + b->bufused;
566 /* make sure A's big enough for both */
567 if (total_length > a_capacity)
568 Parrot_gc_reallocate_string_storage(interp, a, total_length << 1);
570 /* A is now ready to receive the contents of B */
572 /* Tack B on the end of A */
573 mem_sys_memcopy((void *)((ptrcast_t)a->strstart + a->bufused),
574 b->strstart, b->bufused);
576 a->bufused += b->bufused;
577 a->strlen += b_len;
578 a->hashval = 0;
580 return a;
585 =item C<STRING * Parrot_str_new(PARROT_INTERP, const char * const buffer, const
586 UINTVAL len)>
588 Make a Parrot string from a specified C string.
590 =cut
594 PARROT_EXPORT
595 PARROT_WARN_UNUSED_RESULT
596 PARROT_MALLOC
597 PARROT_CANNOT_RETURN_NULL
598 STRING *
599 Parrot_str_new(PARROT_INTERP, ARGIN_NULLOK(const char * const buffer), const UINTVAL len)
601 ASSERT_ARGS(Parrot_str_new)
602 return Parrot_str_new_init(interp, buffer, len ? len :
603 buffer ? strlen(buffer) : 0,
604 PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
605 0); /* Force an 8-bit encoding at some
606 point? */
611 =item C<const char* string_primary_encoding_for_representation(PARROT_INTERP,
612 parrot_string_representation_t representation)>
614 Returns the primary encoding for the specified representation.
616 This is needed for packfile unpacking, unless we just always use UTF-8 or BOCU.
618 =cut
622 PARROT_EXPORT
623 PARROT_CANNOT_RETURN_NULL
624 PARROT_OBSERVER
625 const char*
626 string_primary_encoding_for_representation(PARROT_INTERP,
627 parrot_string_representation_t representation)
629 ASSERT_ARGS(string_primary_encoding_for_representation)
630 if (representation == enum_stringrep_one)
631 return "ascii";
633 Parrot_ex_throw_from_c_args(interp, NULL,
634 EXCEPTION_INVALID_STRING_REPRESENTATION,
635 "string_primary_encoding_for_representation: "
636 "invalid string representation");
641 =item C<STRING * Parrot_str_new_constant(PARROT_INTERP, const char *buffer)>
643 Creates and returns a constant Parrot string.
645 =cut
649 PARROT_EXPORT
650 PARROT_WARN_UNUSED_RESULT
651 PARROT_CANNOT_RETURN_NULL
652 STRING *
653 Parrot_str_new_constant(PARROT_INTERP, ARGIN(const char *buffer))
655 ASSERT_ARGS(Parrot_str_new_constant)
656 DECL_CONST_CAST;
657 STRING *s;
658 Hash * const cstring_cache = (Hash *)interp->const_cstring_hash;
660 s = (STRING *)parrot_hash_get(interp, cstring_cache, buffer);
662 if (s)
663 return s;
665 s = Parrot_str_new_init(interp, buffer, strlen(buffer),
666 PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
667 PObj_external_FLAG|PObj_constant_FLAG);
669 parrot_hash_put(interp, cstring_cache,
670 PARROT_const_cast(char *, buffer), (void *)s);
672 return s;
678 =item C<STRING * string_make(PARROT_INTERP, const char *buffer, UINTVAL len,
679 const char *charset_name, UINTVAL flags)>
681 Creates and returns a new Parrot string using C<len> bytes of string data read
682 from C<buffer>.
684 The value of C<charset_name> specifies the string's representation.
685 The currently recognised values are:
687 'iso-8859-1'
688 'ascii'
689 'binary'
690 'unicode'
692 The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
693 and the other three assume C<fixed-8> encoding.
695 If C<charset> is unspecified, the default charset 'ascii' will be used.
697 The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
698 together.
700 =cut
704 PARROT_EXPORT
705 PARROT_WARN_UNUSED_RESULT
706 PARROT_CANNOT_RETURN_NULL
707 STRING *
708 string_make(PARROT_INTERP, ARGIN_NULLOK(const char *buffer),
709 UINTVAL len, ARGIN_NULLOK(const char *charset_name), UINTVAL flags)
711 ASSERT_ARGS(string_make)
712 const CHARSET *charset;
714 if (!charset_name)
715 charset_name = "ascii";
717 charset = Parrot_find_charset(interp, charset_name);
719 if (!charset)
720 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
721 "Can't make '%s' charset strings", charset_name);
723 return Parrot_str_new_init(interp, buffer, len,
724 charset->preferred_encoding, charset, flags);
731 =item C<STRING * string_make_from_charset(PARROT_INTERP, const char *buffer,
732 UINTVAL len, INTVAL charset_nr, UINTVAL flags)>
734 Creates and returns a new Parrot string using C<len> bytes of string data read
735 from C<buffer>.
737 The value of C<charset_name> specifies the string's representation. It must be
738 a valid charset identifier.
740 'iso-8859-1'
741 'ascii'
742 'binary'
743 'unicode'
745 The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
746 and the other three assume C<fixed-8> encoding.
748 The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
749 together.
751 =cut
755 PARROT_EXPORT
756 PARROT_WARN_UNUSED_RESULT
757 PARROT_CANNOT_RETURN_NULL
758 STRING *
759 string_make_from_charset(PARROT_INTERP, ARGIN_NULLOK(const char *buffer),
760 UINTVAL len, INTVAL charset_nr, UINTVAL flags)
762 ASSERT_ARGS(string_make_from_charset)
763 const CHARSET *charset = Parrot_get_charset(interp, charset_nr);
765 if (!charset)
766 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
767 "Invalid charset number '%d' specified", charset_nr);
769 return Parrot_str_new_init(interp, buffer, len,
770 charset->preferred_encoding, charset, flags);
776 =item C<STRING * Parrot_str_new_init(PARROT_INTERP, const char *buffer, UINTVAL
777 len, const ENCODING *encoding, const CHARSET *charset, UINTVAL flags)>
779 Given a buffer, its length, an encoding, a character set, and STRING flags,
780 creates and returns a new string. Don't call this directly.
782 =cut
786 PARROT_EXPORT
787 PARROT_WARN_UNUSED_RESULT
788 PARROT_CANNOT_RETURN_NULL
789 STRING *
790 Parrot_str_new_init(PARROT_INTERP, ARGIN_NULLOK(const char *buffer), UINTVAL len,
791 ARGIN(const ENCODING *encoding), ARGIN(const CHARSET *charset), UINTVAL flags)
793 ASSERT_ARGS(Parrot_str_new_init)
794 DECL_CONST_CAST;
795 STRING * const s = Parrot_gc_new_string_header(interp, flags);
796 s->encoding = encoding;
797 s->charset = charset;
799 if (flags & PObj_external_FLAG) {
801 * fast path for external (constant) strings - don't allocate
802 * and copy data
804 /* The following cast discards the 'const'. That raises
805 a warning with gcc, but is ok since the caller indicated
806 it was safe by setting PObj_external_FLAG.
807 (The cast is necessary to pacify TenDRA's tcc.)
809 PObj_bufstart(s) = s->strstart = PARROT_const_cast(char *, buffer);
810 PObj_buflen(s) = s->bufused = len;
812 if (encoding == Parrot_fixed_8_encoding_ptr)
813 s->strlen = len;
814 else
815 Parrot_str_length(interp, s);
817 return s;
820 Parrot_gc_allocate_string_storage(interp, s, len);
822 if (buffer) {
823 mem_sys_memcopy(s->strstart, buffer, len);
824 s->bufused = len;
825 if (encoding == Parrot_fixed_8_encoding_ptr)
826 s->strlen = len;
827 else
828 Parrot_str_length(interp, s);
830 else {
831 s->strlen = s->bufused = 0;
834 return s;
840 =item C<STRING * Parrot_str_resize(PARROT_INTERP, STRING *s, UINTVAL addlen)>
842 Grows the Parrot string's buffer by the specified number of characters.
844 =cut
848 PARROT_EXPORT
849 PARROT_CANNOT_RETURN_NULL
850 STRING *
851 Parrot_str_resize(PARROT_INTERP, ARGMOD(STRING *s), UINTVAL addlen)
853 ASSERT_ARGS(Parrot_str_resize)
854 Parrot_str_write_COW(interp, s);
856 /* Don't check buflen, if we are here, we already checked. */
857 Parrot_gc_reallocate_string_storage(interp,
858 s, PObj_buflen(s) + string_max_bytes(interp, s, addlen));
859 return s;
865 =back
867 =head2 Ordinary user-visible string operations
869 =over 4
871 =item C<UINTVAL Parrot_str_byte_length(PARROT_INTERP, const STRING *s)>
873 Returns the number of characters in the specified Parrot string.
875 =cut
879 PARROT_EXPORT
880 PARROT_PURE_FUNCTION
881 UINTVAL
882 Parrot_str_byte_length(SHIM_INTERP, ARGIN(const STRING *s))
884 ASSERT_ARGS(Parrot_str_byte_length)
886 return s->strlen;
892 =item C<INTVAL Parrot_str_indexed(PARROT_INTERP, const STRING *s, UINTVAL idx)>
894 Returns the character (or glyph, depending upon the string's encoding). This
895 abstracts the process of finding the Nth character in a (possibly Unicode or
896 JIS-encoded) string, the idea being that once the encoding functions are
897 fleshed out, this function can do the right thing.
899 Note that this is not range-checked.
901 =cut
905 PARROT_EXPORT
906 PARROT_WARN_UNUSED_RESULT
907 INTVAL
908 Parrot_str_indexed(PARROT_INTERP, ARGIN(const STRING *s), UINTVAL idx)
910 ASSERT_ARGS(Parrot_str_indexed)
911 saneify_string(s);
912 return (INTVAL)CHARSET_GET_CODEPOINT(interp, s, idx);
918 =item C<INTVAL Parrot_str_find_index(PARROT_INTERP, const STRING *s, const
919 STRING *s2, INTVAL start)>
921 Returns the character position of the second Parrot string in the first at or
922 after C<start>. The return value is a (0 based) offset in characters, not
923 bytes. If second string is not found in the first string, returns -1.
925 =cut
929 PARROT_EXPORT
930 PARROT_WARN_UNUSED_RESULT
931 INTVAL
932 Parrot_str_find_index(PARROT_INTERP, ARGIN(const STRING *s),
933 ARGIN(const STRING *s2), INTVAL start)
935 ASSERT_ARGS(Parrot_str_find_index)
936 STRING *src, *search;
937 UINTVAL len;
938 DECL_CONST_CAST;
940 if (start < 0)
941 return -1;
943 len = Parrot_str_byte_length(interp, s);
945 if (!len)
946 return -1;
948 if (start >= (INTVAL)len)
949 return -1;
951 if (!Parrot_str_byte_length(interp, s2))
952 return -1;
954 src = PARROT_const_cast(STRING *, s);
955 search = PARROT_const_cast(STRING *, s2);
957 return CHARSET_INDEX(interp, src, search, (UINTVAL)start);
963 =item C<INTVAL string_ord(PARROT_INTERP, const STRING *s, INTVAL idx)>
965 Returns the codepoint at a given index into a string. Negative indexes are
966 treated as counting from the end of the string.
968 =cut
972 PARROT_EXPORT
973 PARROT_WARN_UNUSED_RESULT
974 INTVAL
975 string_ord(PARROT_INTERP, ARGIN_NULLOK(const STRING *s), INTVAL idx)
977 ASSERT_ARGS(string_ord)
978 const UINTVAL len = s ? Parrot_str_byte_length(interp, s) : 0;
979 UINTVAL true_index = (UINTVAL)idx;
981 if (len == 0)
982 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ORD_OUT_OF_STRING,
983 "Cannot get character of empty string");
985 if (idx < 0) {
986 if ((INTVAL)(idx + len) < 0)
987 Parrot_ex_throw_from_c_args(interp, NULL,
988 EXCEPTION_ORD_OUT_OF_STRING,
989 "Cannot get character before beginning of string");
991 true_index = (UINTVAL)(len + idx);
994 if (true_index > (len - 1))
995 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ORD_OUT_OF_STRING,
996 "Cannot get character past end of string");
998 return Parrot_str_indexed(interp, s, true_index);
1004 =item C<STRING * string_chr(PARROT_INTERP, UINTVAL character)>
1006 Returns a single-character Parrot string.
1008 TODO - Allow this to take an array of characters?
1010 =cut
1014 PARROT_EXPORT
1015 PARROT_CANNOT_RETURN_NULL
1016 PARROT_WARN_UNUSED_RESULT
1017 STRING *
1018 string_chr(PARROT_INTERP, UINTVAL character)
1020 ASSERT_ARGS(string_chr)
1021 if (character > 0xff)
1022 return Parrot_unicode_charset_ptr->string_from_codepoint(interp,
1023 character);
1025 else if (character > 0x7f)
1026 return Parrot_iso_8859_1_charset_ptr->string_from_codepoint(interp,
1027 character);
1029 else
1030 return Parrot_ascii_charset_ptr->string_from_codepoint(interp,
1031 character);
1037 =item C<STRING * Parrot_str_copy(PARROT_INTERP, STRING *s)>
1039 Creates and returns a copy of the specified Parrot string.
1041 =cut
1045 PARROT_EXPORT
1046 PARROT_CANNOT_RETURN_NULL
1047 PARROT_WARN_UNUSED_RESULT
1048 STRING *
1049 Parrot_str_copy(PARROT_INTERP, ARGMOD(STRING *s))
1051 ASSERT_ARGS(Parrot_str_copy)
1052 return Parrot_str_new_COW(interp, s);
1058 =back
1060 =head2 Vtable Dispatch Functions
1062 =over 4
1064 =item C<INTVAL Parrot_str_length(PARROT_INTERP, STRING *s)>
1066 Calculates and returns the number of characters in the specified Parrot string.
1068 =cut
1072 PARROT_EXPORT
1073 PARROT_IGNORABLE_RESULT
1074 INTVAL
1075 Parrot_str_length(PARROT_INTERP, ARGMOD(STRING *s))
1077 ASSERT_ARGS(Parrot_str_length)
1079 s->strlen = CHARSET_CODEPOINTS(interp, s);
1080 return s->strlen;
1086 =item C<INTVAL string_max_bytes(PARROT_INTERP, const STRING *s, UINTVAL nchars)>
1088 Returns the number of bytes required to safely contain the specified number
1089 of characters in the specified Parrot string's representation.
1091 =cut
1095 PARROT_EXPORT
1096 PARROT_WARN_UNUSED_RESULT
1097 INTVAL
1098 string_max_bytes(SHIM_INTERP, ARGIN(const STRING *s), UINTVAL nchars)
1100 ASSERT_ARGS(string_max_bytes)
1101 PARROT_ASSERT(s->encoding);
1102 return ENCODING_MAX_BYTES_PER_CODEPOINT(interp, s) * nchars;
1107 =item C<STRING * Parrot_str_repeat(PARROT_INTERP, const STRING *s, UINTVAL num)>
1109 Repeats the specified Parrot string I<num> times and returns the result.
1111 =cut
1115 PARROT_EXPORT
1116 PARROT_CANNOT_RETURN_NULL
1117 STRING *
1118 Parrot_str_repeat(PARROT_INTERP, ARGIN(const STRING *s), UINTVAL num)
1120 ASSERT_ARGS(Parrot_str_repeat)
1121 STRING * const dest = Parrot_str_new_init(interp, NULL,
1122 s->bufused * num,
1123 s->encoding, s->charset, 0);
1124 if (num > 0) {
1125 /* copy s into dest num times */
1126 UINTVAL length = s->bufused;
1127 UINTVAL i;
1128 char * destpos = dest->strstart;
1129 const char * const srcpos = s->strstart;
1130 for (i = 0; i < num; i++) {
1131 mem_sys_memcopy(destpos, srcpos, length);
1132 destpos += length;
1135 dest->bufused = s->bufused * num;
1136 dest->strlen = s->strlen * num;
1139 return dest;
1144 =item C<STRING * Parrot_str_substr(PARROT_INTERP, STRING *src, INTVAL offset,
1145 INTVAL length, STRING **d, int replace_dest)>
1147 Copies the substring of length C<length> from C<offset> from the specified
1148 Parrot string and stores it in C<**d>, allocating memory if necessary. The
1149 substring is also returned.
1151 =cut
1155 PARROT_EXPORT
1156 PARROT_CANNOT_RETURN_NULL
1157 PARROT_WARN_UNUSED_RESULT
1158 STRING *
1159 Parrot_str_substr(PARROT_INTERP,
1160 ARGIN_NULLOK(STRING *src), INTVAL offset, INTVAL length,
1161 ARGOUT_NULLOK(STRING **d), int replace_dest)
1163 ASSERT_ARGS(Parrot_str_substr)
1164 if (src == NULL)
1165 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_SUBSTR_OUT_OF_STRING,
1166 "Cannot substr on a null string");
1167 else {
1168 STRING *dest;
1169 UINTVAL true_length;
1170 UINTVAL true_offset = (UINTVAL)offset;
1172 saneify_string(src);
1174 /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
1175 if (offset == (INTVAL)Parrot_str_byte_length(interp, src) || length < 1)
1176 return Parrot_str_new_noinit(interp, enum_stringrep_one, 0);
1178 if (offset < 0)
1179 true_offset = (UINTVAL)(src->strlen + offset);
1181 /* 0 based... */
1182 if (src->strlen == 0 || true_offset > src->strlen - 1)
1183 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_SUBSTR_OUT_OF_STRING,
1184 "Cannot take substr outside string");
1186 true_length = (UINTVAL)length;
1187 if (true_length > (src->strlen - true_offset))
1188 true_length = (UINTVAL)(src->strlen - true_offset);
1190 /* do in-place i.e. reuse existing header if one */
1191 if (replace_dest && d && *d) {
1192 PARROT_ASSERT(src->encoding == Parrot_fixed_8_encoding_ptr);
1193 dest = *d;
1195 dest->encoding = src->encoding;
1196 dest->charset = src->charset;
1198 dest->strstart = (char *)src->strstart + true_offset;
1199 dest->bufused = true_length;
1201 dest->strlen = true_length;
1202 dest->hashval = 0;
1204 else
1205 dest = CHARSET_GET_CODEPOINTS(interp, src, true_offset,
1206 true_length);
1208 if (d)
1209 *d = dest;
1211 return dest;
1218 =item C<STRING * Parrot_str_replace(PARROT_INTERP, STRING *src, INTVAL offset,
1219 INTVAL length, STRING *rep, STRING **d)>
1221 Replaces a sequence of C<length> characters from C<offset> in the first
1222 Parrot string with the second Parrot string, returning what was
1223 replaced.
1225 This follows the Perl semantics for:
1227 substr EXPR, OFFSET, LENGTH, REPLACEMENT
1229 Replacing a sequence of characters with a longer string grows the
1230 string; a shorter string shrinks it.
1232 Replacing 2 past the end of the string is undefined. However replacing 1
1233 past the end of the string concatenates the two strings.
1235 A negative offset is allowed to replace from the end.
1237 =cut
1241 PARROT_EXPORT
1242 PARROT_CAN_RETURN_NULL
1243 STRING *
1244 Parrot_str_replace(PARROT_INTERP, ARGIN(STRING *src),
1245 INTVAL offset, INTVAL length, ARGIN(STRING *rep), ARGOUT_NULLOK(STRING **d))
1247 ASSERT_ARGS(Parrot_str_replace)
1248 UINTVAL start_byte, end_byte;
1249 INTVAL diff;
1250 String_iter iter;
1252 const CHARSET *cs;
1253 const ENCODING *enc;
1254 STRING *dest = NULL;
1255 UINTVAL true_offset = (UINTVAL)offset;
1256 UINTVAL true_length = (UINTVAL)length;
1258 /* special case */
1259 if (d == NULL
1260 && src->encoding == Parrot_fixed_8_encoding_ptr
1261 && rep->encoding == Parrot_fixed_8_encoding_ptr
1262 && offset >= 0
1263 && true_offset < src->strlen
1264 && length == 1
1265 && rep->strlen == 1) {
1266 if (PObj_is_cowed_TESTALL(src))
1267 Parrot_str_write_COW(interp, src);
1269 ((char *)src->strstart)[offset] = ((char *)rep->strstart)[0];
1271 return NULL;
1274 /* abs(-offset) may not be > strlen-1 */
1275 if (offset < 0)
1276 true_offset = (UINTVAL)(src->strlen + offset);
1278 /* Can replace 1 past end of string which is technically outside the string
1279 * but is same as a concat().
1280 * Only give exception if caller trys to replace end of string + 2
1282 if (true_offset > src->strlen)
1283 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_SUBSTR_OUT_OF_STRING,
1284 "Can only replace inside string or index after end of string");
1286 if (true_length > (src->strlen - true_offset))
1287 true_length = (UINTVAL)(src->strlen - true_offset);
1289 /* Save the substring that is replaced for the return value */
1290 if (d) {
1291 dest = CHARSET_GET_CODEPOINTS(interp, src, true_offset, true_length);
1292 *d = dest;
1295 /* may have different reps..... */
1296 cs = string_rep_compatible(interp, src, rep, &enc);
1298 if (!cs) {
1299 Parrot_utf16_encoding_ptr->to_encoding(interp, src, NULL);
1300 rep = Parrot_utf16_encoding_ptr->to_encoding(interp, rep,
1301 Parrot_gc_new_string_header(interp, 0));
1303 else {
1304 src->charset = cs;
1305 src->encoding = enc;
1308 /* get byte position of the part that will be replaced */
1309 ENCODING_ITER_INIT(interp, src, &iter);
1311 iter.set_position(interp, &iter, true_offset);
1312 start_byte = iter.bytepos;
1314 iter.set_position(interp, &iter, true_offset + true_length);
1315 end_byte = iter.bytepos;
1317 /* not possible.... */
1318 if (end_byte < start_byte)
1319 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_SUBSTR_OUT_OF_STRING,
1320 "replace: subend somehow is less than substart");
1322 /* Now do the replacement */
1325 * If the replacement string fits inside the original substring
1326 * don't create a new string, just pack it.
1328 diff = (end_byte - start_byte) - rep->bufused;
1330 if (diff >= 0
1331 || ((INTVAL)src->bufused - (INTVAL)PObj_buflen(src)) <= diff) {
1332 Parrot_str_write_COW(interp, src);
1334 if (diff != 0) {
1335 mem_sys_memmove((char *)src->strstart + start_byte + rep->bufused,
1336 (char *)src->strstart + end_byte,
1337 src->bufused - end_byte);
1338 src->bufused -= diff;
1341 mem_sys_memcopy((char *)src->strstart + start_byte,
1342 rep->strstart, rep->bufused);
1344 if (diff)
1345 (void)Parrot_str_length(interp, src);
1348 /* Replacement is larger than avail buffer, grow the string */
1349 else {
1350 /* diff is negative here, make it positive */
1351 diff = -(diff);
1352 Parrot_str_resize(interp, src, (UINTVAL)diff);
1354 /* Move the end of old string that isn't replaced to new offset first */
1355 mem_sys_memmove((char *)src->strstart + end_byte + diff,
1356 (char *)src->strstart + end_byte,
1357 src->bufused - end_byte);
1359 /* Copy the replacement in */
1360 mem_sys_memcopy((char *)src->strstart + start_byte, rep->strstart,
1361 rep->bufused);
1362 src->bufused += diff;
1363 (void)Parrot_str_length(interp, src);
1366 /* src is modified, now return the original substring */
1367 return dest;
1373 =item C<STRING * Parrot_str_chopn(PARROT_INTERP, STRING *s, INTVAL n)>
1375 Removes the last C<n> characters of the specified Parrot string. If C<n> is
1376 negative, cuts the string after C<+n> characters. The returned string is a copy
1377 of the one passed in.
1379 =cut
1383 PARROT_EXPORT
1384 PARROT_CANNOT_RETURN_NULL
1385 STRING *
1386 Parrot_str_chopn(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n)
1388 ASSERT_ARGS(Parrot_str_chopn)
1389 STRING * const chopped = Parrot_str_copy(interp, s);
1390 Parrot_str_chopn_inplace(interp, chopped, n);
1391 return chopped;
1397 =item C<void Parrot_str_chopn_inplace(PARROT_INTERP, STRING *s, INTVAL n)>
1399 Removes the last C<n> characters of the specified Parrot string. If C<n> is
1400 negative, cuts the string after C<+n> characters. The string passed in is
1401 modified and returned.
1403 =cut
1407 PARROT_EXPORT
1408 void
1409 Parrot_str_chopn_inplace(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n)
1411 ASSERT_ARGS(Parrot_str_chopn_inplace)
1412 UINTVAL new_length, uchar_size;
1414 if (n < 0) {
1415 new_length = -n;
1416 if (new_length > s->strlen)
1417 return;
1419 else {
1420 if (s->strlen > (UINTVAL)n)
1421 new_length = s->strlen - n;
1422 else
1423 new_length = 0;
1426 s->hashval = 0;
1428 if (!new_length || !s->strlen) {
1429 s->bufused = s->strlen = 0;
1430 return;
1433 uchar_size = s->bufused / s->strlen;
1434 s->strlen = new_length;
1436 if (s->encoding == Parrot_fixed_8_encoding_ptr) {
1437 s->bufused = new_length;
1439 else if (s->encoding == Parrot_ucs2_encoding_ptr) {
1440 s->bufused = new_length * uchar_size;
1442 else {
1443 String_iter iter;
1445 ENCODING_ITER_INIT(interp, s, &iter);
1446 iter.set_position(interp, &iter, new_length);
1447 s->bufused = iter.bytepos;
1450 return;
1456 =item C<INTVAL Parrot_str_compare(PARROT_INTERP, const STRING *s1, const STRING
1457 *s2)>
1459 Compares two strings to each other. If s1 is less than s2, returns -1. If the
1460 strings are equal, returns 0. If s1 is greater than s2, returns 2. This
1461 comparison uses the character set collation order of the strings for
1462 comparison.
1464 =cut
1468 PARROT_EXPORT
1469 PARROT_WARN_UNUSED_RESULT
1470 INTVAL
1471 Parrot_str_compare(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1), ARGIN_NULLOK(const STRING *s2))
1473 ASSERT_ARGS(Parrot_str_compare)
1474 if (!s2)
1475 return s1 && (s1->strlen != 0);
1477 if (!s1)
1478 return -(s2->strlen != 0);
1480 saneify_string(s1);
1481 saneify_string(s2);
1483 return CHARSET_COMPARE(interp, s1, s2);
1489 =item C<INTVAL Parrot_str_not_equal(PARROT_INTERP, const STRING *s1, const
1490 STRING *s2)>
1492 Compares two Parrot strings, performing type and encoding conversions if
1493 necessary. Returns 1 if the strings are not equal, and 0 otherwise.
1495 =cut
1499 PARROT_EXPORT
1500 PARROT_WARN_UNUSED_RESULT
1501 INTVAL
1502 Parrot_str_not_equal(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1), ARGIN_NULLOK(const STRING *s2))
1504 ASSERT_ARGS(Parrot_str_not_equal)
1505 return !Parrot_str_equal(interp, s1, s2);
1510 =item C<INTVAL Parrot_str_equal(PARROT_INTERP, const STRING *s1, const STRING
1511 *s2)>
1513 Compares two Parrot strings, performing type and encoding conversions if
1514 necessary.
1516 Returns 1 if the strings are equal, and 0 otherwise.
1518 =cut
1522 PARROT_EXPORT
1523 PARROT_WARN_UNUSED_RESULT
1524 INTVAL
1525 Parrot_str_equal(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1), ARGIN_NULLOK(const STRING *s2))
1527 ASSERT_ARGS(Parrot_str_equal)
1528 if ((s1 == s2) || (!s1 && !s2)) {
1529 return 1;
1531 else if (!s2) {
1532 return s1->strlen == 0;
1534 else if (!s1) {
1535 return s2->strlen == 0;
1537 else if (s1->strlen != s2->strlen) {
1538 return 0; /* we don't care which is bigger */
1540 else if (s1->hashval != s2->hashval && s1->hashval && s2->hashval) {
1541 return 0;
1543 else if (!s1->strlen) { /* s2->strlen is the same here */
1544 return 1;
1546 /* COWed strings */
1547 else if (s1->strstart == s2->strstart && s1->bufused == s2->bufused) {
1548 return 1;
1552 * now,
1553 * both strings are non-null
1554 * both strings have same length
1556 return !CHARSET_COMPARE(interp, s1, s2);
1562 =item C<static void make_writable(PARROT_INTERP, STRING **s, const size_t len,
1563 parrot_string_representation_t representation)>
1565 Makes the specified Parrot string writable with minimum length C<len>. The
1566 C<representation> argument is required in case a new Parrot string has to be
1567 created.
1569 =cut
1573 static void
1574 make_writable(PARROT_INTERP, ARGMOD(STRING **s),
1575 const size_t len, parrot_string_representation_t representation)
1577 ASSERT_ARGS(make_writable)
1578 if (!*s)
1579 *s = Parrot_str_new_noinit(interp, representation, len);
1580 else if ((*s)->strlen < len)
1581 Parrot_str_resize(interp, *s, (UINTVAL)(len - (*s)->strlen));
1582 else if (PObj_is_cowed_TESTALL(*s))
1583 Parrot_str_write_COW(interp, *s);
1589 =item C<STRING * Parrot_str_bitwise_and(PARROT_INTERP, const STRING *s1, const
1590 STRING *s2, STRING **dest)>
1592 Performs a bitwise C<AND> on two Parrot string, performing type and encoding
1593 conversions if necessary. If the second string is not C<NULL> then it is
1594 reused. Otherwise a new Parrot string is created.
1596 =cut
1600 PARROT_EXPORT
1601 PARROT_CANNOT_RETURN_NULL
1602 STRING *
1603 Parrot_str_bitwise_and(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
1604 ARGIN_NULLOK(const STRING *s2), ARGOUT_NULLOK(STRING **dest))
1606 ASSERT_ARGS(Parrot_str_bitwise_and)
1607 STRING *res;
1608 size_t minlen;
1610 /* we could also trans_charset to iso-8859-1 */
1611 if (s1 && s1->encoding != Parrot_fixed_8_encoding_ptr)
1612 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1613 "string bitwise_and (%s/%s) unsupported",
1614 s1->encoding->name, nonnull_encoding_name(s2));
1616 if (s2 && s2->encoding != Parrot_fixed_8_encoding_ptr)
1617 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1618 "string bitwise_and (%s/%s) unsupported",
1619 nonnull_encoding_name(s1), s2->encoding->name);
1621 /* think about case of dest string is one of the operands */
1622 if (s1 && s2)
1623 minlen = s1->strlen > s2->strlen ? s2->strlen : s1->strlen;
1624 else
1625 minlen = 0;
1627 if (dest && *dest) {
1628 res = *dest;
1629 res->encoding = Parrot_fixed_8_encoding_ptr;
1630 res->charset = Parrot_binary_charset_ptr;
1632 else
1633 res = Parrot_str_new_init(interp, NULL, minlen,
1634 Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
1636 if (!s1 || !s2) {
1637 res->bufused = 0;
1638 res->strlen = 0;
1640 return res;
1643 #if ! DISABLE_GC_DEBUG
1644 /* trigger GC for debug */
1645 if (interp && GC_DEBUG(interp))
1646 Parrot_gc_mark_and_sweep(interp, GC_trace_stack_FLAG);
1647 #endif
1649 make_writable(interp, &res, minlen, enum_stringrep_one);
1651 { /* bitwise AND the strings */
1652 const Parrot_UInt1 *curr1 = (Parrot_UInt1 *)s1->strstart;
1653 const Parrot_UInt1 *curr2 = (Parrot_UInt1 *)s2->strstart;
1654 Parrot_UInt1 *dp = (Parrot_UInt1 *)res->strstart;
1655 size_t len = minlen;
1657 while (len--)
1658 *dp++ = *curr1++ & *curr2++;
1661 res->bufused = res->strlen = minlen;
1663 if (dest)
1664 *dest = res;
1666 return res;
1669 #define BITWISE_XOR_STRINGS(type1, type2, restype, s1, s2, res, maxlen) \
1670 do { \
1671 const type1 *curr1 = NULL; \
1672 const type2 *curr2 = NULL; \
1673 size_t length1 = 0; \
1674 size_t length2 = 0; \
1675 restype *dp; \
1676 size_t _index; \
1678 if (s1) { \
1679 curr1 = (type1 *)(s1)->strstart; \
1680 length1 = (s1)->strlen; \
1682 if (s2) { \
1683 curr2 = (type2 *)(s2)->strstart; \
1684 length2 = (s2)->strlen; \
1687 dp = (restype *)(res)->strstart; \
1688 _index = 0; \
1690 for (; _index < (maxlen) ; ++curr1, ++curr2, ++dp, ++_index) { \
1691 if (_index < length1) { \
1692 if (_index < length2) \
1693 *dp = *curr1 ^ *curr2; \
1694 else \
1695 *dp = *curr1; \
1697 else if (_index < length2) { \
1698 *dp = *curr2; \
1701 } while (0)
1704 #define BITWISE_OR_STRINGS(type1, type2, restype, s1, s2, res, maxlen) \
1705 do { \
1706 const type1 *curr1 = NULL; \
1707 const type2 *curr2 = NULL; \
1708 size_t length1 = 0; \
1709 size_t length2 = 0; \
1710 restype *dp; \
1711 size_t _index; \
1713 if (s1) { \
1714 curr1 = (type1 *)(s1)->strstart; \
1715 length1 = (s1)->strlen; \
1717 if (s2) { \
1718 curr2 = (type2 *)(s2)->strstart; \
1719 length2 = (s2)->strlen; \
1722 dp = (restype *)(res)->strstart; \
1723 _index = 0; \
1725 for (; _index < (maxlen) ; ++curr1, ++curr2, ++dp, ++_index) { \
1726 if (_index < length1) { \
1727 if (_index < length2) \
1728 *dp = *curr1 | *curr2; \
1729 else \
1730 *dp = *curr1; \
1732 else if (_index < length2) { \
1733 *dp = *curr2; \
1736 } while (0)
1741 =item C<STRING * Parrot_str_bitwise_or(PARROT_INTERP, const STRING *s1, const
1742 STRING *s2, STRING **dest)>
1744 Performs a bitwise C<OR> on two Parrot strings, performing type and encoding
1745 conversions if necessary. If the third string is not C<NULL>, then it is
1746 reused. Otherwise a new Parrot string is created.
1748 =cut
1752 PARROT_EXPORT
1753 PARROT_CANNOT_RETURN_NULL
1754 STRING *
1755 Parrot_str_bitwise_or(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
1756 ARGIN_NULLOK(const STRING *s2), ARGOUT_NULLOK(STRING **dest))
1758 ASSERT_ARGS(Parrot_str_bitwise_or)
1759 STRING *res;
1760 size_t maxlen = 0;
1762 if (s1) {
1763 if (s1->encoding != Parrot_fixed_8_encoding_ptr)
1764 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1765 "string bitwise_or (%s/%s) unsupported",
1766 s1->encoding->name, nonnull_encoding_name(s2));
1768 maxlen = s1->bufused;
1771 if (s2) {
1772 if (s2->encoding != Parrot_fixed_8_encoding_ptr)
1773 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1774 "string bitwise_or (%s/%s) unsupported",
1775 nonnull_encoding_name(s1), s2->encoding->name);
1777 if (s2->bufused > maxlen)
1778 maxlen = s2->bufused;
1781 if (dest && *dest) {
1782 res = *dest;
1783 res->encoding = Parrot_fixed_8_encoding_ptr;
1784 res->charset = Parrot_binary_charset_ptr;
1786 else
1787 res = Parrot_str_new_init(interp, NULL, maxlen,
1788 Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
1790 if (!maxlen) {
1791 res->bufused = 0;
1792 res->strlen = 0;
1793 return res;
1796 #if ! DISABLE_GC_DEBUG
1797 /* trigger GC for debug */
1798 if (interp && GC_DEBUG(interp))
1799 Parrot_gc_mark_and_sweep(interp, GC_trace_stack_FLAG);
1800 #endif
1802 make_writable(interp, &res, maxlen, enum_stringrep_one);
1804 BITWISE_OR_STRINGS(Parrot_UInt1, Parrot_UInt1, Parrot_UInt1,
1805 s1, s2, res, maxlen);
1806 res->bufused = res->strlen = maxlen;
1808 if (dest)
1809 *dest = res;
1811 return res;
1817 =item C<STRING * Parrot_str_bitwise_xor(PARROT_INTERP, const STRING *s1, const
1818 STRING *s2, STRING **dest)>
1820 Performs a bitwise C<XOR> on two Parrot strings, performing type and encoding
1821 conversions if necessary. If the second string is not C<NULL>, then it is
1822 reused. Otherwise a new Parrot string is created.
1824 =cut
1828 PARROT_EXPORT
1829 PARROT_CANNOT_RETURN_NULL
1830 STRING *
1831 Parrot_str_bitwise_xor(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1),
1832 ARGIN_NULLOK(const STRING *s2), ARGOUT_NULLOK(STRING **dest))
1834 ASSERT_ARGS(Parrot_str_bitwise_xor)
1835 STRING *res;
1836 size_t maxlen = 0;
1838 if (s1) {
1839 if (s1->encoding != Parrot_fixed_8_encoding_ptr)
1840 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1841 "string bitwise_xor (%s/%s) unsupported",
1842 s1->encoding->name, nonnull_encoding_name(s2));
1844 maxlen = s1->bufused;
1847 if (s2) {
1848 if (s2->encoding != Parrot_fixed_8_encoding_ptr)
1849 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1850 "string bitwise_xor (%s/%s) unsupported",
1851 nonnull_encoding_name(s1), s2->encoding->name);
1853 if (s2->bufused > maxlen)
1854 maxlen = s2->bufused;
1857 if (dest && *dest) {
1858 res = *dest;
1859 res->encoding = Parrot_fixed_8_encoding_ptr;
1860 res->charset = Parrot_binary_charset_ptr;
1862 else
1863 res = Parrot_str_new_init(interp, NULL, maxlen,
1864 Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
1866 if (!maxlen) {
1867 res->bufused = 0;
1868 res->strlen = 0;
1869 return res;
1872 #if ! DISABLE_GC_DEBUG
1873 /* trigger GC for debug */
1874 if (interp && GC_DEBUG(interp))
1875 Parrot_gc_mark_and_sweep(interp, GC_trace_stack_FLAG);
1876 #endif
1878 make_writable(interp, &res, maxlen, enum_stringrep_one);
1880 BITWISE_XOR_STRINGS(Parrot_UInt1, Parrot_UInt1, Parrot_UInt1,
1881 s1, s2, res, maxlen);
1882 res->bufused = res->strlen = maxlen;
1884 if (dest)
1885 *dest = res;
1887 return res;
1891 #define BITWISE_NOT_STRING(type, s, res) \
1892 do { \
1893 if ((s) && (res)) { \
1894 const type *curr = (type *)(s)->strstart; \
1895 size_t length = (s)->strlen; \
1896 Parrot_UInt1 *dp = (Parrot_UInt1 *)(res)->strstart; \
1898 for (; length ; --length, ++dp, ++curr) \
1899 *dp = 0xFF & ~ *curr; \
1901 } while (0)
1905 =item C<STRING * Parrot_str_bitwise_not(PARROT_INTERP, const STRING *s, STRING
1906 **dest)>
1908 Performs a bitwise C<NOT> on a Parrot string. If the second string is
1909 not C<NULL> then it is reused, otherwise a new Parrot string is created.
1911 =cut
1915 PARROT_EXPORT
1916 PARROT_CANNOT_RETURN_NULL
1917 STRING *
1918 Parrot_str_bitwise_not(PARROT_INTERP, ARGIN_NULLOK(const STRING *s),
1919 ARGOUT_NULLOK(STRING **dest))
1921 ASSERT_ARGS(Parrot_str_bitwise_not)
1922 STRING *res;
1923 size_t len;
1925 if (s) {
1926 if (s->encoding != Parrot_fixed_8_encoding_ptr)
1927 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
1928 "string bitwise_not (%s/%s) unsupported",
1929 s->encoding->name, s->encoding->name);
1931 len = s->bufused;
1933 else
1934 len = 0;
1936 if (dest && *dest) {
1937 res = *dest;
1938 res->encoding = Parrot_fixed_8_encoding_ptr;
1939 res->charset = Parrot_binary_charset_ptr;
1941 else
1942 res = Parrot_str_new_init(interp, NULL, len,
1943 Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
1945 if (!len) {
1946 res->bufused = 0;
1947 res->strlen = 0;
1948 return res;
1951 #if ! DISABLE_GC_DEBUG
1952 /* trigger GC for debug */
1953 if (interp && GC_DEBUG(interp))
1954 Parrot_gc_mark_and_sweep(interp, GC_trace_stack_FLAG);
1955 #endif
1957 make_writable(interp, &res, len, enum_stringrep_one);
1959 res->strlen = res->bufused = len;
1961 BITWISE_NOT_STRING(Parrot_UInt1, s, res);
1962 if (dest)
1963 *dest = res;
1965 return res;
1971 =item C<INTVAL Parrot_str_boolean(PARROT_INTERP, const STRING *s)>
1973 Returns whether the specified Parrot string is true. A string is true if it is
1974 equal to anything other than C<0>, C<""> or C<"0">.
1976 =cut
1980 PARROT_EXPORT
1981 PARROT_WARN_UNUSED_RESULT
1982 INTVAL
1983 Parrot_str_boolean(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
1985 ASSERT_ARGS(Parrot_str_boolean)
1986 const INTVAL len = s ? Parrot_str_byte_length(interp, s) : 0;
1988 if (len == 0)
1989 return 0;
1991 if (len == 1) {
1992 const UINTVAL c = Parrot_str_indexed(interp, s, 0);
1994 /* relying on character literals being interpreted as ASCII--may
1995 not be correct on EBCDIC systems. use numeric value instead? */
1996 if (c == '0')
1997 /* later, accept other chars with digit value 0? or, no */
1998 return 0;
2001 /* it must be true */
2002 return 1;
2008 =item C<STRING * Parrot_str_format_data(PARROT_INTERP, const char *format, ...)>
2010 Writes and returns a Parrot string.
2012 =cut
2016 PARROT_EXPORT
2017 PARROT_CANNOT_RETURN_NULL
2018 STRING *
2019 Parrot_str_format_data(PARROT_INTERP, ARGIN(const char *format), ...)
2021 ASSERT_ARGS(Parrot_str_format_data)
2022 STRING *output;
2023 va_list args;
2025 va_start(args, format);
2026 output = Parrot_vsprintf_c(interp, format, args);
2027 va_end(args);
2029 return output;
2033 State of FSM during number value parsing.
2035 Integer uses only parse_start, parse_before_dot and parse_end.
2038 typedef enum number_parse_state {
2039 parse_start,
2040 parse_before_dot,
2041 parse_after_dot,
2042 parse_after_e,
2043 parse_after_e_sign,
2044 parse_end
2045 } number_parse_state;
2050 =item C<INTVAL Parrot_str_to_int(PARROT_INTERP, const STRING *s)>
2052 Converts a numeric Parrot string to an integer value.
2054 A number is such that:
2056 sign = '+' | '-'
2057 digit = "Any code point considered a digit by the chartype"
2058 indicator = 'e' | 'E'
2059 digits = digit [digit]...
2060 decimal-part = digits '.' [digits] | ['.'] digits
2061 exponent-part = indicator [sign] digits
2062 numeric-string = [sign] decimal-part [exponent-part]
2064 The integer value is the appropriate integer representation of such a number,
2065 rounding towards zero.
2067 =cut
2071 PARROT_EXPORT
2072 PARROT_WARN_UNUSED_RESULT
2073 INTVAL
2074 Parrot_str_to_int(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
2076 ASSERT_ARGS(Parrot_str_to_int)
2077 if (s == NULL)
2078 return 0;
2080 const INTVAL max_safe = PARROT_INTVAL_MAX / 10;
2081 const INTVAL last_dig = PARROT_INTVAL_MAX % 10;
2082 int sign = 1;
2083 INTVAL i = 0;
2084 String_iter iter;
2085 UINTVAL offs;
2086 number_parse_state state = parse_start;
2088 ENCODING_ITER_INIT(interp, s, &iter);
2090 for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
2091 const UINTVAL c = iter.get_and_advance(interp, &iter);
2092 /* Check for overflow */
2093 if (c > 255)
2094 break;
2096 switch (state) {
2097 case parse_start:
2098 if (isdigit((unsigned char)c)) {
2099 const INTVAL nextval = c - '0';
2100 if (i < max_safe || (i == max_safe && nextval <= last_dig))
2101 i = i * 10 + nextval;
2102 else
2103 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ERR_OVERFLOW,
2104 "Integer value of String '%S' too big", s);
2105 state = parse_before_dot;
2107 else if (c == '-') {
2108 sign = -1;
2109 state = parse_before_dot;
2111 else if (c == '+')
2112 state = parse_before_dot;
2113 else if (isspace((unsigned char)c))
2114 ; /* Do nothing */
2115 else
2116 state = parse_end;
2118 break;
2120 case parse_before_dot:
2121 if (isdigit((unsigned char)c)) {
2122 const INTVAL nextval = c - '0';
2123 if (i < max_safe || (i == max_safe && nextval <= last_dig))
2124 i = i * 10 + nextval;
2125 else
2126 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ERR_OVERFLOW,
2127 "Integer value of String '%S' too big", s);
2129 else
2130 state = parse_end;
2131 break;
2133 default:
2134 /* Pacify compiler */
2135 break;
2139 i *= sign;
2141 return i;
2147 =item C<FLOATVAL Parrot_str_to_num(PARROT_INTERP, const STRING *s)>
2149 Converts a numeric Parrot STRING to a floating point number.
2151 =cut
2155 PARROT_EXPORT
2156 PARROT_WARN_UNUSED_RESULT
2157 FLOATVAL
2158 Parrot_str_to_num(PARROT_INTERP, ARGIN(const STRING *s))
2160 ASSERT_ARGS(Parrot_str_to_num)
2161 FLOATVAL f = 0.0;
2162 FLOATVAL mantissa = 0.0;
2163 FLOATVAL sign = 1.0; /* -1 for '-' */
2164 FLOATVAL divider = 0.1;
2165 INTVAL e = 0;
2166 INTVAL e_sign = 1; /* -1 for '-' */
2167 /* How many digits it's safe to parse */
2168 const INTVAL max_safe = PARROT_INTVAL_MAX / 10;
2169 INTVAL m = 0; /* Integer mantissa */
2170 int m_is_safe = 1; /* We can use integer mantissa */
2171 INTVAL d = 0; /* Integer descriminator */
2172 int d_is_safe = 1; /* We can use integer mantissa */
2173 int d_length = 0;
2174 int check_nan = 0; /* Check for NaN and Inf after main loop */
2175 String_iter iter;
2176 UINTVAL offs;
2177 number_parse_state state = parse_start;
2179 if (!s)
2180 return 0.0;
2182 ENCODING_ITER_INIT(interp, s, &iter);
2184 /* Handcrafter FSM to read float value */
2185 for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
2186 const UINTVAL c = iter.get_and_advance(interp, &iter);
2187 /* Check for overflow */
2188 if (c > 255)
2189 break;
2191 switch (state) {
2192 case parse_start:
2193 if (isdigit((unsigned char)c)) {
2194 f = c - '0';
2195 m = c - '0';
2196 state = parse_before_dot;
2198 else if (c == '-') {
2199 sign = -1.0;
2200 state = parse_before_dot;
2202 else if (c == '+')
2203 state = parse_before_dot;
2204 else if (c == '.')
2205 state = parse_after_dot;
2206 else if (isspace((unsigned char)c))
2207 ; /* Do nothing */
2208 else {
2209 check_nan = 1;
2210 state = parse_end;
2212 break;
2214 case parse_before_dot:
2215 if (isdigit((unsigned char)c)) {
2216 f = f*10.0 + (c-'0');
2217 m = m*10 + (c-'0');
2218 /* Integer overflow for mantissa */
2219 if (m >= max_safe)
2220 m_is_safe = 0;
2222 else if (c == '.') {
2223 state = parse_after_dot;
2225 * Throw gathered result. Recalulate from integer mantissa
2226 * to preserve precision.
2228 if (m_is_safe)
2229 f = m;
2230 mantissa = f;
2232 else if (c == 'e' || c == 'E') {
2233 state = parse_after_e;
2234 /* See comment above */
2235 if (m_is_safe)
2236 f = m;
2237 mantissa = f;
2239 else {
2240 check_nan = 1;
2241 state = parse_end;
2243 break;
2245 case parse_after_dot:
2246 if (isdigit((unsigned char)c)) {
2247 f += (c-'0') * divider;
2248 divider /= 10.0;
2249 d = d*10 + (c-'0');
2250 if (d >= max_safe)
2251 d_is_safe = 0;
2252 d_length++;
2254 else if (c == 'e' || c == 'E')
2255 state = parse_after_e;
2256 else
2257 state = parse_end;
2258 break;
2260 case parse_after_e:
2261 if (isdigit((unsigned char)c)) {
2262 e = e*10 + (c-'0');
2263 state = parse_after_e_sign;
2265 else if (c == '-') {
2266 e_sign = -1;
2267 state = parse_after_e_sign;
2269 else if (c == '+')
2270 state = parse_after_e_sign;
2271 else
2272 state = parse_end;
2273 break;
2275 case parse_after_e_sign:
2276 if (isdigit((unsigned char)c))
2277 e = e*10 + (c-'0');
2278 else
2279 state = parse_end;
2280 break;
2282 case parse_end:
2283 default:
2284 /* Pacify compiler */
2285 break;
2289 /* Support for non-canonical NaN and Inf */
2290 /* charpos <=2 because for "-i" iter will be advanced to next char already */
2291 if (check_nan && (iter.charpos <= 2)) {
2292 STRING *t = Parrot_str_upcase(interp, s);
2293 if (Parrot_str_equal(interp, t, CONST_STRING(interp, "NAN")))
2294 return PARROT_FLOATVAL_NAN_QUIET;
2295 else if (Parrot_str_equal(interp, t, CONST_STRING(interp, "INF"))
2296 || Parrot_str_equal(interp, t, CONST_STRING(interp, "INFINITY")))
2297 return PARROT_FLOATVAL_INF_POSITIVE;
2298 else if (Parrot_str_equal(interp, t, CONST_STRING(interp, "-INF"))
2299 || Parrot_str_equal(interp, t, CONST_STRING(interp, "-INFINITY")))
2300 return PARROT_FLOATVAL_INF_NEGATIVE;
2301 else
2302 return 0.0;
2305 /* local macro to call proper pow version depending on FLOATVAL */
2306 #if NUMVAL_SIZE == DOUBLE_SIZE
2307 # define POW pow
2308 #else
2309 # define POW powl
2310 #endif
2312 if (d && d_is_safe) {
2313 f = mantissa + (1.0 * d / POW(10.0, d_length));
2316 if (sign < 0)
2317 f = -f;
2319 if (e) {
2320 if (e_sign == 1)
2321 f *= POW(10.0, e);
2322 else
2323 f /= POW(10.0, e);
2326 #undef POW
2328 return f;
2334 =item C<STRING * Parrot_str_from_int(PARROT_INTERP, INTVAL i)>
2336 Returns a Parrot string representation of the specified integer value.
2338 =cut
2342 PARROT_EXPORT
2343 PARROT_WARN_UNUSED_RESULT
2344 PARROT_CANNOT_RETURN_NULL
2345 STRING *
2346 Parrot_str_from_int(PARROT_INTERP, INTVAL i)
2348 ASSERT_ARGS(Parrot_str_from_int)
2349 char buf[128];
2350 return Parrot_str_from_int_base(interp, buf, (HUGEINTVAL)i, 10);
2356 =item C<STRING * Parrot_str_from_num(PARROT_INTERP, FLOATVAL f)>
2358 Returns a Parrot string representation of the specified floating-point value.
2360 =cut
2364 PARROT_EXPORT
2365 PARROT_WARN_UNUSED_RESULT
2366 PARROT_CANNOT_RETURN_NULL
2367 STRING *
2368 Parrot_str_from_num(PARROT_INTERP, FLOATVAL f)
2370 ASSERT_ARGS(Parrot_str_from_num)
2371 /* Too damn hard--hand it off to Parrot_sprintf, which'll probably
2372 use the system sprintf anyway, but has gigantic buffers that are
2373 awfully hard to overflow. */
2374 return Parrot_sprintf_c(interp, FLOATVAL_FMT, f);
2380 =item C<char * Parrot_str_to_cstring(PARROT_INTERP, const STRING *s)>
2382 Returns a C string for the specified Parrot string. Use
2383 C<Parrot_str_free_cstring()> to free the string. Failure to do this will result in
2384 a memory leak.
2386 =cut
2390 PARROT_EXPORT
2391 PARROT_MALLOC
2392 PARROT_CANNOT_RETURN_NULL
2393 char *
2394 Parrot_str_to_cstring(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
2396 ASSERT_ARGS(Parrot_str_to_cstring)
2397 if (! s) {
2398 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
2399 "Can't convert NULL string");
2401 else
2402 return string_to_cstring_nullable(interp, s);
2408 =item C<char * string_to_cstring_nullable(PARROT_INTERP, const STRING *s)>
2410 Returns a C string for the specified Parrot string. Use
2411 C<Parrot_str_free_cstring()> to free the string. Failure to do this will result in
2412 a memory leak.
2414 =cut
2418 PARROT_EXPORT
2419 PARROT_MALLOC
2420 PARROT_CAN_RETURN_NULL
2421 char *
2422 string_to_cstring_nullable(SHIM_INTERP, ARGIN_NULLOK(const STRING *s))
2424 ASSERT_ARGS(string_to_cstring_nullable)
2425 if (!s)
2426 return NULL;
2427 else {
2428 char * const p = (char *)mem_sys_allocate(s->bufused + 1);
2429 memcpy(p, s->strstart, s->bufused);
2430 p[s->bufused] = '\0';
2431 return p;
2438 =item C<void Parrot_str_free_cstring(char *p)>
2440 Free a string created by C<Parrot_str_to_cstring()>.
2442 TODO - Hopefully this can go away at some point, as it's got all
2443 sorts of leak potential otherwise.
2445 =cut
2449 PARROT_EXPORT
2450 void
2451 Parrot_str_free_cstring(ARGIN_NULLOK(char *p))
2453 ASSERT_ARGS(Parrot_str_free_cstring)
2454 mem_sys_free((void *)p);
2460 =item C<void Parrot_str_pin(PARROT_INTERP, STRING *s)>
2462 Replaces the specified Parrot string's managed buffer memory by system memory.
2464 =cut
2468 PARROT_EXPORT
2469 void
2470 Parrot_str_pin(PARROT_INTERP, ARGMOD(STRING *s))
2472 ASSERT_ARGS(Parrot_str_pin)
2473 char *memory;
2474 size_t size;
2476 /* XXX -lt: COW strings have the external_FLAG set, so this will
2477 * not work for these
2478 * so probably only sysmem should be tested
2480 Parrot_str_write_COW(interp, s);
2482 size = PObj_buflen(s);
2483 memory = (char *)mem_sys_allocate(size);
2485 mem_sys_memcopy(memory, PObj_bufstart(s), size);
2486 PObj_bufstart(s) = memory;
2487 s->strstart = memory;
2489 /* Mark the memory as both from the system and immobile */
2490 PObj_sysmem_SET(s);
2496 =item C<void Parrot_str_unpin(PARROT_INTERP, STRING *s)>
2498 Undoes a C<Parrot_str_pin()> so that the string once again uses managed memory.
2500 =cut
2504 PARROT_EXPORT
2505 void
2506 Parrot_str_unpin(PARROT_INTERP, ARGMOD(STRING *s))
2508 ASSERT_ARGS(Parrot_str_unpin)
2509 void *memory;
2510 size_t size;
2512 /* If this string is not marked using system memory,
2513 * we just don't do this
2515 if (!PObj_sysmem_TEST(s))
2516 return;
2518 Parrot_str_write_COW(interp, s);
2519 size = PObj_buflen(s);
2521 /* We need a handle on the fixed memory so we can get rid of it later */
2522 memory = PObj_bufstart(s);
2524 /* Reallocate it the same size
2525 * NOTE can't use Parrot_gc_reallocate_string_storage because of the LEA
2526 * allocator, where this is a noop for the same size
2528 * We have to block GC here, as we have a pointer to bufstart
2530 Parrot_block_GC_sweep(interp);
2531 Parrot_gc_allocate_string_storage(interp, s, size);
2532 Parrot_unblock_GC_sweep(interp);
2533 mem_sys_memcopy(PObj_bufstart(s), memory, size);
2535 /* Mark the memory as neither immobile nor system allocated */
2536 PObj_sysmem_CLEAR(s);
2538 /* Free up the memory */
2539 mem_sys_free(memory);
2545 =item C<size_t Parrot_str_to_hashval(PARROT_INTERP, STRING *s)>
2547 Returns the hash value for the specified Parrot string, caching it in
2548 C<< s->hashval >>.
2550 =cut
2554 PARROT_EXPORT
2555 PARROT_WARN_UNUSED_RESULT
2556 size_t
2557 Parrot_str_to_hashval(PARROT_INTERP, ARGMOD_NULLOK(STRING *s))
2559 ASSERT_ARGS(Parrot_str_to_hashval)
2560 String_iter iter;
2561 UINTVAL offs;
2562 size_t hashval = interp->hash_seed;
2564 if (!s)
2565 return hashval;
2567 /* ZZZZZ workaround for something not setting up encodings right */
2568 saneify_string(s);
2570 ENCODING_ITER_INIT(interp, s, &iter);
2572 for (offs = 0; offs < s->strlen; ++offs) {
2573 const UINTVAL c = iter.get_and_advance(interp, &iter);
2574 hashval += hashval << 5;
2575 hashval += c;
2578 s->hashval = hashval;
2580 return hashval;
2586 =item C<STRING * Parrot_str_escape(PARROT_INTERP, const STRING *src)>
2588 Escapes all non-ASCII chars to backslash sequences. Control chars that
2589 C<Parrot_str_unescape> can handle are escaped as I<\x>, as well as a double
2590 quote character. Other control chars and codepoints < 0x100 are escaped as
2591 I<\xhh>, codepoints up to 0xffff, as I<\uhhhh>, and codepoints greater than
2592 this as I<\x{hh...hh}>.
2594 =cut
2598 PARROT_EXPORT
2599 PARROT_CAN_RETURN_NULL
2600 STRING *
2601 Parrot_str_escape(PARROT_INTERP, ARGIN_NULLOK(const STRING *src))
2603 ASSERT_ARGS(Parrot_str_escape)
2604 return Parrot_str_escape_truncate(interp, src, (UINTVAL) ~0);
2610 =item C<STRING * Parrot_str_escape_truncate(PARROT_INTERP, const STRING *src,
2611 UINTVAL limit)>
2613 Escapes all non-ASCII characters in the given string with backslashed versions,
2614 but limits the length of the output (used for trace output of strings).
2616 =cut
2620 PARROT_EXPORT
2621 PARROT_CAN_RETURN_NULL
2622 STRING *
2623 Parrot_str_escape_truncate(PARROT_INTERP,
2624 ARGIN_NULLOK(const STRING *src), UINTVAL limit)
2626 ASSERT_ARGS(Parrot_str_escape_truncate)
2627 STRING *result, *hex;
2628 UINTVAL i, len, charlen;
2629 String_iter iter;
2630 unsigned char *dp;
2632 if (!src)
2633 return NULL;
2635 len = src->strlen;
2637 if (len > limit)
2638 len = limit;
2640 /* expect around 2x the chars */
2641 charlen = 2 * len;
2643 if (charlen < 16)
2644 charlen = 16;
2646 /* create ascii result */
2647 result = Parrot_str_new_init(interp, NULL, charlen,
2648 Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
2650 /* more work TODO */
2651 ENCODING_ITER_INIT(interp, src, &iter);
2652 dp = (unsigned char *)result->strstart;
2654 for (i = 0; len > 0; --len) {
2655 UINTVAL c = iter.get_and_advance(interp, &iter);
2656 if (c < 0x7f) {
2657 /* process ASCII chars */
2658 if (i >= charlen - 2) {
2659 /* resize - still len codepoints to go */
2660 charlen += len * 2 + 16;
2661 Parrot_gc_reallocate_string_storage(interp, result, charlen);
2662 /* start can change */
2663 dp = (unsigned char *)result->strstart;
2665 switch (c) {
2666 case '\\':
2667 dp[i++] = '\\';
2668 break;
2669 case '\a':
2670 dp[i++] = '\\';
2671 c = 'a';
2672 break;
2673 case '\b':
2674 dp[i++] = '\\';
2675 c = 'b';
2676 break;
2677 case '\n':
2678 dp[i++] = '\\';
2679 c = 'n';
2680 break;
2681 case '\r':
2682 dp[i++] = '\\';
2683 c = 'r';
2684 break;
2685 case '\t':
2686 dp[i++] = '\\';
2687 c = 't';
2688 break;
2689 case '\f':
2690 dp[i++] = '\\';
2691 c = 'f';
2692 break;
2693 case '"':
2694 dp[i++] = '\\';
2695 c = '"';
2696 break;
2697 case 27:
2698 dp[i++] = '\\';
2699 c = 'e';
2700 break;
2701 default:
2702 break;
2704 if (c >= 0x20) {
2705 dp[i++] = (unsigned char)c;
2706 result->bufused = result->strlen = i;
2707 continue;
2711 /* escape by appending either \uhhhh or \x{hh...} */
2712 result->bufused = result->strlen = i;
2714 if (c < 0x0100 || c >= 0x10000)
2715 hex = Parrot_sprintf_c(interp, "\\x{%x}", c);
2716 else
2717 hex = Parrot_sprintf_c(interp, "\\u%04x", c);
2719 result = Parrot_str_append(interp, result, hex);
2721 /* adjust our insert idx */
2722 i += hex->strlen;
2724 /* and usable len */
2725 charlen = PObj_buflen(result);
2726 dp = (unsigned char *)result->strstart;
2728 PARROT_ASSERT(i <= charlen);
2731 result->bufused = result->strlen = i;
2732 return result;
2738 =item C<STRING * Parrot_str_unescape(PARROT_INTERP, const char *cstring, char
2739 delimiter, const char *enc_char)>
2741 Unescapes the specified C string. These sequences are covered:
2743 \xhh 1..2 hex digits
2744 \ooo 1..3 oct digits
2745 \cX control char X
2746 \x{h..h} 1..8 hex digits
2747 \uhhhh 4 hex digits
2748 \Uhhhhhhhh 8 hex digits
2749 \a, \b, \t, \n, \v, \f, \r, \e
2751 =cut
2755 PARROT_EXPORT
2756 PARROT_CANNOT_RETURN_NULL
2757 STRING *
2758 Parrot_str_unescape(PARROT_INTERP,
2759 ARGIN(const char *cstring), char delimiter, ARGIN_NULLOK(const char *enc_char))
2761 ASSERT_ARGS(Parrot_str_unescape)
2762 size_t clength = strlen(cstring);
2763 Parrot_UInt4 r;
2764 String_iter iter;
2765 STRING *result;
2766 const ENCODING *encoding;
2767 const CHARSET *charset;
2768 char *p;
2769 UINTVAL offs, d;
2771 /* we are constructing const table strings here */
2772 const UINTVAL flags = PObj_constant_FLAG;
2774 if (delimiter && clength)
2775 --clength;
2777 /* default is ascii */
2778 if (!enc_char)
2779 enc_char = "ascii";
2781 /* check for encoding: */
2782 p = strchr(enc_char, ':');
2784 if (p) {
2785 *p = '\0';
2786 encoding = Parrot_find_encoding(interp, enc_char);
2787 if (!encoding)
2788 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
2789 "Can't make '%s' encoding strings", enc_char);
2791 charset = Parrot_find_charset(interp, p + 1);
2792 if (!charset)
2793 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
2794 "Can't make '%s' charset strings", p + 1);
2796 result = Parrot_str_new_init(interp, cstring, clength,
2797 encoding, charset, flags);
2798 encoding = Parrot_fixed_8_encoding_ptr;
2800 else {
2801 result = string_make(interp, cstring, clength, enc_char, flags);
2802 encoding = result->encoding;
2805 encoding->iter_init(interp, result, &iter);
2807 for (offs = d = 0; offs < clength; ++offs) {
2808 r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs];
2810 /* There cannot be any NULs within this string. */
2811 PARROT_ASSERT(r != '\0');
2813 if (r == '\\') {
2814 ++offs;
2815 r = string_unescape_one(interp, &offs, result);
2816 --offs;
2819 if (d == offs) {
2820 /* we did it in place - no action */
2821 ++d;
2822 iter.bytepos++;
2823 iter.charpos++;
2824 continue;
2827 PARROT_ASSERT(d < offs);
2828 iter.set_and_advance(interp, &iter, r);
2829 ++d;
2832 result->strlen = d;
2833 result->bufused = iter.bytepos;
2835 /* this also validates the string */
2836 if (encoding != result->encoding)
2837 Parrot_str_length(interp, result);
2839 if (!CHARSET_VALIDATE(interp, result, 0))
2840 Parrot_ex_throw_from_c_args(interp, NULL,
2841 EXCEPTION_INVALID_STRING_REPRESENTATION, "Malformed string");
2843 return result;
2849 =item C<STRING * Parrot_str_upcase(PARROT_INTERP, const STRING *s)>
2851 Returns a copy of the specified Parrot string converted to upper case.
2852 Non-caseable characters are left unchanged.
2854 =cut
2858 PARROT_EXPORT
2859 PARROT_CANNOT_RETURN_NULL
2860 PARROT_MALLOC
2861 STRING *
2862 Parrot_str_upcase(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
2864 ASSERT_ARGS(Parrot_str_upcase)
2865 if (STRING_IS_NULL(s)) {
2866 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
2867 "Can't upcase NULL string");
2869 else {
2870 DECL_CONST_CAST;
2871 STRING * const dest = Parrot_str_copy(interp, PARROT_const_cast(STRING *, s));
2872 Parrot_str_upcase_inplace(interp, dest);
2873 return dest;
2880 =item C<void Parrot_str_upcase_inplace(PARROT_INTERP, STRING *s)>
2882 Converts the specified Parrot string to upper case.
2884 =cut
2888 PARROT_EXPORT
2889 void
2890 Parrot_str_upcase_inplace(PARROT_INTERP, ARGMOD_NULLOK(STRING *s))
2892 ASSERT_ARGS(Parrot_str_upcase_inplace)
2893 if (STRING_IS_NULL(s)) {
2894 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
2895 "Can't upcase NULL string");
2897 else {
2898 Parrot_str_write_COW(interp, s);
2899 CHARSET_UPCASE(interp, s);
2906 =item C<STRING * Parrot_str_downcase(PARROT_INTERP, const STRING *s)>
2908 Returns a copy of the specified Parrot string converted to lower case.
2909 Non-caseable characters are left unchanged.
2911 =cut
2915 PARROT_EXPORT
2916 PARROT_CANNOT_RETURN_NULL
2917 PARROT_MALLOC
2918 STRING *
2919 Parrot_str_downcase(PARROT_INTERP, ARGIN(const STRING *s))
2921 ASSERT_ARGS(Parrot_str_downcase)
2922 DECL_CONST_CAST;
2923 STRING * const dest = Parrot_str_copy(interp, PARROT_const_cast(STRING *, s));
2924 Parrot_str_downcase_inplace(interp, dest);
2925 return dest;
2931 =item C<void Parrot_str_downcase_inplace(PARROT_INTERP, STRING *s)>
2933 Converts the specified Parrot string to lower case.
2935 =cut
2939 PARROT_EXPORT
2940 void
2941 Parrot_str_downcase_inplace(PARROT_INTERP, ARGMOD(STRING *s))
2943 ASSERT_ARGS(Parrot_str_downcase_inplace)
2945 * TODO get rid of all the inplace variants. We have for utf8:
2946 * * 1 Parrot_str_copy from the non-incase variant
2947 * * conversion to utf16, with doubling the buffer
2948 * * possibly one more reallocation in downcase
2950 Parrot_str_write_COW(interp, s);
2951 CHARSET_DOWNCASE(interp, s);
2957 =item C<STRING * Parrot_str_titlecase(PARROT_INTERP, const STRING *s)>
2959 Returns a copy of the specified Parrot string converted to title case.
2960 Non-caseable characters are left unchanged.
2962 =cut
2966 PARROT_EXPORT
2967 PARROT_CANNOT_RETURN_NULL
2968 PARROT_MALLOC
2969 STRING *
2970 Parrot_str_titlecase(PARROT_INTERP, ARGIN(const STRING *s))
2972 ASSERT_ARGS(Parrot_str_titlecase)
2973 DECL_CONST_CAST;
2974 STRING * const dest = Parrot_str_copy(interp, PARROT_const_cast(STRING *, s));
2975 Parrot_str_titlecase_inplace(interp, dest);
2976 return dest;
2982 =item C<void Parrot_str_titlecase_inplace(PARROT_INTERP, STRING *s)>
2984 Converts the specified Parrot string to title case.
2986 =cut
2990 PARROT_EXPORT
2991 void
2992 Parrot_str_titlecase_inplace(PARROT_INTERP, ARGMOD(STRING *s))
2994 ASSERT_ARGS(Parrot_str_titlecase_inplace)
2995 Parrot_str_write_COW(interp, s);
2996 CHARSET_TITLECASE(interp, s);
3002 =item C<STRING * string_increment(PARROT_INTERP, const STRING *s)>
3004 Increments the string in the Perl 5 fashion, where incrementing aa gives you bb
3005 and so on. Currently single char only.
3007 =cut
3011 PARROT_EXPORT
3012 PARROT_WARN_UNUSED_RESULT
3013 PARROT_CANNOT_RETURN_NULL
3014 STRING *
3015 string_increment(PARROT_INTERP, ARGIN(const STRING *s))
3017 ASSERT_ARGS(string_increment)
3018 UINTVAL o;
3020 if (Parrot_str_byte_length(interp, s) != 1)
3021 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
3022 "increment only for length = 1 done");
3024 o = (UINTVAL)string_ord(interp, s, 0);
3026 if ((o >= 'A' && o < 'Z') || (o >= 'a' && o < 'z')) {
3027 ++o;
3028 /* TODO increment in place */
3029 return string_chr(interp, o);
3032 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
3033 "increment out of range - unimplemented");
3039 =item C<const char * Parrot_string_cstring(PARROT_INTERP, const STRING *str)>
3041 Returns a C string from a Parrot string. Both sides are treated
3042 as constants -- i.e. do not resize the result.
3044 =cut
3048 PARROT_EXPORT
3049 PARROT_PURE_FUNCTION
3050 PARROT_CANNOT_RETURN_NULL
3051 const char *
3052 Parrot_string_cstring(SHIM_INTERP, ARGIN(const STRING *str))
3054 ASSERT_ARGS(Parrot_string_cstring)
3055 /* TODO handle NUL and friends */
3056 return str->strstart;
3062 =item C<INTVAL Parrot_str_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
3063 *s, UINTVAL offset)>
3065 Returns 1 if the codepoint of string C<s> at given offset is in the given
3066 character class C<flags>. See also F<include/parrot/cclass.h> for possible
3067 character classes. Returns 0 otherwise, or if the string is empty or NULL.
3069 =cut
3073 PARROT_EXPORT
3074 PARROT_WARN_UNUSED_RESULT
3075 INTVAL
3076 Parrot_str_is_cclass(PARROT_INTERP, INTVAL flags,
3077 ARGIN(const STRING *s), UINTVAL offset)
3079 ASSERT_ARGS(Parrot_str_is_cclass)
3080 if (!Parrot_str_byte_length(interp, s))
3081 return 0;
3083 return CHARSET_IS_CCLASS(interp, flags, s, offset);
3089 =item C<INTVAL Parrot_str_find_cclass(PARROT_INTERP, INTVAL flags, STRING *s,
3090 UINTVAL offset, UINTVAL count)>
3092 Finds the first occurrence of the given character class in C<flags> in the
3093 string, and returns its glyph-wise index.
3095 =cut
3099 PARROT_EXPORT
3100 PARROT_WARN_UNUSED_RESULT
3101 INTVAL
3102 Parrot_str_find_cclass(PARROT_INTERP, INTVAL flags, ARGIN_NULLOK(STRING *s),
3103 UINTVAL offset, UINTVAL count)
3105 ASSERT_ARGS(Parrot_str_find_cclass)
3106 if (!s)
3107 return -1;
3109 return CHARSET_FIND_CCLASS(interp, flags, s, offset, count);
3114 =item C<INTVAL Parrot_str_find_not_cclass(PARROT_INTERP, INTVAL flags, STRING
3115 *s, UINTVAL offset, UINTVAL count)>
3117 Finds the first occurrence of the a character I<not> in the given character
3118 class in C<flags> in the string starting from C<offset> and looking at C<count>
3119 positions, and returns its glyph-wise index. Returns C<offset + count>, if not
3120 found.
3122 =cut
3126 PARROT_EXPORT
3127 PARROT_WARN_UNUSED_RESULT
3128 INTVAL
3129 Parrot_str_find_not_cclass(PARROT_INTERP, INTVAL flags,
3130 ARGIN_NULLOK(STRING *s), UINTVAL offset, UINTVAL count)
3132 ASSERT_ARGS(Parrot_str_find_not_cclass)
3133 if (!s)
3134 return -1;
3136 return CHARSET_FIND_NOT_CCLASS(interp, flags, s, offset, count);
3142 =item C<STRING* Parrot_str_change_charset(PARROT_INTERP, STRING *src, INTVAL
3143 charset_nr, STRING *dest)>
3145 If C<dest> == NULL, converts C<src> to the given charset or encoding inplace.
3146 Otherwise returns a copy of C<src> with the charset/encoding in C<dest>.
3148 =cut
3152 PARROT_EXPORT
3153 PARROT_WARN_UNUSED_RESULT
3154 PARROT_CAN_RETURN_NULL
3155 STRING*
3156 Parrot_str_change_charset(PARROT_INTERP, ARGMOD_NULLOK(STRING *src),
3157 INTVAL charset_nr, ARGOUT_NULLOK(STRING *dest))
3159 ASSERT_ARGS(Parrot_str_change_charset)
3160 const CHARSET *new_charset;
3162 if (!src)
3163 return NULL;
3165 new_charset = Parrot_get_charset(interp, charset_nr);
3167 if (!new_charset)
3168 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARTYPE,
3169 "charset #%d not found", (int) charset_nr);
3172 * dest is an empty string header or NULL, if an inplace
3173 * operation is desired
3175 if (dest) {
3176 if (new_charset == src->charset) {
3177 dest = Parrot_str_reuse_COW(interp, src, dest);
3178 dest->charset = new_charset;
3179 /* keep encoding */
3180 return dest;
3183 dest->charset = new_charset;
3185 /* get prefered encoding for charset */
3186 dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
3188 else {
3189 if (new_charset == src->charset)
3190 return src;
3192 Parrot_str_write_COW(interp, src);
3195 return new_charset->to_charset(interp, src, dest);
3201 =item C<STRING* Parrot_str_change_encoding(PARROT_INTERP, STRING *src, INTVAL
3202 encoding_nr, STRING *dest)>
3204 If C<dest> == NULL, converts C<src> to the given charset or encoding in place.
3205 Otherwise returns a copy of C<src> with the charset/encoding in C<dest>
3207 =cut
3211 PARROT_EXPORT
3212 PARROT_WARN_UNUSED_RESULT
3213 PARROT_CAN_RETURN_NULL
3214 STRING*
3215 Parrot_str_change_encoding(PARROT_INTERP, ARGIN_NULLOK(STRING *src),
3216 INTVAL encoding_nr, ARGOUT_NULLOK(STRING *dest))
3218 ASSERT_ARGS(Parrot_str_change_encoding)
3219 const ENCODING *new_encoding;
3221 if (!src)
3222 return NULL;
3224 new_encoding = Parrot_get_encoding(interp, encoding_nr);
3226 if (!new_encoding)
3227 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARTYPE,
3228 "encoding #%d not found", (int) encoding_nr);
3231 * dest is an empty string header or NULL, if an inplace
3232 * operation is desired
3234 if (dest) {
3235 dest->encoding = new_encoding;
3236 if (new_encoding == src->encoding) {
3237 dest = Parrot_str_reuse_COW(interp, src, dest);
3238 return dest;
3241 else {
3242 if (new_encoding == src->encoding)
3243 return src;
3245 Parrot_str_write_COW(interp, src);
3248 return new_encoding->to_encoding(interp, src, dest);
3254 =item C<STRING * Parrot_str_compose(PARROT_INTERP, STRING *src)>
3256 Normalizes the string.
3258 =cut
3262 PARROT_EXPORT
3263 PARROT_WARN_UNUSED_RESULT
3264 PARROT_CAN_RETURN_NULL
3265 STRING *
3266 Parrot_str_compose(PARROT_INTERP, ARGIN_NULLOK(STRING *src))
3268 ASSERT_ARGS(Parrot_str_compose)
3269 if (!src)
3270 return NULL;
3272 if (!src->strlen)
3273 return Parrot_str_new_noinit(interp, enum_stringrep_one, 0);
3275 return CHARSET_COMPOSE(interp, src);
3281 =item C<STRING* Parrot_str_join(PARROT_INTERP, STRING *j, PMC *ar)>
3283 Joins the elements of the array C<ar> as strings with the string C<j> between
3284 them, returning the result.
3286 =cut
3290 PARROT_EXPORT
3291 PARROT_WARN_UNUSED_RESULT
3292 PARROT_CANNOT_RETURN_NULL
3293 STRING*
3294 Parrot_str_join(PARROT_INTERP, ARGIN_NULLOK(STRING *j), ARGIN(PMC *ar))
3296 ASSERT_ARGS(Parrot_str_join)
3297 STRING *res;
3298 STRING *s;
3299 const int ar_len = VTABLE_elements(interp, ar);
3300 int i;
3302 if (ar_len == 0)
3303 return Parrot_str_new_noinit(interp, enum_stringrep_one, 0);
3305 s = VTABLE_get_string_keyed_int(interp, ar, 0);
3306 res = s ? Parrot_str_copy(interp, s) : NULL;
3308 for (i = 1; i < ar_len; ++i) {
3309 STRING * const next = VTABLE_get_string_keyed_int(interp, ar, i);
3311 res = Parrot_str_append(interp, res, j);
3312 res = Parrot_str_append(interp, res, next);
3315 return res;
3321 =item C<PMC* Parrot_str_split(PARROT_INTERP, STRING *delim, STRING *str)>
3323 Splits the string C<str> at the delimiter C<delim>, returning a
3324 C<ResizableStringArray>, or his mapped type in the current HLL,
3325 of results. Returns PMCNULL if the string or the delimiter is NULL.
3327 =cut
3331 PARROT_EXPORT
3332 PARROT_WARN_UNUSED_RESULT
3333 PARROT_CANNOT_RETURN_NULL
3334 PMC*
3335 Parrot_str_split(PARROT_INTERP,
3336 ARGIN_NULLOK(STRING *delim), ARGIN_NULLOK(STRING *str))
3338 ASSERT_ARGS(Parrot_str_split)
3339 PMC *res;
3340 INTVAL slen, dlen, ps, pe;
3342 if (STRING_IS_NULL(delim) || STRING_IS_NULL(str))
3343 return PMCNULL;
3345 res = pmc_new(interp, Parrot_get_ctx_HLL_type(interp, enum_class_ResizableStringArray));
3346 slen = Parrot_str_byte_length(interp, str);
3348 if (!slen)
3349 return res;
3351 dlen = Parrot_str_byte_length(interp, delim);
3353 if (dlen == 0) {
3354 int i;
3355 VTABLE_set_integer_native(interp, res, slen);
3357 for (i = 0; i < slen; ++i) {
3358 STRING * const p = Parrot_str_substr(interp, str, i, 1, NULL, 0);
3359 VTABLE_set_string_keyed_int(interp, res, i, p);
3362 return res;
3365 pe = Parrot_str_find_index(interp, str, delim, 0);
3367 if (pe < 0) {
3368 VTABLE_push_string(interp, res, str);
3369 return res;
3372 ps = 0;
3374 while (ps <= slen) {
3375 const int pl = pe - ps;
3376 STRING * const tstr = Parrot_str_substr(interp, str, ps, pl, NULL, 0);
3378 VTABLE_push_string(interp, res, tstr);
3379 ps = pe + Parrot_str_byte_length(interp, delim);
3381 if (ps > slen)
3382 break;
3384 pe = Parrot_str_find_index(interp, str, delim, ps);
3386 if (pe < 0)
3387 pe = slen;
3390 return res;
3396 =item C<STRING* Parrot_str_from_uint(PARROT_INTERP, char *tc, UHUGEINTVAL num,
3397 unsigned int base, int minus)>
3399 Returns C<num> converted to a Parrot C<STRING>.
3401 Note that C<base> must be defined (a default of 10 is not assumed). The caller
3402 has to verify that C<< base >= 2 && base <= 36 >> The buffer C<tc> must be at
3403 least C<sizeof (UHUGEINTVAL)*8 + 1> chars big.
3405 If C<minus> is true, then C<-> is prepended to the string representation.
3407 =cut
3411 PARROT_WARN_UNUSED_RESULT
3412 PARROT_CANNOT_RETURN_NULL
3413 STRING*
3414 Parrot_str_from_uint(PARROT_INTERP, ARGOUT(char *tc), UHUGEINTVAL num,
3415 unsigned int base, int minus)
3417 ASSERT_ARGS(Parrot_str_from_uint)
3418 /* the buffer must be at least as long as this */
3419 char *p = tc + sizeof (UHUGEINTVAL)*8 + 1;
3420 const char * const tail = p;
3422 PARROT_ASSERT(base >= 2 && base <= 36);
3424 do {
3425 const char cur = (char)(num % base);
3426 if (cur < 10) {
3427 *--p = (char)('0' + cur);
3429 else {
3430 *--p = (char)('a' + cur - 10);
3432 } while (num /= base);
3434 if (minus)
3435 *--p = '-';
3437 return string_make(interp, p, (UINTVAL)(tail - p), "ascii", 0);
3443 =item C<STRING * Parrot_str_from_int_base(PARROT_INTERP, char *tc, HUGEINTVAL
3444 num, unsigned int base)>
3446 Returns C<num> converted to a Parrot C<STRING>.
3448 Note that C<base> must be defined (a default of 10 is not assumed).
3450 If C<< num < 0 >>, then C<-> is prepended to the string representation.
3452 =cut
3456 PARROT_WARN_UNUSED_RESULT
3457 PARROT_CANNOT_RETURN_NULL
3458 STRING *
3459 Parrot_str_from_int_base(PARROT_INTERP, ARGOUT(char *tc), HUGEINTVAL num, unsigned int base)
3461 ASSERT_ARGS(Parrot_str_from_int_base)
3462 const int is_neg = (num < 0);
3464 if (is_neg)
3465 num = -num;
3467 return Parrot_str_from_uint(interp, tc, (UHUGEINTVAL)num, base, is_neg);
3472 =back
3474 =head1 SEE ALSO
3476 =over
3478 =item F<src/string/primitives.c>
3480 =item F<include/parrot/string.h>
3482 =item F<include/parrot/string_funcs.h>
3484 =item F<docs/strings.pod>
3486 =back
3488 =cut
3494 * Local variables:
3495 * c-file-style: "parrot"
3496 * End:
3497 * vim: expandtab shiftwidth=4: