2 Copyright (C) 2001-2009, Parrot Foundation.
7 src/string/api.c - Parrot Strings
11 This file implements the non-ICU parts of the Parrot string subsystem.
13 Note that C<bufstart> and C<buflen> are used by the memory subsystem. The
14 string functions may only use C<buflen> to determine if there is some space
15 left beyond C<bufused>. This is the I<only> valid usage of these two data
16 members, beside setting C<bufstart>/C<buflen> for external strings.
26 #include "parrot/parrot.h"
27 #include "parrot/compiler.h"
28 #include "parrot/string_funcs.h"
29 #include "private_cstring.h"
32 #define nonnull_encoding_name(s) (s) ? (s)->encoding->name : "null string"
33 #define saneify_string(s) \
34 PARROT_ASSERT((s)->encoding); \
35 PARROT_ASSERT((s)->charset); \
36 PARROT_ASSERT(!PObj_on_free_list_TEST(s))
38 /* HEADERIZER HFILE: include/parrot/string_funcs.h */
40 /* HEADERIZER BEGIN: static */
41 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
43 static void make_writable(PARROT_INTERP
,
46 parrot_string_representation_t representation
)
47 __attribute__nonnull__(1)
48 __attribute__nonnull__(2)
51 #define ASSERT_ARGS_make_writable __attribute__unused__ int _ASSERT_ARGS_CHECK = \
52 PARROT_ASSERT_ARG(interp) \
53 || PARROT_ASSERT_ARG(s)
54 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
55 /* HEADERIZER END: static */
60 =item C<void Parrot_str_write_COW(PARROT_INTERP, STRING *s)>
62 If the specified Parrot string is copy-on-write then the memory is
63 copied over and the copy-on-write flag is cleared.
71 Parrot_str_write_COW(PARROT_INTERP
, ARGMOD(STRING
*s
))
73 ASSERT_ARGS(Parrot_str_write_COW
)
75 /* COW_FLAG | constant_FLAG | external_FLAG) */
76 if (PObj_is_cowed_TESTALL(s
)) {
79 /* Create new pool data for this header to use,
80 * independent of the original COW data */
81 PObj_constant_CLEAR(s
);
83 /* constant may have been marked */
87 * allocate a dummy strings memory
88 * buflen might be bigger and used, so pass this length
89 * also be sure not to allocate from the constant pool
91 PObj_flags_CLEARALL(&for_alloc
);
92 Parrot_gc_allocate_string_storage(interp
, &for_alloc
, PObj_buflen(s
));
94 /* now copy memory over */
95 mem_sys_memcopy(for_alloc
.strstart
, s
->strstart
, s
->bufused
);
97 /* and finally use that string memory */
99 PObj_bufstart(s
) = PObj_bufstart(&for_alloc
);
100 s
->strstart
= for_alloc
.strstart
;
101 PObj_buflen(s
) = PObj_buflen(&for_alloc
);
103 /* COW_FLAG | external_FLAG */
104 PObj_is_external_CLEARALL(s
);
112 =item C<STRING * Parrot_str_new_COW(PARROT_INTERP, STRING *s)>
114 Creates a copy-on-write string, cloning a string header without
115 allocating a new buffer.
122 PARROT_CANNOT_RETURN_NULL
123 PARROT_WARN_UNUSED_RESULT
125 Parrot_str_new_COW(PARROT_INTERP
, ARGMOD(STRING
*s
))
127 ASSERT_ARGS(Parrot_str_new_COW
)
130 if (PObj_constant_TEST(s
)) {
131 d
= Parrot_gc_new_string_header(interp
,
132 PObj_get_FLAGS(s
) & ~PObj_constant_FLAG
);
135 /* we can't move the memory, because constants aren't
136 * scanned in compact_pool, therefore the other end
137 * would point to garbage.
139 PObj_constant_CLEAR(d
);
140 PObj_external_SET(d
);
143 d
= Parrot_gc_new_string_header(interp
, PObj_get_FLAGS(s
));
146 PObj_sysmem_CLEAR(d
);
148 /* XXX FIXME hack to avoid cross-interpreter issue until it
149 * is fixed correctly. */
150 if (n_interpreters
> 1 && PObj_is_movable_TESTALL(s
) &&
151 !Parrot_gc_ptr_in_memory_pool(interp
, PObj_bufstart(s
))) {
152 Parrot_str_write_COW(interp
, d
);
153 Parrot_io_eprintf(interp
, "cross-interpreter copy of "
154 "relocatable string '%Ss' into tid %d\n",
156 interp
->thread_data
->tid
);
165 =item C<STRING * Parrot_str_reuse_COW(PARROT_INTERP, STRING *s, STRING *d)>
167 Creates a copy-on-write string by cloning a string header without
168 allocating a new buffer. Doesn't allocate a new string header, instead
169 using the one passed in and returns it.
176 PARROT_CANNOT_RETURN_NULL
178 Parrot_str_reuse_COW(SHIM_INTERP
, ARGMOD(STRING
*s
), ARGOUT(STRING
*d
))
180 ASSERT_ARGS(Parrot_str_reuse_COW
)
182 if (PObj_constant_TEST(s
)) {
185 PObj_constant_CLEAR(d
);
186 PObj_external_SET(d
);
191 PObj_sysmem_CLEAR(d
);
198 =item C<STRING * Parrot_str_set(PARROT_INTERP, STRING *dest, STRING *src)>
200 Makes the contents of first Parrot string a copy of the contents of
208 PARROT_CANNOT_RETURN_NULL
210 Parrot_str_set(PARROT_INTERP
, ARGIN_NULLOK(STRING
*dest
), ARGMOD(STRING
*src
))
212 ASSERT_ARGS(Parrot_str_set
)
215 if (dest
) { /* && dest != src */
216 /* they are different, dest is not an external string */
218 if (!PObj_is_cowed_TESTALL(dest
) && PObj_bufstart(dest
)) {
219 mem_sys_free(PObj_bufallocstart(dest
));
222 dest
= Parrot_str_reuse_COW(interp
, src
, dest
);
225 dest
= Parrot_str_new_COW(interp
, src
);
234 =head2 Basic String Functions
236 Creation, enlargement, etc.
240 =item C<void Parrot_str_init(PARROT_INTERP)>
242 Initializes the Parrot string subsystem.
250 Parrot_str_init(PARROT_INTERP
)
252 ASSERT_ARGS(Parrot_str_init
)
253 Hash
*const_cstring_hash
;
255 const size_t n_parrot_cstrings
=
256 sizeof (parrot_cstrings
) / sizeof (parrot_cstrings
[0]);
258 if (interp
->parent_interpreter
) {
259 interp
->hash_seed
= interp
->parent_interpreter
->hash_seed
;
262 /* TT #64 - use an entropy source once available */
263 Parrot_srand(Parrot_intval_time());
264 interp
->hash_seed
= Parrot_uint_rand(0);
267 /* initialize the constant string table */
268 if (interp
->parent_interpreter
) {
269 interp
->const_cstring_table
=
270 interp
->parent_interpreter
->const_cstring_table
;
271 interp
->const_cstring_hash
=
272 interp
->parent_interpreter
->const_cstring_hash
;
276 /* Set up the cstring cache, then load the basic encodings and charsets */
277 const_cstring_hash
= parrot_new_cstring_hash(interp
);
278 interp
->const_cstring_hash
= const_cstring_hash
;
279 Parrot_charsets_encodings_init(interp
);
281 interp
->const_cstring_table
=
282 mem_allocate_n_zeroed_typed(n_parrot_cstrings
, STRING
*);
284 for (i
= 0; i
< n_parrot_cstrings
; ++i
) {
287 Parrot_str_new_init(interp
,
288 parrot_cstrings
[i
].string
,
289 parrot_cstrings
[i
].len
,
290 PARROT_DEFAULT_ENCODING
, PARROT_DEFAULT_CHARSET
,
291 PObj_external_FLAG
|PObj_constant_FLAG
);
292 parrot_hash_put(interp
, const_cstring_hash
,
293 PARROT_const_cast(char *, parrot_cstrings
[i
].string
), (void *)s
);
294 interp
->const_cstring_table
[i
] = s
;
300 =item C<void Parrot_str_finish(PARROT_INTERP)>
302 De-Initializes the Parrot string subsystem.
310 Parrot_str_finish(PARROT_INTERP
)
312 ASSERT_ARGS(Parrot_str_finish
)
313 /* all are shared between interpreters */
314 if (!interp
->parent_interpreter
) {
315 mem_sys_free(interp
->const_cstring_table
);
316 interp
->const_cstring_table
= NULL
;
317 Parrot_charsets_encodings_deinit(interp
);
318 parrot_hash_destroy(interp
, interp
->const_cstring_hash
);
324 =item C<UINTVAL string_capacity(PARROT_INTERP, const STRING *s)>
326 Returns the capacity of the specified Parrot string in bytes, that
327 is how many bytes can be appended onto strstart.
334 PARROT_WARN_UNUSED_RESULT
337 string_capacity(SHIM_INTERP
, ARGIN(const STRING
*s
))
339 ASSERT_ARGS(string_capacity
)
341 return ((ptrcast_t
)PObj_bufstart(s
) + PObj_buflen(s
) -
342 (ptrcast_t
)s
->strstart
);
347 =item C<STRING * Parrot_str_new_noinit(PARROT_INTERP,
348 parrot_string_representation_t representation, UINTVAL capacity)>
350 Creates and returns an empty Parrot string.
357 PARROT_CANNOT_RETURN_NULL
359 Parrot_str_new_noinit(PARROT_INTERP
,
360 parrot_string_representation_t representation
, UINTVAL capacity
)
362 ASSERT_ARGS(Parrot_str_new_noinit
)
363 STRING
* const s
= Parrot_gc_new_string_header(interp
, 0);
365 /* TODO adapt string creation functions */
366 if (representation
!= enum_stringrep_one
)
367 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_CHARTYPE
,
368 "Unsupported representation");
370 s
->charset
= PARROT_DEFAULT_CHARSET
;
371 s
->encoding
= CHARSET_GET_PREFERRED_ENCODING(interp
, s
);
373 Parrot_gc_allocate_string_storage(interp
, s
,
374 (size_t)string_max_bytes(interp
, s
, capacity
));
381 =item C<const CHARSET * string_rep_compatible(PARROT_INTERP, const STRING *a,
382 const STRING *b, const ENCODING **e)>
384 Find the "lowest" possible charset and encoding for the given string. E.g.
386 ascii <op> utf8 => utf8
387 => ascii, B<if> C<STRING *b> has ascii chars only.
389 Returs NULL, if no compatible string representation can be found.
396 PARROT_WARN_UNUSED_RESULT
397 PARROT_CAN_RETURN_NULL
399 string_rep_compatible(SHIM_INTERP
,
400 ARGIN(const STRING
*a
), ARGIN(const STRING
*b
), ARGOUT(const ENCODING
**e
))
402 ASSERT_ARGS(string_rep_compatible
)
403 if (a
->encoding
== b
->encoding
&& a
->charset
== b
->charset
) {
408 /* a table could possibly simplify the logic */
409 if (a
->encoding
== Parrot_utf8_encoding_ptr
&&
410 b
->charset
== Parrot_ascii_charset_ptr
) {
411 if (a
->strlen
== a
->bufused
) {
412 *e
= Parrot_fixed_8_encoding_ptr
;
418 if (b
->encoding
== Parrot_utf8_encoding_ptr
&&
419 a
->charset
== Parrot_ascii_charset_ptr
) {
420 if (b
->strlen
== b
->bufused
) {
421 *e
= Parrot_fixed_8_encoding_ptr
;
427 if (a
->encoding
!= b
->encoding
)
429 if (a
->encoding
!= Parrot_fixed_8_encoding_ptr
)
431 *e
= Parrot_fixed_8_encoding_ptr
;
432 if (a
->charset
== b
->charset
)
434 if (b
->charset
== Parrot_ascii_charset_ptr
)
436 if (a
->charset
== Parrot_ascii_charset_ptr
)
438 if (a
->charset
== Parrot_binary_charset_ptr
)
440 if (b
->charset
== Parrot_binary_charset_ptr
)
447 =item C<STRING * Parrot_str_concat(PARROT_INTERP, STRING *a, STRING *b, UINTVAL
450 Concatenates two Parrot strings. If necessary, converts the second
451 string's encoding and/or type to match those of the first string. If
452 either string is C<NULL>, then a copy of the non-C<NULL> string is
453 returned. If both strings are C<NULL>, then a new zero-length string is
454 created and returned.
461 PARROT_CANNOT_RETURN_NULL
463 Parrot_str_concat(PARROT_INTERP
, ARGIN_NULLOK(STRING
*a
),
464 ARGIN_NULLOK(STRING
*b
), UINTVAL Uflags
)
466 ASSERT_ARGS(Parrot_str_concat
)
467 if (a
!= NULL
&& a
->strlen
!= 0) {
468 if (b
!= NULL
&& b
->strlen
!= 0) {
470 const CHARSET
*cs
= string_rep_compatible(interp
, a
, b
, &enc
);
477 result
= Parrot_str_new_init(interp
, NULL
, a
->bufused
+ b
->bufused
,
480 result
= Parrot_str_append(interp
, result
, a
);
481 result
= Parrot_str_append(interp
, result
, b
);
486 return Parrot_str_copy(interp
, a
);
490 ? Parrot_str_copy(interp
, b
)
491 : string_make(interp
, NULL
, 0, NULL
, Uflags
);
497 =item C<STRING * Parrot_str_append(PARROT_INTERP, STRING *a, STRING *b)>
499 Take in two Parrot strings and append the second to the first. NOTE THAT
500 RETURN VALUE MAY NOT BE THE FIRST STRING, if the first string is COW'd or
501 read-only. So make sure to _use_ the return value.
508 PARROT_WARN_UNUSED_RESULT
509 PARROT_CAN_RETURN_NULL
511 Parrot_str_append(PARROT_INTERP
, ARGMOD_NULLOK(STRING
*a
), ARGIN_NULLOK(STRING
*b
))
513 ASSERT_ARGS(Parrot_str_append
)
515 UINTVAL total_length
;
519 /* XXX should this be a CHARSET method? */
521 /* If B isn't real, we just bail */
522 const UINTVAL b_len
= b
? Parrot_str_byte_length(interp
, b
) : 0;
527 if (a
== NULL
|| PObj_bufstart(a
) == NULL
)
528 return Parrot_str_copy(interp
, b
);
533 /* If the destination's constant, or external then just fall back to
535 if (PObj_is_cowed_TESTALL(a
))
536 return Parrot_str_concat(interp
, a
, b
, 0);
538 cs
= string_rep_compatible(interp
, a
, b
, &enc
);
544 /* upgrade strings for concatenation */
545 enc
= (a
->encoding
== Parrot_utf16_encoding_ptr
||
546 b
->encoding
== Parrot_utf16_encoding_ptr
||
547 a
->encoding
== Parrot_ucs2_encoding_ptr
||
548 b
->encoding
== Parrot_ucs2_encoding_ptr
)
549 ? Parrot_utf16_encoding_ptr
550 : Parrot_utf8_encoding_ptr
;
552 Parrot_unicode_charset_ptr
->to_charset(interp
, a
, NULL
);
553 b
= Parrot_unicode_charset_ptr
->to_charset(interp
, b
,
554 Parrot_gc_new_string_header(interp
, 0));
556 if (a
->encoding
!= enc
)
557 enc
->to_encoding(interp
, a
, NULL
);
558 if (b
->encoding
!= enc
)
559 enc
->to_encoding(interp
, b
, NULL
);
562 /* calc usable and total bytes */
563 a_capacity
= string_capacity(interp
, a
);
564 total_length
= a
->bufused
+ b
->bufused
;
566 /* make sure A's big enough for both */
567 if (total_length
> a_capacity
)
568 Parrot_gc_reallocate_string_storage(interp
, a
, total_length
<< 1);
570 /* A is now ready to receive the contents of B */
572 /* Tack B on the end of A */
573 mem_sys_memcopy((void *)((ptrcast_t
)a
->strstart
+ a
->bufused
),
574 b
->strstart
, b
->bufused
);
576 a
->bufused
+= b
->bufused
;
585 =item C<STRING * Parrot_str_new(PARROT_INTERP, const char * const buffer, const
588 Make a Parrot string from a specified C string.
595 PARROT_WARN_UNUSED_RESULT
597 PARROT_CANNOT_RETURN_NULL
599 Parrot_str_new(PARROT_INTERP
, ARGIN_NULLOK(const char * const buffer
), const UINTVAL len
)
601 ASSERT_ARGS(Parrot_str_new
)
602 return Parrot_str_new_init(interp
, buffer
, len
? len
:
603 buffer
? strlen(buffer
) : 0,
604 PARROT_DEFAULT_ENCODING
, PARROT_DEFAULT_CHARSET
,
605 0); /* Force an 8-bit encoding at some
611 =item C<const char* string_primary_encoding_for_representation(PARROT_INTERP,
612 parrot_string_representation_t representation)>
614 Returns the primary encoding for the specified representation.
616 This is needed for packfile unpacking, unless we just always use UTF-8 or BOCU.
623 PARROT_CANNOT_RETURN_NULL
626 string_primary_encoding_for_representation(PARROT_INTERP
,
627 parrot_string_representation_t representation
)
629 ASSERT_ARGS(string_primary_encoding_for_representation
)
630 if (representation
== enum_stringrep_one
)
633 Parrot_ex_throw_from_c_args(interp
, NULL
,
634 EXCEPTION_INVALID_STRING_REPRESENTATION
,
635 "string_primary_encoding_for_representation: "
636 "invalid string representation");
641 =item C<STRING * Parrot_str_new_constant(PARROT_INTERP, const char *buffer)>
643 Creates and returns a constant Parrot string.
650 PARROT_WARN_UNUSED_RESULT
651 PARROT_CANNOT_RETURN_NULL
653 Parrot_str_new_constant(PARROT_INTERP
, ARGIN(const char *buffer
))
655 ASSERT_ARGS(Parrot_str_new_constant
)
658 Hash
* const cstring_cache
= (Hash
*)interp
->const_cstring_hash
;
660 s
= (STRING
*)parrot_hash_get(interp
, cstring_cache
, buffer
);
665 s
= Parrot_str_new_init(interp
, buffer
, strlen(buffer
),
666 PARROT_DEFAULT_ENCODING
, PARROT_DEFAULT_CHARSET
,
667 PObj_external_FLAG
|PObj_constant_FLAG
);
669 parrot_hash_put(interp
, cstring_cache
,
670 PARROT_const_cast(char *, buffer
), (void *)s
);
678 =item C<STRING * string_make(PARROT_INTERP, const char *buffer, UINTVAL len,
679 const char *charset_name, UINTVAL flags)>
681 Creates and returns a new Parrot string using C<len> bytes of string data read
684 The value of C<charset_name> specifies the string's representation.
685 The currently recognised values are:
692 The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
693 and the other three assume C<fixed-8> encoding.
695 If C<charset> is unspecified, the default charset 'ascii' will be used.
697 The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
705 PARROT_WARN_UNUSED_RESULT
706 PARROT_CANNOT_RETURN_NULL
708 string_make(PARROT_INTERP
, ARGIN_NULLOK(const char *buffer
),
709 UINTVAL len
, ARGIN_NULLOK(const char *charset_name
), UINTVAL flags
)
711 ASSERT_ARGS(string_make
)
712 const CHARSET
*charset
;
715 charset_name
= "ascii";
717 charset
= Parrot_find_charset(interp
, charset_name
);
720 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
721 "Can't make '%s' charset strings", charset_name
);
723 return Parrot_str_new_init(interp
, buffer
, len
,
724 charset
->preferred_encoding
, charset
, flags
);
731 =item C<STRING * string_make_from_charset(PARROT_INTERP, const char *buffer,
732 UINTVAL len, INTVAL charset_nr, UINTVAL flags)>
734 Creates and returns a new Parrot string using C<len> bytes of string data read
737 The value of C<charset_name> specifies the string's representation. It must be
738 a valid charset identifier.
745 The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
746 and the other three assume C<fixed-8> encoding.
748 The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
756 PARROT_WARN_UNUSED_RESULT
757 PARROT_CANNOT_RETURN_NULL
759 string_make_from_charset(PARROT_INTERP
, ARGIN_NULLOK(const char *buffer
),
760 UINTVAL len
, INTVAL charset_nr
, UINTVAL flags
)
762 ASSERT_ARGS(string_make_from_charset
)
763 const CHARSET
*charset
= Parrot_get_charset(interp
, charset_nr
);
766 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
767 "Invalid charset number '%d' specified", charset_nr
);
769 return Parrot_str_new_init(interp
, buffer
, len
,
770 charset
->preferred_encoding
, charset
, flags
);
776 =item C<STRING * Parrot_str_new_init(PARROT_INTERP, const char *buffer, UINTVAL
777 len, const ENCODING *encoding, const CHARSET *charset, UINTVAL flags)>
779 Given a buffer, its length, an encoding, a character set, and STRING flags,
780 creates and returns a new string. Don't call this directly.
787 PARROT_WARN_UNUSED_RESULT
788 PARROT_CANNOT_RETURN_NULL
790 Parrot_str_new_init(PARROT_INTERP
, ARGIN_NULLOK(const char *buffer
), UINTVAL len
,
791 ARGIN(const ENCODING
*encoding
), ARGIN(const CHARSET
*charset
), UINTVAL flags
)
793 ASSERT_ARGS(Parrot_str_new_init
)
795 STRING
* const s
= Parrot_gc_new_string_header(interp
, flags
);
796 s
->encoding
= encoding
;
797 s
->charset
= charset
;
799 if (flags
& PObj_external_FLAG
) {
801 * fast path for external (constant) strings - don't allocate
804 /* The following cast discards the 'const'. That raises
805 a warning with gcc, but is ok since the caller indicated
806 it was safe by setting PObj_external_FLAG.
807 (The cast is necessary to pacify TenDRA's tcc.)
809 PObj_bufstart(s
) = s
->strstart
= PARROT_const_cast(char *, buffer
);
810 PObj_buflen(s
) = s
->bufused
= len
;
812 if (encoding
== Parrot_fixed_8_encoding_ptr
)
815 Parrot_str_length(interp
, s
);
820 Parrot_gc_allocate_string_storage(interp
, s
, len
);
823 mem_sys_memcopy(s
->strstart
, buffer
, len
);
825 if (encoding
== Parrot_fixed_8_encoding_ptr
)
828 Parrot_str_length(interp
, s
);
831 s
->strlen
= s
->bufused
= 0;
840 =item C<STRING * Parrot_str_resize(PARROT_INTERP, STRING *s, UINTVAL addlen)>
842 Grows the Parrot string's buffer by the specified number of characters.
849 PARROT_CANNOT_RETURN_NULL
851 Parrot_str_resize(PARROT_INTERP
, ARGMOD(STRING
*s
), UINTVAL addlen
)
853 ASSERT_ARGS(Parrot_str_resize
)
854 Parrot_str_write_COW(interp
, s
);
856 /* Don't check buflen, if we are here, we already checked. */
857 Parrot_gc_reallocate_string_storage(interp
,
858 s
, PObj_buflen(s
) + string_max_bytes(interp
, s
, addlen
));
867 =head2 Ordinary user-visible string operations
871 =item C<UINTVAL Parrot_str_byte_length(PARROT_INTERP, const STRING *s)>
873 Returns the number of characters in the specified Parrot string.
882 Parrot_str_byte_length(SHIM_INTERP
, ARGIN(const STRING
*s
))
884 ASSERT_ARGS(Parrot_str_byte_length
)
892 =item C<INTVAL Parrot_str_indexed(PARROT_INTERP, const STRING *s, UINTVAL idx)>
894 Returns the character (or glyph, depending upon the string's encoding). This
895 abstracts the process of finding the Nth character in a (possibly Unicode or
896 JIS-encoded) string, the idea being that once the encoding functions are
897 fleshed out, this function can do the right thing.
899 Note that this is not range-checked.
906 PARROT_WARN_UNUSED_RESULT
908 Parrot_str_indexed(PARROT_INTERP
, ARGIN(const STRING
*s
), UINTVAL idx
)
910 ASSERT_ARGS(Parrot_str_indexed
)
912 return (INTVAL
)CHARSET_GET_CODEPOINT(interp
, s
, idx
);
918 =item C<INTVAL Parrot_str_find_index(PARROT_INTERP, const STRING *s, const
919 STRING *s2, INTVAL start)>
921 Returns the character position of the second Parrot string in the first at or
922 after C<start>. The return value is a (0 based) offset in characters, not
923 bytes. If second string is not found in the first string, returns -1.
930 PARROT_WARN_UNUSED_RESULT
932 Parrot_str_find_index(PARROT_INTERP
, ARGIN(const STRING
*s
),
933 ARGIN(const STRING
*s2
), INTVAL start
)
935 ASSERT_ARGS(Parrot_str_find_index
)
936 STRING
*src
, *search
;
943 len
= Parrot_str_byte_length(interp
, s
);
948 if (start
>= (INTVAL
)len
)
951 if (!Parrot_str_byte_length(interp
, s2
))
954 src
= PARROT_const_cast(STRING
*, s
);
955 search
= PARROT_const_cast(STRING
*, s2
);
957 return CHARSET_INDEX(interp
, src
, search
, (UINTVAL
)start
);
963 =item C<INTVAL string_ord(PARROT_INTERP, const STRING *s, INTVAL idx)>
965 Returns the codepoint at a given index into a string. Negative indexes are
966 treated as counting from the end of the string.
973 PARROT_WARN_UNUSED_RESULT
975 string_ord(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
), INTVAL idx
)
977 ASSERT_ARGS(string_ord
)
978 const UINTVAL len
= s
? Parrot_str_byte_length(interp
, s
) : 0;
979 UINTVAL true_index
= (UINTVAL
)idx
;
982 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_ORD_OUT_OF_STRING
,
983 "Cannot get character of empty string");
986 if ((INTVAL
)(idx
+ len
) < 0)
987 Parrot_ex_throw_from_c_args(interp
, NULL
,
988 EXCEPTION_ORD_OUT_OF_STRING
,
989 "Cannot get character before beginning of string");
991 true_index
= (UINTVAL
)(len
+ idx
);
994 if (true_index
> (len
- 1))
995 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_ORD_OUT_OF_STRING
,
996 "Cannot get character past end of string");
998 return Parrot_str_indexed(interp
, s
, true_index
);
1004 =item C<STRING * string_chr(PARROT_INTERP, UINTVAL character)>
1006 Returns a single-character Parrot string.
1008 TODO - Allow this to take an array of characters?
1015 PARROT_CANNOT_RETURN_NULL
1016 PARROT_WARN_UNUSED_RESULT
1018 string_chr(PARROT_INTERP
, UINTVAL character
)
1020 ASSERT_ARGS(string_chr
)
1021 if (character
> 0xff)
1022 return Parrot_unicode_charset_ptr
->string_from_codepoint(interp
,
1025 else if (character
> 0x7f)
1026 return Parrot_iso_8859_1_charset_ptr
->string_from_codepoint(interp
,
1030 return Parrot_ascii_charset_ptr
->string_from_codepoint(interp
,
1037 =item C<STRING * Parrot_str_copy(PARROT_INTERP, STRING *s)>
1039 Creates and returns a copy of the specified Parrot string.
1046 PARROT_CANNOT_RETURN_NULL
1047 PARROT_WARN_UNUSED_RESULT
1049 Parrot_str_copy(PARROT_INTERP
, ARGMOD(STRING
*s
))
1051 ASSERT_ARGS(Parrot_str_copy
)
1052 return Parrot_str_new_COW(interp
, s
);
1060 =head2 Vtable Dispatch Functions
1064 =item C<INTVAL Parrot_str_length(PARROT_INTERP, STRING *s)>
1066 Calculates and returns the number of characters in the specified Parrot string.
1073 PARROT_IGNORABLE_RESULT
1075 Parrot_str_length(PARROT_INTERP
, ARGMOD(STRING
*s
))
1077 ASSERT_ARGS(Parrot_str_length
)
1079 s
->strlen
= CHARSET_CODEPOINTS(interp
, s
);
1086 =item C<INTVAL string_max_bytes(PARROT_INTERP, const STRING *s, UINTVAL nchars)>
1088 Returns the number of bytes required to safely contain the specified number
1089 of characters in the specified Parrot string's representation.
1096 PARROT_WARN_UNUSED_RESULT
1098 string_max_bytes(SHIM_INTERP
, ARGIN(const STRING
*s
), UINTVAL nchars
)
1100 ASSERT_ARGS(string_max_bytes
)
1101 PARROT_ASSERT(s
->encoding
);
1102 return ENCODING_MAX_BYTES_PER_CODEPOINT(interp
, s
) * nchars
;
1107 =item C<STRING * Parrot_str_repeat(PARROT_INTERP, const STRING *s, UINTVAL num)>
1109 Repeats the specified Parrot string I<num> times and returns the result.
1116 PARROT_CANNOT_RETURN_NULL
1118 Parrot_str_repeat(PARROT_INTERP
, ARGIN(const STRING
*s
), UINTVAL num
)
1120 ASSERT_ARGS(Parrot_str_repeat
)
1121 STRING
* const dest
= Parrot_str_new_init(interp
, NULL
,
1123 s
->encoding
, s
->charset
, 0);
1125 /* copy s into dest num times */
1126 UINTVAL length
= s
->bufused
;
1128 char * destpos
= dest
->strstart
;
1129 const char * const srcpos
= s
->strstart
;
1130 for (i
= 0; i
< num
; i
++) {
1131 mem_sys_memcopy(destpos
, srcpos
, length
);
1135 dest
->bufused
= s
->bufused
* num
;
1136 dest
->strlen
= s
->strlen
* num
;
1144 =item C<STRING * Parrot_str_substr(PARROT_INTERP, STRING *src, INTVAL offset,
1145 INTVAL length, STRING **d, int replace_dest)>
1147 Copies the substring of length C<length> from C<offset> from the specified
1148 Parrot string and stores it in C<**d>, allocating memory if necessary. The
1149 substring is also returned.
1156 PARROT_CANNOT_RETURN_NULL
1157 PARROT_WARN_UNUSED_RESULT
1159 Parrot_str_substr(PARROT_INTERP
,
1160 ARGIN_NULLOK(STRING
*src
), INTVAL offset
, INTVAL length
,
1161 ARGOUT_NULLOK(STRING
**d
), int replace_dest
)
1163 ASSERT_ARGS(Parrot_str_substr
)
1165 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_SUBSTR_OUT_OF_STRING
,
1166 "Cannot substr on a null string");
1169 UINTVAL true_length
;
1170 UINTVAL true_offset
= (UINTVAL
)offset
;
1172 saneify_string(src
);
1174 /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
1175 if (offset
== (INTVAL
)Parrot_str_byte_length(interp
, src
) || length
< 1)
1176 return Parrot_str_new_noinit(interp
, enum_stringrep_one
, 0);
1179 true_offset
= (UINTVAL
)(src
->strlen
+ offset
);
1182 if (src
->strlen
== 0 || true_offset
> src
->strlen
- 1)
1183 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_SUBSTR_OUT_OF_STRING
,
1184 "Cannot take substr outside string");
1186 true_length
= (UINTVAL
)length
;
1187 if (true_length
> (src
->strlen
- true_offset
))
1188 true_length
= (UINTVAL
)(src
->strlen
- true_offset
);
1190 /* do in-place i.e. reuse existing header if one */
1191 if (replace_dest
&& d
&& *d
) {
1192 PARROT_ASSERT(src
->encoding
== Parrot_fixed_8_encoding_ptr
);
1195 dest
->encoding
= src
->encoding
;
1196 dest
->charset
= src
->charset
;
1198 dest
->strstart
= (char *)src
->strstart
+ true_offset
;
1199 dest
->bufused
= true_length
;
1201 dest
->strlen
= true_length
;
1205 dest
= CHARSET_GET_CODEPOINTS(interp
, src
, true_offset
,
1218 =item C<STRING * Parrot_str_replace(PARROT_INTERP, STRING *src, INTVAL offset,
1219 INTVAL length, STRING *rep, STRING **d)>
1221 Replaces a sequence of C<length> characters from C<offset> in the first
1222 Parrot string with the second Parrot string, returning what was
1225 This follows the Perl semantics for:
1227 substr EXPR, OFFSET, LENGTH, REPLACEMENT
1229 Replacing a sequence of characters with a longer string grows the
1230 string; a shorter string shrinks it.
1232 Replacing 2 past the end of the string is undefined. However replacing 1
1233 past the end of the string concatenates the two strings.
1235 A negative offset is allowed to replace from the end.
1242 PARROT_CAN_RETURN_NULL
1244 Parrot_str_replace(PARROT_INTERP
, ARGIN(STRING
*src
),
1245 INTVAL offset
, INTVAL length
, ARGIN(STRING
*rep
), ARGOUT_NULLOK(STRING
**d
))
1247 ASSERT_ARGS(Parrot_str_replace
)
1248 UINTVAL start_byte
, end_byte
;
1253 const ENCODING
*enc
;
1254 STRING
*dest
= NULL
;
1255 UINTVAL true_offset
= (UINTVAL
)offset
;
1256 UINTVAL true_length
= (UINTVAL
)length
;
1260 && src
->encoding
== Parrot_fixed_8_encoding_ptr
1261 && rep
->encoding
== Parrot_fixed_8_encoding_ptr
1263 && true_offset
< src
->strlen
1265 && rep
->strlen
== 1) {
1266 if (PObj_is_cowed_TESTALL(src
))
1267 Parrot_str_write_COW(interp
, src
);
1269 ((char *)src
->strstart
)[offset
] = ((char *)rep
->strstart
)[0];
1274 /* abs(-offset) may not be > strlen-1 */
1276 true_offset
= (UINTVAL
)(src
->strlen
+ offset
);
1278 /* Can replace 1 past end of string which is technically outside the string
1279 * but is same as a concat().
1280 * Only give exception if caller trys to replace end of string + 2
1282 if (true_offset
> src
->strlen
)
1283 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_SUBSTR_OUT_OF_STRING
,
1284 "Can only replace inside string or index after end of string");
1286 if (true_length
> (src
->strlen
- true_offset
))
1287 true_length
= (UINTVAL
)(src
->strlen
- true_offset
);
1289 /* Save the substring that is replaced for the return value */
1291 dest
= CHARSET_GET_CODEPOINTS(interp
, src
, true_offset
, true_length
);
1295 /* may have different reps..... */
1296 cs
= string_rep_compatible(interp
, src
, rep
, &enc
);
1299 Parrot_utf16_encoding_ptr
->to_encoding(interp
, src
, NULL
);
1300 rep
= Parrot_utf16_encoding_ptr
->to_encoding(interp
, rep
,
1301 Parrot_gc_new_string_header(interp
, 0));
1305 src
->encoding
= enc
;
1308 /* get byte position of the part that will be replaced */
1309 ENCODING_ITER_INIT(interp
, src
, &iter
);
1311 iter
.set_position(interp
, &iter
, true_offset
);
1312 start_byte
= iter
.bytepos
;
1314 iter
.set_position(interp
, &iter
, true_offset
+ true_length
);
1315 end_byte
= iter
.bytepos
;
1317 /* not possible.... */
1318 if (end_byte
< start_byte
)
1319 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_SUBSTR_OUT_OF_STRING
,
1320 "replace: subend somehow is less than substart");
1322 /* Now do the replacement */
1325 * If the replacement string fits inside the original substring
1326 * don't create a new string, just pack it.
1328 diff
= (end_byte
- start_byte
) - rep
->bufused
;
1331 || ((INTVAL
)src
->bufused
- (INTVAL
)PObj_buflen(src
)) <= diff
) {
1332 Parrot_str_write_COW(interp
, src
);
1335 mem_sys_memmove((char *)src
->strstart
+ start_byte
+ rep
->bufused
,
1336 (char *)src
->strstart
+ end_byte
,
1337 src
->bufused
- end_byte
);
1338 src
->bufused
-= diff
;
1341 mem_sys_memcopy((char *)src
->strstart
+ start_byte
,
1342 rep
->strstart
, rep
->bufused
);
1345 (void)Parrot_str_length(interp
, src
);
1348 /* Replacement is larger than avail buffer, grow the string */
1350 /* diff is negative here, make it positive */
1352 Parrot_str_resize(interp
, src
, (UINTVAL
)diff
);
1354 /* Move the end of old string that isn't replaced to new offset first */
1355 mem_sys_memmove((char *)src
->strstart
+ end_byte
+ diff
,
1356 (char *)src
->strstart
+ end_byte
,
1357 src
->bufused
- end_byte
);
1359 /* Copy the replacement in */
1360 mem_sys_memcopy((char *)src
->strstart
+ start_byte
, rep
->strstart
,
1362 src
->bufused
+= diff
;
1363 (void)Parrot_str_length(interp
, src
);
1366 /* src is modified, now return the original substring */
1373 =item C<STRING * Parrot_str_chopn(PARROT_INTERP, STRING *s, INTVAL n)>
1375 Removes the last C<n> characters of the specified Parrot string. If C<n> is
1376 negative, cuts the string after C<+n> characters. The returned string is a copy
1377 of the one passed in.
1384 PARROT_CANNOT_RETURN_NULL
1386 Parrot_str_chopn(PARROT_INTERP
, ARGMOD(STRING
*s
), INTVAL n
)
1388 ASSERT_ARGS(Parrot_str_chopn
)
1389 STRING
* const chopped
= Parrot_str_copy(interp
, s
);
1390 Parrot_str_chopn_inplace(interp
, chopped
, n
);
1397 =item C<void Parrot_str_chopn_inplace(PARROT_INTERP, STRING *s, INTVAL n)>
1399 Removes the last C<n> characters of the specified Parrot string. If C<n> is
1400 negative, cuts the string after C<+n> characters. The string passed in is
1401 modified and returned.
1409 Parrot_str_chopn_inplace(PARROT_INTERP
, ARGMOD(STRING
*s
), INTVAL n
)
1411 ASSERT_ARGS(Parrot_str_chopn_inplace
)
1412 UINTVAL new_length
, uchar_size
;
1416 if (new_length
> s
->strlen
)
1420 if (s
->strlen
> (UINTVAL
)n
)
1421 new_length
= s
->strlen
- n
;
1428 if (!new_length
|| !s
->strlen
) {
1429 s
->bufused
= s
->strlen
= 0;
1433 uchar_size
= s
->bufused
/ s
->strlen
;
1434 s
->strlen
= new_length
;
1436 if (s
->encoding
== Parrot_fixed_8_encoding_ptr
) {
1437 s
->bufused
= new_length
;
1439 else if (s
->encoding
== Parrot_ucs2_encoding_ptr
) {
1440 s
->bufused
= new_length
* uchar_size
;
1445 ENCODING_ITER_INIT(interp
, s
, &iter
);
1446 iter
.set_position(interp
, &iter
, new_length
);
1447 s
->bufused
= iter
.bytepos
;
1456 =item C<INTVAL Parrot_str_compare(PARROT_INTERP, const STRING *s1, const STRING
1459 Compares two strings to each other. If s1 is less than s2, returns -1. If the
1460 strings are equal, returns 0. If s1 is greater than s2, returns 2. This
1461 comparison uses the character set collation order of the strings for
1469 PARROT_WARN_UNUSED_RESULT
1471 Parrot_str_compare(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
), ARGIN_NULLOK(const STRING
*s2
))
1473 ASSERT_ARGS(Parrot_str_compare
)
1475 return s1
&& (s1
->strlen
!= 0);
1478 return -(s2
->strlen
!= 0);
1483 return CHARSET_COMPARE(interp
, s1
, s2
);
1489 =item C<INTVAL Parrot_str_not_equal(PARROT_INTERP, const STRING *s1, const
1492 Compares two Parrot strings, performing type and encoding conversions if
1493 necessary. Returns 1 if the strings are not equal, and 0 otherwise.
1500 PARROT_WARN_UNUSED_RESULT
1502 Parrot_str_not_equal(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
), ARGIN_NULLOK(const STRING
*s2
))
1504 ASSERT_ARGS(Parrot_str_not_equal
)
1505 return !Parrot_str_equal(interp
, s1
, s2
);
1510 =item C<INTVAL Parrot_str_equal(PARROT_INTERP, const STRING *s1, const STRING
1513 Compares two Parrot strings, performing type and encoding conversions if
1516 Returns 1 if the strings are equal, and 0 otherwise.
1523 PARROT_WARN_UNUSED_RESULT
1525 Parrot_str_equal(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
), ARGIN_NULLOK(const STRING
*s2
))
1527 ASSERT_ARGS(Parrot_str_equal
)
1528 if ((s1
== s2
) || (!s1
&& !s2
)) {
1532 return s1
->strlen
== 0;
1535 return s2
->strlen
== 0;
1537 else if (s1
->strlen
!= s2
->strlen
) {
1538 return 0; /* we don't care which is bigger */
1540 else if (s1
->hashval
!= s2
->hashval
&& s1
->hashval
&& s2
->hashval
) {
1543 else if (!s1
->strlen
) { /* s2->strlen is the same here */
1547 else if (s1
->strstart
== s2
->strstart
&& s1
->bufused
== s2
->bufused
) {
1553 * both strings are non-null
1554 * both strings have same length
1556 return !CHARSET_COMPARE(interp
, s1
, s2
);
1562 =item C<static void make_writable(PARROT_INTERP, STRING **s, const size_t len,
1563 parrot_string_representation_t representation)>
1565 Makes the specified Parrot string writable with minimum length C<len>. The
1566 C<representation> argument is required in case a new Parrot string has to be
1574 make_writable(PARROT_INTERP
, ARGMOD(STRING
**s
),
1575 const size_t len
, parrot_string_representation_t representation
)
1577 ASSERT_ARGS(make_writable
)
1579 *s
= Parrot_str_new_noinit(interp
, representation
, len
);
1580 else if ((*s
)->strlen
< len
)
1581 Parrot_str_resize(interp
, *s
, (UINTVAL
)(len
- (*s
)->strlen
));
1582 else if (PObj_is_cowed_TESTALL(*s
))
1583 Parrot_str_write_COW(interp
, *s
);
1589 =item C<STRING * Parrot_str_bitwise_and(PARROT_INTERP, const STRING *s1, const
1590 STRING *s2, STRING **dest)>
1592 Performs a bitwise C<AND> on two Parrot string, performing type and encoding
1593 conversions if necessary. If the second string is not C<NULL> then it is
1594 reused. Otherwise a new Parrot string is created.
1601 PARROT_CANNOT_RETURN_NULL
1603 Parrot_str_bitwise_and(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
),
1604 ARGIN_NULLOK(const STRING
*s2
), ARGOUT_NULLOK(STRING
**dest
))
1606 ASSERT_ARGS(Parrot_str_bitwise_and
)
1610 /* we could also trans_charset to iso-8859-1 */
1611 if (s1
&& s1
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1612 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1613 "string bitwise_and (%s/%s) unsupported",
1614 s1
->encoding
->name
, nonnull_encoding_name(s2
));
1616 if (s2
&& s2
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1617 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1618 "string bitwise_and (%s/%s) unsupported",
1619 nonnull_encoding_name(s1
), s2
->encoding
->name
);
1621 /* think about case of dest string is one of the operands */
1623 minlen
= s1
->strlen
> s2
->strlen
? s2
->strlen
: s1
->strlen
;
1627 if (dest
&& *dest
) {
1629 res
->encoding
= Parrot_fixed_8_encoding_ptr
;
1630 res
->charset
= Parrot_binary_charset_ptr
;
1633 res
= Parrot_str_new_init(interp
, NULL
, minlen
,
1634 Parrot_fixed_8_encoding_ptr
, Parrot_binary_charset_ptr
, 0);
1643 #if ! DISABLE_GC_DEBUG
1644 /* trigger GC for debug */
1645 if (interp
&& GC_DEBUG(interp
))
1646 Parrot_gc_mark_and_sweep(interp
, GC_trace_stack_FLAG
);
1649 make_writable(interp
, &res
, minlen
, enum_stringrep_one
);
1651 { /* bitwise AND the strings */
1652 const Parrot_UInt1
*curr1
= (Parrot_UInt1
*)s1
->strstart
;
1653 const Parrot_UInt1
*curr2
= (Parrot_UInt1
*)s2
->strstart
;
1654 Parrot_UInt1
*dp
= (Parrot_UInt1
*)res
->strstart
;
1655 size_t len
= minlen
;
1658 *dp
++ = *curr1
++ & *curr2
++;
1661 res
->bufused
= res
->strlen
= minlen
;
1669 #define BITWISE_XOR_STRINGS(type1, type2, restype, s1, s2, res, maxlen) \
1671 const type1 *curr1 = NULL; \
1672 const type2 *curr2 = NULL; \
1673 size_t length1 = 0; \
1674 size_t length2 = 0; \
1679 curr1 = (type1 *)(s1)->strstart; \
1680 length1 = (s1)->strlen; \
1683 curr2 = (type2 *)(s2)->strstart; \
1684 length2 = (s2)->strlen; \
1687 dp = (restype *)(res)->strstart; \
1690 for (; _index < (maxlen) ; ++curr1, ++curr2, ++dp, ++_index) { \
1691 if (_index < length1) { \
1692 if (_index < length2) \
1693 *dp = *curr1 ^ *curr2; \
1697 else if (_index < length2) { \
1704 #define BITWISE_OR_STRINGS(type1, type2, restype, s1, s2, res, maxlen) \
1706 const type1 *curr1 = NULL; \
1707 const type2 *curr2 = NULL; \
1708 size_t length1 = 0; \
1709 size_t length2 = 0; \
1714 curr1 = (type1 *)(s1)->strstart; \
1715 length1 = (s1)->strlen; \
1718 curr2 = (type2 *)(s2)->strstart; \
1719 length2 = (s2)->strlen; \
1722 dp = (restype *)(res)->strstart; \
1725 for (; _index < (maxlen) ; ++curr1, ++curr2, ++dp, ++_index) { \
1726 if (_index < length1) { \
1727 if (_index < length2) \
1728 *dp = *curr1 | *curr2; \
1732 else if (_index < length2) { \
1741 =item C<STRING * Parrot_str_bitwise_or(PARROT_INTERP, const STRING *s1, const
1742 STRING *s2, STRING **dest)>
1744 Performs a bitwise C<OR> on two Parrot strings, performing type and encoding
1745 conversions if necessary. If the third string is not C<NULL>, then it is
1746 reused. Otherwise a new Parrot string is created.
1753 PARROT_CANNOT_RETURN_NULL
1755 Parrot_str_bitwise_or(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
),
1756 ARGIN_NULLOK(const STRING
*s2
), ARGOUT_NULLOK(STRING
**dest
))
1758 ASSERT_ARGS(Parrot_str_bitwise_or
)
1763 if (s1
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1764 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1765 "string bitwise_or (%s/%s) unsupported",
1766 s1
->encoding
->name
, nonnull_encoding_name(s2
));
1768 maxlen
= s1
->bufused
;
1772 if (s2
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1773 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1774 "string bitwise_or (%s/%s) unsupported",
1775 nonnull_encoding_name(s1
), s2
->encoding
->name
);
1777 if (s2
->bufused
> maxlen
)
1778 maxlen
= s2
->bufused
;
1781 if (dest
&& *dest
) {
1783 res
->encoding
= Parrot_fixed_8_encoding_ptr
;
1784 res
->charset
= Parrot_binary_charset_ptr
;
1787 res
= Parrot_str_new_init(interp
, NULL
, maxlen
,
1788 Parrot_fixed_8_encoding_ptr
, Parrot_binary_charset_ptr
, 0);
1796 #if ! DISABLE_GC_DEBUG
1797 /* trigger GC for debug */
1798 if (interp
&& GC_DEBUG(interp
))
1799 Parrot_gc_mark_and_sweep(interp
, GC_trace_stack_FLAG
);
1802 make_writable(interp
, &res
, maxlen
, enum_stringrep_one
);
1804 BITWISE_OR_STRINGS(Parrot_UInt1
, Parrot_UInt1
, Parrot_UInt1
,
1805 s1
, s2
, res
, maxlen
);
1806 res
->bufused
= res
->strlen
= maxlen
;
1817 =item C<STRING * Parrot_str_bitwise_xor(PARROT_INTERP, const STRING *s1, const
1818 STRING *s2, STRING **dest)>
1820 Performs a bitwise C<XOR> on two Parrot strings, performing type and encoding
1821 conversions if necessary. If the second string is not C<NULL>, then it is
1822 reused. Otherwise a new Parrot string is created.
1829 PARROT_CANNOT_RETURN_NULL
1831 Parrot_str_bitwise_xor(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s1
),
1832 ARGIN_NULLOK(const STRING
*s2
), ARGOUT_NULLOK(STRING
**dest
))
1834 ASSERT_ARGS(Parrot_str_bitwise_xor
)
1839 if (s1
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1840 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1841 "string bitwise_xor (%s/%s) unsupported",
1842 s1
->encoding
->name
, nonnull_encoding_name(s2
));
1844 maxlen
= s1
->bufused
;
1848 if (s2
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1849 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1850 "string bitwise_xor (%s/%s) unsupported",
1851 nonnull_encoding_name(s1
), s2
->encoding
->name
);
1853 if (s2
->bufused
> maxlen
)
1854 maxlen
= s2
->bufused
;
1857 if (dest
&& *dest
) {
1859 res
->encoding
= Parrot_fixed_8_encoding_ptr
;
1860 res
->charset
= Parrot_binary_charset_ptr
;
1863 res
= Parrot_str_new_init(interp
, NULL
, maxlen
,
1864 Parrot_fixed_8_encoding_ptr
, Parrot_binary_charset_ptr
, 0);
1872 #if ! DISABLE_GC_DEBUG
1873 /* trigger GC for debug */
1874 if (interp
&& GC_DEBUG(interp
))
1875 Parrot_gc_mark_and_sweep(interp
, GC_trace_stack_FLAG
);
1878 make_writable(interp
, &res
, maxlen
, enum_stringrep_one
);
1880 BITWISE_XOR_STRINGS(Parrot_UInt1
, Parrot_UInt1
, Parrot_UInt1
,
1881 s1
, s2
, res
, maxlen
);
1882 res
->bufused
= res
->strlen
= maxlen
;
1891 #define BITWISE_NOT_STRING(type, s, res) \
1893 if ((s) && (res)) { \
1894 const type *curr = (type *)(s)->strstart; \
1895 size_t length = (s)->strlen; \
1896 Parrot_UInt1 *dp = (Parrot_UInt1 *)(res)->strstart; \
1898 for (; length ; --length, ++dp, ++curr) \
1899 *dp = 0xFF & ~ *curr; \
1905 =item C<STRING * Parrot_str_bitwise_not(PARROT_INTERP, const STRING *s, STRING
1908 Performs a bitwise C<NOT> on a Parrot string. If the second string is
1909 not C<NULL> then it is reused, otherwise a new Parrot string is created.
1916 PARROT_CANNOT_RETURN_NULL
1918 Parrot_str_bitwise_not(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
),
1919 ARGOUT_NULLOK(STRING
**dest
))
1921 ASSERT_ARGS(Parrot_str_bitwise_not
)
1926 if (s
->encoding
!= Parrot_fixed_8_encoding_ptr
)
1927 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_ENCODING
,
1928 "string bitwise_not (%s/%s) unsupported",
1929 s
->encoding
->name
, s
->encoding
->name
);
1936 if (dest
&& *dest
) {
1938 res
->encoding
= Parrot_fixed_8_encoding_ptr
;
1939 res
->charset
= Parrot_binary_charset_ptr
;
1942 res
= Parrot_str_new_init(interp
, NULL
, len
,
1943 Parrot_fixed_8_encoding_ptr
, Parrot_binary_charset_ptr
, 0);
1951 #if ! DISABLE_GC_DEBUG
1952 /* trigger GC for debug */
1953 if (interp
&& GC_DEBUG(interp
))
1954 Parrot_gc_mark_and_sweep(interp
, GC_trace_stack_FLAG
);
1957 make_writable(interp
, &res
, len
, enum_stringrep_one
);
1959 res
->strlen
= res
->bufused
= len
;
1961 BITWISE_NOT_STRING(Parrot_UInt1
, s
, res
);
1971 =item C<INTVAL Parrot_str_boolean(PARROT_INTERP, const STRING *s)>
1973 Returns whether the specified Parrot string is true. A string is true if it is
1974 equal to anything other than C<0>, C<""> or C<"0">.
1981 PARROT_WARN_UNUSED_RESULT
1983 Parrot_str_boolean(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
))
1985 ASSERT_ARGS(Parrot_str_boolean
)
1986 const INTVAL len
= s
? Parrot_str_byte_length(interp
, s
) : 0;
1992 const UINTVAL c
= Parrot_str_indexed(interp
, s
, 0);
1994 /* relying on character literals being interpreted as ASCII--may
1995 not be correct on EBCDIC systems. use numeric value instead? */
1997 /* later, accept other chars with digit value 0? or, no */
2001 /* it must be true */
2008 =item C<STRING * Parrot_str_format_data(PARROT_INTERP, const char *format, ...)>
2010 Writes and returns a Parrot string.
2017 PARROT_CANNOT_RETURN_NULL
2019 Parrot_str_format_data(PARROT_INTERP
, ARGIN(const char *format
), ...)
2021 ASSERT_ARGS(Parrot_str_format_data
)
2025 va_start(args
, format
);
2026 output
= Parrot_vsprintf_c(interp
, format
, args
);
2033 State of FSM during number value parsing.
2035 Integer uses only parse_start, parse_before_dot and parse_end.
2038 typedef enum number_parse_state
{
2045 } number_parse_state
;
2050 =item C<INTVAL Parrot_str_to_int(PARROT_INTERP, const STRING *s)>
2052 Converts a numeric Parrot string to an integer value.
2054 A number is such that:
2057 digit = "Any code point considered a digit by the chartype"
2058 indicator = 'e' | 'E'
2059 digits = digit [digit]...
2060 decimal-part = digits '.' [digits] | ['.'] digits
2061 exponent-part = indicator [sign] digits
2062 numeric-string = [sign] decimal-part [exponent-part]
2064 The integer value is the appropriate integer representation of such a number,
2065 rounding towards zero.
2072 PARROT_WARN_UNUSED_RESULT
2074 Parrot_str_to_int(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
))
2076 ASSERT_ARGS(Parrot_str_to_int
)
2080 const INTVAL max_safe
= PARROT_INTVAL_MAX
/ 10;
2081 const INTVAL last_dig
= PARROT_INTVAL_MAX
% 10;
2086 number_parse_state state
= parse_start
;
2088 ENCODING_ITER_INIT(interp
, s
, &iter
);
2090 for (offs
= 0; (state
!= parse_end
) && (offs
< s
->strlen
); ++offs
) {
2091 const UINTVAL c
= iter
.get_and_advance(interp
, &iter
);
2092 /* Check for overflow */
2098 if (isdigit((unsigned char)c
)) {
2099 const INTVAL nextval
= c
- '0';
2100 if (i
< max_safe
|| (i
== max_safe
&& nextval
<= last_dig
))
2101 i
= i
* 10 + nextval
;
2103 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_ERR_OVERFLOW
,
2104 "Integer value of String '%S' too big", s
);
2105 state
= parse_before_dot
;
2107 else if (c
== '-') {
2109 state
= parse_before_dot
;
2112 state
= parse_before_dot
;
2113 else if (isspace((unsigned char)c
))
2120 case parse_before_dot
:
2121 if (isdigit((unsigned char)c
)) {
2122 const INTVAL nextval
= c
- '0';
2123 if (i
< max_safe
|| (i
== max_safe
&& nextval
<= last_dig
))
2124 i
= i
* 10 + nextval
;
2126 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_ERR_OVERFLOW
,
2127 "Integer value of String '%S' too big", s
);
2134 /* Pacify compiler */
2147 =item C<FLOATVAL Parrot_str_to_num(PARROT_INTERP, const STRING *s)>
2149 Converts a numeric Parrot STRING to a floating point number.
2156 PARROT_WARN_UNUSED_RESULT
2158 Parrot_str_to_num(PARROT_INTERP
, ARGIN(const STRING
*s
))
2160 ASSERT_ARGS(Parrot_str_to_num
)
2162 FLOATVAL mantissa
= 0.0;
2163 FLOATVAL sign
= 1.0; /* -1 for '-' */
2164 FLOATVAL divider
= 0.1;
2166 INTVAL e_sign
= 1; /* -1 for '-' */
2167 /* How many digits it's safe to parse */
2168 const INTVAL max_safe
= PARROT_INTVAL_MAX
/ 10;
2169 INTVAL m
= 0; /* Integer mantissa */
2170 int m_is_safe
= 1; /* We can use integer mantissa */
2171 INTVAL d
= 0; /* Integer descriminator */
2172 int d_is_safe
= 1; /* We can use integer mantissa */
2174 int check_nan
= 0; /* Check for NaN and Inf after main loop */
2177 number_parse_state state
= parse_start
;
2182 ENCODING_ITER_INIT(interp
, s
, &iter
);
2184 /* Handcrafter FSM to read float value */
2185 for (offs
= 0; (state
!= parse_end
) && (offs
< s
->strlen
); ++offs
) {
2186 const UINTVAL c
= iter
.get_and_advance(interp
, &iter
);
2187 /* Check for overflow */
2193 if (isdigit((unsigned char)c
)) {
2196 state
= parse_before_dot
;
2198 else if (c
== '-') {
2200 state
= parse_before_dot
;
2203 state
= parse_before_dot
;
2205 state
= parse_after_dot
;
2206 else if (isspace((unsigned char)c
))
2214 case parse_before_dot
:
2215 if (isdigit((unsigned char)c
)) {
2216 f
= f
*10.0 + (c
-'0');
2218 /* Integer overflow for mantissa */
2222 else if (c
== '.') {
2223 state
= parse_after_dot
;
2225 * Throw gathered result. Recalulate from integer mantissa
2226 * to preserve precision.
2232 else if (c
== 'e' || c
== 'E') {
2233 state
= parse_after_e
;
2234 /* See comment above */
2245 case parse_after_dot
:
2246 if (isdigit((unsigned char)c
)) {
2247 f
+= (c
-'0') * divider
;
2254 else if (c
== 'e' || c
== 'E')
2255 state
= parse_after_e
;
2261 if (isdigit((unsigned char)c
)) {
2263 state
= parse_after_e_sign
;
2265 else if (c
== '-') {
2267 state
= parse_after_e_sign
;
2270 state
= parse_after_e_sign
;
2275 case parse_after_e_sign
:
2276 if (isdigit((unsigned char)c
))
2284 /* Pacify compiler */
2289 /* Support for non-canonical NaN and Inf */
2290 /* charpos <=2 because for "-i" iter will be advanced to next char already */
2291 if (check_nan
&& (iter
.charpos
<= 2)) {
2292 STRING
*t
= Parrot_str_upcase(interp
, s
);
2293 if (Parrot_str_equal(interp
, t
, CONST_STRING(interp
, "NAN")))
2294 return PARROT_FLOATVAL_NAN_QUIET
;
2295 else if (Parrot_str_equal(interp
, t
, CONST_STRING(interp
, "INF"))
2296 || Parrot_str_equal(interp
, t
, CONST_STRING(interp
, "INFINITY")))
2297 return PARROT_FLOATVAL_INF_POSITIVE
;
2298 else if (Parrot_str_equal(interp
, t
, CONST_STRING(interp
, "-INF"))
2299 || Parrot_str_equal(interp
, t
, CONST_STRING(interp
, "-INFINITY")))
2300 return PARROT_FLOATVAL_INF_NEGATIVE
;
2305 /* local macro to call proper pow version depending on FLOATVAL */
2306 #if NUMVAL_SIZE == DOUBLE_SIZE
2312 if (d
&& d_is_safe
) {
2313 f
= mantissa
+ (1.0 * d
/ POW(10.0, d_length
));
2334 =item C<STRING * Parrot_str_from_int(PARROT_INTERP, INTVAL i)>
2336 Returns a Parrot string representation of the specified integer value.
2343 PARROT_WARN_UNUSED_RESULT
2344 PARROT_CANNOT_RETURN_NULL
2346 Parrot_str_from_int(PARROT_INTERP
, INTVAL i
)
2348 ASSERT_ARGS(Parrot_str_from_int
)
2350 return Parrot_str_from_int_base(interp
, buf
, (HUGEINTVAL
)i
, 10);
2356 =item C<STRING * Parrot_str_from_num(PARROT_INTERP, FLOATVAL f)>
2358 Returns a Parrot string representation of the specified floating-point value.
2365 PARROT_WARN_UNUSED_RESULT
2366 PARROT_CANNOT_RETURN_NULL
2368 Parrot_str_from_num(PARROT_INTERP
, FLOATVAL f
)
2370 ASSERT_ARGS(Parrot_str_from_num
)
2371 /* Too damn hard--hand it off to Parrot_sprintf, which'll probably
2372 use the system sprintf anyway, but has gigantic buffers that are
2373 awfully hard to overflow. */
2374 return Parrot_sprintf_c(interp
, FLOATVAL_FMT
, f
);
2380 =item C<char * Parrot_str_to_cstring(PARROT_INTERP, const STRING *s)>
2382 Returns a C string for the specified Parrot string. Use
2383 C<Parrot_str_free_cstring()> to free the string. Failure to do this will result in
2392 PARROT_CANNOT_RETURN_NULL
2394 Parrot_str_to_cstring(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
))
2396 ASSERT_ARGS(Parrot_str_to_cstring
)
2398 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNEXPECTED_NULL
,
2399 "Can't convert NULL string");
2402 return string_to_cstring_nullable(interp
, s
);
2408 =item C<char * string_to_cstring_nullable(PARROT_INTERP, const STRING *s)>
2410 Returns a C string for the specified Parrot string. Use
2411 C<Parrot_str_free_cstring()> to free the string. Failure to do this will result in
2420 PARROT_CAN_RETURN_NULL
2422 string_to_cstring_nullable(SHIM_INTERP
, ARGIN_NULLOK(const STRING
*s
))
2424 ASSERT_ARGS(string_to_cstring_nullable
)
2428 char * const p
= (char *)mem_sys_allocate(s
->bufused
+ 1);
2429 memcpy(p
, s
->strstart
, s
->bufused
);
2430 p
[s
->bufused
] = '\0';
2438 =item C<void Parrot_str_free_cstring(char *p)>
2440 Free a string created by C<Parrot_str_to_cstring()>.
2442 TODO - Hopefully this can go away at some point, as it's got all
2443 sorts of leak potential otherwise.
2451 Parrot_str_free_cstring(ARGIN_NULLOK(char *p
))
2453 ASSERT_ARGS(Parrot_str_free_cstring
)
2454 mem_sys_free((void *)p
);
2460 =item C<void Parrot_str_pin(PARROT_INTERP, STRING *s)>
2462 Replaces the specified Parrot string's managed buffer memory by system memory.
2470 Parrot_str_pin(PARROT_INTERP
, ARGMOD(STRING
*s
))
2472 ASSERT_ARGS(Parrot_str_pin
)
2476 /* XXX -lt: COW strings have the external_FLAG set, so this will
2477 * not work for these
2478 * so probably only sysmem should be tested
2480 Parrot_str_write_COW(interp
, s
);
2482 size
= PObj_buflen(s
);
2483 memory
= (char *)mem_sys_allocate(size
);
2485 mem_sys_memcopy(memory
, PObj_bufstart(s
), size
);
2486 PObj_bufstart(s
) = memory
;
2487 s
->strstart
= memory
;
2489 /* Mark the memory as both from the system and immobile */
2496 =item C<void Parrot_str_unpin(PARROT_INTERP, STRING *s)>
2498 Undoes a C<Parrot_str_pin()> so that the string once again uses managed memory.
2506 Parrot_str_unpin(PARROT_INTERP
, ARGMOD(STRING
*s
))
2508 ASSERT_ARGS(Parrot_str_unpin
)
2512 /* If this string is not marked using system memory,
2513 * we just don't do this
2515 if (!PObj_sysmem_TEST(s
))
2518 Parrot_str_write_COW(interp
, s
);
2519 size
= PObj_buflen(s
);
2521 /* We need a handle on the fixed memory so we can get rid of it later */
2522 memory
= PObj_bufstart(s
);
2524 /* Reallocate it the same size
2525 * NOTE can't use Parrot_gc_reallocate_string_storage because of the LEA
2526 * allocator, where this is a noop for the same size
2528 * We have to block GC here, as we have a pointer to bufstart
2530 Parrot_block_GC_sweep(interp
);
2531 Parrot_gc_allocate_string_storage(interp
, s
, size
);
2532 Parrot_unblock_GC_sweep(interp
);
2533 mem_sys_memcopy(PObj_bufstart(s
), memory
, size
);
2535 /* Mark the memory as neither immobile nor system allocated */
2536 PObj_sysmem_CLEAR(s
);
2538 /* Free up the memory */
2539 mem_sys_free(memory
);
2545 =item C<size_t Parrot_str_to_hashval(PARROT_INTERP, STRING *s)>
2547 Returns the hash value for the specified Parrot string, caching it in
2555 PARROT_WARN_UNUSED_RESULT
2557 Parrot_str_to_hashval(PARROT_INTERP
, ARGMOD_NULLOK(STRING
*s
))
2559 ASSERT_ARGS(Parrot_str_to_hashval
)
2562 size_t hashval
= interp
->hash_seed
;
2567 /* ZZZZZ workaround for something not setting up encodings right */
2570 ENCODING_ITER_INIT(interp
, s
, &iter
);
2572 for (offs
= 0; offs
< s
->strlen
; ++offs
) {
2573 const UINTVAL c
= iter
.get_and_advance(interp
, &iter
);
2574 hashval
+= hashval
<< 5;
2578 s
->hashval
= hashval
;
2586 =item C<STRING * Parrot_str_escape(PARROT_INTERP, const STRING *src)>
2588 Escapes all non-ASCII chars to backslash sequences. Control chars that
2589 C<Parrot_str_unescape> can handle are escaped as I<\x>, as well as a double
2590 quote character. Other control chars and codepoints < 0x100 are escaped as
2591 I<\xhh>, codepoints up to 0xffff, as I<\uhhhh>, and codepoints greater than
2592 this as I<\x{hh...hh}>.
2599 PARROT_CAN_RETURN_NULL
2601 Parrot_str_escape(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*src
))
2603 ASSERT_ARGS(Parrot_str_escape
)
2604 return Parrot_str_escape_truncate(interp
, src
, (UINTVAL
) ~0);
2610 =item C<STRING * Parrot_str_escape_truncate(PARROT_INTERP, const STRING *src,
2613 Escapes all non-ASCII characters in the given string with backslashed versions,
2614 but limits the length of the output (used for trace output of strings).
2621 PARROT_CAN_RETURN_NULL
2623 Parrot_str_escape_truncate(PARROT_INTERP
,
2624 ARGIN_NULLOK(const STRING
*src
), UINTVAL limit
)
2626 ASSERT_ARGS(Parrot_str_escape_truncate
)
2627 STRING
*result
, *hex
;
2628 UINTVAL i
, len
, charlen
;
2640 /* expect around 2x the chars */
2646 /* create ascii result */
2647 result
= Parrot_str_new_init(interp
, NULL
, charlen
,
2648 Parrot_fixed_8_encoding_ptr
, Parrot_ascii_charset_ptr
, 0);
2650 /* more work TODO */
2651 ENCODING_ITER_INIT(interp
, src
, &iter
);
2652 dp
= (unsigned char *)result
->strstart
;
2654 for (i
= 0; len
> 0; --len
) {
2655 UINTVAL c
= iter
.get_and_advance(interp
, &iter
);
2657 /* process ASCII chars */
2658 if (i
>= charlen
- 2) {
2659 /* resize - still len codepoints to go */
2660 charlen
+= len
* 2 + 16;
2661 Parrot_gc_reallocate_string_storage(interp
, result
, charlen
);
2662 /* start can change */
2663 dp
= (unsigned char *)result
->strstart
;
2705 dp
[i
++] = (unsigned char)c
;
2706 result
->bufused
= result
->strlen
= i
;
2711 /* escape by appending either \uhhhh or \x{hh...} */
2712 result
->bufused
= result
->strlen
= i
;
2714 if (c
< 0x0100 || c
>= 0x10000)
2715 hex
= Parrot_sprintf_c(interp
, "\\x{%x}", c
);
2717 hex
= Parrot_sprintf_c(interp
, "\\u%04x", c
);
2719 result
= Parrot_str_append(interp
, result
, hex
);
2721 /* adjust our insert idx */
2724 /* and usable len */
2725 charlen
= PObj_buflen(result
);
2726 dp
= (unsigned char *)result
->strstart
;
2728 PARROT_ASSERT(i
<= charlen
);
2731 result
->bufused
= result
->strlen
= i
;
2738 =item C<STRING * Parrot_str_unescape(PARROT_INTERP, const char *cstring, char
2739 delimiter, const char *enc_char)>
2741 Unescapes the specified C string. These sequences are covered:
2743 \xhh 1..2 hex digits
2744 \ooo 1..3 oct digits
2746 \x{h..h} 1..8 hex digits
2748 \Uhhhhhhhh 8 hex digits
2749 \a, \b, \t, \n, \v, \f, \r, \e
2756 PARROT_CANNOT_RETURN_NULL
2758 Parrot_str_unescape(PARROT_INTERP
,
2759 ARGIN(const char *cstring
), char delimiter
, ARGIN_NULLOK(const char *enc_char
))
2761 ASSERT_ARGS(Parrot_str_unescape
)
2762 size_t clength
= strlen(cstring
);
2766 const ENCODING
*encoding
;
2767 const CHARSET
*charset
;
2771 /* we are constructing const table strings here */
2772 const UINTVAL flags
= PObj_constant_FLAG
;
2774 if (delimiter
&& clength
)
2777 /* default is ascii */
2781 /* check for encoding: */
2782 p
= strchr(enc_char
, ':');
2786 encoding
= Parrot_find_encoding(interp
, enc_char
);
2788 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
2789 "Can't make '%s' encoding strings", enc_char
);
2791 charset
= Parrot_find_charset(interp
, p
+ 1);
2793 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
2794 "Can't make '%s' charset strings", p
+ 1);
2796 result
= Parrot_str_new_init(interp
, cstring
, clength
,
2797 encoding
, charset
, flags
);
2798 encoding
= Parrot_fixed_8_encoding_ptr
;
2801 result
= string_make(interp
, cstring
, clength
, enc_char
, flags
);
2802 encoding
= result
->encoding
;
2805 encoding
->iter_init(interp
, result
, &iter
);
2807 for (offs
= d
= 0; offs
< clength
; ++offs
) {
2808 r
= (Parrot_UInt4
)((unsigned char *)result
->strstart
)[offs
];
2810 /* There cannot be any NULs within this string. */
2811 PARROT_ASSERT(r
!= '\0');
2815 r
= string_unescape_one(interp
, &offs
, result
);
2820 /* we did it in place - no action */
2827 PARROT_ASSERT(d
< offs
);
2828 iter
.set_and_advance(interp
, &iter
, r
);
2833 result
->bufused
= iter
.bytepos
;
2835 /* this also validates the string */
2836 if (encoding
!= result
->encoding
)
2837 Parrot_str_length(interp
, result
);
2839 if (!CHARSET_VALIDATE(interp
, result
, 0))
2840 Parrot_ex_throw_from_c_args(interp
, NULL
,
2841 EXCEPTION_INVALID_STRING_REPRESENTATION
, "Malformed string");
2849 =item C<STRING * Parrot_str_upcase(PARROT_INTERP, const STRING *s)>
2851 Returns a copy of the specified Parrot string converted to upper case.
2852 Non-caseable characters are left unchanged.
2859 PARROT_CANNOT_RETURN_NULL
2862 Parrot_str_upcase(PARROT_INTERP
, ARGIN_NULLOK(const STRING
*s
))
2864 ASSERT_ARGS(Parrot_str_upcase
)
2865 if (STRING_IS_NULL(s
)) {
2866 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNEXPECTED_NULL
,
2867 "Can't upcase NULL string");
2871 STRING
* const dest
= Parrot_str_copy(interp
, PARROT_const_cast(STRING
*, s
));
2872 Parrot_str_upcase_inplace(interp
, dest
);
2880 =item C<void Parrot_str_upcase_inplace(PARROT_INTERP, STRING *s)>
2882 Converts the specified Parrot string to upper case.
2890 Parrot_str_upcase_inplace(PARROT_INTERP
, ARGMOD_NULLOK(STRING
*s
))
2892 ASSERT_ARGS(Parrot_str_upcase_inplace
)
2893 if (STRING_IS_NULL(s
)) {
2894 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNEXPECTED_NULL
,
2895 "Can't upcase NULL string");
2898 Parrot_str_write_COW(interp
, s
);
2899 CHARSET_UPCASE(interp
, s
);
2906 =item C<STRING * Parrot_str_downcase(PARROT_INTERP, const STRING *s)>
2908 Returns a copy of the specified Parrot string converted to lower case.
2909 Non-caseable characters are left unchanged.
2916 PARROT_CANNOT_RETURN_NULL
2919 Parrot_str_downcase(PARROT_INTERP
, ARGIN(const STRING
*s
))
2921 ASSERT_ARGS(Parrot_str_downcase
)
2923 STRING
* const dest
= Parrot_str_copy(interp
, PARROT_const_cast(STRING
*, s
));
2924 Parrot_str_downcase_inplace(interp
, dest
);
2931 =item C<void Parrot_str_downcase_inplace(PARROT_INTERP, STRING *s)>
2933 Converts the specified Parrot string to lower case.
2941 Parrot_str_downcase_inplace(PARROT_INTERP
, ARGMOD(STRING
*s
))
2943 ASSERT_ARGS(Parrot_str_downcase_inplace
)
2945 * TODO get rid of all the inplace variants. We have for utf8:
2946 * * 1 Parrot_str_copy from the non-incase variant
2947 * * conversion to utf16, with doubling the buffer
2948 * * possibly one more reallocation in downcase
2950 Parrot_str_write_COW(interp
, s
);
2951 CHARSET_DOWNCASE(interp
, s
);
2957 =item C<STRING * Parrot_str_titlecase(PARROT_INTERP, const STRING *s)>
2959 Returns a copy of the specified Parrot string converted to title case.
2960 Non-caseable characters are left unchanged.
2967 PARROT_CANNOT_RETURN_NULL
2970 Parrot_str_titlecase(PARROT_INTERP
, ARGIN(const STRING
*s
))
2972 ASSERT_ARGS(Parrot_str_titlecase
)
2974 STRING
* const dest
= Parrot_str_copy(interp
, PARROT_const_cast(STRING
*, s
));
2975 Parrot_str_titlecase_inplace(interp
, dest
);
2982 =item C<void Parrot_str_titlecase_inplace(PARROT_INTERP, STRING *s)>
2984 Converts the specified Parrot string to title case.
2992 Parrot_str_titlecase_inplace(PARROT_INTERP
, ARGMOD(STRING
*s
))
2994 ASSERT_ARGS(Parrot_str_titlecase_inplace
)
2995 Parrot_str_write_COW(interp
, s
);
2996 CHARSET_TITLECASE(interp
, s
);
3002 =item C<STRING * string_increment(PARROT_INTERP, const STRING *s)>
3004 Increments the string in the Perl 5 fashion, where incrementing aa gives you bb
3005 and so on. Currently single char only.
3012 PARROT_WARN_UNUSED_RESULT
3013 PARROT_CANNOT_RETURN_NULL
3015 string_increment(PARROT_INTERP
, ARGIN(const STRING
*s
))
3017 ASSERT_ARGS(string_increment
)
3020 if (Parrot_str_byte_length(interp
, s
) != 1)
3021 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
3022 "increment only for length = 1 done");
3024 o
= (UINTVAL
)string_ord(interp
, s
, 0);
3026 if ((o
>= 'A' && o
< 'Z') || (o
>= 'a' && o
< 'z')) {
3028 /* TODO increment in place */
3029 return string_chr(interp
, o
);
3032 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_UNIMPLEMENTED
,
3033 "increment out of range - unimplemented");
3039 =item C<const char * Parrot_string_cstring(PARROT_INTERP, const STRING *str)>
3041 Returns a C string from a Parrot string. Both sides are treated
3042 as constants -- i.e. do not resize the result.
3049 PARROT_PURE_FUNCTION
3050 PARROT_CANNOT_RETURN_NULL
3052 Parrot_string_cstring(SHIM_INTERP
, ARGIN(const STRING
*str
))
3054 ASSERT_ARGS(Parrot_string_cstring
)
3055 /* TODO handle NUL and friends */
3056 return str
->strstart
;
3062 =item C<INTVAL Parrot_str_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
3063 *s, UINTVAL offset)>
3065 Returns 1 if the codepoint of string C<s> at given offset is in the given
3066 character class C<flags>. See also F<include/parrot/cclass.h> for possible
3067 character classes. Returns 0 otherwise, or if the string is empty or NULL.
3074 PARROT_WARN_UNUSED_RESULT
3076 Parrot_str_is_cclass(PARROT_INTERP
, INTVAL flags
,
3077 ARGIN(const STRING
*s
), UINTVAL offset
)
3079 ASSERT_ARGS(Parrot_str_is_cclass
)
3080 if (!Parrot_str_byte_length(interp
, s
))
3083 return CHARSET_IS_CCLASS(interp
, flags
, s
, offset
);
3089 =item C<INTVAL Parrot_str_find_cclass(PARROT_INTERP, INTVAL flags, STRING *s,
3090 UINTVAL offset, UINTVAL count)>
3092 Finds the first occurrence of the given character class in C<flags> in the
3093 string, and returns its glyph-wise index.
3100 PARROT_WARN_UNUSED_RESULT
3102 Parrot_str_find_cclass(PARROT_INTERP
, INTVAL flags
, ARGIN_NULLOK(STRING
*s
),
3103 UINTVAL offset
, UINTVAL count
)
3105 ASSERT_ARGS(Parrot_str_find_cclass
)
3109 return CHARSET_FIND_CCLASS(interp
, flags
, s
, offset
, count
);
3114 =item C<INTVAL Parrot_str_find_not_cclass(PARROT_INTERP, INTVAL flags, STRING
3115 *s, UINTVAL offset, UINTVAL count)>
3117 Finds the first occurrence of the a character I<not> in the given character
3118 class in C<flags> in the string starting from C<offset> and looking at C<count>
3119 positions, and returns its glyph-wise index. Returns C<offset + count>, if not
3127 PARROT_WARN_UNUSED_RESULT
3129 Parrot_str_find_not_cclass(PARROT_INTERP
, INTVAL flags
,
3130 ARGIN_NULLOK(STRING
*s
), UINTVAL offset
, UINTVAL count
)
3132 ASSERT_ARGS(Parrot_str_find_not_cclass
)
3136 return CHARSET_FIND_NOT_CCLASS(interp
, flags
, s
, offset
, count
);
3142 =item C<STRING* Parrot_str_change_charset(PARROT_INTERP, STRING *src, INTVAL
3143 charset_nr, STRING *dest)>
3145 If C<dest> == NULL, converts C<src> to the given charset or encoding inplace.
3146 Otherwise returns a copy of C<src> with the charset/encoding in C<dest>.
3153 PARROT_WARN_UNUSED_RESULT
3154 PARROT_CAN_RETURN_NULL
3156 Parrot_str_change_charset(PARROT_INTERP
, ARGMOD_NULLOK(STRING
*src
),
3157 INTVAL charset_nr
, ARGOUT_NULLOK(STRING
*dest
))
3159 ASSERT_ARGS(Parrot_str_change_charset
)
3160 const CHARSET
*new_charset
;
3165 new_charset
= Parrot_get_charset(interp
, charset_nr
);
3168 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_CHARTYPE
,
3169 "charset #%d not found", (int) charset_nr
);
3172 * dest is an empty string header or NULL, if an inplace
3173 * operation is desired
3176 if (new_charset
== src
->charset
) {
3177 dest
= Parrot_str_reuse_COW(interp
, src
, dest
);
3178 dest
->charset
= new_charset
;
3183 dest
->charset
= new_charset
;
3185 /* get prefered encoding for charset */
3186 dest
->encoding
= CHARSET_GET_PREFERRED_ENCODING(interp
, dest
);
3189 if (new_charset
== src
->charset
)
3192 Parrot_str_write_COW(interp
, src
);
3195 return new_charset
->to_charset(interp
, src
, dest
);
3201 =item C<STRING* Parrot_str_change_encoding(PARROT_INTERP, STRING *src, INTVAL
3202 encoding_nr, STRING *dest)>
3204 If C<dest> == NULL, converts C<src> to the given charset or encoding in place.
3205 Otherwise returns a copy of C<src> with the charset/encoding in C<dest>
3212 PARROT_WARN_UNUSED_RESULT
3213 PARROT_CAN_RETURN_NULL
3215 Parrot_str_change_encoding(PARROT_INTERP
, ARGIN_NULLOK(STRING
*src
),
3216 INTVAL encoding_nr
, ARGOUT_NULLOK(STRING
*dest
))
3218 ASSERT_ARGS(Parrot_str_change_encoding
)
3219 const ENCODING
*new_encoding
;
3224 new_encoding
= Parrot_get_encoding(interp
, encoding_nr
);
3227 Parrot_ex_throw_from_c_args(interp
, NULL
, EXCEPTION_INVALID_CHARTYPE
,
3228 "encoding #%d not found", (int) encoding_nr
);
3231 * dest is an empty string header or NULL, if an inplace
3232 * operation is desired
3235 dest
->encoding
= new_encoding
;
3236 if (new_encoding
== src
->encoding
) {
3237 dest
= Parrot_str_reuse_COW(interp
, src
, dest
);
3242 if (new_encoding
== src
->encoding
)
3245 Parrot_str_write_COW(interp
, src
);
3248 return new_encoding
->to_encoding(interp
, src
, dest
);
3254 =item C<STRING * Parrot_str_compose(PARROT_INTERP, STRING *src)>
3256 Normalizes the string.
3263 PARROT_WARN_UNUSED_RESULT
3264 PARROT_CAN_RETURN_NULL
3266 Parrot_str_compose(PARROT_INTERP
, ARGIN_NULLOK(STRING
*src
))
3268 ASSERT_ARGS(Parrot_str_compose
)
3273 return Parrot_str_new_noinit(interp
, enum_stringrep_one
, 0);
3275 return CHARSET_COMPOSE(interp
, src
);
3281 =item C<STRING* Parrot_str_join(PARROT_INTERP, STRING *j, PMC *ar)>
3283 Joins the elements of the array C<ar> as strings with the string C<j> between
3284 them, returning the result.
3291 PARROT_WARN_UNUSED_RESULT
3292 PARROT_CANNOT_RETURN_NULL
3294 Parrot_str_join(PARROT_INTERP
, ARGIN_NULLOK(STRING
*j
), ARGIN(PMC
*ar
))
3296 ASSERT_ARGS(Parrot_str_join
)
3299 const int ar_len
= VTABLE_elements(interp
, ar
);
3303 return Parrot_str_new_noinit(interp
, enum_stringrep_one
, 0);
3305 s
= VTABLE_get_string_keyed_int(interp
, ar
, 0);
3306 res
= s
? Parrot_str_copy(interp
, s
) : NULL
;
3308 for (i
= 1; i
< ar_len
; ++i
) {
3309 STRING
* const next
= VTABLE_get_string_keyed_int(interp
, ar
, i
);
3311 res
= Parrot_str_append(interp
, res
, j
);
3312 res
= Parrot_str_append(interp
, res
, next
);
3321 =item C<PMC* Parrot_str_split(PARROT_INTERP, STRING *delim, STRING *str)>
3323 Splits the string C<str> at the delimiter C<delim>, returning a
3324 C<ResizableStringArray>, or his mapped type in the current HLL,
3325 of results. Returns PMCNULL if the string or the delimiter is NULL.
3332 PARROT_WARN_UNUSED_RESULT
3333 PARROT_CANNOT_RETURN_NULL
3335 Parrot_str_split(PARROT_INTERP
,
3336 ARGIN_NULLOK(STRING
*delim
), ARGIN_NULLOK(STRING
*str
))
3338 ASSERT_ARGS(Parrot_str_split
)
3340 INTVAL slen
, dlen
, ps
, pe
;
3342 if (STRING_IS_NULL(delim
) || STRING_IS_NULL(str
))
3345 res
= pmc_new(interp
, Parrot_get_ctx_HLL_type(interp
, enum_class_ResizableStringArray
));
3346 slen
= Parrot_str_byte_length(interp
, str
);
3351 dlen
= Parrot_str_byte_length(interp
, delim
);
3355 VTABLE_set_integer_native(interp
, res
, slen
);
3357 for (i
= 0; i
< slen
; ++i
) {
3358 STRING
* const p
= Parrot_str_substr(interp
, str
, i
, 1, NULL
, 0);
3359 VTABLE_set_string_keyed_int(interp
, res
, i
, p
);
3365 pe
= Parrot_str_find_index(interp
, str
, delim
, 0);
3368 VTABLE_push_string(interp
, res
, str
);
3374 while (ps
<= slen
) {
3375 const int pl
= pe
- ps
;
3376 STRING
* const tstr
= Parrot_str_substr(interp
, str
, ps
, pl
, NULL
, 0);
3378 VTABLE_push_string(interp
, res
, tstr
);
3379 ps
= pe
+ Parrot_str_byte_length(interp
, delim
);
3384 pe
= Parrot_str_find_index(interp
, str
, delim
, ps
);
3396 =item C<STRING* Parrot_str_from_uint(PARROT_INTERP, char *tc, UHUGEINTVAL num,
3397 unsigned int base, int minus)>
3399 Returns C<num> converted to a Parrot C<STRING>.
3401 Note that C<base> must be defined (a default of 10 is not assumed). The caller
3402 has to verify that C<< base >= 2 && base <= 36 >> The buffer C<tc> must be at
3403 least C<sizeof (UHUGEINTVAL)*8 + 1> chars big.
3405 If C<minus> is true, then C<-> is prepended to the string representation.
3411 PARROT_WARN_UNUSED_RESULT
3412 PARROT_CANNOT_RETURN_NULL
3414 Parrot_str_from_uint(PARROT_INTERP
, ARGOUT(char *tc
), UHUGEINTVAL num
,
3415 unsigned int base
, int minus
)
3417 ASSERT_ARGS(Parrot_str_from_uint
)
3418 /* the buffer must be at least as long as this */
3419 char *p
= tc
+ sizeof (UHUGEINTVAL
)*8 + 1;
3420 const char * const tail
= p
;
3422 PARROT_ASSERT(base
>= 2 && base
<= 36);
3425 const char cur
= (char)(num
% base
);
3427 *--p
= (char)('0' + cur
);
3430 *--p
= (char)('a' + cur
- 10);
3432 } while (num
/= base
);
3437 return string_make(interp
, p
, (UINTVAL
)(tail
- p
), "ascii", 0);
3443 =item C<STRING * Parrot_str_from_int_base(PARROT_INTERP, char *tc, HUGEINTVAL
3444 num, unsigned int base)>
3446 Returns C<num> converted to a Parrot C<STRING>.
3448 Note that C<base> must be defined (a default of 10 is not assumed).
3450 If C<< num < 0 >>, then C<-> is prepended to the string representation.
3456 PARROT_WARN_UNUSED_RESULT
3457 PARROT_CANNOT_RETURN_NULL
3459 Parrot_str_from_int_base(PARROT_INTERP
, ARGOUT(char *tc
), HUGEINTVAL num
, unsigned int base
)
3461 ASSERT_ARGS(Parrot_str_from_int_base
)
3462 const int is_neg
= (num
< 0);
3467 return Parrot_str_from_uint(interp
, tc
, (UHUGEINTVAL
)num
, base
, is_neg
);
3478 =item F<src/string/primitives.c>
3480 =item F<include/parrot/string.h>
3482 =item F<include/parrot/string_funcs.h>
3484 =item F<docs/strings.pod>
3495 * c-file-style: "parrot"
3497 * vim: expandtab shiftwidth=4: