1 /**********************************************************************
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
24 #include "debug_counter.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
43 #include "ruby/encoding.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
65 #undef rb_str_new_cstr
66 #undef rb_usascii_str_new_cstr
67 #undef rb_utf8_str_new_cstr
68 #undef rb_enc_str_new_cstr
69 #undef rb_external_str_new_cstr
70 #undef rb_locale_str_new_cstr
71 #undef rb_str_dup_frozen
72 #undef rb_str_buf_new_cstr
74 #undef rb_str_buf_cat2
76 #undef rb_str_cat_cstr
77 #undef rb_fstring_cstr
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
103 #define RUBY_MAX_CHAR_LEN 16
104 #define STR_SHARED_ROOT FL_USER5
105 #define STR_BORROWED FL_USER6
106 #define STR_TMPLOCK FL_USER7
107 #define STR_NOFREE FL_USER18
108 #define STR_FAKESTR FL_USER19
110 #define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
116 STR_SET_EMBED_LEN((str), 0);\
119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121 # define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
126 # define STR_SET_EMBED_LEN(str, n) do { \
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
133 #define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
138 RSTRING(str)->as.heap.len = (n);\
142 #define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
146 STR_SET_EMBED_LEN((str), n);\
149 RSTRING(str)->as.heap.len--;\
153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
154 #define TERM_FILL(ptr, termlen) do {\
155 char *const term_fill_ptr = (ptr);\
156 const int term_fill_len = (termlen);\
157 *term_fill_ptr = '\0';\
158 if (UNLIKELY(term_fill_len > 1))\
159 memset(term_fill_ptr, 0, term_fill_len);\
162 #define RESIZE_CAPA(str,capacity) do {\
163 const int termlen = TERM_LEN(str);\
164 RESIZE_CAPA_TERM(str,capacity,termlen);\
166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
167 if (STR_EMBED_P(str)) {\
168 if (str_embed_capa(str) < capacity + termlen) {\
169 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
170 const long tlen = RSTRING_LEN(str);\
171 memcpy(tmp, RSTRING_PTR(str), tlen);\
172 RSTRING(str)->as.heap.ptr = tmp;\
173 RSTRING(str)->as.heap.len = tlen;\
174 STR_SET_NOEMBED(str);\
175 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 assert(!FL_TEST((str), STR_SHARED)); \
180 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
181 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
182 RSTRING(str)->as.heap.aux.capa = (capacity);\
186 #define STR_SET_SHARED(str, shared_str) do { \
187 if (!FL_TEST(str, STR_FAKESTR)) { \
188 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
189 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
190 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
191 FL_SET((str), STR_SHARED); \
192 FL_SET((shared_str), STR_SHARED_ROOT); \
193 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
194 FL_SET_RAW((shared_str), STR_BORROWED); \
198 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
200 /* TODO: include the terminator size in capa. */
202 #define STR_ENC_GET(str) get_encoding(str)
204 #if !defined SHARABLE_MIDDLE_SUBSTRING
205 # define SHARABLE_MIDDLE_SUBSTRING 0
207 #if !SHARABLE_MIDDLE_SUBSTRING
208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
215 str_embed_capa(VALUE str
)
218 return rb_gc_obj_slot_size(str
) - offsetof(struct RString
, as
.embed
.ary
);
220 return RSTRING_EMBED_LEN_MAX
+ 1;
225 str_embed_size(long capa
)
227 return offsetof(struct RString
, as
.embed
.ary
) + capa
;
231 STR_EMBEDDABLE_P(long len
, long termlen
)
234 return rb_gc_size_allocatable_p(str_embed_size(len
+ termlen
));
236 return len
<= RSTRING_EMBED_LEN_MAX
+ 1 - termlen
;
240 static VALUE
str_replace_shared_without_enc(VALUE str2
, VALUE str
);
241 static VALUE
str_new_frozen(VALUE klass
, VALUE orig
);
242 static VALUE
str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
);
243 static VALUE
str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
);
244 static VALUE
str_new(VALUE klass
, const char *ptr
, long len
);
245 static void str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
);
246 static inline void str_modifiable(VALUE str
);
247 static VALUE
rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
);
250 str_make_independent(VALUE str
)
252 long len
= RSTRING_LEN(str
);
253 int termlen
= TERM_LEN(str
);
254 str_make_independent_expand((str
), len
, 0L, termlen
);
257 static inline int str_dependent_p(VALUE str
);
260 rb_str_make_independent(VALUE str
)
262 if (str_dependent_p(str
)) {
263 str_make_independent(str
);
268 rb_debug_rstring_null_ptr(const char *func
)
270 fprintf(stderr
, "%s is returning NULL!! "
271 "SIGSEGV is highly expected to follow immediately. "
272 "If you could reproduce, attach your debugger here, "
273 "and look at the passed string.",
277 /* symbols for [up|down|swap]case/capitalize options */
278 static VALUE sym_ascii
, sym_turkic
, sym_lithuanian
, sym_fold
;
281 get_actual_encoding(const int encidx
, VALUE str
)
283 const unsigned char *q
;
286 case ENCINDEX_UTF_16
:
287 if (RSTRING_LEN(str
) < 2) break;
288 q
= (const unsigned char *)RSTRING_PTR(str
);
289 if (q
[0] == 0xFE && q
[1] == 0xFF) {
290 return rb_enc_get_from_index(ENCINDEX_UTF_16BE
);
292 if (q
[0] == 0xFF && q
[1] == 0xFE) {
293 return rb_enc_get_from_index(ENCINDEX_UTF_16LE
);
295 return rb_ascii8bit_encoding();
296 case ENCINDEX_UTF_32
:
297 if (RSTRING_LEN(str
) < 4) break;
298 q
= (const unsigned char *)RSTRING_PTR(str
);
299 if (q
[0] == 0 && q
[1] == 0 && q
[2] == 0xFE && q
[3] == 0xFF) {
300 return rb_enc_get_from_index(ENCINDEX_UTF_32BE
);
302 if (q
[3] == 0 && q
[2] == 0 && q
[1] == 0xFE && q
[0] == 0xFF) {
303 return rb_enc_get_from_index(ENCINDEX_UTF_32LE
);
305 return rb_ascii8bit_encoding();
307 return rb_enc_from_index(encidx
);
311 get_encoding(VALUE str
)
313 return get_actual_encoding(ENCODING_GET(str
), str
);
317 mustnot_broken(VALUE str
)
319 if (is_broken_string(str
)) {
320 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str
)));
325 mustnot_wchar(VALUE str
)
327 rb_encoding
*enc
= STR_ENC_GET(str
);
328 if (rb_enc_mbminlen(enc
) > 1) {
329 rb_raise(rb_eArgError
, "wide char encoding: %s", rb_enc_name(enc
));
333 static int fstring_cmp(VALUE a
, VALUE b
);
335 static VALUE
register_fstring(VALUE str
, bool copy
);
337 const struct st_hash_type rb_fstring_hash_type
= {
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg
{
350 fstr_update_callback(st_data_t
*key
, st_data_t
*value
, st_data_t data
, int existing
)
353 struct fstr_update_arg
*arg
= (struct fstr_update_arg
*)data
;
354 VALUE str
= (VALUE
)*key
;
357 /* because of lazy sweep, str may be unmarked already and swept
360 if (rb_objspace_garbage_object_p(str
)) {
369 if (FL_TEST_RAW(str
, STR_FAKESTR
)) {
371 VALUE new_str
= str_new(rb_cString
, RSTRING(str
)->as
.heap
.ptr
, RSTRING(str
)->as
.heap
.len
);
372 rb_enc_copy(new_str
, str
);
376 str
= str_new_static(rb_cString
, RSTRING(str
)->as
.heap
.ptr
,
377 RSTRING(str
)->as
.heap
.len
,
383 if (!OBJ_FROZEN(str
))
384 str
= str_new_frozen(rb_cString
, str
);
385 if (STR_SHARED_P(str
)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str
);
388 assert(OBJ_FROZEN(str
));
390 if (!BARE_STRING_P(str
)) {
391 str
= str_new_frozen(rb_cString
, str
);
394 RBASIC(str
)->flags
|= RSTRING_FSTR
;
396 *key
= *value
= arg
->fstr
= str
;
403 rb_fstring(VALUE str
)
408 Check_Type(str
, T_STRING
);
410 if (FL_TEST(str
, RSTRING_FSTR
))
413 bare
= BARE_STRING_P(str
);
415 if (STR_EMBED_P(str
)) {
419 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_SHARED_ROOT
|STR_SHARED
) == (STR_NOEMBED
|STR_SHARED_ROOT
)) {
420 assert(OBJ_FROZEN(str
));
425 if (!OBJ_FROZEN(str
))
426 rb_str_resize(str
, RSTRING_LEN(str
));
428 fstr
= register_fstring(str
, FALSE
);
431 str_replace_shared_without_enc(str
, fstr
);
439 register_fstring(VALUE str
, bool copy
)
441 struct fstr_update_arg args
;
446 st_table
*frozen_strings
= rb_vm_fstring_table();
449 st_update(frozen_strings
, (st_data_t
)str
, fstr_update_callback
, (st_data_t
)&args
);
450 } while (args
.fstr
== Qundef
);
454 assert(OBJ_FROZEN(args
.fstr
));
455 assert(!FL_TEST_RAW(args
.fstr
, STR_FAKESTR
));
456 assert(!FL_TEST_RAW(args
.fstr
, FL_EXIVAR
));
457 assert(RBASIC_CLASS(args
.fstr
) == rb_cString
);
462 setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, int encidx
)
464 fake_str
->basic
.flags
= T_STRING
|RSTRING_NOEMBED
|STR_NOFREE
|STR_FAKESTR
;
465 /* SHARED to be allocated by the callback */
468 RUBY_ASSERT_ALWAYS(len
== 0);
472 ENCODING_SET_INLINED((VALUE
)fake_str
, encidx
);
474 RBASIC_SET_CLASS_RAW((VALUE
)fake_str
, rb_cString
);
475 fake_str
->as
.heap
.len
= len
;
476 fake_str
->as
.heap
.ptr
= (char *)name
;
477 fake_str
->as
.heap
.aux
.capa
= len
;
478 return (VALUE
)fake_str
;
482 * set up a fake string which refers a static string literal.
485 rb_setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, rb_encoding
*enc
)
487 return setup_fake_str(fake_str
, name
, len
, rb_enc_to_index(enc
));
491 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
492 * shared string which refers a static string literal. `ptr` must
493 * point a constant string.
495 MJIT_FUNC_EXPORTED VALUE
496 rb_fstring_new(const char *ptr
, long len
)
498 struct RString fake_str
;
499 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), FALSE
);
503 rb_fstring_enc_new(const char *ptr
, long len
, rb_encoding
*enc
)
505 struct RString fake_str
;
506 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), FALSE
);
510 rb_fstring_cstr(const char *ptr
)
512 return rb_fstring_new(ptr
, strlen(ptr
));
516 fstring_set_class_i(st_data_t key
, st_data_t val
, st_data_t arg
)
518 RBASIC_SET_CLASS((VALUE
)key
, (VALUE
)arg
);
523 fstring_cmp(VALUE a
, VALUE b
)
526 const char *aptr
, *bptr
;
527 RSTRING_GETMEM(a
, aptr
, alen
);
528 RSTRING_GETMEM(b
, bptr
, blen
);
529 return (alen
!= blen
||
530 ENCODING_GET(a
) != ENCODING_GET(b
) ||
531 memcmp(aptr
, bptr
, alen
) != 0);
535 single_byte_optimizable(VALUE str
)
539 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
540 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
543 enc
= STR_ENC_GET(str
);
544 if (rb_enc_mbmaxlen(enc
) == 1)
547 /* Conservative. Possibly single byte.
548 * "\xa1" in Shift_JIS for example. */
554 static inline const char *
555 search_nonascii(const char *p
, const char *e
)
557 const uintptr_t *s
, *t
;
559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
560 # if SIZEOF_UINTPTR_T == 8
561 # define NONASCII_MASK UINT64_C(0x8080808080808080)
562 # elif SIZEOF_UINTPTR_T == 4
563 # define NONASCII_MASK UINT32_C(0x80808080)
565 # error "don't know what to do."
568 # if SIZEOF_UINTPTR_T == 8
569 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
570 # elif SIZEOF_UINTPTR_T == 4
571 # define NONASCII_MASK 0x80808080UL /* or...? */
573 # error "don't know what to do."
577 if (UNALIGNED_WORD_ACCESS
|| e
- p
>= SIZEOF_VOIDP
) {
578 #if !UNALIGNED_WORD_ACCESS
579 if ((uintptr_t)p
% SIZEOF_VOIDP
) {
580 int l
= SIZEOF_VOIDP
- (uintptr_t)p
% SIZEOF_VOIDP
;
583 default: UNREACHABLE
;
585 case 7: if (p
[-7]&0x80) return p
-7;
586 case 6: if (p
[-6]&0x80) return p
-6;
587 case 5: if (p
[-5]&0x80) return p
-5;
588 case 4: if (p
[-4]&0x80) return p
-4;
590 case 3: if (p
[-3]&0x80) return p
-3;
591 case 2: if (p
[-2]&0x80) return p
-2;
592 case 1: if (p
[-1]&0x80) return p
-1;
597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
598 #define aligned_ptr(value) \
599 __builtin_assume_aligned((value), sizeof(uintptr_t))
601 #define aligned_ptr(value) (uintptr_t *)(value)
604 t
= (uintptr_t *)(e
- (SIZEOF_VOIDP
-1));
607 if (*s
& NONASCII_MASK
) {
608 #ifdef WORDS_BIGENDIAN
609 return (const char *)s
+ (nlz_intptr(*s
&NONASCII_MASK
)>>3);
611 return (const char *)s
+ (ntz_intptr(*s
&NONASCII_MASK
)>>3);
619 default: UNREACHABLE
;
621 case 7: if (e
[-7]&0x80) return e
-7;
622 case 6: if (e
[-6]&0x80) return e
-6;
623 case 5: if (e
[-5]&0x80) return e
-5;
624 case 4: if (e
[-4]&0x80) return e
-4;
626 case 3: if (e
[-3]&0x80) return e
-3;
627 case 2: if (e
[-2]&0x80) return e
-2;
628 case 1: if (e
[-1]&0x80) return e
-1;
634 coderange_scan(const char *p
, long len
, rb_encoding
*enc
)
636 const char *e
= p
+ len
;
638 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
639 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
640 p
= search_nonascii(p
, e
);
641 return p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
644 if (rb_enc_asciicompat(enc
)) {
645 p
= search_nonascii(p
, e
);
646 if (!p
) return ENC_CODERANGE_7BIT
;
648 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
649 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
650 p
+= MBCLEN_CHARFOUND_LEN(ret
);
652 p
= search_nonascii(p
, e
);
658 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
659 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
660 p
+= MBCLEN_CHARFOUND_LEN(ret
);
663 return ENC_CODERANGE_VALID
;
667 rb_str_coderange_scan_restartable(const char *s
, const char *e
, rb_encoding
*enc
, int *cr
)
671 if (*cr
== ENC_CODERANGE_BROKEN
)
674 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
675 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
676 if (*cr
== ENC_CODERANGE_VALID
) return e
- s
;
677 p
= search_nonascii(p
, e
);
678 *cr
= p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
681 else if (rb_enc_asciicompat(enc
)) {
682 p
= search_nonascii(p
, e
);
684 if (*cr
!= ENC_CODERANGE_VALID
) *cr
= ENC_CODERANGE_7BIT
;
688 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
689 if (!MBCLEN_CHARFOUND_P(ret
)) {
690 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
693 p
+= MBCLEN_CHARFOUND_LEN(ret
);
695 p
= search_nonascii(p
, e
);
701 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
702 if (!MBCLEN_CHARFOUND_P(ret
)) {
703 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
706 p
+= MBCLEN_CHARFOUND_LEN(ret
);
709 *cr
= ENC_CODERANGE_VALID
;
714 str_enc_copy(VALUE str1
, VALUE str2
)
716 rb_enc_set_index(str1
, ENCODING_GET(str2
));
720 rb_enc_cr_str_copy_for_substr(VALUE dest
, VALUE src
)
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
725 str_enc_copy(dest
, src
);
726 if (RSTRING_LEN(dest
) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src
)))
728 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
730 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
733 switch (ENC_CODERANGE(src
)) {
734 case ENC_CODERANGE_7BIT
:
735 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
737 case ENC_CODERANGE_VALID
:
738 if (!rb_enc_asciicompat(STR_ENC_GET(src
)) ||
739 search_nonascii(RSTRING_PTR(dest
), RSTRING_END(dest
)))
740 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
742 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
750 rb_enc_cr_str_exact_copy(VALUE dest
, VALUE src
)
752 str_enc_copy(dest
, src
);
753 ENC_CODERANGE_SET(dest
, ENC_CODERANGE(src
));
757 enc_coderange_scan(VALUE str
, rb_encoding
*enc
, int encidx
)
759 if (rb_enc_mbminlen(enc
) > 1 && rb_enc_dummy_p(enc
) &&
760 rb_enc_mbminlen(enc
= get_actual_encoding(encidx
, str
)) == 1) {
761 return ENC_CODERANGE_BROKEN
;
764 return coderange_scan(RSTRING_PTR(str
), RSTRING_LEN(str
), enc
);
769 rb_enc_str_coderange_scan(VALUE str
, rb_encoding
*enc
)
771 return enc_coderange_scan(str
, enc
, rb_enc_to_index(enc
));
775 rb_enc_str_coderange(VALUE str
)
777 int cr
= ENC_CODERANGE(str
);
779 if (cr
== ENC_CODERANGE_UNKNOWN
) {
780 int encidx
= ENCODING_GET(str
);
781 rb_encoding
*enc
= rb_enc_from_index(encidx
);
782 cr
= enc_coderange_scan(str
, enc
, encidx
);
783 ENC_CODERANGE_SET(str
, cr
);
789 rb_enc_str_asciionly_p(VALUE str
)
791 rb_encoding
*enc
= STR_ENC_GET(str
);
793 if (!rb_enc_asciicompat(enc
))
795 else if (rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
)
801 str_mod_check(VALUE s
, const char *p
, long len
)
803 if (RSTRING_PTR(s
) != p
|| RSTRING_LEN(s
) != len
){
804 rb_raise(rb_eRuntimeError
, "string modified");
809 str_capacity(VALUE str
, const int termlen
)
811 if (STR_EMBED_P(str
)) {
813 return str_embed_capa(str
) - termlen
;
815 return (RSTRING_EMBED_LEN_MAX
+ 1 - termlen
);
818 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
819 return RSTRING(str
)->as
.heap
.len
;
822 return RSTRING(str
)->as
.heap
.aux
.capa
;
827 rb_str_capacity(VALUE str
)
829 return str_capacity(str
, TERM_LEN(str
));
833 must_not_null(const char *ptr
)
836 rb_raise(rb_eArgError
, "NULL pointer given");
841 str_alloc(VALUE klass
, size_t size
)
844 RVARGC_NEWOBJ_OF(str
, struct RString
, klass
,
845 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
850 str_alloc_embed(VALUE klass
, size_t capa
)
852 size_t size
= str_embed_size(capa
);
853 assert(rb_gc_size_allocatable_p(size
));
855 assert(size
<= sizeof(struct RString
));
857 return str_alloc(klass
, size
);
861 str_alloc_heap(VALUE klass
)
863 return str_alloc(klass
, sizeof(struct RString
));
867 empty_str_alloc(VALUE klass
)
869 RUBY_DTRACE_CREATE_HOOK(STRING
, 0);
870 VALUE str
= str_alloc_embed(klass
, 0);
871 memset(RSTRING(str
)->as
.embed
.ary
, 0, str_embed_capa(str
));
876 str_new0(VALUE klass
, const char *ptr
, long len
, int termlen
)
881 rb_raise(rb_eArgError
, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
886 if (STR_EMBEDDABLE_P(len
, termlen
)) {
887 str
= str_alloc_embed(klass
, len
+ termlen
);
889 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
893 str
= str_alloc_heap(klass
);
894 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str
)->as
.heap
.ptr
=
899 rb_xmalloc_mul_add_mul(sizeof(char), len
, sizeof(char), termlen
);
900 STR_SET_NOEMBED(str
);
903 memcpy(RSTRING_PTR(str
), ptr
, len
);
905 STR_SET_LEN(str
, len
);
906 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
911 str_new(VALUE klass
, const char *ptr
, long len
)
913 return str_new0(klass
, ptr
, len
, 1);
917 rb_str_new(const char *ptr
, long len
)
919 return str_new(rb_cString
, ptr
, len
);
923 rb_usascii_str_new(const char *ptr
, long len
)
925 VALUE str
= rb_str_new(ptr
, len
);
926 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
931 rb_utf8_str_new(const char *ptr
, long len
)
933 VALUE str
= str_new(rb_cString
, ptr
, len
);
934 rb_enc_associate_index(str
, rb_utf8_encindex());
939 rb_enc_str_new(const char *ptr
, long len
, rb_encoding
*enc
)
943 if (!enc
) return rb_str_new(ptr
, len
);
945 str
= str_new0(rb_cString
, ptr
, len
, rb_enc_mbminlen(enc
));
946 rb_enc_associate(str
, enc
);
951 rb_str_new_cstr(const char *ptr
)
954 /* rb_str_new_cstr() can take pointer from non-malloc-generated
955 * memory regions, and that cannot be detected by the MSAN. Just
956 * trust the programmer that the argument passed here is a sane C
958 __msan_unpoison_string(ptr
);
959 return rb_str_new(ptr
, strlen(ptr
));
963 rb_usascii_str_new_cstr(const char *ptr
)
965 VALUE str
= rb_str_new_cstr(ptr
);
966 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
971 rb_utf8_str_new_cstr(const char *ptr
)
973 VALUE str
= rb_str_new_cstr(ptr
);
974 rb_enc_associate_index(str
, rb_utf8_encindex());
979 rb_enc_str_new_cstr(const char *ptr
, rb_encoding
*enc
)
982 if (rb_enc_mbminlen(enc
) != 1) {
983 rb_raise(rb_eArgError
, "wchar encoding given");
985 return rb_enc_str_new(ptr
, strlen(ptr
), enc
);
989 str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
)
994 rb_raise(rb_eArgError
, "negative string size (or size too big)");
998 rb_encoding
*enc
= rb_enc_get_from_index(encindex
);
999 str
= str_new0(klass
, ptr
, len
, rb_enc_mbminlen(enc
));
1002 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
1003 str
= str_alloc_heap(klass
);
1004 RSTRING(str
)->as
.heap
.len
= len
;
1005 RSTRING(str
)->as
.heap
.ptr
= (char *)ptr
;
1006 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
1007 STR_SET_NOEMBED(str
);
1008 RBASIC(str
)->flags
|= STR_NOFREE
;
1010 rb_enc_associate_index(str
, encindex
);
1015 rb_str_new_static(const char *ptr
, long len
)
1017 return str_new_static(rb_cString
, ptr
, len
, 0);
1021 rb_usascii_str_new_static(const char *ptr
, long len
)
1023 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_US_ASCII
);
1027 rb_utf8_str_new_static(const char *ptr
, long len
)
1029 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_UTF_8
);
1033 rb_enc_str_new_static(const char *ptr
, long len
, rb_encoding
*enc
)
1035 return str_new_static(rb_cString
, ptr
, len
, rb_enc_to_index(enc
));
1038 static VALUE
str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1039 rb_encoding
*from
, rb_encoding
*to
,
1040 int ecflags
, VALUE ecopts
);
1043 is_enc_ascii_string(VALUE str
, rb_encoding
*enc
)
1045 int encidx
= rb_enc_to_index(enc
);
1046 if (rb_enc_get_index(str
) == encidx
)
1047 return is_ascii_string(str
);
1048 return enc_coderange_scan(str
, enc
, encidx
) == ENC_CODERANGE_7BIT
;
1052 rb_str_conv_enc_opts(VALUE str
, rb_encoding
*from
, rb_encoding
*to
, int ecflags
, VALUE ecopts
)
1058 if (!to
) return str
;
1059 if (!from
) from
= rb_enc_get(str
);
1060 if (from
== to
) return str
;
1061 if ((rb_enc_asciicompat(to
) && is_enc_ascii_string(str
, from
)) ||
1062 to
== rb_ascii8bit_encoding()) {
1063 if (STR_ENC_GET(str
) != to
) {
1064 str
= rb_str_dup(str
);
1065 rb_enc_associate(str
, to
);
1070 RSTRING_GETMEM(str
, ptr
, len
);
1071 newstr
= str_cat_conv_enc_opts(rb_str_buf_new(len
), 0, ptr
, len
,
1072 from
, to
, ecflags
, ecopts
);
1073 if (NIL_P(newstr
)) {
1074 /* some error, return original */
1081 rb_str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1082 rb_encoding
*from
, int ecflags
, VALUE ecopts
)
1086 olen
= RSTRING_LEN(newstr
);
1087 if (ofs
< -olen
|| olen
< ofs
)
1088 rb_raise(rb_eIndexError
, "index %ld out of string", ofs
);
1089 if (ofs
< 0) ofs
+= olen
;
1091 STR_SET_LEN(newstr
, ofs
);
1092 return rb_str_cat(newstr
, ptr
, len
);
1095 rb_str_modify(newstr
);
1096 return str_cat_conv_enc_opts(newstr
, ofs
, ptr
, len
, from
,
1102 rb_str_initialize(VALUE str
, const char *ptr
, long len
, rb_encoding
*enc
)
1104 STR_SET_LEN(str
, 0);
1105 rb_enc_associate(str
, enc
);
1106 rb_str_cat(str
, ptr
, len
);
1111 str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1112 rb_encoding
*from
, rb_encoding
*to
,
1113 int ecflags
, VALUE ecopts
)
1116 rb_econv_result_t ret
;
1118 VALUE econv_wrapper
;
1119 const unsigned char *start
, *sp
;
1120 unsigned char *dest
, *dp
;
1121 size_t converted_output
= (size_t)ofs
;
1123 olen
= rb_str_capacity(newstr
);
1125 econv_wrapper
= rb_obj_alloc(rb_cEncodingConverter
);
1126 RBASIC_CLEAR_CLASS(econv_wrapper
);
1127 ec
= rb_econv_open_opts(from
->name
, to
->name
, ecflags
, ecopts
);
1128 if (!ec
) return Qnil
;
1129 DATA_PTR(econv_wrapper
) = ec
;
1131 sp
= (unsigned char*)ptr
;
1133 while ((dest
= (unsigned char*)RSTRING_PTR(newstr
)),
1134 (dp
= dest
+ converted_output
),
1135 (ret
= rb_econv_convert(ec
, &sp
, start
+ len
, &dp
, dest
+ olen
, 0)),
1136 ret
== econv_destination_buffer_full
) {
1137 /* destination buffer short */
1138 size_t converted_input
= sp
- start
;
1139 size_t rest
= len
- converted_input
;
1140 converted_output
= dp
- dest
;
1141 rb_str_set_len(newstr
, converted_output
);
1142 if (converted_input
&& converted_output
&&
1143 rest
< (LONG_MAX
/ converted_output
)) {
1144 rest
= (rest
* converted_output
) / converted_input
;
1149 olen
+= rest
< 2 ? 2 : rest
;
1150 rb_str_resize(newstr
, olen
);
1152 DATA_PTR(econv_wrapper
) = 0;
1155 case econv_finished
:
1156 len
= dp
- (unsigned char*)RSTRING_PTR(newstr
);
1157 rb_str_set_len(newstr
, len
);
1158 rb_enc_associate(newstr
, to
);
1167 rb_str_conv_enc(VALUE str
, rb_encoding
*from
, rb_encoding
*to
)
1169 return rb_str_conv_enc_opts(str
, from
, to
, 0, Qnil
);
1173 rb_external_str_new_with_enc(const char *ptr
, long len
, rb_encoding
*eenc
)
1177 const int eidx
= rb_enc_to_index(eenc
);
1180 return rb_enc_str_new(ptr
, len
, eenc
);
1183 /* ASCII-8BIT case, no conversion */
1184 if ((eidx
== rb_ascii8bit_encindex()) ||
1185 (eidx
== rb_usascii_encindex() && search_nonascii(ptr
, ptr
+ len
))) {
1186 return rb_str_new(ptr
, len
);
1188 /* no default_internal or same encoding, no conversion */
1189 ienc
= rb_default_internal_encoding();
1190 if (!ienc
|| eenc
== ienc
) {
1191 return rb_enc_str_new(ptr
, len
, eenc
);
1193 /* ASCII compatible, and ASCII only string, no conversion in
1194 * default_internal */
1195 if ((eidx
== rb_ascii8bit_encindex()) ||
1196 (eidx
== rb_usascii_encindex()) ||
1197 (rb_enc_asciicompat(eenc
) && !search_nonascii(ptr
, ptr
+ len
))) {
1198 return rb_enc_str_new(ptr
, len
, ienc
);
1200 /* convert from the given encoding to default_internal */
1201 str
= rb_enc_str_new(NULL
, 0, ienc
);
1202 /* when the conversion failed for some reason, just ignore the
1203 * default_internal and result in the given encoding as-is. */
1204 if (NIL_P(rb_str_cat_conv_enc_opts(str
, 0, ptr
, len
, eenc
, 0, Qnil
))) {
1205 rb_str_initialize(str
, ptr
, len
, eenc
);
1211 rb_external_str_with_enc(VALUE str
, rb_encoding
*eenc
)
1213 int eidx
= rb_enc_to_index(eenc
);
1214 if (eidx
== rb_usascii_encindex() &&
1215 rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
) {
1216 rb_enc_associate_index(str
, rb_ascii8bit_encindex());
1219 rb_enc_associate_index(str
, eidx
);
1220 return rb_str_conv_enc(str
, eenc
, rb_default_internal_encoding());
1224 rb_external_str_new(const char *ptr
, long len
)
1226 return rb_external_str_new_with_enc(ptr
, len
, rb_default_external_encoding());
1230 rb_external_str_new_cstr(const char *ptr
)
1232 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_default_external_encoding());
1236 rb_locale_str_new(const char *ptr
, long len
)
1238 return rb_external_str_new_with_enc(ptr
, len
, rb_locale_encoding());
1242 rb_locale_str_new_cstr(const char *ptr
)
1244 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_locale_encoding());
1248 rb_filesystem_str_new(const char *ptr
, long len
)
1250 return rb_external_str_new_with_enc(ptr
, len
, rb_filesystem_encoding());
1254 rb_filesystem_str_new_cstr(const char *ptr
)
1256 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_filesystem_encoding());
1260 rb_str_export(VALUE str
)
1262 return rb_str_export_to_enc(str
, rb_default_external_encoding());
1266 rb_str_export_locale(VALUE str
)
1268 return rb_str_export_to_enc(str
, rb_locale_encoding());
1272 rb_str_export_to_enc(VALUE str
, rb_encoding
*enc
)
1274 return rb_str_conv_enc(str
, STR_ENC_GET(str
), enc
);
1278 str_replace_shared_without_enc(VALUE str2
, VALUE str
)
1280 const int termlen
= TERM_LEN(str
);
1284 RSTRING_GETMEM(str
, ptr
, len
);
1285 if (str_embed_capa(str2
) >= len
+ termlen
) {
1286 char *ptr2
= RSTRING(str2
)->as
.embed
.ary
;
1287 STR_SET_EMBED(str2
);
1288 memcpy(ptr2
, RSTRING_PTR(str
), len
);
1289 STR_SET_EMBED_LEN(str2
, len
);
1290 TERM_FILL(ptr2
+len
, termlen
);
1294 if (STR_SHARED_P(str
)) {
1295 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1296 RSTRING_GETMEM(str
, ptr
, len
);
1299 root
= rb_str_new_frozen(str
);
1300 RSTRING_GETMEM(root
, ptr
, len
);
1302 assert(OBJ_FROZEN(root
));
1303 if (!STR_EMBED_P(str2
) && !FL_TEST_RAW(str2
, STR_SHARED
|STR_NOFREE
)) {
1304 if (FL_TEST_RAW(str2
, STR_SHARED_ROOT
)) {
1305 rb_fatal("about to free a possible shared root");
1307 char *ptr2
= STR_HEAP_PTR(str2
);
1309 ruby_sized_xfree(ptr2
, STR_HEAP_SIZE(str2
));
1312 FL_SET(str2
, STR_NOEMBED
);
1313 RSTRING(str2
)->as
.heap
.len
= len
;
1314 RSTRING(str2
)->as
.heap
.ptr
= ptr
;
1315 STR_SET_SHARED(str2
, root
);
1321 str_replace_shared(VALUE str2
, VALUE str
)
1323 str_replace_shared_without_enc(str2
, str
);
1324 rb_enc_cr_str_exact_copy(str2
, str
);
1329 str_new_shared(VALUE klass
, VALUE str
)
1331 return str_replace_shared(str_alloc_heap(klass
), str
);
1335 rb_str_new_shared(VALUE str
)
1337 return str_new_shared(rb_obj_class(str
), str
);
1341 rb_str_new_frozen(VALUE orig
)
1343 if (OBJ_FROZEN(orig
)) return orig
;
1344 return str_new_frozen(rb_obj_class(orig
), orig
);
1348 rb_str_new_frozen_String(VALUE orig
)
1350 if (OBJ_FROZEN(orig
) && rb_obj_class(orig
) == rb_cString
) return orig
;
1351 return str_new_frozen(rb_cString
, orig
);
1355 rb_str_tmp_frozen_acquire(VALUE orig
)
1357 if (OBJ_FROZEN_RAW(orig
)) return orig
;
1358 return str_new_frozen_buffer(0, orig
, FALSE
);
1362 rb_str_tmp_frozen_release(VALUE orig
, VALUE tmp
)
1364 if (RBASIC_CLASS(tmp
) != 0)
1367 if (STR_EMBED_P(tmp
)) {
1368 assert(OBJ_FROZEN_RAW(tmp
));
1370 else if (FL_TEST_RAW(orig
, STR_SHARED
) &&
1371 !FL_TEST_RAW(orig
, STR_TMPLOCK
|RUBY_FL_FREEZE
)) {
1372 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1374 if (shared
== tmp
&& !FL_TEST_RAW(tmp
, STR_BORROWED
)) {
1375 assert(RSTRING(orig
)->as
.heap
.ptr
== RSTRING(tmp
)->as
.heap
.ptr
);
1376 assert(RSTRING(orig
)->as
.heap
.len
== RSTRING(tmp
)->as
.heap
.len
);
1378 /* Unshare orig since the root (tmp) only has this one child. */
1379 FL_UNSET_RAW(orig
, STR_SHARED
);
1380 RSTRING(orig
)->as
.heap
.aux
.capa
= RSTRING(tmp
)->as
.heap
.aux
.capa
;
1381 RBASIC(orig
)->flags
|= RBASIC(tmp
)->flags
& STR_NOFREE
;
1382 assert(OBJ_FROZEN_RAW(tmp
));
1384 /* Make tmp embedded and empty so it is safe for sweeping. */
1386 STR_SET_EMBED_LEN(tmp
, 0);
1392 str_new_frozen(VALUE klass
, VALUE orig
)
1394 return str_new_frozen_buffer(klass
, orig
, TRUE
);
1398 heap_str_make_shared(VALUE klass
, VALUE orig
)
1400 assert(!STR_EMBED_P(orig
));
1401 assert(!STR_SHARED_P(orig
));
1403 VALUE str
= str_alloc_heap(klass
);
1404 STR_SET_NOEMBED(str
);
1405 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(orig
);
1406 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(orig
);
1407 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(orig
)->as
.heap
.aux
.capa
;
1408 RBASIC(str
)->flags
|= RBASIC(orig
)->flags
& STR_NOFREE
;
1409 RBASIC(orig
)->flags
&= ~STR_NOFREE
;
1410 STR_SET_SHARED(orig
, str
);
1412 FL_UNSET_RAW(str
, STR_BORROWED
);
1417 str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
)
1421 long len
= RSTRING_LEN(orig
);
1423 if (STR_EMBED_P(orig
) || STR_EMBEDDABLE_P(len
, 1)) {
1424 str
= str_new(klass
, RSTRING_PTR(orig
), len
);
1425 assert(STR_EMBED_P(str
));
1428 if (FL_TEST_RAW(orig
, STR_SHARED
)) {
1429 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1430 long ofs
= RSTRING(orig
)->as
.heap
.ptr
- RSTRING_PTR(shared
);
1431 long rest
= RSTRING_LEN(shared
) - ofs
- RSTRING(orig
)->as
.heap
.len
;
1434 assert(ofs
+ rest
<= RSTRING_LEN(shared
));
1436 assert(!STR_EMBED_P(shared
));
1438 assert(OBJ_FROZEN(shared
));
1440 if ((ofs
> 0) || (rest
> 0) ||
1441 (klass
!= RBASIC(shared
)->klass
) ||
1442 ENCODING_GET(shared
) != ENCODING_GET(orig
)) {
1443 str
= str_new_shared(klass
, shared
);
1444 assert(!STR_EMBED_P(str
));
1445 RSTRING(str
)->as
.heap
.ptr
+= ofs
;
1446 RSTRING(str
)->as
.heap
.len
-= ofs
+ rest
;
1449 if (RBASIC_CLASS(shared
) == 0)
1450 FL_SET_RAW(shared
, STR_BORROWED
);
1454 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig
), TERM_LEN(orig
))) {
1455 str
= str_alloc_embed(klass
, RSTRING_LEN(orig
) + TERM_LEN(orig
));
1457 memcpy(RSTRING_PTR(str
), RSTRING_PTR(orig
), RSTRING_LEN(orig
));
1458 STR_SET_EMBED_LEN(str
, RSTRING_LEN(orig
));
1459 TERM_FILL(RSTRING_END(str
), TERM_LEN(orig
));
1462 str
= heap_str_make_shared(klass
, orig
);
1466 if (copy_encoding
) rb_enc_cr_str_exact_copy(str
, orig
);
1472 rb_str_new_with_class(VALUE obj
, const char *ptr
, long len
)
1474 return str_new0(rb_obj_class(obj
), ptr
, len
, TERM_LEN(obj
));
1478 str_new_empty_String(VALUE str
)
1480 VALUE v
= rb_str_new(0, 0);
1481 rb_enc_copy(v
, str
);
1485 #define STR_BUF_MIN_SIZE 63
1487 STATIC_ASSERT(STR_BUF_MIN_SIZE
, STR_BUF_MIN_SIZE
> RSTRING_EMBED_LEN_MAX
);
1491 rb_str_buf_new(long capa
)
1493 if (STR_EMBEDDABLE_P(capa
, 1)) {
1494 return str_alloc_embed(rb_cString
, capa
+ 1);
1497 VALUE str
= str_alloc_heap(rb_cString
);
1500 if (capa
< STR_BUF_MIN_SIZE
) {
1501 capa
= STR_BUF_MIN_SIZE
;
1504 FL_SET(str
, STR_NOEMBED
);
1505 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1506 RSTRING(str
)->as
.heap
.ptr
= ALLOC_N(char, (size_t)capa
+ 1);
1507 RSTRING(str
)->as
.heap
.ptr
[0] = '\0';
1513 rb_str_buf_new_cstr(const char *ptr
)
1516 long len
= strlen(ptr
);
1518 str
= rb_str_buf_new(len
);
1519 rb_str_buf_cat(str
, ptr
, len
);
1525 rb_str_tmp_new(long len
)
1527 return str_new(0, 0, len
);
1531 rb_str_free(VALUE str
)
1533 if (FL_TEST(str
, RSTRING_FSTR
)) {
1534 st_data_t fstr
= (st_data_t
)str
;
1538 st_delete(rb_vm_fstring_table(), &fstr
, NULL
);
1539 RB_DEBUG_COUNTER_INC(obj_str_fstr
);
1544 if (STR_EMBED_P(str
)) {
1545 RB_DEBUG_COUNTER_INC(obj_str_embed
);
1547 else if (FL_TEST(str
, STR_SHARED
| STR_NOFREE
)) {
1548 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_SHARED
));
1549 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_NOFREE
));
1552 RB_DEBUG_COUNTER_INC(obj_str_ptr
);
1553 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
1557 RUBY_FUNC_EXPORTED
size_t
1558 rb_str_memsize(VALUE str
)
1560 if (FL_TEST(str
, STR_NOEMBED
|STR_SHARED
|STR_NOFREE
) == STR_NOEMBED
) {
1561 return STR_HEAP_SIZE(str
);
1569 rb_str_to_str(VALUE str
)
1571 return rb_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
1574 static inline void str_discard(VALUE str
);
1575 static void str_shared_replace(VALUE str
, VALUE str2
);
1578 rb_str_shared_replace(VALUE str
, VALUE str2
)
1580 if (str
!= str2
) str_shared_replace(str
, str2
);
1584 str_shared_replace(VALUE str
, VALUE str2
)
1590 RUBY_ASSERT(str2
!= str
);
1591 enc
= STR_ENC_GET(str2
);
1592 cr
= ENC_CODERANGE(str2
);
1594 termlen
= rb_enc_mbminlen(enc
);
1596 if (str_embed_capa(str
) >= RSTRING_LEN(str2
) + termlen
) {
1598 memcpy(RSTRING_PTR(str
), RSTRING_PTR(str2
), (size_t)RSTRING_LEN(str2
) + termlen
);
1599 STR_SET_EMBED_LEN(str
, RSTRING_LEN(str2
));
1600 rb_enc_associate(str
, enc
);
1601 ENC_CODERANGE_SET(str
, cr
);
1605 if (STR_EMBED_P(str2
)) {
1606 assert(!FL_TEST(str2
, STR_SHARED
));
1607 long len
= RSTRING(str2
)->as
.embed
.len
;
1608 assert(len
+ termlen
<= str_embed_capa(str2
));
1610 char *new_ptr
= ALLOC_N(char, len
+ termlen
);
1611 memcpy(new_ptr
, RSTRING(str2
)->as
.embed
.ary
, len
+ termlen
);
1612 RSTRING(str2
)->as
.heap
.ptr
= new_ptr
;
1613 RSTRING(str2
)->as
.heap
.len
= len
;
1614 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
1615 STR_SET_NOEMBED(str2
);
1619 STR_SET_NOEMBED(str
);
1620 FL_UNSET(str
, STR_SHARED
);
1621 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1622 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(str2
);
1624 if (FL_TEST(str2
, STR_SHARED
)) {
1625 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1626 STR_SET_SHARED(str
, shared
);
1629 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(str2
)->as
.heap
.aux
.capa
;
1633 STR_SET_EMBED(str2
);
1634 RSTRING_PTR(str2
)[0] = 0;
1635 STR_SET_EMBED_LEN(str2
, 0);
1636 rb_enc_associate(str
, enc
);
1637 ENC_CODERANGE_SET(str
, cr
);
1642 rb_obj_as_string(VALUE obj
)
1646 if (RB_TYPE_P(obj
, T_STRING
)) {
1649 str
= rb_funcall(obj
, idTo_s
, 0);
1650 return rb_obj_as_string_result(str
, obj
);
1653 MJIT_FUNC_EXPORTED VALUE
1654 rb_obj_as_string_result(VALUE str
, VALUE obj
)
1656 if (!RB_TYPE_P(str
, T_STRING
))
1657 return rb_any_to_s(obj
);
1662 str_replace(VALUE str
, VALUE str2
)
1666 len
= RSTRING_LEN(str2
);
1667 if (STR_SHARED_P(str2
)) {
1668 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1669 assert(OBJ_FROZEN(shared
));
1670 STR_SET_NOEMBED(str
);
1671 RSTRING(str
)->as
.heap
.len
= len
;
1672 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1673 STR_SET_SHARED(str
, shared
);
1674 rb_enc_cr_str_exact_copy(str
, str2
);
1677 str_replace_shared(str
, str2
);
1684 ec_str_alloc(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t size
)
1687 RB_RVARGC_EC_NEWOBJ_OF(ec
, str
, struct RString
, klass
,
1688 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
1693 ec_str_alloc_embed(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t capa
)
1695 size_t size
= str_embed_size(capa
);
1696 assert(rb_gc_size_allocatable_p(size
));
1698 assert(size
<= sizeof(struct RString
));
1700 return ec_str_alloc(ec
, klass
, size
);
1704 ec_str_alloc_heap(struct rb_execution_context_struct
*ec
, VALUE klass
)
1706 return ec_str_alloc(ec
, klass
, sizeof(struct RString
));
1710 str_duplicate_setup(VALUE klass
, VALUE str
, VALUE dup
)
1712 const VALUE flag_mask
=
1714 RSTRING_NOEMBED
| RSTRING_EMBED_LEN_MASK
|
1716 ENC_CODERANGE_MASK
| ENCODING_MASK
|
1719 VALUE flags
= FL_TEST_RAW(str
, flag_mask
);
1721 if (STR_EMBED_P(str
)) {
1722 long len
= RSTRING_EMBED_LEN(str
);
1724 assert(str_embed_capa(dup
) >= len
+ 1);
1725 STR_SET_EMBED_LEN(dup
, len
);
1726 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.ary
, char, len
+ 1);
1730 if (FL_TEST_RAW(str
, STR_SHARED
)) {
1731 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1733 else if (UNLIKELY(!(flags
& FL_FREEZE
))) {
1734 root
= str
= str_new_frozen(klass
, str
);
1735 flags
= FL_TEST_RAW(str
, flag_mask
);
1737 assert(!STR_SHARED_P(root
));
1738 assert(RB_OBJ_FROZEN_RAW(root
));
1742 if (STR_EMBED_P(root
)) {
1743 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(root
)->as
.embed
.ary
,
1744 char, RSTRING_EMBED_LEN_MAX
+ 1);
1748 RSTRING(dup
)->as
.heap
.len
= RSTRING_LEN(str
);
1749 RSTRING(dup
)->as
.heap
.ptr
= RSTRING_PTR(str
);
1750 RB_OBJ_WRITE(dup
, &RSTRING(dup
)->as
.heap
.aux
.shared
, root
);
1751 flags
|= RSTRING_NOEMBED
| STR_SHARED
;
1755 if ((flags
& ENCODING_MASK
) == (ENCODING_INLINE_MAX
<<ENCODING_SHIFT
)) {
1756 encidx
= rb_enc_get_index(str
);
1757 flags
&= ~ENCODING_MASK
;
1759 FL_SET_RAW(dup
, flags
& ~FL_FREEZE
);
1760 if (encidx
) rb_enc_associate_index(dup
, encidx
);
1765 ec_str_duplicate(struct rb_execution_context_struct
*ec
, VALUE klass
, VALUE str
)
1768 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1769 dup
= ec_str_alloc_heap(ec
, klass
);
1772 dup
= ec_str_alloc_embed(ec
, klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1775 return str_duplicate_setup(klass
, str
, dup
);
1779 str_duplicate(VALUE klass
, VALUE str
)
1782 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1783 dup
= str_alloc_heap(klass
);
1786 dup
= str_alloc_embed(klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1789 return str_duplicate_setup(klass
, str
, dup
);
1793 rb_str_dup(VALUE str
)
1795 return str_duplicate(rb_obj_class(str
), str
);
1799 rb_str_resurrect(VALUE str
)
1801 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1802 return str_duplicate(rb_cString
, str
);
1806 rb_ec_str_resurrect(struct rb_execution_context_struct
*ec
, VALUE str
)
1808 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1809 return ec_str_duplicate(ec
, rb_cString
, str
);
1814 * String.new(string = '') -> new_string
1815 * String.new(string = '', encoding: encoding) -> new_string
1816 * String.new(string = '', capacity: size) -> new_string
1818 * Returns a new \String that is a copy of +string+.
1820 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1823 * s.encoding # => #<Encoding:ASCII-8BIT>
1825 * With the single \String argument +string+, returns a copy of +string+
1826 * with the same encoding as +string+:
1827 * s = String.new("Que veut dire \u{e7}a?")
1828 * s # => "Que veut dire \u{e7}a?"
1829 * s.encoding # => #<Encoding:UTF-8>
1831 * Literal strings like <tt>""</tt> or here-documents always use
1832 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1834 * With keyword +encoding+, returns a copy of +str+
1835 * with the specified encoding:
1836 * s = String.new(encoding: 'ASCII')
1837 * s.encoding # => #<Encoding:US-ASCII>
1838 * s = String.new('foo', encoding: 'ASCII')
1839 * s.encoding # => #<Encoding:US-ASCII>
1841 * Note that these are equivalent:
1842 * s0 = String.new('foo', encoding: 'ASCII')
1843 * s1 = 'foo'.force_encoding('ASCII')
1844 * s0.encoding == s1.encoding # => true
1846 * With keyword +capacity+, returns a copy of +str+;
1847 * the given +capacity+ may set the size of the internal buffer,
1848 * which may affect performance:
1849 * String.new(capacity: 1) # => ""
1850 * String.new(capacity: 4096) # => ""
1852 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1854 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1859 rb_str_init(int argc
, VALUE
*argv
, VALUE str
)
1861 static ID keyword_ids
[2];
1862 VALUE orig
, opt
, venc
, vcapa
;
1864 rb_encoding
*enc
= 0;
1867 if (!keyword_ids
[0]) {
1868 keyword_ids
[0] = rb_id_encoding();
1869 CONST_ID(keyword_ids
[1], "capacity");
1872 n
= rb_scan_args(argc
, argv
, "01:", &orig
, &opt
);
1874 rb_get_kwargs(opt
, keyword_ids
, 0, 2, kwargs
);
1877 if (venc
!= Qundef
&& !NIL_P(venc
)) {
1878 enc
= rb_to_encoding(venc
);
1880 if (vcapa
!= Qundef
&& !NIL_P(vcapa
)) {
1881 long capa
= NUM2LONG(vcapa
);
1883 int termlen
= enc
? rb_enc_mbminlen(enc
) : 1;
1885 if (capa
< STR_BUF_MIN_SIZE
) {
1886 capa
= STR_BUF_MIN_SIZE
;
1890 len
= RSTRING_LEN(orig
);
1894 if (orig
== str
) n
= 0;
1896 str_modifiable(str
);
1897 if (STR_EMBED_P(str
)) { /* make noembed always */
1898 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1900 assert(RSTRING(str
)->as
.embed
.len
+ 1 <= str_embed_capa(str
));
1901 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.len
+ 1);
1903 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING_EMBED_LEN_MAX
+ 1);
1905 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1907 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
1908 const size_t size
= (size_t)capa
+ termlen
;
1909 const char *const old_ptr
= RSTRING_PTR(str
);
1910 const size_t osize
= RSTRING(str
)->as
.heap
.len
+ TERM_LEN(str
);
1911 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1912 memcpy(new_ptr
, old_ptr
, osize
< size
? osize
: size
);
1913 FL_UNSET_RAW(str
, STR_SHARED
|STR_NOFREE
);
1914 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1916 else if (STR_HEAP_SIZE(str
) != (size_t)capa
+ termlen
) {
1917 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
1918 (size_t)capa
+ termlen
, STR_HEAP_SIZE(str
));
1920 RSTRING(str
)->as
.heap
.len
= len
;
1921 TERM_FILL(&RSTRING(str
)->as
.heap
.ptr
[len
], termlen
);
1923 memcpy(RSTRING(str
)->as
.heap
.ptr
, RSTRING_PTR(orig
), len
);
1924 rb_enc_cr_str_exact_copy(str
, orig
);
1926 FL_SET(str
, STR_NOEMBED
);
1927 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1930 rb_str_replace(str
, orig
);
1933 rb_enc_associate(str
, enc
);
1934 ENC_CODERANGE_CLEAR(str
);
1938 rb_str_replace(str
, orig
);
1943 #ifdef NONASCII_MASK
1944 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1947 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1948 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1949 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1951 * if (!(byte & 0x80))
1952 * byte |= 0x40; // turn on bit6
1953 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1955 * This function calculates whether a byte is leading or not for all bytes
1956 * in the argument word by concurrently using the above logic, and then
1957 * adds up the number of leading bytes in the word.
1959 static inline uintptr_t
1960 count_utf8_lead_bytes_with_word(const uintptr_t *s
)
1964 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1965 d
= (d
>>6) | (~d
>>7);
1966 d
&= NONASCII_MASK
>> 7;
1968 /* Gather all bytes. */
1969 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1970 /* use only if it can use POPCNT */
1971 return rb_popcount_intptr(d
);
1975 # if SIZEOF_VOIDP == 8
1984 enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
, int cr
)
1989 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
1990 long diff
= (long)(e
- p
);
1991 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
1993 #ifdef NONASCII_MASK
1994 else if (cr
== ENC_CODERANGE_VALID
&& enc
== rb_utf8_encoding()) {
1996 if ((int)sizeof(uintptr_t) * 2 < e
- p
) {
1997 const uintptr_t *s
, *t
;
1998 const uintptr_t lowbits
= sizeof(uintptr_t) - 1;
1999 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2000 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2001 while (p
< (const char *)s
) {
2002 if (is_utf8_lead_byte(*p
)) len
++;
2006 len
+= count_utf8_lead_bytes_with_word(s
);
2009 p
= (const char *)s
;
2012 if (is_utf8_lead_byte(*p
)) len
++;
2018 else if (rb_enc_asciicompat(enc
)) {
2020 if (ENC_CODERANGE_CLEAN_P(cr
)) {
2023 q
= search_nonascii(p
, e
);
2029 p
+= rb_enc_fast_mbclen(p
, e
, enc
);
2036 q
= search_nonascii(p
, e
);
2042 p
+= rb_enc_mbclen(p
, e
, enc
);
2049 for (c
=0; p
<e
; c
++) {
2050 p
+= rb_enc_mbclen(p
, e
, enc
);
2056 rb_enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
)
2058 return enc_strlen(p
, e
, enc
, ENC_CODERANGE_UNKNOWN
);
2061 /* To get strlen with cr
2062 * Note that given cr is not used.
2065 rb_enc_strlen_cr(const char *p
, const char *e
, rb_encoding
*enc
, int *cr
)
2072 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2073 long diff
= (long)(e
- p
);
2074 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
2076 else if (rb_enc_asciicompat(enc
)) {
2080 q
= search_nonascii(p
, e
);
2082 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2088 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2089 if (MBCLEN_CHARFOUND_P(ret
)) {
2090 *cr
|= ENC_CODERANGE_VALID
;
2091 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2094 *cr
= ENC_CODERANGE_BROKEN
;
2099 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2103 for (c
=0; p
<e
; c
++) {
2104 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2105 if (MBCLEN_CHARFOUND_P(ret
)) {
2106 *cr
|= ENC_CODERANGE_VALID
;
2107 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2110 *cr
= ENC_CODERANGE_BROKEN
;
2111 if (p
+ rb_enc_mbminlen(enc
) <= e
)
2112 p
+= rb_enc_mbminlen(enc
);
2117 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2121 /* enc must be str's enc or rb_enc_check(str, str2) */
2123 str_strlen(VALUE str
, rb_encoding
*enc
)
2128 if (single_byte_optimizable(str
)) return RSTRING_LEN(str
);
2129 if (!enc
) enc
= STR_ENC_GET(str
);
2130 p
= RSTRING_PTR(str
);
2131 e
= RSTRING_END(str
);
2132 cr
= ENC_CODERANGE(str
);
2134 if (cr
== ENC_CODERANGE_UNKNOWN
) {
2135 long n
= rb_enc_strlen_cr(p
, e
, enc
, &cr
);
2136 if (cr
) ENC_CODERANGE_SET(str
, cr
);
2140 return enc_strlen(p
, e
, enc
, cr
);
2145 rb_str_strlen(VALUE str
)
2147 return str_strlen(str
, NULL
);
2154 * Returns the count of characters (not bytes) in +self+:
2156 * "\x80\u3042".length # => 2
2157 * "hello".length # => 5
2159 * String#size is an alias for String#length.
2161 * Related: String#bytesize.
2165 rb_str_length(VALUE str
)
2167 return LONG2NUM(str_strlen(str
, NULL
));
2172 * bytesize -> integer
2174 * Returns the count of bytes in +self+:
2176 * "\x80\u3042".bytesize # => 4
2177 * "hello".bytesize # => 5
2179 * Related: String#length.
2183 rb_str_bytesize(VALUE str
)
2185 return LONG2NUM(RSTRING_LEN(str
));
2190 * empty? -> true or false
2192 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2194 * "hello".empty? # => false
2195 * " ".empty? # => false
2196 * "".empty? # => true
2201 rb_str_empty(VALUE str
)
2203 return RBOOL(RSTRING_LEN(str
) == 0);
2208 * string + other_string -> new_string
2210 * Returns a new \String containing +other_string+ concatenated to +self+:
2212 * "Hello from " + self.to_s # => "Hello from main"
2217 rb_str_plus(VALUE str1
, VALUE str2
)
2221 char *ptr1
, *ptr2
, *ptr3
;
2226 enc
= rb_enc_check_str(str1
, str2
);
2227 RSTRING_GETMEM(str1
, ptr1
, len1
);
2228 RSTRING_GETMEM(str2
, ptr2
, len2
);
2229 termlen
= rb_enc_mbminlen(enc
);
2230 if (len1
> LONG_MAX
- len2
) {
2231 rb_raise(rb_eArgError
, "string size too big");
2233 str3
= str_new0(rb_cString
, 0, len1
+len2
, termlen
);
2234 ptr3
= RSTRING_PTR(str3
);
2235 memcpy(ptr3
, ptr1
, len1
);
2236 memcpy(ptr3
+len1
, ptr2
, len2
);
2237 TERM_FILL(&ptr3
[len1
+len2
], termlen
);
2239 ENCODING_CODERANGE_SET(str3
, rb_enc_to_index(enc
),
2240 ENC_CODERANGE_AND(ENC_CODERANGE(str1
), ENC_CODERANGE(str2
)));
2246 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2247 MJIT_FUNC_EXPORTED VALUE
2248 rb_str_opt_plus(VALUE str1
, VALUE str2
)
2250 assert(RBASIC_CLASS(str1
) == rb_cString
);
2251 assert(RBASIC_CLASS(str2
) == rb_cString
);
2253 MAYBE_UNUSED(char) *ptr1
, *ptr2
;
2254 RSTRING_GETMEM(str1
, ptr1
, len1
);
2255 RSTRING_GETMEM(str2
, ptr2
, len2
);
2256 int enc1
= rb_enc_get_index(str1
);
2257 int enc2
= rb_enc_get_index(str2
);
2262 else if (enc2
< 0) {
2265 else if (enc1
!= enc2
) {
2268 else if (len1
> LONG_MAX
- len2
) {
2272 return rb_str_plus(str1
, str2
);
2279 * string * integer -> new_string
2281 * Returns a new \String containing +integer+ copies of +self+:
2283 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2284 * "Ho! " * 0 # => ""
2289 rb_str_times(VALUE str
, VALUE times
)
2296 if (times
== INT2FIX(1)) {
2297 return str_duplicate(rb_cString
, str
);
2299 if (times
== INT2FIX(0)) {
2300 str2
= str_alloc_embed(rb_cString
, 0);
2301 rb_enc_copy(str2
, str
);
2304 len
= NUM2LONG(times
);
2306 rb_raise(rb_eArgError
, "negative argument");
2308 if (RSTRING_LEN(str
) == 1 && RSTRING_PTR(str
)[0] == 0) {
2309 if (STR_EMBEDDABLE_P(len
, 1)) {
2310 str2
= str_alloc_embed(rb_cString
, len
+ 1);
2311 memset(RSTRING_PTR(str2
), 0, len
+ 1);
2314 str2
= str_alloc_heap(rb_cString
);
2315 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
2316 RSTRING(str2
)->as
.heap
.ptr
= ZALLOC_N(char, (size_t)len
+ 1);
2317 STR_SET_NOEMBED(str2
);
2319 STR_SET_LEN(str2
, len
);
2320 rb_enc_copy(str2
, str
);
2323 if (len
&& LONG_MAX
/len
< RSTRING_LEN(str
)) {
2324 rb_raise(rb_eArgError
, "argument too big");
2327 len
*= RSTRING_LEN(str
);
2328 termlen
= TERM_LEN(str
);
2329 str2
= str_new0(rb_cString
, 0, len
, termlen
);
2330 ptr2
= RSTRING_PTR(str2
);
2332 n
= RSTRING_LEN(str
);
2333 memcpy(ptr2
, RSTRING_PTR(str
), n
);
2334 while (n
<= len
/2) {
2335 memcpy(ptr2
+ n
, ptr2
, n
);
2338 memcpy(ptr2
+ n
, ptr2
, len
-n
);
2340 STR_SET_LEN(str2
, len
);
2341 TERM_FILL(&ptr2
[len
], termlen
);
2342 rb_enc_cr_str_copy_for_substr(str2
, str
);
2349 * string % object -> new_string
2351 * Returns the result of formatting +object+ into the format specification +self+
2352 * (see Kernel#sprintf for formatting details):
2354 * "%05d" % 123 # => "00123"
2356 * If +self+ contains multiple substitutions, +object+ must be
2357 * an \Array or \Hash containing the values to be substituted:
2359 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2360 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2361 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2366 rb_str_format_m(VALUE str
, VALUE arg
)
2368 VALUE tmp
= rb_check_array_type(arg
);
2371 return rb_str_format(RARRAY_LENINT(tmp
), RARRAY_CONST_PTR(tmp
), str
);
2373 return rb_str_format(1, &arg
, str
);
2377 rb_check_lockedtmp(VALUE str
)
2379 if (FL_TEST(str
, STR_TMPLOCK
)) {
2380 rb_raise(rb_eRuntimeError
, "can't modify string; temporarily locked");
2385 str_modifiable(VALUE str
)
2387 rb_check_lockedtmp(str
);
2388 rb_check_frozen(str
);
2392 str_dependent_p(VALUE str
)
2394 if (STR_EMBED_P(str
) || !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2403 str_independent(VALUE str
)
2405 str_modifiable(str
);
2406 return !str_dependent_p(str
);
2410 str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
)
2414 long capa
= len
+ expand
;
2416 if (len
> capa
) len
= capa
;
2418 if (!STR_EMBED_P(str
) && str_embed_capa(str
) >= capa
+ termlen
) {
2419 ptr
= RSTRING(str
)->as
.heap
.ptr
;
2421 memcpy(RSTRING(str
)->as
.embed
.ary
, ptr
, len
);
2422 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
2423 STR_SET_EMBED_LEN(str
, len
);
2427 ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
2428 oldptr
= RSTRING_PTR(str
);
2430 memcpy(ptr
, oldptr
, len
);
2432 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_NOFREE
|STR_SHARED
) == STR_NOEMBED
) {
2435 STR_SET_NOEMBED(str
);
2436 FL_UNSET(str
, STR_SHARED
|STR_NOFREE
);
2437 TERM_FILL(ptr
+ len
, termlen
);
2438 RSTRING(str
)->as
.heap
.ptr
= ptr
;
2439 RSTRING(str
)->as
.heap
.len
= len
;
2440 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
2444 rb_str_modify(VALUE str
)
2446 if (!str_independent(str
))
2447 str_make_independent(str
);
2448 ENC_CODERANGE_CLEAR(str
);
2452 rb_str_modify_expand(VALUE str
, long expand
)
2454 int termlen
= TERM_LEN(str
);
2455 long len
= RSTRING_LEN(str
);
2458 rb_raise(rb_eArgError
, "negative expanding string size");
2460 if (expand
>= LONG_MAX
- len
) {
2461 rb_raise(rb_eArgError
, "string size too big");
2464 if (!str_independent(str
)) {
2465 str_make_independent_expand(str
, len
, expand
, termlen
);
2467 else if (expand
> 0) {
2468 RESIZE_CAPA_TERM(str
, len
+ expand
, termlen
);
2470 ENC_CODERANGE_CLEAR(str
);
2473 /* As rb_str_modify(), but don't clear coderange */
2475 str_modify_keep_cr(VALUE str
)
2477 if (!str_independent(str
))
2478 str_make_independent(str
);
2479 if (ENC_CODERANGE(str
) == ENC_CODERANGE_BROKEN
)
2480 /* Force re-scan later */
2481 ENC_CODERANGE_CLEAR(str
);
2485 str_discard(VALUE str
)
2487 str_modifiable(str
);
2488 if (!STR_EMBED_P(str
) && !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2489 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
2490 RSTRING(str
)->as
.heap
.ptr
= 0;
2491 RSTRING(str
)->as
.heap
.len
= 0;
2496 rb_must_asciicompat(VALUE str
)
2498 rb_encoding
*enc
= rb_enc_get(str
);
2499 if (!rb_enc_asciicompat(enc
)) {
2500 rb_raise(rb_eEncCompatError
, "ASCII incompatible encoding: %s", rb_enc_name(enc
));
2505 rb_string_value(volatile VALUE
*ptr
)
2508 if (!RB_TYPE_P(s
, T_STRING
)) {
2509 s
= rb_str_to_str(s
);
2516 rb_string_value_ptr(volatile VALUE
*ptr
)
2518 VALUE str
= rb_string_value(ptr
);
2519 return RSTRING_PTR(str
);
2523 zero_filled(const char *s
, int n
)
2525 for (; n
> 0; --n
) {
2532 str_null_char(const char *s
, long len
, const int minlen
, rb_encoding
*enc
)
2534 const char *e
= s
+ len
;
2536 for (; s
+ minlen
<= e
; s
+= rb_enc_mbclen(s
, e
, enc
)) {
2537 if (zero_filled(s
, minlen
)) return s
;
2543 str_fill_term(VALUE str
, char *s
, long len
, int termlen
)
2545 /* This function assumes that (capa + termlen) bytes of memory
2546 * is allocated, like many other functions in this file.
2548 if (str_dependent_p(str
)) {
2549 if (!zero_filled(s
+ len
, termlen
))
2550 str_make_independent_expand(str
, len
, 0L, termlen
);
2553 TERM_FILL(s
+ len
, termlen
);
2556 return RSTRING_PTR(str
);
2560 rb_str_change_terminator_length(VALUE str
, const int oldtermlen
, const int termlen
)
2562 long capa
= str_capacity(str
, oldtermlen
) + oldtermlen
;
2563 long len
= RSTRING_LEN(str
);
2565 assert(capa
>= len
);
2566 if (capa
- len
< termlen
) {
2567 rb_check_lockedtmp(str
);
2568 str_make_independent_expand(str
, len
, 0L, termlen
);
2570 else if (str_dependent_p(str
)) {
2571 if (termlen
> oldtermlen
)
2572 str_make_independent_expand(str
, len
, 0L, termlen
);
2575 if (!STR_EMBED_P(str
)) {
2576 /* modify capa instead of realloc */
2577 assert(!FL_TEST((str
), STR_SHARED
));
2578 RSTRING(str
)->as
.heap
.aux
.capa
= capa
- termlen
;
2580 if (termlen
> oldtermlen
) {
2581 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
2589 str_null_check(VALUE str
, int *w
)
2591 char *s
= RSTRING_PTR(str
);
2592 long len
= RSTRING_LEN(str
);
2593 rb_encoding
*enc
= rb_enc_get(str
);
2594 const int minlen
= rb_enc_mbminlen(enc
);
2598 if (str_null_char(s
, len
, minlen
, enc
)) {
2601 return str_fill_term(str
, s
, len
, minlen
);
2604 if (!s
|| memchr(s
, 0, len
)) {
2608 s
= str_fill_term(str
, s
, len
, minlen
);
2614 rb_str_to_cstr(VALUE str
)
2617 return str_null_check(str
, &w
);
2621 rb_string_value_cstr(volatile VALUE
*ptr
)
2623 VALUE str
= rb_string_value(ptr
);
2625 char *s
= str_null_check(str
, &w
);
2628 rb_raise(rb_eArgError
, "string contains null char");
2630 rb_raise(rb_eArgError
, "string contains null byte");
2636 rb_str_fill_terminator(VALUE str
, const int newminlen
)
2638 char *s
= RSTRING_PTR(str
);
2639 long len
= RSTRING_LEN(str
);
2640 return str_fill_term(str
, s
, len
, newminlen
);
2644 rb_check_string_type(VALUE str
)
2646 str
= rb_check_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
2652 * String.try_convert(object) -> object, new_string, or nil
2654 * If +object+ is a \String object, returns +object+.
2656 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2657 * calls <tt>object.to_str</tt> and returns the result.
2659 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2661 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2664 rb_str_s_try_convert(VALUE dummy
, VALUE str
)
2666 return rb_check_string_type(str
);
2670 str_nth_len(const char *p
, const char *e
, long *nthp
, rb_encoding
*enc
)
2673 if (rb_enc_mbmaxlen(enc
) == 1) {
2676 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2677 p
+= nth
* rb_enc_mbmaxlen(enc
);
2679 else if (rb_enc_asciicompat(enc
)) {
2680 const char *p2
, *e2
;
2683 while (p
< e
&& 0 < nth
) {
2690 p2
= search_nonascii(p
, e2
);
2699 n
= rb_enc_mbclen(p
, e
, enc
);
2710 while (p
< e
&& nth
--) {
2711 p
+= rb_enc_mbclen(p
, e
, enc
);
2720 rb_enc_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
)
2722 return str_nth_len(p
, e
, &nth
, enc
);
2726 str_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2731 p
= str_nth_len(p
, e
, &nth
, enc
);
2738 /* char offset to byte offset */
2740 str_offset(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2742 const char *pp
= str_nth(p
, e
, nth
, enc
, singlebyte
);
2743 if (!pp
) return e
- p
;
2748 rb_str_offset(VALUE str
, long pos
)
2750 return str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
2751 STR_ENC_GET(str
), single_byte_optimizable(str
));
2754 #ifdef NONASCII_MASK
2756 str_utf8_nth(const char *p
, const char *e
, long *nthp
)
2759 if ((int)SIZEOF_VOIDP
* 2 < e
- p
&& (int)SIZEOF_VOIDP
* 2 < nth
) {
2760 const uintptr_t *s
, *t
;
2761 const uintptr_t lowbits
= SIZEOF_VOIDP
- 1;
2762 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2763 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2764 while (p
< (const char *)s
) {
2765 if (is_utf8_lead_byte(*p
)) nth
--;
2769 nth
-= count_utf8_lead_bytes_with_word(s
);
2771 } while (s
< t
&& (int)SIZEOF_VOIDP
<= nth
);
2775 if (is_utf8_lead_byte(*p
)) {
2776 if (nth
== 0) break;
2786 str_utf8_offset(const char *p
, const char *e
, long nth
)
2788 const char *pp
= str_utf8_nth(p
, e
, &nth
);
2793 /* byte offset to char offset */
2795 rb_str_sublen(VALUE str
, long pos
)
2797 if (single_byte_optimizable(str
) || pos
< 0)
2800 char *p
= RSTRING_PTR(str
);
2801 return enc_strlen(p
, p
+ pos
, STR_ENC_GET(str
), ENC_CODERANGE(str
));
2806 rb_str_subseq(VALUE str
, long beg
, long len
)
2810 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2811 SHARABLE_SUBSTRING_P(beg
, len
, RSTRING_LEN(str
))) {
2813 str2
= rb_str_new_shared(rb_str_new_frozen_String(str
));
2814 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
2815 olen
= RSTRING(str2
)->as
.heap
.len
;
2816 if (olen
> len
) RSTRING(str2
)->as
.heap
.len
= len
;
2819 str2
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
2823 rb_enc_cr_str_copy_for_substr(str2
, str
);
2829 rb_str_subpos(VALUE str
, long beg
, long *lenp
)
2833 long blen
= RSTRING_LEN(str
);
2834 rb_encoding
*enc
= STR_ENC_GET(str
);
2835 char *p
, *s
= RSTRING_PTR(str
), *e
= s
+ blen
;
2837 if (len
< 0) return 0;
2841 if (single_byte_optimizable(str
)) {
2842 if (beg
> blen
) return 0;
2845 if (beg
< 0) return 0;
2847 if (len
> blen
- beg
)
2849 if (len
< 0) return 0;
2854 if (len
> -beg
) len
= -beg
;
2855 if (-beg
* rb_enc_mbmaxlen(enc
) < RSTRING_LEN(str
) / 8) {
2857 while (beg
-- > len
&& (e
= rb_enc_prev_char(s
, e
, e
, enc
)) != 0);
2860 while (len
-- > 0 && (p
= rb_enc_prev_char(s
, p
, e
, enc
)) != 0);
2866 slen
= str_strlen(str
, enc
);
2868 if (beg
< 0) return 0;
2870 if (len
== 0) goto end
;
2873 else if (beg
> 0 && beg
> RSTRING_LEN(str
)) {
2877 if (beg
> str_strlen(str
, enc
)) return 0; /* str's enc */
2880 #ifdef NONASCII_MASK
2881 else if (ENC_CODERANGE(str
) == ENC_CODERANGE_VALID
&&
2882 enc
== rb_utf8_encoding()) {
2883 p
= str_utf8_nth(s
, e
, &beg
);
2884 if (beg
> 0) return 0;
2885 len
= str_utf8_offset(p
, e
, len
);
2888 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2889 int char_sz
= rb_enc_mbmaxlen(enc
);
2891 p
= s
+ beg
* char_sz
;
2895 else if (len
* char_sz
> e
- p
)
2900 else if ((p
= str_nth_len(s
, e
, &beg
, enc
)) == e
) {
2901 if (beg
> 0) return 0;
2905 len
= str_offset(p
, e
, len
, enc
, 0);
2913 static VALUE
str_substr(VALUE str
, long beg
, long len
, int empty
);
2916 rb_str_substr(VALUE str
, long beg
, long len
)
2918 return str_substr(str
, beg
, len
, TRUE
);
2922 str_substr(VALUE str
, long beg
, long len
, int empty
)
2925 char *p
= rb_str_subpos(str
, beg
, &len
);
2927 if (!p
) return Qnil
;
2928 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2929 SHARABLE_SUBSTRING_P(p
, len
, RSTRING_END(str
))) {
2930 long ofs
= p
- RSTRING_PTR(str
);
2931 str2
= rb_str_new_frozen(str
);
2932 str2
= str_new_shared(rb_cString
, str2
);
2933 RSTRING(str2
)->as
.heap
.ptr
+= ofs
;
2934 RSTRING(str2
)->as
.heap
.len
= len
;
2935 ENC_CODERANGE_CLEAR(str2
);
2938 if (!len
&& !empty
) return Qnil
;
2939 str2
= rb_str_new(p
, len
);
2942 rb_enc_cr_str_copy_for_substr(str2
, str
);
2948 rb_str_freeze(VALUE str
)
2950 if (OBJ_FROZEN(str
)) return str
;
2951 rb_str_resize(str
, RSTRING_LEN(str
));
2952 return rb_obj_freeze(str
);
2958 * +string -> new_string or self
2960 * Returns +self+ if +self+ is not frozen.
2962 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2965 str_uplus(VALUE str
)
2967 if (OBJ_FROZEN(str
)) {
2968 return rb_str_dup(str
);
2977 * -string -> frozen_string
2979 * Returns a frozen, possibly pre-existing copy of the string.
2981 * The returned \String will be deduplicated as long as it does not have
2982 * any instance variables set on it.
2985 str_uminus(VALUE str
)
2987 if (!BARE_STRING_P(str
) && !rb_obj_frozen_p(str
)) {
2988 str
= rb_str_dup(str
);
2990 return rb_fstring(str
);
2993 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str
), rb_str_new_frozen
, (str
))
2994 #define rb_str_dup_frozen rb_str_new_frozen
2997 rb_str_locktmp(VALUE str
)
2999 if (FL_TEST(str
, STR_TMPLOCK
)) {
3000 rb_raise(rb_eRuntimeError
, "temporal locking already locked string");
3002 FL_SET(str
, STR_TMPLOCK
);
3007 rb_str_unlocktmp(VALUE str
)
3009 if (!FL_TEST(str
, STR_TMPLOCK
)) {
3010 rb_raise(rb_eRuntimeError
, "temporal unlocking already unlocked string");
3012 FL_UNSET(str
, STR_TMPLOCK
);
3016 RUBY_FUNC_EXPORTED VALUE
3017 rb_str_locktmp_ensure(VALUE str
, VALUE (*func
)(VALUE
), VALUE arg
)
3019 rb_str_locktmp(str
);
3020 return rb_ensure(func
, arg
, rb_str_unlocktmp
, str
);
3024 rb_str_set_len(VALUE str
, long len
)
3027 const int termlen
= TERM_LEN(str
);
3029 str_modifiable(str
);
3030 if (STR_SHARED_P(str
)) {
3031 rb_raise(rb_eRuntimeError
, "can't set length of shared string");
3033 if (len
> (capa
= (long)str_capacity(str
, termlen
)) || len
< 0) {
3034 rb_bug("probable buffer overflow: %ld for %ld", len
, capa
);
3036 STR_SET_LEN(str
, len
);
3037 TERM_FILL(&RSTRING_PTR(str
)[len
], termlen
);
3041 rb_str_resize(VALUE str
, long len
)
3047 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3050 independent
= str_independent(str
);
3051 ENC_CODERANGE_CLEAR(str
);
3052 slen
= RSTRING_LEN(str
);
3056 const int termlen
= TERM_LEN(str
);
3057 if (STR_EMBED_P(str
)) {
3058 if (len
== slen
) return str
;
3059 if (str_embed_capa(str
) >= len
+ termlen
) {
3060 STR_SET_EMBED_LEN(str
, len
);
3061 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3064 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3066 else if (str_embed_capa(str
) >= len
+ termlen
) {
3067 char *ptr
= STR_HEAP_PTR(str
);
3069 if (slen
> len
) slen
= len
;
3070 if (slen
> 0) MEMCPY(RSTRING(str
)->as
.embed
.ary
, ptr
, char, slen
);
3071 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3072 STR_SET_EMBED_LEN(str
, len
);
3073 if (independent
) ruby_xfree(ptr
);
3076 else if (!independent
) {
3077 if (len
== slen
) return str
;
3078 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3080 else if ((capa
= RSTRING(str
)->as
.heap
.aux
.capa
) < len
||
3081 (capa
- len
) > (len
< 1024 ? len
: 1024)) {
3082 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
3083 (size_t)len
+ termlen
, STR_HEAP_SIZE(str
));
3084 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
3086 else if (len
== slen
) return str
;
3087 RSTRING(str
)->as
.heap
.len
= len
;
3088 TERM_FILL(RSTRING(str
)->as
.heap
.ptr
+ len
, termlen
); /* sentinel */
3094 str_buf_cat(VALUE str
, const char *ptr
, long len
)
3096 long capa
, total
, olen
, off
= -1;
3098 const int termlen
= TERM_LEN(str
);
3100 assert(termlen
< RSTRING_EMBED_LEN_MAX
+ 1); /* < (LONG_MAX/2) */
3103 RSTRING_GETMEM(str
, sptr
, olen
);
3104 if (ptr
>= sptr
&& ptr
<= sptr
+ olen
) {
3108 if (len
== 0) return 0;
3109 if (STR_EMBED_P(str
)) {
3110 capa
= str_embed_capa(str
) - termlen
;
3111 sptr
= RSTRING(str
)->as
.embed
.ary
;
3112 olen
= RSTRING_EMBED_LEN(str
);
3115 capa
= RSTRING(str
)->as
.heap
.aux
.capa
;
3116 sptr
= RSTRING(str
)->as
.heap
.ptr
;
3117 olen
= RSTRING(str
)->as
.heap
.len
;
3119 if (olen
> LONG_MAX
- len
) {
3120 rb_raise(rb_eArgError
, "string sizes too big");
3124 if (total
>= LONG_MAX
/ 2) {
3127 while (total
> capa
) {
3128 capa
= 2 * capa
+ termlen
; /* == 2*(capa+termlen)-termlen */
3130 RESIZE_CAPA_TERM(str
, capa
, termlen
);
3131 sptr
= RSTRING_PTR(str
);
3136 memcpy(sptr
+ olen
, ptr
, len
);
3137 STR_SET_LEN(str
, total
);
3138 TERM_FILL(sptr
+ total
, termlen
); /* sentinel */
3143 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3146 rb_str_cat(VALUE str
, const char *ptr
, long len
)
3148 if (len
== 0) return str
;
3150 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3152 return str_buf_cat(str
, ptr
, len
);
3156 rb_str_cat_cstr(VALUE str
, const char *ptr
)
3159 return rb_str_buf_cat(str
, ptr
, strlen(ptr
));
3162 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str
, const char *ptr
, long len
), rb_str_cat
, (str
, ptr
, len
))
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3164 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3167 rb_enc_cr_str_buf_cat(VALUE str
, const char *ptr
, long len
,
3168 int ptr_encindex
, int ptr_cr
, int *ptr_cr_ret
)
3170 int str_encindex
= ENCODING_GET(str
);
3173 rb_encoding
*str_enc
, *ptr_enc
;
3175 str_cr
= RSTRING_LEN(str
) ? ENC_CODERANGE(str
) : ENC_CODERANGE_7BIT
;
3177 if (str_encindex
== ptr_encindex
) {
3178 if (str_cr
!= ENC_CODERANGE_UNKNOWN
&& ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3179 ptr_cr
= coderange_scan(ptr
, len
, rb_enc_from_index(ptr_encindex
));
3183 str_enc
= rb_enc_from_index(str_encindex
);
3184 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3185 if (!rb_enc_asciicompat(str_enc
) || !rb_enc_asciicompat(ptr_enc
)) {
3188 if (RSTRING_LEN(str
) == 0) {
3189 rb_str_buf_cat(str
, ptr
, len
);
3190 ENCODING_CODERANGE_SET(str
, ptr_encindex
, ptr_cr
);
3195 if (ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3196 ptr_cr
= coderange_scan(ptr
, len
, ptr_enc
);
3198 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3199 if (ENCODING_IS_ASCII8BIT(str
) || ptr_cr
!= ENC_CODERANGE_7BIT
) {
3200 str_cr
= rb_enc_str_coderange(str
);
3205 *ptr_cr_ret
= ptr_cr
;
3207 if (str_encindex
!= ptr_encindex
&&
3208 str_cr
!= ENC_CODERANGE_7BIT
&&
3209 ptr_cr
!= ENC_CODERANGE_7BIT
) {
3210 str_enc
= rb_enc_from_index(str_encindex
);
3211 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3215 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3216 res_encindex
= str_encindex
;
3217 res_cr
= ENC_CODERANGE_UNKNOWN
;
3219 else if (str_cr
== ENC_CODERANGE_7BIT
) {
3220 if (ptr_cr
== ENC_CODERANGE_7BIT
) {
3221 res_encindex
= str_encindex
;
3222 res_cr
= ENC_CODERANGE_7BIT
;
3225 res_encindex
= ptr_encindex
;
3229 else if (str_cr
== ENC_CODERANGE_VALID
) {
3230 res_encindex
= str_encindex
;
3231 if (ENC_CODERANGE_CLEAN_P(ptr_cr
))
3236 else { /* str_cr == ENC_CODERANGE_BROKEN */
3237 res_encindex
= str_encindex
;
3239 if (0 < len
) res_cr
= ENC_CODERANGE_UNKNOWN
;
3243 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3245 str_buf_cat(str
, ptr
, len
);
3246 ENCODING_CODERANGE_SET(str
, res_encindex
, res_cr
);
3250 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
3251 rb_enc_name(str_enc
), rb_enc_name(ptr_enc
));
3252 UNREACHABLE_RETURN(Qundef
);
3256 rb_enc_str_buf_cat(VALUE str
, const char *ptr
, long len
, rb_encoding
*ptr_enc
)
3258 return rb_enc_cr_str_buf_cat(str
, ptr
, len
,
3259 rb_enc_to_index(ptr_enc
), ENC_CODERANGE_UNKNOWN
, NULL
);
3263 rb_str_buf_cat_ascii(VALUE str
, const char *ptr
)
3265 /* ptr must reference NUL terminated ASCII string. */
3266 int encindex
= ENCODING_GET(str
);
3267 rb_encoding
*enc
= rb_enc_from_index(encindex
);
3268 if (rb_enc_asciicompat(enc
)) {
3269 return rb_enc_cr_str_buf_cat(str
, ptr
, strlen(ptr
),
3270 encindex
, ENC_CODERANGE_7BIT
, 0);
3273 char *buf
= ALLOCA_N(char, rb_enc_mbmaxlen(enc
));
3275 unsigned int c
= (unsigned char)*ptr
;
3276 int len
= rb_enc_codelen(c
, enc
);
3277 rb_enc_mbcput(c
, buf
, enc
);
3278 rb_enc_cr_str_buf_cat(str
, buf
, len
,
3279 encindex
, ENC_CODERANGE_VALID
, 0);
3287 rb_str_buf_append(VALUE str
, VALUE str2
)
3291 str2_cr
= ENC_CODERANGE(str2
);
3293 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(str2
), RSTRING_LEN(str2
),
3294 ENCODING_GET(str2
), str2_cr
, &str2_cr
);
3296 ENC_CODERANGE_SET(str2
, str2_cr
);
3302 rb_str_append(VALUE str
, VALUE str2
)
3305 return rb_str_buf_append(str
, str2
);
3308 #define MIN_PRE_ALLOC_SIZE 48
3310 MJIT_FUNC_EXPORTED VALUE
3311 rb_str_concat_literals(size_t num
, const VALUE
*strary
)
3317 if (UNLIKELY(!num
)) return rb_str_new(0, 0);
3318 if (UNLIKELY(num
== 1)) return rb_str_resurrect(strary
[0]);
3320 for (i
= 0; i
< num
; ++i
) { len
+= RSTRING_LEN(strary
[i
]); }
3321 if (LIKELY(len
< MIN_PRE_ALLOC_SIZE
)) {
3322 str
= rb_str_resurrect(strary
[0]);
3326 str
= rb_str_buf_new(len
);
3327 rb_enc_copy(str
, strary
[0]);
3331 for (i
= s
; i
< num
; ++i
) {
3332 const VALUE v
= strary
[i
];
3333 int encidx
= ENCODING_GET(v
);
3335 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(v
), RSTRING_LEN(v
),
3336 encidx
, ENC_CODERANGE(v
), NULL
);
3337 if (encidx
!= ENCINDEX_US_ASCII
) {
3338 if (ENCODING_GET_INLINED(str
) == ENCINDEX_US_ASCII
)
3339 rb_enc_set_index(str
, encidx
);
3347 * concat(*objects) -> string
3349 * Concatenates each object in +objects+ to +self+ and returns +self+:
3352 * s.concat('bar', 'baz') # => "foobarbaz"
3353 * s # => "foobarbaz"
3355 * For each given object +object+ that is an \Integer,
3356 * the value is considered a codepoint and converted to a character before concatenation:
3359 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3361 * Related: String#<<, which takes a single argument.
3364 rb_str_concat_multi(int argc
, VALUE
*argv
, VALUE str
)
3366 str_modifiable(str
);
3369 return rb_str_concat(str
, argv
[0]);
3371 else if (argc
> 1) {
3373 VALUE arg_str
= rb_str_tmp_new(0);
3374 rb_enc_copy(arg_str
, str
);
3375 for (i
= 0; i
< argc
; i
++) {
3376 rb_str_concat(arg_str
, argv
[i
]);
3378 rb_str_buf_append(str
, arg_str
);
3386 * string << object -> string
3388 * Concatenates +object+ to +self+ and returns +self+:
3391 * s << 'bar' # => "foobar"
3394 * If +object+ is an \Integer,
3395 * the value is considered a codepoint and converted to a character before concatenation:
3398 * s << 33 # => "foo!"
3400 * Related: String#concat, which takes multiple arguments.
3403 rb_str_concat(VALUE str1
, VALUE str2
)
3406 rb_encoding
*enc
= STR_ENC_GET(str1
);
3409 if (RB_INTEGER_TYPE_P(str2
)) {
3410 if (rb_num_to_uint(str2
, &code
) == 0) {
3412 else if (FIXNUM_P(str2
)) {
3413 rb_raise(rb_eRangeError
, "%ld out of char range", FIX2LONG(str2
));
3416 rb_raise(rb_eRangeError
, "bignum out of char range");
3420 return rb_str_append(str1
, str2
);
3423 encidx
= rb_enc_to_index(enc
);
3424 if (encidx
== ENCINDEX_ASCII
|| encidx
== ENCINDEX_US_ASCII
) {
3425 /* US-ASCII automatically extended to ASCII-8BIT */
3427 buf
[0] = (char)code
;
3429 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3431 rb_str_cat(str1
, buf
, 1);
3432 if (encidx
== ENCINDEX_US_ASCII
&& code
> 127) {
3433 rb_enc_associate_index(str1
, ENCINDEX_ASCII
);
3434 ENC_CODERANGE_SET(str1
, ENC_CODERANGE_VALID
);
3438 long pos
= RSTRING_LEN(str1
);
3439 int cr
= ENC_CODERANGE(str1
);
3443 switch (len
= rb_enc_codelen(code
, enc
)) {
3444 case ONIGERR_INVALID_CODE_POINT_VALUE
:
3445 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3447 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
:
3449 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3452 buf
= ALLOCA_N(char, len
+ 1);
3453 rb_enc_mbcput(code
, buf
, enc
);
3454 if (rb_enc_precise_mbclen(buf
, buf
+ len
+ 1, enc
) != len
) {
3455 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3457 rb_str_resize(str1
, pos
+len
);
3458 memcpy(RSTRING_PTR(str1
) + pos
, buf
, len
);
3459 if (cr
== ENC_CODERANGE_7BIT
&& code
> 127)
3460 cr
= ENC_CODERANGE_VALID
;
3461 ENC_CODERANGE_SET(str1
, cr
);
3468 * prepend(*other_strings) -> string
3470 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3473 * s.prepend('bar', 'baz') # => "barbazfoo"
3474 * s # => "barbazfoo"
3476 * Related: String#concat.
3480 rb_str_prepend_multi(int argc
, VALUE
*argv
, VALUE str
)
3482 str_modifiable(str
);
3485 rb_str_update(str
, 0L, 0L, argv
[0]);
3487 else if (argc
> 1) {
3489 VALUE arg_str
= rb_str_tmp_new(0);
3490 rb_enc_copy(arg_str
, str
);
3491 for (i
= 0; i
< argc
; i
++) {
3492 rb_str_append(arg_str
, argv
[i
]);
3494 rb_str_update(str
, 0L, 0L, arg_str
);
3501 rb_str_hash(VALUE str
)
3503 int e
= ENCODING_GET(str
);
3504 if (e
&& rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
) {
3507 return rb_memhash((const void *)RSTRING_PTR(str
), RSTRING_LEN(str
)) ^ e
;
3511 rb_str_hash_cmp(VALUE str1
, VALUE str2
)
3514 const char *ptr1
, *ptr2
;
3515 RSTRING_GETMEM(str1
, ptr1
, len1
);
3516 RSTRING_GETMEM(str2
, ptr2
, len2
);
3517 return (len1
!= len2
||
3518 !rb_str_comparable(str1
, str2
) ||
3519 memcmp(ptr1
, ptr2
, len1
) != 0);
3526 * Returns the integer hash value for +self+.
3527 * The value is based on the length, content and encoding of +self+.
3529 * Related: Object#hash.
3533 rb_str_hash_m(VALUE str
)
3535 st_index_t hval
= rb_str_hash(str
);
3536 return ST2FIX(hval
);
3539 #define lesser(a,b) (((a)>(b))?(b):(a))
3542 rb_str_comparable(VALUE str1
, VALUE str2
)
3547 if (RSTRING_LEN(str1
) == 0) return TRUE
;
3548 if (RSTRING_LEN(str2
) == 0) return TRUE
;
3549 idx1
= ENCODING_GET(str1
);
3550 idx2
= ENCODING_GET(str2
);
3551 if (idx1
== idx2
) return TRUE
;
3552 rc1
= rb_enc_str_coderange(str1
);
3553 rc2
= rb_enc_str_coderange(str2
);
3554 if (rc1
== ENC_CODERANGE_7BIT
) {
3555 if (rc2
== ENC_CODERANGE_7BIT
) return TRUE
;
3556 if (rb_enc_asciicompat(rb_enc_from_index(idx2
)))
3559 if (rc2
== ENC_CODERANGE_7BIT
) {
3560 if (rb_enc_asciicompat(rb_enc_from_index(idx1
)))
3567 rb_str_cmp(VALUE str1
, VALUE str2
)
3570 const char *ptr1
, *ptr2
;
3573 if (str1
== str2
) return 0;
3574 RSTRING_GETMEM(str1
, ptr1
, len1
);
3575 RSTRING_GETMEM(str2
, ptr2
, len2
);
3576 if (ptr1
== ptr2
|| (retval
= memcmp(ptr1
, ptr2
, lesser(len1
, len2
))) == 0) {
3578 if (!rb_str_comparable(str1
, str2
)) {
3579 if (ENCODING_GET(str1
) > ENCODING_GET(str2
))
3585 if (len1
> len2
) return 1;
3588 if (retval
> 0) return 1;
3594 * string == object -> true or false
3595 * string === object -> true or false
3597 * Returns +true+ if +object+ has the same length and content;
3598 * as +self+; +false+ otherwise:
3601 * s == 'foo' # => true
3602 * s == 'food' # => false
3603 * s == 'FOO' # => false
3605 * Returns +false+ if the two strings' encodings are not compatible:
3606 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3608 * If +object+ is not an instance of \String but responds to +to_str+, then the
3609 * two strings are compared using <code>object.==</code>.
3613 rb_str_equal(VALUE str1
, VALUE str2
)
3615 if (str1
== str2
) return Qtrue
;
3616 if (!RB_TYPE_P(str2
, T_STRING
)) {
3617 if (!rb_respond_to(str2
, idTo_str
)) {
3620 return rb_equal(str2
, str1
);
3622 return rb_str_eql_internal(str1
, str2
);
3627 * eql?(object) -> true or false
3629 * Returns +true+ if +object+ has the same length and content;
3630 * as +self+; +false+ otherwise:
3633 * s.eql?('foo') # => true
3634 * s.eql?('food') # => false
3635 * s.eql?('FOO') # => false
3637 * Returns +false+ if the two strings' encodings are not compatible:
3639 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3643 MJIT_FUNC_EXPORTED VALUE
3644 rb_str_eql(VALUE str1
, VALUE str2
)
3646 if (str1
== str2
) return Qtrue
;
3647 if (!RB_TYPE_P(str2
, T_STRING
)) return Qfalse
;
3648 return rb_str_eql_internal(str1
, str2
);
3653 * string <=> other_string -> -1, 0, 1, or nil
3655 * Compares +self+ and +other_string+, returning:
3657 * - -1 if +other_string+ is larger.
3658 * - 0 if the two are equal.
3659 * - 1 if +other_string+ is smaller.
3660 * - +nil+ if the two are incomparable.
3664 * 'foo' <=> 'foo' # => 0
3665 * 'foo' <=> 'food' # => -1
3666 * 'food' <=> 'foo' # => 1
3667 * 'FOO' <=> 'foo' # => -1
3668 * 'foo' <=> 'FOO' # => 1
3669 * 'foo' <=> 1 # => nil
3674 rb_str_cmp_m(VALUE str1
, VALUE str2
)
3677 VALUE s
= rb_check_string_type(str2
);
3679 return rb_invcmp(str1
, str2
);
3681 result
= rb_str_cmp(str1
, s
);
3682 return INT2FIX(result
);
3685 static VALUE
str_casecmp(VALUE str1
, VALUE str2
);
3686 static VALUE
str_casecmp_p(VALUE str1
, VALUE str2
);
3690 * casecmp(other_string) -> -1, 0, 1, or nil
3692 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3694 * - -1 if <tt>other_string.downcase</tt> is larger.
3695 * - 0 if the two are equal.
3696 * - 1 if <tt>other_string.downcase</tt> is smaller.
3697 * - +nil+ if the two are incomparable.
3701 * 'foo'.casecmp('foo') # => 0
3702 * 'foo'.casecmp('food') # => -1
3703 * 'food'.casecmp('foo') # => 1
3704 * 'FOO'.casecmp('foo') # => 0
3705 * 'foo'.casecmp('FOO') # => 0
3706 * 'foo'.casecmp(1) # => nil
3708 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3710 * Related: String#casecmp?.
3715 rb_str_casecmp(VALUE str1
, VALUE str2
)
3717 VALUE s
= rb_check_string_type(str2
);
3721 return str_casecmp(str1
, s
);
3725 str_casecmp(VALUE str1
, VALUE str2
)
3729 const char *p1
, *p1end
, *p2
, *p2end
;
3731 enc
= rb_enc_compatible(str1
, str2
);
3736 p1
= RSTRING_PTR(str1
); p1end
= RSTRING_END(str1
);
3737 p2
= RSTRING_PTR(str2
); p2end
= RSTRING_END(str2
);
3738 if (single_byte_optimizable(str1
) && single_byte_optimizable(str2
)) {
3739 while (p1
< p1end
&& p2
< p2end
) {
3741 unsigned int c1
= TOLOWER(*p1
& 0xff);
3742 unsigned int c2
= TOLOWER(*p2
& 0xff);
3744 return INT2FIX(c1
< c2
? -1 : 1);
3751 while (p1
< p1end
&& p2
< p2end
) {
3752 int l1
, c1
= rb_enc_ascget(p1
, p1end
, &l1
, enc
);
3753 int l2
, c2
= rb_enc_ascget(p2
, p2end
, &l2
, enc
);
3755 if (0 <= c1
&& 0 <= c2
) {
3759 return INT2FIX(c1
< c2
? -1 : 1);
3763 l1
= rb_enc_mbclen(p1
, p1end
, enc
);
3764 l2
= rb_enc_mbclen(p2
, p2end
, enc
);
3765 len
= l1
< l2
? l1
: l2
;
3766 r
= memcmp(p1
, p2
, len
);
3768 return INT2FIX(r
< 0 ? -1 : 1);
3770 return INT2FIX(l1
< l2
? -1 : 1);
3776 if (RSTRING_LEN(str1
) == RSTRING_LEN(str2
)) return INT2FIX(0);
3777 if (RSTRING_LEN(str1
) > RSTRING_LEN(str2
)) return INT2FIX(1);
3783 * casecmp?(other_string) -> true, false, or nil
3785 * Returns +true+ if +self+ and +other_string+ are equal after
3786 * Unicode case folding, otherwise +false+:
3788 * 'foo'.casecmp?('foo') # => true
3789 * 'foo'.casecmp?('food') # => false
3790 * 'food'.casecmp?('foo') # => false
3791 * 'FOO'.casecmp?('foo') # => true
3792 * 'foo'.casecmp?('FOO') # => true
3794 * Returns +nil+ if the two values are incomparable:
3796 * 'foo'.casecmp?(1) # => nil
3798 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3800 * Related: String#casecmp.
3805 rb_str_casecmp_p(VALUE str1
, VALUE str2
)
3807 VALUE s
= rb_check_string_type(str2
);
3811 return str_casecmp_p(str1
, s
);
3815 str_casecmp_p(VALUE str1
, VALUE str2
)
3818 VALUE folded_str1
, folded_str2
;
3819 VALUE fold_opt
= sym_fold
;
3821 enc
= rb_enc_compatible(str1
, str2
);
3826 folded_str1
= rb_str_downcase(1, &fold_opt
, str1
);
3827 folded_str2
= rb_str_downcase(1, &fold_opt
, str2
);
3829 return rb_str_eql(folded_str1
, folded_str2
);
3833 strseq_core(const char *str_ptr
, const char *str_ptr_end
, long str_len
,
3834 const char *sub_ptr
, long sub_len
, long offset
, rb_encoding
*enc
)
3836 const char *search_start
= str_ptr
;
3837 long pos
, search_len
= str_len
- offset
;
3841 pos
= rb_memsearch(sub_ptr
, sub_len
, search_start
, search_len
, enc
);
3842 if (pos
< 0) return pos
;
3843 t
= rb_enc_right_char_head(search_start
, search_start
+pos
, str_ptr_end
, enc
);
3844 if (t
== search_start
+ pos
) break;
3845 search_len
-= t
- search_start
;
3846 if (search_len
<= 0) return -1;
3847 offset
+= t
- search_start
;
3850 return pos
+ offset
;
3853 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3856 rb_strseq_index(VALUE str
, VALUE sub
, long offset
, int in_byte
)
3858 const char *str_ptr
, *str_ptr_end
, *sub_ptr
;
3859 long str_len
, sub_len
;
3862 enc
= rb_enc_check(str
, sub
);
3863 if (is_broken_string(sub
)) return -1;
3865 str_ptr
= RSTRING_PTR(str
);
3866 str_ptr_end
= RSTRING_END(str
);
3867 str_len
= RSTRING_LEN(str
);
3868 sub_ptr
= RSTRING_PTR(sub
);
3869 sub_len
= RSTRING_LEN(sub
);
3871 if (str_len
< sub_len
) return -1;
3874 long str_len_char
, sub_len_char
;
3875 int single_byte
= single_byte_optimizable(str
);
3876 str_len_char
= (in_byte
|| single_byte
) ? str_len
: str_strlen(str
, enc
);
3877 sub_len_char
= in_byte
? sub_len
: str_strlen(sub
, enc
);
3879 offset
+= str_len_char
;
3880 if (offset
< 0) return -1;
3882 if (str_len_char
- offset
< sub_len_char
) return -1;
3883 if (!in_byte
) offset
= str_offset(str_ptr
, str_ptr_end
, offset
, enc
, single_byte
);
3886 if (sub_len
== 0) return offset
;
3888 /* need proceed one character at a time */
3889 return strseq_core(str_ptr
, str_ptr_end
, str_len
, sub_ptr
, sub_len
, offset
, enc
);
3895 * index(substring, offset = 0) -> integer or nil
3896 * index(regexp, offset = 0) -> integer or nil
3898 * Returns the \Integer index of the first occurrence of the given +substring+,
3899 * or +nil+ if none found:
3901 * 'foo'.index('f') # => 0
3902 * 'foo'.index('o') # => 1
3903 * 'foo'.index('oo') # => 1
3904 * 'foo'.index('ooo') # => nil
3906 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3907 * or +nil+ if none found:
3909 * 'foo'.index(/f/) # => 0
3910 * 'foo'.index(/o/) # => 1
3911 * 'foo'.index(/oo/) # => 1
3912 * 'foo'.index(/ooo/) # => nil
3914 * \Integer argument +offset+, if given, specifies the position in the
3915 * string to begin the search:
3917 * 'foo'.index('o', 1) # => 1
3918 * 'foo'.index('o', 2) # => 2
3919 * 'foo'.index('o', 3) # => nil
3921 * If +offset+ is negative, counts backward from the end of +self+:
3923 * 'foo'.index('o', -1) # => 2
3924 * 'foo'.index('o', -2) # => 1
3925 * 'foo'.index('o', -3) # => 1
3926 * 'foo'.index('o', -4) # => nil
3928 * Related: String#rindex.
3932 rb_str_index_m(int argc
, VALUE
*argv
, VALUE str
)
3938 if (rb_scan_args(argc
, argv
, "11", &sub
, &initpos
) == 2) {
3939 pos
= NUM2LONG(initpos
);
3945 pos
+= str_strlen(str
, NULL
);
3947 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3948 rb_backref_set(Qnil
);
3954 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3955 if (pos
> str_strlen(str
, NULL
))
3957 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
3958 rb_enc_check(str
, sub
), single_byte_optimizable(str
));
3960 if (rb_reg_search(sub
, str
, pos
, 0) < 0) {
3964 VALUE match
= rb_backref_get();
3965 struct re_registers
*regs
= RMATCH_REGS(match
);
3966 pos
= rb_str_sublen(str
, BEG(0));
3967 return LONG2NUM(pos
);
3972 pos
= rb_str_index(str
, sub
, pos
);
3973 pos
= rb_str_sublen(str
, pos
);
3976 if (pos
== -1) return Qnil
;
3977 return LONG2NUM(pos
);
3982 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
3984 char *hit
, *adjusted
;
3986 long slen
, searchlen
;
3989 slen
= RSTRING_LEN(sub
);
3990 if (slen
== 0) return pos
;
3991 sbeg
= RSTRING_PTR(str
);
3992 e
= RSTRING_END(str
);
3993 t
= RSTRING_PTR(sub
);
3995 searchlen
= s
- sbeg
+ 1;
3998 hit
= memrchr(sbeg
, c
, searchlen
);
4000 adjusted
= rb_enc_left_char_head(sbeg
, hit
, e
, enc
);
4001 if (hit
!= adjusted
) {
4002 searchlen
= adjusted
- sbeg
;
4005 if (memcmp(hit
, t
, slen
) == 0)
4006 return rb_str_sublen(str
, hit
- sbeg
);
4007 searchlen
= adjusted
- sbeg
;
4008 } while (searchlen
> 0);
4014 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
4019 sbeg
= RSTRING_PTR(str
);
4020 e
= RSTRING_END(str
);
4021 t
= RSTRING_PTR(sub
);
4022 slen
= RSTRING_LEN(sub
);
4025 if (memcmp(s
, t
, slen
) == 0) {
4028 if (pos
== 0) break;
4030 s
= rb_enc_prev_char(sbeg
, s
, e
, enc
);
4038 rb_str_rindex(VALUE str
, VALUE sub
, long pos
)
4045 enc
= rb_enc_check(str
, sub
);
4046 if (is_broken_string(sub
)) return -1;
4047 singlebyte
= single_byte_optimizable(str
);
4048 len
= singlebyte
? RSTRING_LEN(str
) : str_strlen(str
, enc
); /* rb_enc_check */
4049 slen
= str_strlen(sub
, enc
); /* rb_enc_check */
4051 /* substring longer than string */
4052 if (len
< slen
) return -1;
4053 if (len
- pos
< slen
) pos
= len
- slen
;
4054 if (len
== 0) return pos
;
4056 sbeg
= RSTRING_PTR(str
);
4059 if (memcmp(sbeg
, RSTRING_PTR(sub
), RSTRING_LEN(sub
)) == 0)
4065 s
= str_nth(sbeg
, RSTRING_END(str
), pos
, enc
, singlebyte
);
4066 return str_rindex(str
, sub
, s
, pos
, enc
);
4071 * rindex(substring, offset = self.length) -> integer or nil
4072 * rindex(regexp, offset = self.length) -> integer or nil
4074 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4075 * or +nil+ if none found:
4077 * 'foo'.rindex('f') # => 0
4078 * 'foo'.rindex('o') # => 2
4079 * 'foo'.rindex('oo') # => 1
4080 * 'foo'.rindex('ooo') # => nil
4082 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4083 * or +nil+ if none found:
4085 * 'foo'.rindex(/f/) # => 0
4086 * 'foo'.rindex(/o/) # => 2
4087 * 'foo'.rindex(/oo/) # => 1
4088 * 'foo'.rindex(/ooo/) # => nil
4090 * The _last_ match means starting at the possible last position, not
4091 * the last of longest matches.
4093 * 'foo'.rindex(/o+/) # => 2
4094 * $~ #=> #<MatchData "o">
4096 * To get the last longest match, needs to combine with negative
4099 * 'foo'.rindex(/(?<!o)o+/) # => 1
4100 * $~ #=> #<MatchData "oo">
4102 * Or String#index with negative lookforward.
4104 * 'foo'.index(/o+(?!.*o)/) # => 1
4105 * $~ #=> #<MatchData "oo">
4107 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4108 * string to _end_ the search:
4110 * 'foo'.rindex('o', 0) # => nil
4111 * 'foo'.rindex('o', 1) # => 1
4112 * 'foo'.rindex('o', 2) # => 2
4113 * 'foo'.rindex('o', 3) # => 2
4115 * If +offset+ is a negative \Integer, the maximum starting position in the
4116 * string to _end_ the search is the sum of the string's length and +offset+:
4118 * 'foo'.rindex('o', -1) # => 2
4119 * 'foo'.rindex('o', -2) # => 1
4120 * 'foo'.rindex('o', -3) # => nil
4121 * 'foo'.rindex('o', -4) # => nil
4123 * Related: String#index.
4127 rb_str_rindex_m(int argc
, VALUE
*argv
, VALUE str
)
4131 rb_encoding
*enc
= STR_ENC_GET(str
);
4132 long pos
, len
= str_strlen(str
, enc
); /* str's enc */
4134 if (rb_scan_args(argc
, argv
, "11", &sub
, &vpos
) == 2) {
4135 pos
= NUM2LONG(vpos
);
4139 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4140 rb_backref_set(Qnil
);
4145 if (pos
> len
) pos
= len
;
4151 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4152 /* enc = rb_get_check(str, sub); */
4153 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
4154 enc
, single_byte_optimizable(str
));
4156 if (rb_reg_search(sub
, str
, pos
, 1) >= 0) {
4157 VALUE match
= rb_backref_get();
4158 struct re_registers
*regs
= RMATCH_REGS(match
);
4159 pos
= rb_str_sublen(str
, BEG(0));
4160 return LONG2NUM(pos
);
4165 pos
= rb_str_rindex(str
, sub
, pos
);
4166 if (pos
>= 0) return LONG2NUM(pos
);
4173 * string =~ regexp -> integer or nil
4174 * string =~ object -> integer or nil
4176 * Returns the \Integer index of the first substring that matches
4177 * the given +regexp+, or +nil+ if no match found:
4179 * 'foo' =~ /f/ # => 0
4180 * 'foo' =~ /o/ # => 1
4181 * 'foo' =~ /x/ # => nil
4183 * Note: also updates
4184 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4186 * If the given +object+ is not a \Regexp, returns the value
4187 * returned by <tt>object =~ self</tt>.
4189 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4190 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4193 * "no. 9" =~ /(?<number>\d+)/
4194 * number # => nil (not assigned)
4195 * /(?<number>\d+)/ =~ "no. 9"
4201 rb_str_match(VALUE x
, VALUE y
)
4203 switch (OBJ_BUILTIN_TYPE(y
)) {
4205 rb_raise(rb_eTypeError
, "type mismatch: String given");
4208 return rb_reg_match(y
, x
);
4211 return rb_funcall(y
, idEqTilde
, 1, x
);
4216 static VALUE
get_pat(VALUE
);
4221 * match(pattern, offset = 0) -> matchdata or nil
4222 * match(pattern, offset = 0) {|matchdata| ... } -> object
4224 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4226 * Note: also updates
4227 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4229 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4230 * regexp = Regexp.new(pattern)
4231 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4232 * (see Regexp#match):
4233 * matchdata = <tt>regexp.match(self)
4235 * With no block given, returns the computed +matchdata+:
4237 * 'foo'.match('f') # => #<MatchData "f">
4238 * 'foo'.match('o') # => #<MatchData "o">
4239 * 'foo'.match('x') # => nil
4241 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4243 * 'foo'.match('f', 1) # => nil
4244 * 'foo'.match('o', 1) # => #<MatchData "o">
4246 * With a block given, calls the block with the computed +matchdata+
4247 * and returns the block's return value:
4249 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4250 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4251 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4256 rb_str_match_m(int argc
, VALUE
*argv
, VALUE str
)
4260 rb_check_arity(argc
, 1, 2);
4263 result
= rb_funcallv(get_pat(re
), rb_intern("match"), argc
, argv
);
4264 if (!NIL_P(result
) && rb_block_given_p()) {
4265 return rb_yield(result
);
4272 * match?(pattern, offset = 0) -> true or false
4274 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4276 * Note: does not update
4277 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4279 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4280 * regexp = Regexp.new(pattern)
4282 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4283 * +false+ otherwise:
4285 * 'foo'.match?(/o/) # => true
4286 * 'foo'.match?('o') # => true
4287 * 'foo'.match?(/x/) # => false
4289 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4290 * 'foo'.match?('f', 1) # => false
4291 * 'foo'.match?('o', 1) # => true
4296 rb_str_match_m_p(int argc
, VALUE
*argv
, VALUE str
)
4299 rb_check_arity(argc
, 1, 2);
4300 re
= get_pat(argv
[0]);
4301 return rb_reg_match_p(re
, str
, argc
> 1 ? NUM2LONG(argv
[1]) : 0);
4304 enum neighbor_char
{
4310 static enum neighbor_char
4311 enc_succ_char(char *p
, long len
, rb_encoding
*enc
)
4316 if (rb_enc_mbminlen(enc
) > 1) {
4317 /* wchar, trivial case */
4318 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4319 if (!MBCLEN_CHARFOUND_P(r
)) {
4320 return NEIGHBOR_NOT_CHAR
;
4322 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
) + 1;
4323 l
= rb_enc_code_to_mbclen(c
, enc
);
4324 if (!l
) return NEIGHBOR_NOT_CHAR
;
4325 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4326 rb_enc_mbcput(c
, p
, enc
);
4327 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4328 if (!MBCLEN_CHARFOUND_P(r
)) {
4329 return NEIGHBOR_NOT_CHAR
;
4331 return NEIGHBOR_FOUND
;
4334 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0xff; i
--)
4337 return NEIGHBOR_WRAPPED
;
4338 ++((unsigned char*)p
)[i
];
4339 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4340 if (MBCLEN_CHARFOUND_P(l
)) {
4341 l
= MBCLEN_CHARFOUND_LEN(l
);
4343 return NEIGHBOR_FOUND
;
4346 memset(p
+l
, 0xff, len
-l
);
4349 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4352 for (len2
= len
-1; 0 < len2
; len2
--) {
4353 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4354 if (!MBCLEN_INVALID_P(l2
))
4357 memset(p
+len2
+1, 0xff, len
-(len2
+1));
4362 static enum neighbor_char
4363 enc_pred_char(char *p
, long len
, rb_encoding
*enc
)
4367 if (rb_enc_mbminlen(enc
) > 1) {
4368 /* wchar, trivial case */
4369 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4370 if (!MBCLEN_CHARFOUND_P(r
)) {
4371 return NEIGHBOR_NOT_CHAR
;
4373 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
);
4374 if (!c
) return NEIGHBOR_NOT_CHAR
;
4376 l
= rb_enc_code_to_mbclen(c
, enc
);
4377 if (!l
) return NEIGHBOR_NOT_CHAR
;
4378 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4379 rb_enc_mbcput(c
, p
, enc
);
4380 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4381 if (!MBCLEN_CHARFOUND_P(r
)) {
4382 return NEIGHBOR_NOT_CHAR
;
4384 return NEIGHBOR_FOUND
;
4387 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0; i
--)
4390 return NEIGHBOR_WRAPPED
;
4391 --((unsigned char*)p
)[i
];
4392 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4393 if (MBCLEN_CHARFOUND_P(l
)) {
4394 l
= MBCLEN_CHARFOUND_LEN(l
);
4396 return NEIGHBOR_FOUND
;
4399 memset(p
+l
, 0, len
-l
);
4402 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4405 for (len2
= len
-1; 0 < len2
; len2
--) {
4406 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4407 if (!MBCLEN_INVALID_P(l2
))
4410 memset(p
+len2
+1, 0, len
-(len2
+1));
4416 overwrite +p+ by succeeding letter in +enc+ and returns
4417 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4418 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4419 assuming each ranges are successive, and mbclen
4420 never change in each ranges.
4421 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4424 static enum neighbor_char
4425 enc_succ_alnum_char(char *p
, long len
, rb_encoding
*enc
, char *carry
)
4427 enum neighbor_char ret
;
4431 char save
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4433 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4435 const int max_gaps
= 1;
4437 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4438 if (rb_enc_isctype(c
, ONIGENC_CTYPE_DIGIT
, enc
))
4439 ctype
= ONIGENC_CTYPE_DIGIT
;
4440 else if (rb_enc_isctype(c
, ONIGENC_CTYPE_ALPHA
, enc
))
4441 ctype
= ONIGENC_CTYPE_ALPHA
;
4443 return NEIGHBOR_NOT_CHAR
;
4445 MEMCPY(save
, p
, char, len
);
4446 for (try = 0; try <= max_gaps
; ++try) {
4447 ret
= enc_succ_char(p
, len
, enc
);
4448 if (ret
== NEIGHBOR_FOUND
) {
4449 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4450 if (rb_enc_isctype(c
, ctype
, enc
))
4451 return NEIGHBOR_FOUND
;
4454 MEMCPY(p
, save
, char, len
);
4457 MEMCPY(save
, p
, char, len
);
4458 ret
= enc_pred_char(p
, len
, enc
);
4459 if (ret
== NEIGHBOR_FOUND
) {
4460 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4461 if (!rb_enc_isctype(c
, ctype
, enc
)) {
4462 MEMCPY(p
, save
, char, len
);
4467 MEMCPY(p
, save
, char, len
);
4473 return NEIGHBOR_NOT_CHAR
;
4476 if (ctype
!= ONIGENC_CTYPE_DIGIT
) {
4477 MEMCPY(carry
, p
, char, len
);
4478 return NEIGHBOR_WRAPPED
;
4481 MEMCPY(carry
, p
, char, len
);
4482 enc_succ_char(carry
, len
, enc
);
4483 return NEIGHBOR_WRAPPED
;
4487 static VALUE
str_succ(VALUE str
);
4493 * Returns the successor to +self+. The successor is calculated by
4494 * incrementing characters.
4496 * The first character to be incremented is the rightmost alphanumeric:
4497 * or, if no alphanumerics, the rightmost character:
4499 * 'THX1138'.succ # => "THX1139"
4500 * '<<koala>>'.succ # => "<<koalb>>"
4501 * '***'.succ # => '**+'
4503 * The successor to a digit is another digit, "carrying" to the next-left
4504 * character for a "rollover" from 9 to 0, and prepending another digit
4507 * '00'.succ # => "01"
4508 * '09'.succ # => "10"
4509 * '99'.succ # => "100"
4511 * The successor to a letter is another letter of the same case,
4512 * carrying to the next-left character for a rollover,
4513 * and prepending another same-case letter if necessary:
4515 * 'aa'.succ # => "ab"
4516 * 'az'.succ # => "ba"
4517 * 'zz'.succ # => "aaa"
4518 * 'AA'.succ # => "AB"
4519 * 'AZ'.succ # => "BA"
4520 * 'ZZ'.succ # => "AAA"
4522 * The successor to a non-alphanumeric character is the next character
4523 * in the underlying character set's collating sequence,
4524 * carrying to the next-left character for a rollover,
4525 * and prepending another character if necessary:
4528 * s # => "\x00\x00\x00"
4529 * s.succ # => "\x00\x00\x01"
4531 * s # => "\xFF\xFF\xFF"
4532 * s.succ # => "\x01\x00\x00\x00"
4534 * Carrying can occur between and among mixtures of alphanumeric characters:
4537 * s.succ # => "aaa00aa00"
4539 * s.succ # => "100aa00aa"
4541 * The successor to an empty \String is a new empty \String:
4545 * String#next is an alias for String#succ.
4549 rb_str_succ(VALUE orig
)
4552 str
= rb_str_new(RSTRING_PTR(orig
), RSTRING_LEN(orig
));
4553 rb_enc_cr_str_copy_for_substr(str
, orig
);
4554 return str_succ(str
);
4561 char *sbeg
, *s
, *e
, *last_alnum
= 0;
4562 int found_alnum
= 0;
4564 char carry
[ONIGENC_CODE_TO_MBC_MAXLEN
] = "\1";
4565 long carry_pos
= 0, carry_len
= 1;
4566 enum neighbor_char neighbor
= NEIGHBOR_FOUND
;
4568 slen
= RSTRING_LEN(str
);
4569 if (slen
== 0) return str
;
4571 enc
= STR_ENC_GET(str
);
4572 sbeg
= RSTRING_PTR(str
);
4573 s
= e
= sbeg
+ slen
;
4575 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4576 if (neighbor
== NEIGHBOR_NOT_CHAR
&& last_alnum
) {
4577 if (ISALPHA(*last_alnum
) ? ISDIGIT(*s
) :
4578 ISDIGIT(*last_alnum
) ? ISALPHA(*s
) : 0) {
4582 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4583 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4584 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4585 neighbor
= enc_succ_alnum_char(s
, l
, enc
, carry
);
4587 case NEIGHBOR_NOT_CHAR
:
4589 case NEIGHBOR_FOUND
:
4591 case NEIGHBOR_WRAPPED
:
4596 carry_pos
= s
- sbeg
;
4599 if (!found_alnum
) { /* str contains no alnum */
4601 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4602 enum neighbor_char neighbor
;
4603 char tmp
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4604 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4605 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4606 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4607 MEMCPY(tmp
, s
, char, l
);
4608 neighbor
= enc_succ_char(tmp
, l
, enc
);
4610 case NEIGHBOR_FOUND
:
4611 MEMCPY(s
, tmp
, char, l
);
4614 case NEIGHBOR_WRAPPED
:
4615 MEMCPY(s
, tmp
, char, l
);
4617 case NEIGHBOR_NOT_CHAR
:
4620 if (rb_enc_precise_mbclen(s
, s
+l
, enc
) != l
) {
4621 /* wrapped to \0...\0. search next valid char. */
4622 enc_succ_char(s
, l
, enc
);
4624 if (!rb_enc_asciicompat(enc
)) {
4625 MEMCPY(carry
, s
, char, l
);
4628 carry_pos
= s
- sbeg
;
4630 ENC_CODERANGE_SET(str
, ENC_CODERANGE_UNKNOWN
);
4632 RESIZE_CAPA(str
, slen
+ carry_len
);
4633 sbeg
= RSTRING_PTR(str
);
4634 s
= sbeg
+ carry_pos
;
4635 memmove(s
+ carry_len
, s
, slen
- carry_pos
);
4636 memmove(s
, carry
, carry_len
);
4638 STR_SET_LEN(str
, slen
);
4639 TERM_FILL(&sbeg
[slen
], rb_enc_mbminlen(enc
));
4640 rb_enc_str_coderange(str
);
4649 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4651 * String#next! is an alias for String#succ!.
4655 rb_str_succ_bang(VALUE str
)
4663 all_digits_p(const char *s
, long len
)
4666 if (!ISDIGIT(*s
)) return 0;
4673 str_upto_i(VALUE str
, VALUE arg
)
4681 * upto(other_string, exclusive = false) {|string| ... } -> self
4682 * upto(other_string, exclusive = false) -> new_enumerator
4684 * With a block given, calls the block with each \String value
4685 * returned by successive calls to String#succ;
4686 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4687 * the sequence terminates when value +other_string+ is reached;
4690 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4693 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4695 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4697 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4701 * a8 a9 b0 b1 b2 b3 b4 b5
4703 * If +other_string+ would not be reached, does not call the block:
4705 * '25'.upto('5') {|s| fail s }
4706 * 'aa'.upto('a') {|s| fail s }
4708 * With no block given, returns a new \Enumerator:
4710 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4715 rb_str_upto(int argc
, VALUE
*argv
, VALUE beg
)
4717 VALUE end
, exclusive
;
4719 rb_scan_args(argc
, argv
, "11", &end
, &exclusive
);
4720 RETURN_ENUMERATOR(beg
, argc
, argv
);
4721 return rb_str_upto_each(beg
, end
, RTEST(exclusive
), str_upto_i
, Qnil
);
4725 rb_str_upto_each(VALUE beg
, VALUE end
, int excl
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4727 VALUE current
, after_end
;
4732 CONST_ID(succ
, "succ");
4734 enc
= rb_enc_check(beg
, end
);
4735 ascii
= (is_ascii_string(beg
) && is_ascii_string(end
));
4736 /* single character */
4737 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1 && ascii
) {
4738 char c
= RSTRING_PTR(beg
)[0];
4739 char e
= RSTRING_PTR(end
)[0];
4741 if (c
> e
|| (excl
&& c
== e
)) return beg
;
4743 if ((*each
)(rb_enc_str_new(&c
, 1, enc
), arg
)) break;
4744 if (!excl
&& c
== e
) break;
4746 if (excl
&& c
== e
) break;
4750 /* both edges are all digits */
4751 if (ascii
&& ISDIGIT(RSTRING_PTR(beg
)[0]) && ISDIGIT(RSTRING_PTR(end
)[0]) &&
4752 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
)) &&
4753 all_digits_p(RSTRING_PTR(end
), RSTRING_LEN(end
))) {
4757 width
= RSTRING_LENINT(beg
);
4758 b
= rb_str_to_inum(beg
, 10, FALSE
);
4759 e
= rb_str_to_inum(end
, 10, FALSE
);
4760 if (FIXNUM_P(b
) && FIXNUM_P(e
)) {
4761 long bi
= FIX2LONG(b
);
4762 long ei
= FIX2LONG(e
);
4763 rb_encoding
*usascii
= rb_usascii_encoding();
4766 if (excl
&& bi
== ei
) break;
4767 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4772 ID op
= excl
? '<' : idLE
;
4773 VALUE args
[2], fmt
= rb_fstring_lit("%.*d");
4775 args
[0] = INT2FIX(width
);
4776 while (rb_funcall(b
, op
, 1, e
)) {
4778 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4779 b
= rb_funcallv(b
, succ
, 0, 0);
4785 n
= rb_str_cmp(beg
, end
);
4786 if (n
> 0 || (excl
&& n
== 0)) return beg
;
4788 after_end
= rb_funcallv(end
, succ
, 0, 0);
4789 current
= str_duplicate(rb_cString
, beg
);
4790 while (!rb_str_equal(current
, after_end
)) {
4792 if (excl
|| !rb_str_equal(current
, end
))
4793 next
= rb_funcallv(current
, succ
, 0, 0);
4794 if ((*each
)(current
, arg
)) break;
4795 if (NIL_P(next
)) break;
4797 StringValue(current
);
4798 if (excl
&& rb_str_equal(current
, end
)) break;
4799 if (RSTRING_LEN(current
) > RSTRING_LEN(end
) || RSTRING_LEN(current
) == 0)
4807 rb_str_upto_endless_each(VALUE beg
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4812 CONST_ID(succ
, "succ");
4813 /* both edges are all digits */
4814 if (is_ascii_string(beg
) && ISDIGIT(RSTRING_PTR(beg
)[0]) &&
4815 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
))) {
4816 VALUE b
, args
[2], fmt
= rb_fstring_lit("%.*d");
4817 int width
= RSTRING_LENINT(beg
);
4818 b
= rb_str_to_inum(beg
, 10, FALSE
);
4820 long bi
= FIX2LONG(b
);
4821 rb_encoding
*usascii
= rb_usascii_encoding();
4823 while (FIXABLE(bi
)) {
4824 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4829 args
[0] = INT2FIX(width
);
4832 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4833 b
= rb_funcallv(b
, succ
, 0, 0);
4837 current
= str_duplicate(rb_cString
, beg
);
4839 VALUE next
= rb_funcallv(current
, succ
, 0, 0);
4840 if ((*each
)(current
, arg
)) break;
4842 StringValue(current
);
4843 if (RSTRING_LEN(current
) == 0)
4851 include_range_i(VALUE str
, VALUE arg
)
4853 VALUE
*argp
= (VALUE
*)arg
;
4854 if (!rb_equal(str
, *argp
)) return 0;
4860 rb_str_include_range_p(VALUE beg
, VALUE end
, VALUE val
, VALUE exclusive
)
4862 beg
= rb_str_new_frozen(beg
);
4864 end
= rb_str_new_frozen(end
);
4865 if (NIL_P(val
)) return Qfalse
;
4866 val
= rb_check_string_type(val
);
4867 if (NIL_P(val
)) return Qfalse
;
4868 if (rb_enc_asciicompat(STR_ENC_GET(beg
)) &&
4869 rb_enc_asciicompat(STR_ENC_GET(end
)) &&
4870 rb_enc_asciicompat(STR_ENC_GET(val
))) {
4871 const char *bp
= RSTRING_PTR(beg
);
4872 const char *ep
= RSTRING_PTR(end
);
4873 const char *vp
= RSTRING_PTR(val
);
4874 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1) {
4875 if (RSTRING_LEN(val
) == 0 || RSTRING_LEN(val
) > 1)
4882 if (ISASCII(b
) && ISASCII(e
) && ISASCII(v
)) {
4883 if (b
<= v
&& v
< e
) return Qtrue
;
4884 return RBOOL(!RTEST(exclusive
) && v
== e
);
4889 /* both edges are all digits */
4890 if (ISDIGIT(*bp
) && ISDIGIT(*ep
) &&
4891 all_digits_p(bp
, RSTRING_LEN(beg
)) &&
4892 all_digits_p(ep
, RSTRING_LEN(end
))) {
4897 rb_str_upto_each(beg
, end
, RTEST(exclusive
), include_range_i
, (VALUE
)&val
);
4899 return RBOOL(NIL_P(val
));
4903 rb_str_subpat(VALUE str
, VALUE re
, VALUE backref
)
4905 if (rb_reg_search(re
, str
, 0, 0) >= 0) {
4906 VALUE match
= rb_backref_get();
4907 int nth
= rb_reg_backref_number(match
, backref
);
4908 return rb_reg_nth_match(nth
, match
);
4914 rb_str_aref(VALUE str
, VALUE indx
)
4918 if (FIXNUM_P(indx
)) {
4919 idx
= FIX2LONG(indx
);
4921 else if (RB_TYPE_P(indx
, T_REGEXP
)) {
4922 return rb_str_subpat(str
, indx
, INT2FIX(0));
4924 else if (RB_TYPE_P(indx
, T_STRING
)) {
4925 if (rb_str_index(str
, indx
, 0) != -1)
4926 return str_duplicate(rb_cString
, indx
);
4930 /* check if indx is Range */
4931 long beg
, len
= str_strlen(str
, NULL
);
4932 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
4938 return rb_str_substr(str
, beg
, len
);
4940 idx
= NUM2LONG(indx
);
4943 return str_substr(str
, idx
, 1, FALSE
);
4949 * string[index] -> new_string or nil
4950 * string[start, length] -> new_string or nil
4951 * string[range] -> new_string or nil
4952 * string[regexp, capture = 0] -> new_string or nil
4953 * string[substring] -> new_string or nil
4955 * Returns the substring of +self+ specified by the arguments.
4957 * When the single \Integer argument +index+ is given,
4958 * returns the 1-character substring found in +self+ at offset +index+:
4962 * Counts backward from the end of +self+ if +index+ is negative:
4964 * 'foo'[-3] # => "f"
4966 * Returns +nil+ if +index+ is out of range:
4969 * 'foo'[-4] # => nil
4971 * When the two \Integer arguments +start+ and +length+ are given,
4972 * returns the substring of the given +length+ found in +self+ at offset +start+:
4974 * 'foo'[0, 2] # => "fo"
4975 * 'foo'[0, 0] # => ""
4977 * Counts backward from the end of +self+ if +start+ is negative:
4979 * 'foo'[-2, 2] # => "oo"
4981 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4983 * 'foo'[3, 2] # => ""
4985 * Returns +nil+ if +start+ is out of range:
4987 * 'foo'[4, 2] # => nil
4988 * 'foo'[-4, 2] # => nil
4990 * Returns the trailing substring of +self+ if +length+ is large:
4992 * 'foo'[1, 50] # => "oo"
4994 * Returns +nil+ if +length+ is negative:
4996 * 'foo'[0, -1] # => nil
4998 * When the single \Range argument +range+ is given,
4999 * derives +start+ and +length+ values from the given +range+,
5000 * and returns values as above:
5002 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5003 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5005 * When the \Regexp argument +regexp+ is given,
5006 * and the +capture+ argument is <tt>0</tt>,
5007 * returns the first matching substring found in +self+,
5008 * or +nil+ if none found:
5010 * 'foo'[/o/] # => "o"
5011 * 'foo'[/x/] # => nil
5013 * s[/[aeiou](.)\1/] # => "ell"
5014 * s[/[aeiou](.)\1/, 0] # => "ell"
5016 * If argument +capture+ is given and not <tt>0</tt>,
5017 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5018 * the method call returns only the specified capture
5019 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5022 * s[/[aeiou](.)\1/, 1] # => "l"
5023 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5024 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5026 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5027 * capture group name is given, +IndexError+ is raised.
5029 * When the single \String argument +substring+ is given,
5030 * returns the substring from +self+ if found, otherwise +nil+:
5032 * 'foo'['oo'] # => "oo"
5033 * 'foo'['xx'] # => nil
5035 * String#slice is an alias for String#[].
5039 rb_str_aref_m(int argc
, VALUE
*argv
, VALUE str
)
5042 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5043 return rb_str_subpat(str
, argv
[0], argv
[1]);
5046 long beg
= NUM2LONG(argv
[0]);
5047 long len
= NUM2LONG(argv
[1]);
5048 return rb_str_substr(str
, beg
, len
);
5051 rb_check_arity(argc
, 1, 2);
5052 return rb_str_aref(str
, argv
[0]);
5056 rb_str_drop_bytes(VALUE str
, long len
)
5058 char *ptr
= RSTRING_PTR(str
);
5059 long olen
= RSTRING_LEN(str
), nlen
;
5061 str_modifiable(str
);
5062 if (len
> olen
) len
= olen
;
5064 if (str_embed_capa(str
) >= nlen
+ TERM_LEN(str
)) {
5066 int fl
= (int)(RBASIC(str
)->flags
& (STR_NOEMBED
|STR_SHARED
|STR_NOFREE
));
5068 STR_SET_EMBED_LEN(str
, nlen
);
5069 ptr
= RSTRING(str
)->as
.embed
.ary
;
5070 memmove(ptr
, oldptr
+ len
, nlen
);
5071 if (fl
== STR_NOEMBED
) xfree(oldptr
);
5074 if (!STR_SHARED_P(str
)) {
5075 VALUE shared
= heap_str_make_shared(rb_obj_class(str
), str
);
5076 rb_enc_cr_str_exact_copy(shared
, str
);
5079 ptr
= RSTRING(str
)->as
.heap
.ptr
+= len
;
5080 RSTRING(str
)->as
.heap
.len
= nlen
;
5083 ENC_CODERANGE_CLEAR(str
);
5088 rb_str_splice_0(VALUE str
, long beg
, long len
, VALUE val
)
5091 long slen
, vlen
= RSTRING_LEN(val
);
5094 if (beg
== 0 && vlen
== 0) {
5095 rb_str_drop_bytes(str
, len
);
5099 str_modify_keep_cr(str
);
5100 RSTRING_GETMEM(str
, sptr
, slen
);
5103 RESIZE_CAPA(str
, slen
+ vlen
- len
);
5104 sptr
= RSTRING_PTR(str
);
5107 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
5108 cr
= rb_enc_str_coderange(val
);
5110 cr
= ENC_CODERANGE_UNKNOWN
;
5113 memmove(sptr
+ beg
+ vlen
,
5115 slen
- (beg
+ len
));
5117 if (vlen
< beg
&& len
< 0) {
5118 MEMZERO(sptr
+ slen
, char, -len
);
5121 memmove(sptr
+ beg
, RSTRING_PTR(val
), vlen
);
5124 STR_SET_LEN(str
, slen
);
5125 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5126 ENC_CODERANGE_SET(str
, cr
);
5130 rb_str_update(VALUE str
, long beg
, long len
, VALUE val
)
5135 int singlebyte
= single_byte_optimizable(str
);
5138 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
5141 enc
= rb_enc_check(str
, val
);
5142 slen
= str_strlen(str
, enc
); /* rb_enc_check */
5144 if ((slen
< beg
) || ((beg
< 0) && (beg
+ slen
< 0))) {
5145 rb_raise(rb_eIndexError
, "index %ld out of string", beg
);
5151 assert(beg
<= slen
);
5152 if (len
> slen
- beg
) {
5155 str_modify_keep_cr(str
);
5156 p
= str_nth(RSTRING_PTR(str
), RSTRING_END(str
), beg
, enc
, singlebyte
);
5157 if (!p
) p
= RSTRING_END(str
);
5158 e
= str_nth(p
, RSTRING_END(str
), len
, enc
, singlebyte
);
5159 if (!e
) e
= RSTRING_END(str
);
5161 beg
= p
- RSTRING_PTR(str
); /* physical position */
5162 len
= e
- p
; /* physical length */
5163 rb_str_splice_0(str
, beg
, len
, val
);
5164 rb_enc_associate(str
, enc
);
5165 cr
= ENC_CODERANGE_AND(ENC_CODERANGE(str
), ENC_CODERANGE(val
));
5166 if (cr
!= ENC_CODERANGE_BROKEN
)
5167 ENC_CODERANGE_SET(str
, cr
);
5170 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5173 rb_str_subpat_set(VALUE str
, VALUE re
, VALUE backref
, VALUE val
)
5177 long start
, end
, len
;
5179 struct re_registers
*regs
;
5181 if (rb_reg_search(re
, str
, 0, 0) < 0) {
5182 rb_raise(rb_eIndexError
, "regexp not matched");
5184 match
= rb_backref_get();
5185 nth
= rb_reg_backref_number(match
, backref
);
5186 regs
= RMATCH_REGS(match
);
5187 if ((nth
>= regs
->num_regs
) || ((nth
< 0) && (-nth
>= regs
->num_regs
))) {
5188 rb_raise(rb_eIndexError
, "index %d out of regexp", nth
);
5191 nth
+= regs
->num_regs
;
5196 rb_raise(rb_eIndexError
, "regexp group %d not matched", nth
);
5201 enc
= rb_enc_check_str(str
, val
);
5202 rb_str_splice_0(str
, start
, len
, val
);
5203 rb_enc_associate(str
, enc
);
5207 rb_str_aset(VALUE str
, VALUE indx
, VALUE val
)
5211 switch (TYPE(indx
)) {
5213 rb_str_subpat_set(str
, indx
, INT2FIX(0), val
);
5217 beg
= rb_str_index(str
, indx
, 0);
5219 rb_raise(rb_eIndexError
, "string not matched");
5221 beg
= rb_str_sublen(str
, beg
);
5222 rb_str_splice(str
, beg
, str_strlen(indx
, NULL
), val
);
5226 /* check if indx is Range */
5229 if (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 2)) {
5230 rb_str_splice(str
, beg
, len
, val
);
5237 idx
= NUM2LONG(indx
);
5238 rb_str_splice(str
, idx
, 1, val
);
5245 * str[integer] = new_str
5246 * str[integer, integer] = new_str
5247 * str[range] = aString
5248 * str[regexp] = new_str
5249 * str[regexp, integer] = new_str
5250 * str[regexp, name] = new_str
5251 * str[other_str] = new_str
5253 * Element Assignment---Replaces some or all of the content of
5254 * <i>str</i>. The portion of the string affected is determined using
5255 * the same criteria as String#[]. If the replacement string is not
5256 * the same length as the text it is replacing, the string will be
5257 * adjusted accordingly. If the regular expression or string is used
5258 * as the index doesn't match a position in the string, IndexError is
5259 * raised. If the regular expression form is used, the optional
5260 * second Integer allows you to specify which portion of the match to
5261 * replace (effectively using the MatchData indexing rules. The forms
5262 * that take an Integer will raise an IndexError if the value is out
5263 * of range; the Range form will raise a RangeError, and the Regexp
5264 * and String will raise an IndexError on negative match.
5268 rb_str_aset_m(int argc
, VALUE
*argv
, VALUE str
)
5271 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5272 rb_str_subpat_set(str
, argv
[0], argv
[1], argv
[2]);
5275 rb_str_splice(str
, NUM2LONG(argv
[0]), NUM2LONG(argv
[1]), argv
[2]);
5279 rb_check_arity(argc
, 2, 3);
5280 return rb_str_aset(str
, argv
[0], argv
[1]);
5285 * insert(index, other_string) -> self
5287 * Inserts the given +other_string+ into +self+; returns +self+.
5289 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5291 * 'foo'.insert(1, 'bar') # => "fbaroo"
5293 * If the \Integer +index+ is negative, counts backward from the end of +self+
5294 * and inserts +other_string+ at offset <tt>index+1</tt>
5295 * (that is, _after_ <tt>self[index]</tt>):
5297 * 'foo'.insert(-2, 'bar') # => "fobaro"
5302 rb_str_insert(VALUE str
, VALUE idx
, VALUE str2
)
5304 long pos
= NUM2LONG(idx
);
5307 return rb_str_append(str
, str2
);
5312 rb_str_splice(str
, pos
, 0, str2
);
5319 * slice!(index) -> new_string or nil
5320 * slice!(start, length) -> new_string or nil
5321 * slice!(range) -> new_string or nil
5322 * slice!(regexp, capture = 0) -> new_string or nil
5323 * slice!(substring) -> new_string or nil
5325 * Removes the substring of +self+ specified by the arguments;
5326 * returns the removed substring.
5328 * See String#[] for details about the arguments that specify the substring.
5332 * string = "This is a string"
5333 * string.slice!(2) #=> "i"
5334 * string.slice!(3..6) #=> " is "
5335 * string.slice!(/s.*t/) #=> "sa st"
5336 * string.slice!("r") #=> "r"
5337 * string #=> "Thing"
5342 rb_str_slice_bang(int argc
, VALUE
*argv
, VALUE str
)
5344 VALUE result
= Qnil
;
5349 rb_check_arity(argc
, 1, 2);
5350 str_modify_keep_cr(str
);
5352 if (RB_TYPE_P(indx
, T_REGEXP
)) {
5353 if (rb_reg_search(indx
, str
, 0, 0) < 0) return Qnil
;
5354 VALUE match
= rb_backref_get();
5355 struct re_registers
*regs
= RMATCH_REGS(match
);
5357 if (argc
> 1 && (nth
= rb_reg_backref_number(match
, argv
[1])) < 0) {
5358 if ((nth
+= regs
->num_regs
) <= 0) return Qnil
;
5360 else if (nth
>= regs
->num_regs
) return Qnil
;
5362 len
= END(nth
) - beg
;
5365 else if (argc
== 2) {
5366 beg
= NUM2LONG(indx
);
5367 len
= NUM2LONG(argv
[1]);
5370 else if (FIXNUM_P(indx
)) {
5371 beg
= FIX2LONG(indx
);
5372 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5373 if (!len
) return Qnil
;
5374 beg
= p
- RSTRING_PTR(str
);
5377 else if (RB_TYPE_P(indx
, T_STRING
)) {
5378 beg
= rb_str_index(str
, indx
, 0);
5379 if (beg
== -1) return Qnil
;
5380 len
= RSTRING_LEN(indx
);
5381 result
= str_duplicate(rb_cString
, indx
);
5385 switch (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 0)) {
5389 beg
= NUM2LONG(indx
);
5390 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5391 if (!len
) return Qnil
;
5392 beg
= p
- RSTRING_PTR(str
);
5400 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5401 beg
= p
- RSTRING_PTR(str
);
5404 result
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
5405 rb_enc_cr_str_copy_for_substr(result
, str
);
5410 rb_str_drop_bytes(str
, len
);
5413 char *sptr
= RSTRING_PTR(str
);
5414 long slen
= RSTRING_LEN(str
);
5415 if (beg
+ len
> slen
) /* pathological check */
5419 slen
- (beg
+ len
));
5421 STR_SET_LEN(str
, slen
);
5422 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5433 switch (OBJ_BUILTIN_TYPE(pat
)) {
5441 val
= rb_check_string_type(pat
);
5443 Check_Type(pat
, T_REGEXP
);
5448 return rb_reg_regcomp(pat
);
5452 get_pat_quoted(VALUE pat
, int check
)
5456 switch (OBJ_BUILTIN_TYPE(pat
)) {
5464 val
= rb_check_string_type(pat
);
5466 Check_Type(pat
, T_REGEXP
);
5470 if (check
&& is_broken_string(pat
)) {
5471 rb_exc_raise(rb_reg_check_preprocess(pat
));
5477 rb_pat_search(VALUE pat
, VALUE str
, long pos
, int set_backref_str
)
5479 if (BUILTIN_TYPE(pat
) == T_STRING
) {
5480 pos
= rb_strseq_index(str
, pat
, pos
, 1);
5481 if (set_backref_str
) {
5483 str
= rb_str_new_frozen_String(str
);
5484 rb_backref_set_string(str
, pos
, RSTRING_LEN(pat
));
5487 rb_backref_set(Qnil
);
5493 return rb_reg_search0(pat
, str
, pos
, 0, set_backref_str
);
5500 * sub!(pattern, replacement) -> self or nil
5501 * sub!(pattern) {|match| ... } -> self or nil
5503 * Returns +self+ with only the first occurrence
5504 * (not all occurrences) of the given +pattern+ replaced.
5506 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5508 * Related: String#sub, String#gsub, String#gsub!.
5513 rb_str_sub_bang(int argc
, VALUE
*argv
, VALUE str
)
5515 VALUE pat
, repl
, hash
= Qnil
;
5518 int min_arity
= rb_block_given_p() ? 1 : 2;
5521 rb_check_arity(argc
, min_arity
, 2);
5527 hash
= rb_check_hash_type(argv
[1]);
5533 pat
= get_pat_quoted(argv
[0], 1);
5535 str_modifiable(str
);
5536 beg
= rb_pat_search(pat
, str
, 0, 1);
5539 int cr
= ENC_CODERANGE(str
);
5541 VALUE match
, match0
= Qnil
;
5542 struct re_registers
*regs
;
5546 match
= rb_backref_get();
5547 regs
= RMATCH_REGS(match
);
5548 if (RB_TYPE_P(pat
, T_STRING
)) {
5550 end0
= beg0
+ RSTRING_LEN(pat
);
5556 if (iter
) match0
= rb_reg_nth_match(0, match
);
5559 if (iter
|| !NIL_P(hash
)) {
5560 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5563 repl
= rb_obj_as_string(rb_yield(match0
));
5566 repl
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5567 repl
= rb_obj_as_string(repl
);
5569 str_mod_check(str
, p
, len
);
5570 rb_check_frozen(str
);
5573 repl
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5576 enc
= rb_enc_compatible(str
, repl
);
5578 rb_encoding
*str_enc
= STR_ENC_GET(str
);
5579 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5580 if (coderange_scan(p
, beg0
, str_enc
) != ENC_CODERANGE_7BIT
||
5581 coderange_scan(p
+end0
, len
-end0
, str_enc
) != ENC_CODERANGE_7BIT
) {
5582 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
5583 rb_enc_name(str_enc
),
5584 rb_enc_name(STR_ENC_GET(repl
)));
5586 enc
= STR_ENC_GET(repl
);
5589 rb_enc_associate(str
, enc
);
5590 if (ENC_CODERANGE_UNKNOWN
< cr
&& cr
< ENC_CODERANGE_BROKEN
) {
5591 int cr2
= ENC_CODERANGE(repl
);
5592 if (cr2
== ENC_CODERANGE_BROKEN
||
5593 (cr
== ENC_CODERANGE_VALID
&& cr2
== ENC_CODERANGE_7BIT
))
5594 cr
= ENC_CODERANGE_UNKNOWN
;
5599 rlen
= RSTRING_LEN(repl
);
5600 len
= RSTRING_LEN(str
);
5602 RESIZE_CAPA(str
, len
+ rlen
- plen
);
5604 p
= RSTRING_PTR(str
);
5606 memmove(p
+ beg0
+ rlen
, p
+ beg0
+ plen
, len
- beg0
- plen
);
5608 rp
= RSTRING_PTR(repl
);
5609 memmove(p
+ beg0
, rp
, rlen
);
5611 STR_SET_LEN(str
, len
);
5612 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
5613 ENC_CODERANGE_SET(str
, cr
);
5623 * sub(pattern, replacement) -> new_string
5624 * sub(pattern) {|match| ... } -> new_string
5626 * Returns a copy of +self+ with only the first occurrence
5627 * (not all occurrences) of the given +pattern+ replaced.
5629 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5631 * Related: String#sub!, String#gsub, String#gsub!.
5636 rb_str_sub(int argc
, VALUE
*argv
, VALUE str
)
5638 str
= str_duplicate(rb_cString
, str
);
5639 rb_str_sub_bang(argc
, argv
, str
);
5644 str_gsub(int argc
, VALUE
*argv
, VALUE str
, int bang
)
5646 VALUE pat
, val
= Qnil
, repl
, match
, match0
= Qnil
, dest
, hash
= Qnil
;
5647 struct re_registers
*regs
;
5648 long beg
, beg0
, end0
;
5649 long offset
, blen
, slen
, len
, last
;
5650 enum {STR
, ITER
, MAP
} mode
= STR
;
5652 int need_backref
= -1;
5653 rb_encoding
*str_enc
;
5657 RETURN_ENUMERATOR(str
, argc
, argv
);
5662 hash
= rb_check_hash_type(argv
[1]);
5671 rb_error_arity(argc
, 1, 2);
5674 pat
= get_pat_quoted(argv
[0], 1);
5675 beg
= rb_pat_search(pat
, str
, 0, need_backref
);
5677 if (bang
) return Qnil
; /* no match, no substitution */
5678 return str_duplicate(rb_cString
, str
);
5682 blen
= RSTRING_LEN(str
) + 30; /* len + margin */
5683 dest
= rb_str_buf_new(blen
);
5684 sp
= RSTRING_PTR(str
);
5685 slen
= RSTRING_LEN(str
);
5687 str_enc
= STR_ENC_GET(str
);
5688 rb_enc_associate(dest
, str_enc
);
5689 ENC_CODERANGE_SET(dest
, rb_enc_asciicompat(str_enc
) ? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
);
5692 match
= rb_backref_get();
5693 regs
= RMATCH_REGS(match
);
5694 if (RB_TYPE_P(pat
, T_STRING
)) {
5696 end0
= beg0
+ RSTRING_LEN(pat
);
5702 if (mode
== ITER
) match0
= rb_reg_nth_match(0, match
);
5707 val
= rb_obj_as_string(rb_yield(match0
));
5710 val
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5711 val
= rb_obj_as_string(val
);
5713 str_mod_check(str
, sp
, slen
);
5714 if (val
== dest
) { /* paranoid check [ruby-dev:24827] */
5715 rb_raise(rb_eRuntimeError
, "block should not cheat");
5718 else if (need_backref
) {
5719 val
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5720 if (need_backref
< 0) {
5721 need_backref
= val
!= repl
;
5728 len
= beg0
- offset
; /* copy pre-match substr */
5730 rb_enc_str_buf_cat(dest
, cp
, len
, str_enc
);
5733 rb_str_buf_append(dest
, val
);
5739 * Always consume at least one character of the input string
5740 * in order to prevent infinite loops.
5742 if (RSTRING_LEN(str
) <= end0
) break;
5743 len
= rb_enc_fast_mbclen(RSTRING_PTR(str
)+end0
, RSTRING_END(str
), str_enc
);
5744 rb_enc_str_buf_cat(dest
, RSTRING_PTR(str
)+end0
, len
, str_enc
);
5745 offset
= end0
+ len
;
5747 cp
= RSTRING_PTR(str
) + offset
;
5748 if (offset
> RSTRING_LEN(str
)) break;
5749 beg
= rb_pat_search(pat
, str
, offset
, need_backref
);
5751 if (RSTRING_LEN(str
) > offset
) {
5752 rb_enc_str_buf_cat(dest
, cp
, RSTRING_LEN(str
) - offset
, str_enc
);
5754 rb_pat_search(pat
, str
, last
, 1);
5756 str_shared_replace(str
, dest
);
5768 * gsub!(pattern, replacement) -> self or nil
5769 * gsub!(pattern) {|match| ... } -> self or nil
5770 * gsub!(pattern) -> an_enumerator
5772 * Performs the specified substring replacement(s) on +self+;
5773 * returns +self+ if any replacement occurred, +nil+ otherwise.
5775 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5777 * Returns an Enumerator if no +replacement+ and no block given.
5779 * Related: String#sub, String#gsub, String#sub!.
5784 rb_str_gsub_bang(int argc
, VALUE
*argv
, VALUE str
)
5786 str_modify_keep_cr(str
);
5787 return str_gsub(argc
, argv
, str
, 1);
5793 * gsub(pattern, replacement) -> new_string
5794 * gsub(pattern) {|match| ... } -> new_string
5795 * gsub(pattern) -> enumerator
5797 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5799 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5801 * Returns an Enumerator if no +replacement+ and no block given.
5803 * Related: String#sub, String#sub!, String#gsub!.
5808 rb_str_gsub(int argc
, VALUE
*argv
, VALUE str
)
5810 return str_gsub(argc
, argv
, str
, 0);
5816 * replace(other_string) -> self
5818 * Replaces the contents of +self+ with the contents of +other_string+:
5820 * s = 'foo' # => "foo"
5821 * s.replace('bar') # => "bar"
5826 rb_str_replace(VALUE str
, VALUE str2
)
5828 str_modifiable(str
);
5829 if (str
== str2
) return str
;
5833 return str_replace(str
, str2
);
5840 * Removes the contents of +self+:
5842 * s = 'foo' # => "foo"
5848 rb_str_clear(VALUE str
)
5852 STR_SET_EMBED_LEN(str
, 0);
5853 RSTRING_PTR(str
)[0] = 0;
5854 if (rb_enc_asciicompat(STR_ENC_GET(str
)))
5855 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
5857 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5865 * Returns a string containing the first character of +self+:
5867 * s = 'foo' # => "foo"
5873 rb_str_chr(VALUE str
)
5875 return rb_str_substr(str
, 0, 1);
5880 * getbyte(index) -> integer
5882 * Returns the byte at zero-based +index+ as an integer:
5884 * s = 'abcde' # => "abcde"
5885 * s.getbyte(0) # => 97
5886 * s.getbyte(1) # => 98
5888 * Related: String#setbyte.
5891 rb_str_getbyte(VALUE str
, VALUE index
)
5893 long pos
= NUM2LONG(index
);
5896 pos
+= RSTRING_LEN(str
);
5897 if (pos
< 0 || RSTRING_LEN(str
) <= pos
)
5900 return INT2FIX((unsigned char)RSTRING_PTR(str
)[pos
]);
5905 * setbyte(index, integer) -> integer
5907 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5909 * s = 'abcde' # => "abcde"
5910 * s.setbyte(0, 98) # => 98
5913 * Related: String#getbyte.
5916 rb_str_setbyte(VALUE str
, VALUE index
, VALUE value
)
5918 long pos
= NUM2LONG(index
);
5919 long len
= RSTRING_LEN(str
);
5920 char *ptr
, *head
, *left
= 0;
5922 int cr
= ENC_CODERANGE_UNKNOWN
, width
, nlen
;
5924 if (pos
< -len
|| len
<= pos
)
5925 rb_raise(rb_eIndexError
, "index %ld out of string", pos
);
5929 VALUE v
= rb_to_int(value
);
5930 VALUE w
= rb_int_and(v
, INT2FIX(0xff));
5931 char byte
= (char)(NUM2INT(w
) & 0xFF);
5933 if (!str_independent(str
))
5934 str_make_independent(str
);
5935 enc
= STR_ENC_GET(str
);
5936 head
= RSTRING_PTR(str
);
5938 if (!STR_EMBED_P(str
)) {
5939 cr
= ENC_CODERANGE(str
);
5941 case ENC_CODERANGE_7BIT
:
5944 if (ISASCII(byte
)) goto end
;
5945 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5946 if (!MBCLEN_CHARFOUND_P(nlen
))
5947 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5949 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5951 case ENC_CODERANGE_VALID
:
5952 left
= rb_enc_left_char_head(head
, ptr
, head
+len
, enc
);
5953 width
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5955 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5956 if (!MBCLEN_CHARFOUND_P(nlen
))
5957 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5958 else if (MBCLEN_CHARFOUND_LEN(nlen
) != width
|| ISASCII(byte
))
5959 ENC_CODERANGE_CLEAR(str
);
5963 ENC_CODERANGE_CLEAR(str
);
5971 str_byte_substr(VALUE str
, long beg
, long len
, int empty
)
5973 char *p
, *s
= RSTRING_PTR(str
);
5974 long n
= RSTRING_LEN(str
);
5977 if (beg
> n
|| len
< 0) return Qnil
;
5980 if (beg
< 0) return Qnil
;
5985 if (!empty
) return Qnil
;
5992 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) && SHARABLE_SUBSTRING_P(beg
, len
, n
)) {
5993 str2
= rb_str_new_frozen(str
);
5994 str2
= str_new_shared(rb_cString
, str2
);
5995 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
5996 RSTRING(str2
)->as
.heap
.len
= len
;
5999 str2
= rb_str_new(p
, len
);
6002 str_enc_copy(str2
, str
);
6004 if (RSTRING_LEN(str2
) == 0) {
6005 if (!rb_enc_asciicompat(STR_ENC_GET(str
)))
6006 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_VALID
);
6008 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6011 switch (ENC_CODERANGE(str
)) {
6012 case ENC_CODERANGE_7BIT
:
6013 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6016 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_UNKNOWN
);
6025 str_byte_aref(VALUE str
, VALUE indx
)
6028 if (FIXNUM_P(indx
)) {
6029 idx
= FIX2LONG(indx
);
6032 /* check if indx is Range */
6033 long beg
, len
= RSTRING_LEN(str
);
6035 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
6041 return str_byte_substr(str
, beg
, len
, TRUE
);
6044 idx
= NUM2LONG(indx
);
6046 return str_byte_substr(str
, idx
, 1, FALSE
);
6051 * byteslice(index, length = 1) -> string or nil
6052 * byteslice(range) -> string or nil
6054 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6056 * With integer arguments +index+ and +length+ given,
6057 * returns the substring beginning at the given +index+
6058 * of the given +length+ (if possible),
6059 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6061 * s = '0123456789' # => "0123456789"
6062 * s.byteslice(2) # => "2"
6063 * s.byteslice(200) # => nil
6064 * s.byteslice(4, 3) # => "456"
6065 * s.byteslice(4, 30) # => "456789"
6066 * s.byteslice(4, -1) # => nil
6067 * s.byteslice(40, 2) # => nil
6069 * In either case above, counts backwards from the end of +self+
6070 * if +index+ is negative:
6072 * s = '0123456789' # => "0123456789"
6073 * s.byteslice(-4) # => "6"
6074 * s.byteslice(-4, 3) # => "678"
6076 * With Range argument +range+ given, returns
6077 * <tt>byteslice(range.begin, range.size)</tt>:
6079 * s = '0123456789' # => "0123456789"
6080 * s.byteslice(4..6) # => "456"
6081 * s.byteslice(-6..-4) # => "456"
6082 * s.byteslice(5..2) # => "" # range.size is zero.
6083 * s.byteslice(40..42) # => nil
6085 * In all cases, a returned string has the same encoding as +self+:
6087 * s.encoding # => #<Encoding:UTF-8>
6088 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6093 rb_str_byteslice(int argc
, VALUE
*argv
, VALUE str
)
6096 long beg
= NUM2LONG(argv
[0]);
6097 long end
= NUM2LONG(argv
[1]);
6098 return str_byte_substr(str
, beg
, end
, TRUE
);
6100 rb_check_arity(argc
, 1, 2);
6101 return str_byte_aref(str
, argv
[0]);
6108 * Returns a new string with the characters from +self+ in reverse order.
6110 * 'stressed'.reverse # => "desserts"
6115 rb_str_reverse(VALUE str
)
6122 if (RSTRING_LEN(str
) <= 1) return str_duplicate(rb_cString
, str
);
6123 enc
= STR_ENC_GET(str
);
6124 rev
= rb_str_new(0, RSTRING_LEN(str
));
6125 s
= RSTRING_PTR(str
); e
= RSTRING_END(str
);
6126 p
= RSTRING_END(rev
);
6127 cr
= ENC_CODERANGE(str
);
6129 if (RSTRING_LEN(str
) > 1) {
6130 if (single_byte_optimizable(str
)) {
6135 else if (cr
== ENC_CODERANGE_VALID
) {
6137 int clen
= rb_enc_fast_mbclen(s
, e
, enc
);
6145 cr
= rb_enc_asciicompat(enc
) ?
6146 ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
6148 int clen
= rb_enc_mbclen(s
, e
, enc
);
6150 if (clen
> 1 || (*s
& 0x80)) cr
= ENC_CODERANGE_UNKNOWN
;
6157 STR_SET_LEN(rev
, RSTRING_LEN(str
));
6158 str_enc_copy(rev
, str
);
6159 ENC_CODERANGE_SET(rev
, cr
);
6169 * Returns +self+ with its characters reversed:
6172 * s.reverse! # => "desserts"
6178 rb_str_reverse_bang(VALUE str
)
6180 if (RSTRING_LEN(str
) > 1) {
6181 if (single_byte_optimizable(str
)) {
6184 str_modify_keep_cr(str
);
6185 s
= RSTRING_PTR(str
);
6186 e
= RSTRING_END(str
) - 1;
6194 str_shared_replace(str
, rb_str_reverse(str
));
6198 str_modify_keep_cr(str
);
6206 * include? other_string -> true or false
6208 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6211 * s.include?('f') # => true
6212 * s.include?('fo') # => true
6213 * s.include?('food') # => false
6218 rb_str_include(VALUE str
, VALUE arg
)
6223 i
= rb_str_index(str
, arg
, 0);
6225 return RBOOL(i
!= -1);
6231 * to_i(base = 10) -> integer
6233 * Returns the result of interpreting leading characters in +self+
6234 * as an integer in the given +base+ (which must be in (2..36)):
6236 * '123456'.to_i # => 123456
6237 * '123def'.to_i(16) # => 1195503
6239 * Characters past a leading valid number (in the given +base+) are ignored:
6241 * '12.345'.to_i # => 12
6242 * '12345'.to_i(2) # => 1
6244 * Returns zero if there is no leading valid number:
6246 * 'abcdef'.to_i # => 0
6247 * '2'.to_i(2) # => 0
6252 rb_str_to_i(int argc
, VALUE
*argv
, VALUE str
)
6256 if (rb_check_arity(argc
, 0, 1) && (base
= NUM2INT(argv
[0])) < 0) {
6257 rb_raise(rb_eArgError
, "invalid radix %d", base
);
6259 return rb_str_to_inum(str
, base
, FALSE
);
6267 * Returns the result of interpreting leading characters in +self+ as a Float:
6269 * '3.14159'.to_f # => 3.14159
6270 '1.234e-2'.to_f # => 0.01234
6272 * Characters past a leading valid number (in the given +base+) are ignored:
6274 * '3.14 (pi to two places)'.to_f # => 3.14
6276 * Returns zero if there is no leading valid number:
6278 * 'abcdef'.to_f # => 0.0
6283 rb_str_to_f(VALUE str
)
6285 return DBL2NUM(rb_str_to_dbl(str
, FALSE
));
6291 * to_s -> self or string
6293 * Returns +self+ if +self+ is a \String,
6294 * or +self+ converted to a \String if +self+ is a subclass of \String.
6296 * String#to_str is an alias for String#to_s.
6301 rb_str_to_s(VALUE str
)
6303 if (rb_obj_class(str
) != rb_cString
) {
6304 return str_duplicate(rb_cString
, str
);
6311 str_cat_char(VALUE str
, unsigned int c
, rb_encoding
*enc
)
6313 char s
[RUBY_MAX_CHAR_LEN
];
6314 int n
= rb_enc_codelen(c
, enc
);
6316 rb_enc_mbcput(c
, s
, enc
);
6317 rb_enc_str_buf_cat(str
, s
, n
, enc
);
6321 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6324 rb_str_buf_cat_escaped_char(VALUE result
, unsigned int c
, int unicode_p
)
6326 char buf
[CHAR_ESC_LEN
+ 1];
6333 if (c
< 0x7F && ISPRINT(c
)) {
6334 snprintf(buf
, CHAR_ESC_LEN
, "%c", c
);
6336 else if (c
< 0x10000) {
6337 snprintf(buf
, CHAR_ESC_LEN
, "\\u%04X", c
);
6340 snprintf(buf
, CHAR_ESC_LEN
, "\\u{%X}", c
);
6345 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", c
);
6348 snprintf(buf
, CHAR_ESC_LEN
, "\\x{%X}", c
);
6351 l
= (int)strlen(buf
); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6352 rb_str_buf_cat(result
, buf
, l
);
6357 ruby_escaped_char(int c
)
6360 case '\0': return "\\0";
6361 case '\n': return "\\n";
6362 case '\r': return "\\r";
6363 case '\t': return "\\t";
6364 case '\f': return "\\f";
6365 case '\013': return "\\v";
6366 case '\010': return "\\b";
6367 case '\007': return "\\a";
6368 case '\033': return "\\e";
6369 case '\x7f': return "\\c?";
6375 rb_str_escape(VALUE str
)
6377 int encidx
= ENCODING_GET(str
);
6378 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6379 const char *p
= RSTRING_PTR(str
);
6380 const char *pend
= RSTRING_END(str
);
6381 const char *prev
= p
;
6382 char buf
[CHAR_ESC_LEN
+ 1];
6383 VALUE result
= rb_str_buf_new(0);
6384 int unicode_p
= rb_enc_unicode_p(enc
);
6385 int asciicompat
= rb_enc_asciicompat(enc
);
6390 int n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6391 if (!MBCLEN_CHARFOUND_P(n
)) {
6392 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6393 n
= rb_enc_mbminlen(enc
);
6395 n
= (int)(pend
- p
);
6397 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6398 str_buf_cat(result
, buf
, strlen(buf
));
6403 n
= MBCLEN_CHARFOUND_LEN(n
);
6404 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6406 cc
= ruby_escaped_char(c
);
6408 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6409 str_buf_cat(result
, cc
, strlen(cc
));
6412 else if (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
)) {
6415 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6416 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6420 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6421 ENCODING_CODERANGE_SET(result
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
6430 * Returns a printable version of +self+, enclosed in double-quotes,
6431 * and with special characters escaped:
6433 * s = "foo\tbar\tbaz\n"
6434 * # => "foo\tbar\tbaz\n"
6436 * # => "\"foo\\tbar\\tbaz\\n\""
6441 rb_str_inspect(VALUE str
)
6443 int encidx
= ENCODING_GET(str
);
6444 rb_encoding
*enc
= rb_enc_from_index(encidx
), *actenc
;
6445 const char *p
, *pend
, *prev
;
6446 char buf
[CHAR_ESC_LEN
+ 1];
6447 VALUE result
= rb_str_buf_new(0);
6448 rb_encoding
*resenc
= rb_default_internal_encoding();
6449 int unicode_p
= rb_enc_unicode_p(enc
);
6450 int asciicompat
= rb_enc_asciicompat(enc
);
6452 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
6453 if (!rb_enc_asciicompat(resenc
)) resenc
= rb_usascii_encoding();
6454 rb_enc_associate(result
, resenc
);
6455 str_buf_cat2(result
, "\"");
6457 p
= RSTRING_PTR(str
); pend
= RSTRING_END(str
);
6459 actenc
= get_actual_encoding(encidx
, str
);
6460 if (actenc
!= enc
) {
6462 if (unicode_p
) unicode_p
= rb_enc_unicode_p(enc
);
6468 n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6469 if (!MBCLEN_CHARFOUND_P(n
)) {
6470 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6471 n
= rb_enc_mbminlen(enc
);
6473 n
= (int)(pend
- p
);
6475 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6476 str_buf_cat(result
, buf
, strlen(buf
));
6481 n
= MBCLEN_CHARFOUND_LEN(n
);
6482 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6484 if ((asciicompat
|| unicode_p
) &&
6485 (c
== '"'|| c
== '\\' ||
6488 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p
,pend
,enc
)) &&
6489 (cc
= rb_enc_codepoint(p
,pend
,enc
),
6490 (cc
== '$' || cc
== '@' || cc
== '{'))))) {
6491 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6492 str_buf_cat2(result
, "\\");
6493 if (asciicompat
|| enc
== resenc
) {
6499 case '\n': cc
= 'n'; break;
6500 case '\r': cc
= 'r'; break;
6501 case '\t': cc
= 't'; break;
6502 case '\f': cc
= 'f'; break;
6503 case '\013': cc
= 'v'; break;
6504 case '\010': cc
= 'b'; break;
6505 case '\007': cc
= 'a'; break;
6506 case 033: cc
= 'e'; break;
6507 default: cc
= 0; break;
6510 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6513 str_buf_cat(result
, buf
, 2);
6517 if ((enc
== resenc
&& rb_enc_isprint(c
, enc
)) ||
6518 (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
))) {
6522 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6523 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6528 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6529 str_buf_cat2(result
, "\"");
6534 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6540 * Returns a printable version of +self+, enclosed in double-quotes,
6541 * with special characters escaped, and with non-printing characters
6542 * replaced by hexadecimal notation:
6544 * "hello \n ''".dump # => "\"hello \\n ''\""
6545 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6547 * Related: String#undump (inverse of String#dump).
6552 rb_str_dump(VALUE str
)
6554 int encidx
= rb_enc_get_index(str
);
6555 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6557 const char *p
, *pend
;
6560 int u8
= (encidx
== rb_utf8_encindex());
6561 static const char nonascii_suffix
[] = ".dup.force_encoding(\"%s\")";
6564 if (!rb_enc_asciicompat(enc
)) {
6565 len
+= strlen(nonascii_suffix
) - rb_strlen_lit("%s");
6566 len
+= strlen(enc
->name
);
6569 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6572 unsigned char c
= *p
++;
6575 case '"': case '\\':
6576 case '\n': case '\r':
6577 case '\t': case '\f':
6578 case '\013': case '\010': case '\007': case '\033':
6583 clen
= IS_EVSTR(p
, pend
) ? 2 : 1;
6591 if (u8
&& c
> 0x7F) { /* \u notation */
6592 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
);
6593 if (MBCLEN_CHARFOUND_P(n
)) {
6594 unsigned int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6596 clen
= 6; /* \uXXXX */
6597 else if (cc
<= 0xFFFFF)
6598 clen
= 9; /* \u{XXXXX} */
6600 clen
= 10; /* \u{XXXXXX} */
6601 p
+= MBCLEN_CHARFOUND_LEN(n
)-1;
6605 clen
= 4; /* \xNN */
6610 if (clen
> LONG_MAX
- len
) {
6611 rb_raise(rb_eRuntimeError
, "string size too big");
6616 result
= rb_str_new(0, len
);
6617 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6618 q
= RSTRING_PTR(result
); qend
= q
+ len
+ 1;
6622 unsigned char c
= *p
++;
6624 if (c
== '"' || c
== '\\') {
6628 else if (c
== '#') {
6629 if (IS_EVSTR(p
, pend
)) *q
++ = '\\';
6632 else if (c
== '\n') {
6636 else if (c
== '\r') {
6640 else if (c
== '\t') {
6644 else if (c
== '\f') {
6648 else if (c
== '\013') {
6652 else if (c
== '\010') {
6656 else if (c
== '\007') {
6660 else if (c
== '\033') {
6664 else if (ISPRINT(c
)) {
6670 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
) - 1;
6671 if (MBCLEN_CHARFOUND_P(n
)) {
6672 int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6675 snprintf(q
, qend
-q
, "u%04X", cc
); /* \uXXXX */
6677 snprintf(q
, qend
-q
, "u{%X}", cc
); /* \u{XXXXX} or \u{XXXXXX} */
6682 snprintf(q
, qend
-q
, "x%02X", c
);
6688 if (!rb_enc_asciicompat(enc
)) {
6689 snprintf(q
, qend
-q
, nonascii_suffix
, enc
->name
);
6690 encidx
= rb_ascii8bit_encindex();
6692 /* result from dump is ASCII */
6693 rb_enc_associate_index(result
, encidx
);
6694 ENC_CODERANGE_SET(result
, ENC_CODERANGE_7BIT
);
6699 unescape_ascii(unsigned int c
)
6719 UNREACHABLE_RETURN(-1);
6723 undump_after_backslash(VALUE undumped
, const char **ss
, const char *s_end
, rb_encoding
**penc
, bool *utf8
, bool *binary
)
6725 const char *s
= *ss
;
6729 unsigned char buf
[6];
6730 static rb_encoding
*enc_utf8
= NULL
;
6736 rb_str_cat(undumped
, s
, 1); /* cat itself */
6747 *buf
= unescape_ascii(*s
);
6748 rb_str_cat(undumped
, (char *)buf
, 1);
6753 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6757 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6759 if (enc_utf8
== NULL
) enc_utf8
= rb_utf8_encoding();
6760 if (*penc
!= enc_utf8
) {
6762 rb_enc_associate(undumped
, enc_utf8
);
6764 if (*s
== '{') { /* handle \u{...} form */
6768 rb_raise(rb_eRuntimeError
, "unterminated Unicode escape");
6778 c
= scan_hex(s
, s_end
-s
, &hexlen
);
6779 if (hexlen
== 0 || hexlen
> 6) {
6780 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6783 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint (too large)");
6785 if (0xd800 <= c
&& c
<= 0xdfff) {
6786 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6788 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6789 rb_str_cat(undumped
, (char *)buf
, codelen
);
6793 else { /* handle \uXXXX form */
6794 c
= scan_hex(s
, 4, &hexlen
);
6796 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6798 if (0xd800 <= c
&& c
<= 0xdfff) {
6799 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6801 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6802 rb_str_cat(undumped
, (char *)buf
, codelen
);
6808 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6812 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6814 *buf
= scan_hex(s
, 2, &hexlen
);
6816 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6818 rb_str_cat(undumped
, (char *)buf
, 1);
6822 rb_str_cat(undumped
, s
-1, 2);
6829 static VALUE
rb_str_is_ascii_only_p(VALUE str
);
6835 * Returns an unescaped version of +self+:
6837 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6838 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6839 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6840 * s_undumped == s_orig # => true
6842 * Related: String#dump (inverse of String#undump).
6847 str_undump(VALUE str
)
6849 const char *s
= RSTRING_PTR(str
);
6850 const char *s_end
= RSTRING_END(str
);
6851 rb_encoding
*enc
= rb_enc_get(str
);
6852 VALUE undumped
= rb_enc_str_new(s
, 0L, enc
);
6854 bool binary
= false;
6857 rb_must_asciicompat(str
);
6858 if (rb_str_is_ascii_only_p(str
) == Qfalse
) {
6859 rb_raise(rb_eRuntimeError
, "non-ASCII character detected");
6861 if (!str_null_check(str
, &w
)) {
6862 rb_raise(rb_eRuntimeError
, "string contains null byte");
6864 if (RSTRING_LEN(str
) < 2) goto invalid_format
;
6865 if (*s
!= '"') goto invalid_format
;
6867 /* strip '"' at the start */
6872 rb_raise(rb_eRuntimeError
, "unterminated dumped string");
6879 /* ascii compatible dumped string */
6883 static const char force_encoding_suffix
[] = ".force_encoding(\""; /* "\")" */
6884 static const char dup_suffix
[] = ".dup";
6885 const char *encname
;
6889 /* check separately for strings dumped by older versions */
6890 size
= sizeof(dup_suffix
) - 1;
6891 if (s_end
- s
> size
&& memcmp(s
, dup_suffix
, size
) == 0) s
+= size
;
6893 size
= sizeof(force_encoding_suffix
) - 1;
6894 if (s_end
- s
<= size
) goto invalid_format
;
6895 if (memcmp(s
, force_encoding_suffix
, size
) != 0) goto invalid_format
;
6899 rb_raise(rb_eRuntimeError
, "dumped string contained Unicode escape but used force_encoding");
6903 s
= memchr(s
, '"', s_end
-s
);
6905 if (!s
) goto invalid_format
;
6906 if (s_end
- s
!= 2) goto invalid_format
;
6907 if (s
[0] != '"' || s
[1] != ')') goto invalid_format
;
6909 encidx
= rb_enc_find_index2(encname
, (long)size
);
6911 rb_raise(rb_eRuntimeError
, "dumped string has unknown encoding name");
6913 rb_enc_associate_index(undumped
, encidx
);
6921 rb_raise(rb_eRuntimeError
, "invalid escape");
6923 undump_after_backslash(undumped
, &s
, s_end
, &enc
, &utf8
, &binary
);
6926 rb_str_cat(undumped
, s
++, 1);
6932 rb_raise(rb_eRuntimeError
, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6936 rb_str_check_dummy_enc(rb_encoding
*enc
)
6938 if (rb_enc_dummy_p(enc
)) {
6939 rb_raise(rb_eEncCompatError
, "incompatible encoding with this operation: %s",
6944 static rb_encoding
*
6945 str_true_enc(VALUE str
)
6947 rb_encoding
*enc
= STR_ENC_GET(str
);
6948 rb_str_check_dummy_enc(enc
);
6952 static OnigCaseFoldType
6953 check_case_options(int argc
, VALUE
*argv
, OnigCaseFoldType flags
)
6958 rb_raise(rb_eArgError
, "too many options");
6959 if (argv
[0]==sym_turkic
) {
6960 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6962 if (argv
[1]==sym_lithuanian
)
6963 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6965 rb_raise(rb_eArgError
, "invalid second option");
6968 else if (argv
[0]==sym_lithuanian
) {
6969 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6971 if (argv
[1]==sym_turkic
)
6972 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6974 rb_raise(rb_eArgError
, "invalid second option");
6978 rb_raise(rb_eArgError
, "too many options");
6979 else if (argv
[0]==sym_ascii
)
6980 flags
|= ONIGENC_CASE_ASCII_ONLY
;
6981 else if (argv
[0]==sym_fold
) {
6982 if ((flags
& (ONIGENC_CASE_UPCASE
|ONIGENC_CASE_DOWNCASE
)) == ONIGENC_CASE_DOWNCASE
)
6983 flags
^= ONIGENC_CASE_FOLD
|ONIGENC_CASE_DOWNCASE
;
6985 rb_raise(rb_eArgError
, "option :fold only allowed for downcasing");
6988 rb_raise(rb_eArgError
, "invalid option");
6993 case_option_single_p(OnigCaseFoldType flags
, rb_encoding
*enc
, VALUE str
)
6995 if ((flags
& ONIGENC_CASE_ASCII_ONLY
) && (enc
==rb_utf8_encoding() || rb_enc_mbmaxlen(enc
) == 1))
6997 return !(flags
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) && ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
;
7000 /* 16 should be long enough to absorb any kind of single character length increase */
7001 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7002 #ifndef CASEMAP_DEBUG
7003 # define CASEMAP_DEBUG 0
7006 struct mapping_buffer
;
7007 typedef struct mapping_buffer
{
7010 struct mapping_buffer
*next
;
7011 OnigUChar space
[FLEX_ARY_LEN
];
7015 mapping_buffer_free(void *p
)
7017 mapping_buffer
*previous_buffer
;
7018 mapping_buffer
*current_buffer
= p
;
7019 while (current_buffer
) {
7020 previous_buffer
= current_buffer
;
7021 current_buffer
= current_buffer
->next
;
7022 ruby_sized_xfree(previous_buffer
, previous_buffer
->capa
);
7026 static const rb_data_type_t mapping_buffer_type
= {
7028 {0, mapping_buffer_free
,}
7032 rb_str_casemap(VALUE source
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7036 const OnigUChar
*source_current
, *source_end
;
7037 int target_length
= 0;
7038 VALUE buffer_anchor
;
7039 mapping_buffer
*current_buffer
= 0;
7040 mapping_buffer
**pre_buffer
;
7041 size_t buffer_count
= 0;
7042 int buffer_length_or_invalid
;
7044 if (RSTRING_LEN(source
) == 0) return str_duplicate(rb_cString
, source
);
7046 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7047 source_end
= (OnigUChar
*)RSTRING_END(source
);
7049 buffer_anchor
= TypedData_Wrap_Struct(0, &mapping_buffer_type
, 0);
7050 pre_buffer
= (mapping_buffer
**)&DATA_PTR(buffer_anchor
);
7051 while (source_current
< source_end
) {
7052 /* increase multiplier using buffer count to converge quickly */
7053 size_t capa
= (size_t)(source_end
-source_current
)*++buffer_count
+ CASE_MAPPING_ADDITIONAL_LENGTH
;
7054 if (CASEMAP_DEBUG
) {
7055 fprintf(stderr
, "Buffer allocation, capa is %"PRIuSIZE
"\n", capa
); /* for tuning */
7057 current_buffer
= xmalloc(offsetof(mapping_buffer
, space
) + capa
);
7058 *pre_buffer
= current_buffer
;
7059 pre_buffer
= ¤t_buffer
->next
;
7060 current_buffer
->next
= NULL
;
7061 current_buffer
->capa
= capa
;
7062 buffer_length_or_invalid
= enc
->case_map(flags
,
7063 &source_current
, source_end
,
7064 current_buffer
->space
,
7065 current_buffer
->space
+current_buffer
->capa
,
7067 if (buffer_length_or_invalid
< 0) {
7068 current_buffer
= DATA_PTR(buffer_anchor
);
7069 DATA_PTR(buffer_anchor
) = 0;
7070 mapping_buffer_free(current_buffer
);
7071 rb_raise(rb_eArgError
, "input string invalid");
7073 target_length
+= current_buffer
->used
= buffer_length_or_invalid
;
7075 if (CASEMAP_DEBUG
) {
7076 fprintf(stderr
, "Buffer count is %"PRIuSIZE
"\n", buffer_count
); /* for tuning */
7079 if (buffer_count
==1) {
7080 target
= rb_str_new((const char*)current_buffer
->space
, target_length
);
7083 char *target_current
;
7085 target
= rb_str_new(0, target_length
);
7086 target_current
= RSTRING_PTR(target
);
7087 current_buffer
= DATA_PTR(buffer_anchor
);
7088 while (current_buffer
) {
7089 memcpy(target_current
, current_buffer
->space
, current_buffer
->used
);
7090 target_current
+= current_buffer
->used
;
7091 current_buffer
= current_buffer
->next
;
7094 current_buffer
= DATA_PTR(buffer_anchor
);
7095 DATA_PTR(buffer_anchor
) = 0;
7096 mapping_buffer_free(current_buffer
);
7098 /* TODO: check about string terminator character */
7099 str_enc_copy(target
, source
);
7100 /*ENC_CODERANGE_SET(mapped, cr);*/
7106 rb_str_ascii_casemap(VALUE source
, VALUE target
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7108 const OnigUChar
*source_current
, *source_end
;
7109 OnigUChar
*target_current
, *target_end
;
7110 long old_length
= RSTRING_LEN(source
);
7111 int length_or_invalid
;
7113 if (old_length
== 0) return Qnil
;
7115 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7116 source_end
= (OnigUChar
*)RSTRING_END(source
);
7117 if (source
== target
) {
7118 target_current
= (OnigUChar
*)source_current
;
7119 target_end
= (OnigUChar
*)source_end
;
7122 target_current
= (OnigUChar
*)RSTRING_PTR(target
);
7123 target_end
= (OnigUChar
*)RSTRING_END(target
);
7126 length_or_invalid
= onigenc_ascii_only_case_map(flags
,
7127 &source_current
, source_end
,
7128 target_current
, target_end
, enc
);
7129 if (length_or_invalid
< 0)
7130 rb_raise(rb_eArgError
, "input string invalid");
7131 if (CASEMAP_DEBUG
&& length_or_invalid
!= old_length
) {
7132 fprintf(stderr
, "problem with rb_str_ascii_casemap"
7133 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7134 rb_raise(rb_eArgError
, "internal problem with rb_str_ascii_casemap"
7135 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7138 str_enc_copy(target
, source
);
7144 upcase_single(VALUE str
)
7146 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7147 bool modified
= false;
7150 unsigned int c
= *(unsigned char*)s
;
7152 if ('a' <= c
&& c
<= 'z') {
7153 *s
= 'A' + (c
- 'a');
7163 * upcase!(*options) -> self or nil
7165 * Upcases the characters in +self+;
7166 * returns +self+ if any changes were made, +nil+ otherwise:
7168 * s = 'Hello World!' # => "Hello World!"
7169 * s.upcase! # => "HELLO WORLD!"
7170 * s # => "HELLO WORLD!"
7171 * s.upcase! # => nil
7173 * The casing may be affected by the given +options+;
7174 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7176 * Related: String#upcase, String#downcase, String#downcase!.
7181 rb_str_upcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7184 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7186 flags
= check_case_options(argc
, argv
, flags
);
7187 str_modify_keep_cr(str
);
7188 enc
= str_true_enc(str
);
7189 if (case_option_single_p(flags
, enc
, str
)) {
7190 if (upcase_single(str
))
7191 flags
|= ONIGENC_CASE_MODIFIED
;
7193 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7194 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7196 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7198 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7205 * upcase(*options) -> string
7207 * Returns a string containing the upcased characters in +self+:
7209 * s = 'Hello World!' # => "Hello World!"
7210 * s.upcase # => "HELLO WORLD!"
7212 * The casing may be affected by the given +options+;
7213 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7215 * Related: String#upcase!, String#downcase, String#downcase!.
7220 rb_str_upcase(int argc
, VALUE
*argv
, VALUE str
)
7223 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7226 flags
= check_case_options(argc
, argv
, flags
);
7227 enc
= str_true_enc(str
);
7228 if (case_option_single_p(flags
, enc
, str
)) {
7229 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7230 str_enc_copy(ret
, str
);
7233 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7234 ret
= rb_str_new(0, RSTRING_LEN(str
));
7235 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7238 ret
= rb_str_casemap(str
, &flags
, enc
);
7245 downcase_single(VALUE str
)
7247 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7248 bool modified
= false;
7251 unsigned int c
= *(unsigned char*)s
;
7253 if ('A' <= c
&& c
<= 'Z') {
7254 *s
= 'a' + (c
- 'A');
7265 * downcase!(*options) -> self or nil
7267 * Downcases the characters in +self+;
7268 * returns +self+ if any changes were made, +nil+ otherwise:
7270 * s = 'Hello World!' # => "Hello World!"
7271 * s.downcase! # => "hello world!"
7272 * s # => "hello world!"
7273 * s.downcase! # => nil
7275 * The casing may be affected by the given +options+;
7276 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7278 * Related: String#downcase, String#upcase, String#upcase!.
7283 rb_str_downcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7286 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7288 flags
= check_case_options(argc
, argv
, flags
);
7289 str_modify_keep_cr(str
);
7290 enc
= str_true_enc(str
);
7291 if (case_option_single_p(flags
, enc
, str
)) {
7292 if (downcase_single(str
))
7293 flags
|= ONIGENC_CASE_MODIFIED
;
7295 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7296 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7298 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7300 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7307 * downcase(*options) -> string
7309 * Returns a string containing the downcased characters in +self+:
7311 * s = 'Hello World!' # => "Hello World!"
7312 * s.downcase # => "hello world!"
7314 * The casing may be affected by the given +options+;
7315 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7317 * Related: String#downcase!, String#upcase, String#upcase!.
7322 rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
)
7325 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7328 flags
= check_case_options(argc
, argv
, flags
);
7329 enc
= str_true_enc(str
);
7330 if (case_option_single_p(flags
, enc
, str
)) {
7331 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7332 str_enc_copy(ret
, str
);
7333 downcase_single(ret
);
7335 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7336 ret
= rb_str_new(0, RSTRING_LEN(str
));
7337 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7340 ret
= rb_str_casemap(str
, &flags
, enc
);
7349 * capitalize!(*options) -> self or nil
7351 * Upcases the first character in +self+;
7352 * downcases the remaining characters;
7353 * returns +self+ if any changes were made, +nil+ otherwise:
7355 * s = 'hello World!' # => "hello World!"
7356 * s.capitalize! # => "Hello world!"
7357 * s # => "Hello world!"
7358 * s.capitalize! # => nil
7360 * The casing may be affected by the given +options+;
7361 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7363 * Related: String#capitalize.
7368 rb_str_capitalize_bang(int argc
, VALUE
*argv
, VALUE str
)
7371 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7373 flags
= check_case_options(argc
, argv
, flags
);
7374 str_modify_keep_cr(str
);
7375 enc
= str_true_enc(str
);
7376 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7377 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7378 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7380 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7382 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7389 * capitalize(*options) -> string
7391 * Returns a string containing the characters in +self+;
7392 * the first character is upcased;
7393 * the remaining characters are downcased:
7395 * s = 'hello World!' # => "hello World!"
7396 * s.capitalize # => "Hello world!"
7398 * The casing may be affected by the given +options+;
7399 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7401 * Related: String#capitalize!.
7406 rb_str_capitalize(int argc
, VALUE
*argv
, VALUE str
)
7409 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7412 flags
= check_case_options(argc
, argv
, flags
);
7413 enc
= str_true_enc(str
);
7414 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str
;
7415 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7416 ret
= rb_str_new(0, RSTRING_LEN(str
));
7417 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7420 ret
= rb_str_casemap(str
, &flags
, enc
);
7428 * swapcase!(*options) -> self or nil
7430 * Upcases each lowercase character in +self+;
7431 * downcases uppercase character;
7432 * returns +self+ if any changes were made, +nil+ otherwise:
7434 * s = 'Hello World!' # => "Hello World!"
7435 * s.swapcase! # => "hELLO wORLD!"
7436 * s # => "Hello World!"
7437 * ''.swapcase! # => nil
7439 * The casing may be affected by the given +options+;
7440 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7442 * Related: String#swapcase.
7447 rb_str_swapcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7450 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7452 flags
= check_case_options(argc
, argv
, flags
);
7453 str_modify_keep_cr(str
);
7454 enc
= str_true_enc(str
);
7455 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7456 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7458 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7460 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7467 * swapcase(*options) -> string
7469 * Returns a string containing the characters in +self+, with cases reversed;
7470 * each uppercase character is downcased;
7471 * each lowercase character is upcased:
7473 * s = 'Hello World!' # => "Hello World!"
7474 * s.swapcase # => "hELLO wORLD!"
7476 * The casing may be affected by the given +options+;
7477 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7479 * Related: String#swapcase!.
7484 rb_str_swapcase(int argc
, VALUE
*argv
, VALUE str
)
7487 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7490 flags
= check_case_options(argc
, argv
, flags
);
7491 enc
= str_true_enc(str
);
7492 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str_duplicate(rb_cString
, str
);
7493 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7494 ret
= rb_str_new(0, RSTRING_LEN(str
));
7495 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7498 ret
= rb_str_casemap(str
, &flags
, enc
);
7503 typedef unsigned char *USTR
;
7507 unsigned int now
, max
;
7512 trnext(struct tr
*t
, rb_encoding
*enc
)
7519 if (t
->p
== t
->pend
) return -1;
7520 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '\\' && t
->p
+ n
< t
->pend
) {
7523 t
->now
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7525 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '-' && t
->p
+ n
< t
->pend
) {
7527 if (t
->p
< t
->pend
) {
7528 unsigned int c
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7531 if (t
->now
< 0x80 && c
< 0x80) {
7532 rb_raise(rb_eArgError
,
7533 "invalid range \"%c-%c\" in string transliteration",
7537 rb_raise(rb_eArgError
, "invalid range in string transliteration");
7539 continue; /* not reached */
7548 while (ONIGENC_CODE_TO_MBCLEN(enc
, ++t
->now
) <= 0) {
7549 if (t
->now
== t
->max
) {
7554 if (t
->now
< t
->max
) {
7565 static VALUE
rb_str_delete_bang(int,VALUE
*,VALUE
);
7568 tr_trans(VALUE str
, VALUE src
, VALUE repl
, int sflag
)
7570 const unsigned int errc
= -1;
7571 unsigned int trans
[256];
7572 rb_encoding
*enc
, *e1
, *e2
;
7573 struct tr trsrc
, trrepl
;
7575 unsigned int c
, c0
, last
= 0;
7576 int modify
= 0, i
, l
;
7577 unsigned char *s
, *send
;
7579 int singlebyte
= single_byte_optimizable(str
);
7583 #define CHECK_IF_ASCII(c) \
7584 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7585 (cr = ENC_CODERANGE_VALID) : 0)
7589 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7590 if (RSTRING_LEN(repl
) == 0) {
7591 return rb_str_delete_bang(1, &src
, str
);
7594 cr
= ENC_CODERANGE(str
);
7595 e1
= rb_enc_check(str
, src
);
7596 e2
= rb_enc_check(str
, repl
);
7601 enc
= rb_enc_check(src
, repl
);
7603 trsrc
.p
= RSTRING_PTR(src
); trsrc
.pend
= trsrc
.p
+ RSTRING_LEN(src
);
7604 if (RSTRING_LEN(src
) > 1 &&
7605 rb_enc_ascget(trsrc
.p
, trsrc
.pend
, &l
, enc
) == '^' &&
7606 trsrc
.p
+ l
< trsrc
.pend
) {
7610 trrepl
.p
= RSTRING_PTR(repl
);
7611 trrepl
.pend
= trrepl
.p
+ RSTRING_LEN(repl
);
7612 trsrc
.gen
= trrepl
.gen
= 0;
7613 trsrc
.now
= trrepl
.now
= 0;
7614 trsrc
.max
= trrepl
.max
= 0;
7617 for (i
=0; i
<256; i
++) {
7620 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7625 if (!hash
) hash
= rb_hash_new();
7626 rb_hash_aset(hash
, UINT2NUM(c
), Qtrue
);
7629 while ((c
= trnext(&trrepl
, enc
)) != errc
)
7630 /* retrieve last replacer */;
7632 for (i
=0; i
<256; i
++) {
7633 if (trans
[i
] != errc
) {
7641 for (i
=0; i
<256; i
++) {
7644 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7645 r
= trnext(&trrepl
, enc
);
7646 if (r
== errc
) r
= trrepl
.now
;
7649 if (rb_enc_codelen(r
, enc
) != 1) singlebyte
= 0;
7652 if (!hash
) hash
= rb_hash_new();
7653 rb_hash_aset(hash
, UINT2NUM(c
), UINT2NUM(r
));
7658 if (cr
== ENC_CODERANGE_VALID
&& rb_enc_asciicompat(e1
))
7659 cr
= ENC_CODERANGE_7BIT
;
7660 str_modify_keep_cr(str
);
7661 s
= (unsigned char *)RSTRING_PTR(str
); send
= (unsigned char *)RSTRING_END(str
);
7662 termlen
= rb_enc_mbminlen(enc
);
7665 long offset
, max
= RSTRING_LEN(str
);
7666 unsigned int save
= -1;
7667 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7672 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7673 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7680 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7682 if (cflag
) c
= last
;
7685 else if (cflag
) c
= errc
;
7686 else c
= NUM2INT(tmp
);
7691 if (c
!= (unsigned int)-1) {
7697 tlen
= rb_enc_codelen(c
, enc
);
7703 if (enc
!= e1
) may_modify
= 1;
7705 if ((offset
= t
- buf
) + tlen
> max
) {
7706 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7707 max
= offset
+ tlen
+ (send
- s
);
7708 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7711 rb_enc_mbcput(c
, t
, enc
);
7712 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7718 if (!STR_EMBED_P(str
)) {
7719 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7721 TERM_FILL((char *)t
, termlen
);
7722 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7723 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7724 STR_SET_NOEMBED(str
);
7725 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7727 else if (rb_enc_mbmaxlen(enc
) == 1 || (singlebyte
&& !hash
)) {
7729 c
= (unsigned char)*s
;
7730 if (trans
[c
] != errc
) {
7747 long offset
, max
= (long)((send
- s
) * 1.2);
7748 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7752 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7753 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7759 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7761 if (cflag
) c
= last
;
7764 else if (cflag
) c
= errc
;
7765 else c
= NUM2INT(tmp
);
7768 c
= cflag
? last
: errc
;
7771 tlen
= rb_enc_codelen(c
, enc
);
7776 if (enc
!= e1
) may_modify
= 1;
7778 if ((offset
= t
- buf
) + tlen
> max
) {
7779 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7780 max
= offset
+ tlen
+ (long)((send
- s
) * 1.2);
7781 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7785 rb_enc_mbcput(c
, t
, enc
);
7786 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7794 if (!STR_EMBED_P(str
)) {
7795 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7797 TERM_FILL((char *)t
, termlen
);
7798 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7799 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7800 STR_SET_NOEMBED(str
);
7801 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7805 if (cr
!= ENC_CODERANGE_BROKEN
)
7806 ENC_CODERANGE_SET(str
, cr
);
7807 rb_enc_associate(str
, enc
);
7816 * str.tr!(from_str, to_str) -> str or nil
7818 * Translates <i>str</i> in place, using the same rules as
7819 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7824 rb_str_tr_bang(VALUE str
, VALUE src
, VALUE repl
)
7826 return tr_trans(str
, src
, repl
, 0);
7832 * str.tr(from_str, to_str) => new_str
7834 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7835 * corresponding characters in +to_str+. If +to_str+ is shorter than
7836 * +from_str+, it is padded with its last character in order to maintain the
7839 * "hello".tr('el', 'ip') #=> "hippo"
7840 * "hello".tr('aeiou', '*') #=> "h*ll*"
7841 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7843 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7844 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7845 * all characters except those listed.
7847 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7848 * "hello".tr('^aeiou', '*') #=> "*e**o"
7850 * The backslash character <code>\\</code> can be used to escape
7851 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7852 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7854 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7855 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7857 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7858 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7859 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7861 * "X['\\b']".tr("X\\", "") #=> "['b']"
7862 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7866 rb_str_tr(VALUE str
, VALUE src
, VALUE repl
)
7868 str
= str_duplicate(rb_cString
, str
);
7869 tr_trans(str
, src
, repl
, 0);
7873 #define TR_TABLE_MAX (UCHAR_MAX+1)
7874 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7876 tr_setup_table(VALUE str
, char stable
[TR_TABLE_SIZE
], int first
,
7877 VALUE
*tablep
, VALUE
*ctablep
, rb_encoding
*enc
)
7879 const unsigned int errc
= -1;
7880 char buf
[TR_TABLE_MAX
];
7883 VALUE table
= 0, ptable
= 0;
7884 int i
, l
, cflag
= 0;
7886 tr
.p
= RSTRING_PTR(str
); tr
.pend
= tr
.p
+ RSTRING_LEN(str
);
7887 tr
.gen
= tr
.now
= tr
.max
= 0;
7889 if (RSTRING_LEN(str
) > 1 && rb_enc_ascget(tr
.p
, tr
.pend
, &l
, enc
) == '^') {
7894 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7897 stable
[TR_TABLE_MAX
] = cflag
;
7899 else if (stable
[TR_TABLE_MAX
] && !cflag
) {
7900 stable
[TR_TABLE_MAX
] = 0;
7902 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7906 while ((c
= trnext(&tr
, enc
)) != errc
) {
7907 if (c
< TR_TABLE_MAX
) {
7908 buf
[(unsigned char)c
] = !cflag
;
7911 VALUE key
= UINT2NUM(c
);
7913 if (!table
&& (first
|| *tablep
|| stable
[TR_TABLE_MAX
])) {
7916 table
= ptable
? ptable
: rb_hash_new();
7920 table
= rb_hash_new();
7925 if (table
&& (!ptable
|| (cflag
^ !NIL_P(rb_hash_aref(ptable
, key
))))) {
7926 rb_hash_aset(table
, key
, Qtrue
);
7930 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7931 stable
[i
] = stable
[i
] && buf
[i
];
7933 if (!table
&& !cflag
) {
7940 tr_find(unsigned int c
, const char table
[TR_TABLE_SIZE
], VALUE del
, VALUE nodel
)
7942 if (c
< TR_TABLE_MAX
) {
7943 return table
[c
] != 0;
7946 VALUE v
= UINT2NUM(c
);
7949 if (!NIL_P(rb_hash_lookup(del
, v
)) &&
7950 (!nodel
|| NIL_P(rb_hash_lookup(nodel
, v
)))) {
7954 else if (nodel
&& !NIL_P(rb_hash_lookup(nodel
, v
))) {
7957 return table
[TR_TABLE_MAX
] ? TRUE
: FALSE
;
7963 * str.delete!([other_str]+) -> str or nil
7965 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7966 * <code>nil</code> if <i>str</i> was not modified.
7970 rb_str_delete_bang(int argc
, VALUE
*argv
, VALUE str
)
7972 char squeez
[TR_TABLE_SIZE
];
7973 rb_encoding
*enc
= 0;
7975 VALUE del
= 0, nodel
= 0;
7977 int i
, ascompat
, cr
;
7979 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7980 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
7981 for (i
=0; i
<argc
; i
++) {
7985 enc
= rb_enc_check(str
, s
);
7986 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
7989 str_modify_keep_cr(str
);
7990 ascompat
= rb_enc_asciicompat(enc
);
7991 s
= t
= RSTRING_PTR(str
);
7992 send
= RSTRING_END(str
);
7993 cr
= ascompat
? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
7998 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8009 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8011 if (tr_find(c
, squeez
, del
, nodel
)) {
8015 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8017 if (cr
== ENC_CODERANGE_7BIT
) cr
= ENC_CODERANGE_VALID
;
8022 TERM_FILL(t
, TERM_LEN(str
));
8023 STR_SET_LEN(str
, t
- RSTRING_PTR(str
));
8024 ENC_CODERANGE_SET(str
, cr
);
8026 if (modify
) return str
;
8033 * str.delete([other_str]+) -> new_str
8035 * Returns a copy of <i>str</i> with all characters in the intersection of its
8036 * arguments deleted. Uses the same rules for building the set of characters as
8039 * "hello".delete "l","lo" #=> "heo"
8040 * "hello".delete "lo" #=> "he"
8041 * "hello".delete "aeiou", "^e" #=> "hell"
8042 * "hello".delete "ej-m" #=> "ho"
8046 rb_str_delete(int argc
, VALUE
*argv
, VALUE str
)
8048 str
= str_duplicate(rb_cString
, str
);
8049 rb_str_delete_bang(argc
, argv
, str
);
8056 * str.squeeze!([other_str]*) -> str or nil
8058 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8059 * <code>nil</code> if no changes were made.
8063 rb_str_squeeze_bang(int argc
, VALUE
*argv
, VALUE str
)
8065 char squeez
[TR_TABLE_SIZE
];
8066 rb_encoding
*enc
= 0;
8067 VALUE del
= 0, nodel
= 0;
8068 unsigned char *s
, *send
, *t
;
8070 int ascompat
, singlebyte
= single_byte_optimizable(str
);
8074 enc
= STR_ENC_GET(str
);
8077 for (i
=0; i
<argc
; i
++) {
8081 enc
= rb_enc_check(str
, s
);
8082 if (singlebyte
&& !single_byte_optimizable(s
))
8084 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
8088 str_modify_keep_cr(str
);
8089 s
= t
= (unsigned char *)RSTRING_PTR(str
);
8090 if (!s
|| RSTRING_LEN(str
) == 0) return Qnil
;
8091 send
= (unsigned char *)RSTRING_END(str
);
8093 ascompat
= rb_enc_asciicompat(enc
);
8097 unsigned int c
= *s
++;
8098 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8108 if (ascompat
&& (c
= *s
) < 0x80) {
8109 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8115 c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, enc
);
8117 if (c
!= save
|| (argc
> 0 && !tr_find(c
, squeez
, del
, nodel
))) {
8118 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8127 TERM_FILL((char *)t
, TERM_LEN(str
));
8128 if ((char *)t
- RSTRING_PTR(str
) != RSTRING_LEN(str
)) {
8129 STR_SET_LEN(str
, (char *)t
- RSTRING_PTR(str
));
8133 if (modify
) return str
;
8140 * str.squeeze([other_str]*) -> new_str
8142 * Builds a set of characters from the <i>other_str</i> parameter(s)
8143 * using the procedure described for String#count. Returns a new
8144 * string where runs of the same character that occur in this set are
8145 * replaced by a single character. If no arguments are given, all
8146 * runs of identical characters are replaced by a single character.
8148 * "yellow moon".squeeze #=> "yelow mon"
8149 * " now is the".squeeze(" ") #=> " now is the"
8150 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8154 rb_str_squeeze(int argc
, VALUE
*argv
, VALUE str
)
8156 str
= str_duplicate(rb_cString
, str
);
8157 rb_str_squeeze_bang(argc
, argv
, str
);
8164 * str.tr_s!(from_str, to_str) -> str or nil
8166 * Performs String#tr_s processing on <i>str</i> in place,
8167 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8171 rb_str_tr_s_bang(VALUE str
, VALUE src
, VALUE repl
)
8173 return tr_trans(str
, src
, repl
, 1);
8179 * str.tr_s(from_str, to_str) -> new_str
8181 * Processes a copy of <i>str</i> as described under String#tr, then
8182 * removes duplicate characters in regions that were affected by the
8185 * "hello".tr_s('l', 'r') #=> "hero"
8186 * "hello".tr_s('el', '*') #=> "h*o"
8187 * "hello".tr_s('el', 'hx') #=> "hhxo"
8191 rb_str_tr_s(VALUE str
, VALUE src
, VALUE repl
)
8193 str
= str_duplicate(rb_cString
, str
);
8194 tr_trans(str
, src
, repl
, 1);
8201 * str.count([other_str]+) -> integer
8203 * Each +other_str+ parameter defines a set of characters to count. The
8204 * intersection of these sets defines the characters to count in +str+. Any
8205 * +other_str+ that starts with a caret <code>^</code> is negated. The
8206 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8207 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8208 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8209 * sequence or the end of a +other_str+.
8212 * a.count "lo" #=> 5
8213 * a.count "lo", "o" #=> 2
8214 * a.count "hello", "^l" #=> 4
8215 * a.count "ej-m" #=> 4
8217 * "hello^world".count "\\^aeiou" #=> 4
8218 * "hello-world".count "a\\-eo" #=> 4
8220 * c = "hello world\\r\\n"
8221 * c.count "\\" #=> 2
8222 * c.count "\\A" #=> 0
8223 * c.count "X-\\w" #=> 3
8227 rb_str_count(int argc
, VALUE
*argv
, VALUE str
)
8229 char table
[TR_TABLE_SIZE
];
8230 rb_encoding
*enc
= 0;
8231 VALUE del
= 0, nodel
= 0, tstr
;
8237 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
8241 enc
= rb_enc_check(str
, tstr
);
8244 if (RSTRING_LEN(tstr
) == 1 && rb_enc_asciicompat(enc
) &&
8245 (ptstr
= RSTRING_PTR(tstr
),
8246 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc
, (const unsigned char *)ptstr
, (const unsigned char *)ptstr
+1)) &&
8247 !is_broken_string(str
)) {
8249 unsigned char c
= rb_enc_codepoint_len(ptstr
, ptstr
+1, &clen
, enc
);
8251 s
= RSTRING_PTR(str
);
8252 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8253 send
= RSTRING_END(str
);
8255 if (*(unsigned char*)s
++ == c
) n
++;
8257 return SIZET2NUM(n
);
8261 tr_setup_table(tstr
, table
, TRUE
, &del
, &nodel
, enc
);
8262 for (i
=1; i
<argc
; i
++) {
8265 enc
= rb_enc_check(str
, tstr
);
8266 tr_setup_table(tstr
, table
, FALSE
, &del
, &nodel
, enc
);
8269 s
= RSTRING_PTR(str
);
8270 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8271 send
= RSTRING_END(str
);
8272 ascompat
= rb_enc_asciicompat(enc
);
8276 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8284 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8285 if (tr_find(c
, table
, del
, nodel
)) {
8292 return SIZET2NUM(n
);
8296 rb_fs_check(VALUE val
)
8298 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
) && !RB_TYPE_P(val
, T_REGEXP
)) {
8299 val
= rb_check_string_type(val
);
8300 if (NIL_P(val
)) return 0;
8305 static const char isspacetable
[256] = {
8306 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8308 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8324 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8327 split_string(VALUE result
, VALUE str
, long beg
, long len
, long empty_count
)
8329 if (empty_count
>= 0 && len
== 0) {
8330 return empty_count
+ 1;
8332 if (empty_count
> 0) {
8333 /* make different substrings */
8336 rb_ary_push(result
, str_new_empty_String(str
));
8337 } while (--empty_count
> 0);
8341 rb_yield(str_new_empty_String(str
));
8342 } while (--empty_count
> 0);
8345 str
= rb_str_subseq(str
, beg
, len
);
8347 rb_ary_push(result
, str
);
8356 SPLIT_TYPE_AWK
, SPLIT_TYPE_STRING
, SPLIT_TYPE_REGEXP
, SPLIT_TYPE_CHARS
8360 literal_split_pattern(VALUE spat
, split_type_t default_type
)
8362 rb_encoding
*enc
= STR_ENC_GET(spat
);
8365 RSTRING_GETMEM(spat
, ptr
, len
);
8367 /* Special case - split into chars */
8368 return SPLIT_TYPE_CHARS
;
8370 else if (rb_enc_asciicompat(enc
)) {
8371 if (len
== 1 && ptr
[0] == ' ') {
8372 return SPLIT_TYPE_AWK
;
8377 if (rb_enc_ascget(ptr
, ptr
+ len
, &l
, enc
) == ' ' && len
== l
) {
8378 return SPLIT_TYPE_AWK
;
8381 return default_type
;
8386 * str.split(pattern=nil, [limit]) -> an_array
8387 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8389 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8390 * of these substrings.
8392 * If <i>pattern</i> is a String, then its contents are used as
8393 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8394 * space, <i>str</i> is split on whitespace, with leading and trailing
8395 * whitespace and runs of contiguous whitespace characters ignored.
8397 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8398 * pattern matches. Whenever the pattern matches a zero-length string,
8399 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8400 * groups, the respective matches will be returned in the array as well.
8402 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8403 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8404 * split on whitespace as if ' ' were specified.
8406 * If the <i>limit</i> parameter is omitted, trailing null fields are
8407 * suppressed. If <i>limit</i> is a positive number, at most that number
8408 * of split substrings will be returned (captured groups will be returned
8409 * as well, but are not counted towards the limit).
8410 * If <i>limit</i> is <code>1</code>, the entire
8411 * string is returned as the only entry in an array. If negative, there is no
8412 * limit to the number of fields returned, and trailing null fields are not
8415 * When the input +str+ is empty an empty Array is returned as the string is
8416 * considered to have no fields to split.
8418 * " now's the time ".split #=> ["now's", "the", "time"]
8419 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8420 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8421 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8422 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8423 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8424 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8426 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8427 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8428 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8429 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8431 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8433 * "".split(',', -1) #=> []
8435 * If a block is given, invoke the block with each split substring.
8440 rb_str_split_m(int argc
, VALUE
*argv
, VALUE str
)
8445 split_type_t split_type
;
8446 long beg
, end
, i
= 0, empty_count
= -1;
8450 result
= rb_block_given_p() ? Qfalse
: Qnil
;
8451 if (rb_scan_args(argc
, argv
, "02", &spat
, &limit
) == 2) {
8452 lim
= NUM2INT(limit
);
8453 if (lim
<= 0) limit
= Qnil
;
8454 else if (lim
== 1) {
8455 if (RSTRING_LEN(str
) == 0)
8456 return result
? rb_ary_new2(0) : str
;
8457 tmp
= str_duplicate(rb_cString
, str
);
8462 return rb_ary_new3(1, tmp
);
8466 if (NIL_P(limit
) && !lim
) empty_count
= 0;
8468 enc
= STR_ENC_GET(str
);
8469 split_type
= SPLIT_TYPE_REGEXP
;
8471 spat
= get_pat_quoted(spat
, 0);
8473 else if (NIL_P(spat
= rb_fs
)) {
8474 split_type
= SPLIT_TYPE_AWK
;
8476 else if (!(spat
= rb_fs_check(spat
))) {
8477 rb_raise(rb_eTypeError
, "value of $; must be String or Regexp");
8480 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$; is set to non-nil value");
8482 if (split_type
!= SPLIT_TYPE_AWK
) {
8483 switch (BUILTIN_TYPE(spat
)) {
8485 rb_reg_options(spat
); /* check if uninitialized */
8486 tmp
= RREGEXP_SRC(spat
);
8487 split_type
= literal_split_pattern(tmp
, SPLIT_TYPE_REGEXP
);
8488 if (split_type
== SPLIT_TYPE_AWK
) {
8490 split_type
= SPLIT_TYPE_STRING
;
8495 mustnot_broken(spat
);
8496 split_type
= literal_split_pattern(spat
, SPLIT_TYPE_STRING
);
8500 UNREACHABLE_RETURN(Qnil
);
8504 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8506 if (result
) result
= rb_ary_new();
8508 char *ptr
= RSTRING_PTR(str
);
8509 char *eptr
= RSTRING_END(str
);
8510 if (split_type
== SPLIT_TYPE_AWK
) {
8516 if (is_ascii_string(str
)) {
8517 while (ptr
< eptr
) {
8518 c
= (unsigned char)*ptr
++;
8520 if (ascii_isspace(c
)) {
8526 if (!NIL_P(limit
) && lim
<= i
) break;
8529 else if (ascii_isspace(c
)) {
8530 SPLIT_STR(beg
, end
-beg
);
8533 if (!NIL_P(limit
)) ++i
;
8541 while (ptr
< eptr
) {
8544 c
= rb_enc_codepoint_len(ptr
, eptr
, &n
, enc
);
8547 if (rb_isspace(c
)) {
8553 if (!NIL_P(limit
) && lim
<= i
) break;
8556 else if (rb_isspace(c
)) {
8557 SPLIT_STR(beg
, end
-beg
);
8560 if (!NIL_P(limit
)) ++i
;
8568 else if (split_type
== SPLIT_TYPE_STRING
) {
8569 char *str_start
= ptr
;
8570 char *substr_start
= ptr
;
8571 char *sptr
= RSTRING_PTR(spat
);
8572 long slen
= RSTRING_LEN(spat
);
8574 mustnot_broken(str
);
8575 enc
= rb_enc_check(str
, spat
);
8576 while (ptr
< eptr
&&
8577 (end
= rb_memsearch(sptr
, slen
, ptr
, eptr
- ptr
, enc
)) >= 0) {
8578 /* Check we are at the start of a char */
8579 char *t
= rb_enc_right_char_head(ptr
, ptr
+ end
, eptr
, enc
);
8580 if (t
!= ptr
+ end
) {
8584 SPLIT_STR(substr_start
- str_start
, (ptr
+end
) - substr_start
);
8587 if (!NIL_P(limit
) && lim
<= ++i
) break;
8589 beg
= ptr
- str_start
;
8591 else if (split_type
== SPLIT_TYPE_CHARS
) {
8592 char *str_start
= ptr
;
8595 mustnot_broken(str
);
8596 enc
= rb_enc_get(str
);
8597 while (ptr
< eptr
&&
8598 (n
= rb_enc_precise_mbclen(ptr
, eptr
, enc
)) > 0) {
8599 SPLIT_STR(ptr
- str_start
, n
);
8601 if (!NIL_P(limit
) && lim
<= ++i
) break;
8603 beg
= ptr
- str_start
;
8606 long len
= RSTRING_LEN(str
);
8610 struct re_registers
*regs
;
8613 for (; rb_reg_search(spat
, str
, start
, 0) >= 0;
8614 (match
? (rb_match_unbusy(match
), rb_backref_set(match
)) : (void)0)) {
8615 match
= rb_backref_get();
8616 if (!result
) rb_match_busy(match
);
8617 regs
= RMATCH_REGS(match
);
8619 if (start
== end
&& BEG(0) == END(0)) {
8624 else if (last_null
== 1) {
8625 SPLIT_STR(beg
, rb_enc_fast_mbclen(ptr
+beg
, eptr
, enc
));
8632 start
+= rb_enc_fast_mbclen(ptr
+start
,eptr
,enc
);
8638 SPLIT_STR(beg
, end
-beg
);
8639 beg
= start
= END(0);
8643 for (idx
=1; idx
< regs
->num_regs
; idx
++) {
8644 if (BEG(idx
) == -1) continue;
8645 SPLIT_STR(BEG(idx
), END(idx
)-BEG(idx
));
8647 if (!NIL_P(limit
) && lim
<= ++i
) break;
8649 if (match
) rb_match_unbusy(match
);
8651 if (RSTRING_LEN(str
) > 0 && (!NIL_P(limit
) || RSTRING_LEN(str
) > beg
|| lim
< 0)) {
8652 SPLIT_STR(beg
, RSTRING_LEN(str
)-beg
);
8655 return result
? result
: str
;
8659 rb_str_split(VALUE str
, const char *sep0
)
8664 sep
= rb_str_new_cstr(sep0
);
8665 return rb_str_split_m(1, &sep
, str
);
8668 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8671 enumerator_element(VALUE ary
, VALUE e
)
8674 rb_ary_push(ary
, e
);
8683 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8686 chomp_newline(const char *p
, const char *e
, rb_encoding
*enc
)
8688 const char *prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8689 if (rb_enc_is_newline(prev
, e
, enc
)) {
8691 prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8692 if (prev
&& rb_enc_ascget(prev
, e
, NULL
, enc
) == '\r')
8703 (!RB_TYPE_P(rs
, T_STRING
) ||
8704 RSTRING_LEN(rs
) != 1 ||
8705 RSTRING_PTR(rs
)[0] != '\n')) {
8706 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$/ is set to non-default value");
8711 #define rb_rs get_rs()
8714 rb_str_enumerate_lines(int argc
, VALUE
*argv
, VALUE str
, VALUE ary
)
8717 VALUE line
, rs
, orig
= str
, opts
= Qnil
, chomp
= Qfalse
;
8718 const char *ptr
, *pend
, *subptr
, *subend
, *rsptr
, *hit
, *adjusted
;
8719 long pos
, len
, rslen
;
8722 if (rb_scan_args(argc
, argv
, "01:", &rs
, &opts
) == 0)
8725 static ID keywords
[1];
8727 keywords
[0] = rb_intern_const("chomp");
8729 rb_get_kwargs(opts
, keywords
, 0, 1, &chomp
);
8730 chomp
= (chomp
!= Qundef
&& RTEST(chomp
));
8734 if (!ENUM_ELEM(ary
, str
)) {
8742 if (!RSTRING_LEN(str
)) goto end
;
8743 str
= rb_str_new_frozen(str
);
8744 ptr
= subptr
= RSTRING_PTR(str
);
8745 pend
= RSTRING_END(str
);
8746 len
= RSTRING_LEN(str
);
8748 rslen
= RSTRING_LEN(rs
);
8750 if (rs
== rb_default_rs
)
8751 enc
= rb_enc_get(str
);
8753 enc
= rb_enc_check(str
, rs
);
8756 /* paragraph mode */
8758 const char *eol
= NULL
;
8760 while (subend
< pend
) {
8762 if (rb_enc_ascget(subend
, pend
, &n
, enc
) != '\r')
8764 rslen
= n
+ rb_enc_mbclen(subend
+ n
, pend
, enc
);
8765 if (rb_enc_is_newline(subend
+ n
, pend
, enc
)) {
8766 if (eol
== subend
) break;
8768 if (subptr
) eol
= subend
;
8771 if (!subptr
) subptr
= subend
;
8775 } while (subend
< pend
);
8777 line
= rb_str_subseq(str
, subptr
- ptr
,
8778 subend
- subptr
+ (chomp
? 0 : rslen
));
8779 if (ENUM_ELEM(ary
, line
)) {
8780 str_mod_check(str
, ptr
, len
);
8782 subptr
= eol
= NULL
;
8787 rsptr
= RSTRING_PTR(rs
);
8788 if (RSTRING_LEN(rs
) == rb_enc_mbminlen(enc
) &&
8789 rb_enc_is_newline(rsptr
, rsptr
+ RSTRING_LEN(rs
), enc
)) {
8794 if ((rs
== rb_default_rs
) && !rb_enc_asciicompat(enc
)) {
8795 rs
= rb_str_new(rsptr
, rslen
);
8796 rs
= rb_str_encode(rs
, rb_enc_from_encoding(enc
), 0, Qnil
);
8797 rsptr
= RSTRING_PTR(rs
);
8798 rslen
= RSTRING_LEN(rs
);
8801 while (subptr
< pend
) {
8802 pos
= rb_memsearch(rsptr
, rslen
, subptr
, pend
- subptr
, enc
);
8805 adjusted
= rb_enc_right_char_head(subptr
, hit
, pend
, enc
);
8806 if (hit
!= adjusted
) {
8810 subend
= hit
+= rslen
;
8813 subend
= chomp_newline(subptr
, subend
, enc
);
8819 line
= rb_str_subseq(str
, subptr
- ptr
, subend
- subptr
);
8820 if (ENUM_ELEM(ary
, line
)) {
8821 str_mod_check(str
, ptr
, len
);
8826 if (subptr
!= pend
) {
8829 pend
= chomp_newline(subptr
, pend
, enc
);
8831 else if (pend
- subptr
>= rslen
&&
8832 memcmp(pend
- rslen
, rsptr
, rslen
) == 0) {
8836 line
= rb_str_subseq(str
, subptr
- ptr
, pend
- subptr
);
8837 ENUM_ELEM(ary
, line
);
8850 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8851 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8853 * Splits <i>str</i> using the supplied parameter as the record
8854 * separator (<code>$/</code> by default), passing each substring in
8855 * turn to the supplied block. If a zero-length record separator is
8856 * supplied, the string is split into paragraphs delimited by
8857 * multiple successive newlines.
8859 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8862 * If no block is given, an enumerator is returned instead.
8864 * "hello\nworld".each_line {|s| p s}
8869 * "hello\nworld".each_line('l') {|s| p s}
8876 * "hello\n\n\nworld".each_line('') {|s| p s}
8881 * "hello\nworld".each_line(chomp: true) {|s| p s}
8886 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8896 rb_str_each_line(int argc
, VALUE
*argv
, VALUE str
)
8898 RETURN_SIZED_ENUMERATOR(str
, argc
, argv
, 0);
8899 return rb_str_enumerate_lines(argc
, argv
, str
, 0);
8904 * str.lines(separator=$/, chomp: false) -> an_array
8906 * Returns an array of lines in <i>str</i> split using the supplied
8907 * record separator (<code>$/</code> by default). This is a
8908 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8910 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8913 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8914 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8915 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8917 * If a block is given, which is a deprecated form, works the same as
8918 * <code>each_line</code>.
8922 rb_str_lines(int argc
, VALUE
*argv
, VALUE str
)
8924 VALUE ary
= WANTARRAY("lines", 0);
8925 return rb_str_enumerate_lines(argc
, argv
, str
, ary
);
8929 rb_str_each_byte_size(VALUE str
, VALUE args
, VALUE eobj
)
8931 return LONG2FIX(RSTRING_LEN(str
));
8935 rb_str_enumerate_bytes(VALUE str
, VALUE ary
)
8939 for (i
=0; i
<RSTRING_LEN(str
); i
++) {
8940 ENUM_ELEM(ary
, INT2FIX((unsigned char)RSTRING_PTR(str
)[i
]));
8950 * str.each_byte {|integer| block } -> str
8951 * str.each_byte -> an_enumerator
8953 * Passes each byte in <i>str</i> to the given block, or returns an
8954 * enumerator if no block is given.
8956 * "hello".each_byte {|c| print c, ' ' }
8958 * <em>produces:</em>
8960 * 104 101 108 108 111
8964 rb_str_each_byte(VALUE str
)
8966 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_byte_size
);
8967 return rb_str_enumerate_bytes(str
, 0);
8972 * str.bytes -> an_array
8974 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8975 * <code>str.each_byte.to_a</code>.
8977 * If a block is given, which is a deprecated form, works the same as
8978 * <code>each_byte</code>.
8982 rb_str_bytes(VALUE str
)
8984 VALUE ary
= WANTARRAY("bytes", RSTRING_LEN(str
));
8985 return rb_str_enumerate_bytes(str
, ary
);
8989 rb_str_each_char_size(VALUE str
, VALUE args
, VALUE eobj
)
8991 return rb_str_length(str
);
8995 rb_str_enumerate_chars(VALUE str
, VALUE ary
)
9002 str
= rb_str_new_frozen(str
);
9003 ptr
= RSTRING_PTR(str
);
9004 len
= RSTRING_LEN(str
);
9005 enc
= rb_enc_get(str
);
9007 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str
))) {
9008 for (i
= 0; i
< len
; i
+= n
) {
9009 n
= rb_enc_fast_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9010 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9014 for (i
= 0; i
< len
; i
+= n
) {
9015 n
= rb_enc_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9016 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9028 * str.each_char {|cstr| block } -> str
9029 * str.each_char -> an_enumerator
9031 * Passes each character in <i>str</i> to the given block, or returns
9032 * an enumerator if no block is given.
9034 * "hello".each_char {|c| print c, ' ' }
9036 * <em>produces:</em>
9042 rb_str_each_char(VALUE str
)
9044 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9045 return rb_str_enumerate_chars(str
, 0);
9050 * str.chars -> an_array
9052 * Returns an array of characters in <i>str</i>. This is a shorthand
9053 * for <code>str.each_char.to_a</code>.
9055 * If a block is given, which is a deprecated form, works the same as
9056 * <code>each_char</code>.
9060 rb_str_chars(VALUE str
)
9062 VALUE ary
= WANTARRAY("chars", rb_str_strlen(str
));
9063 return rb_str_enumerate_chars(str
, ary
);
9067 rb_str_enumerate_codepoints(VALUE str
, VALUE ary
)
9072 const char *ptr
, *end
;
9075 if (single_byte_optimizable(str
))
9076 return rb_str_enumerate_bytes(str
, ary
);
9078 str
= rb_str_new_frozen(str
);
9079 ptr
= RSTRING_PTR(str
);
9080 end
= RSTRING_END(str
);
9081 enc
= STR_ENC_GET(str
);
9084 c
= rb_enc_codepoint_len(ptr
, end
, &n
, enc
);
9085 ENUM_ELEM(ary
, UINT2NUM(c
));
9097 * str.each_codepoint {|integer| block } -> str
9098 * str.each_codepoint -> an_enumerator
9100 * Passes the Integer ordinal of each character in <i>str</i>,
9101 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9102 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9103 * values are directly derived from the binary representation
9104 * of each character.
9106 * If no block is given, an enumerator is returned instead.
9108 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9110 * <em>produces:</em>
9112 * 104 101 108 108 111 1593
9116 rb_str_each_codepoint(VALUE str
)
9118 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9119 return rb_str_enumerate_codepoints(str
, 0);
9124 * str.codepoints -> an_array
9126 * Returns an array of the Integer ordinals of the
9127 * characters in <i>str</i>. This is a shorthand for
9128 * <code>str.each_codepoint.to_a</code>.
9130 * If a block is given, which is a deprecated form, works the same as
9131 * <code>each_codepoint</code>.
9135 rb_str_codepoints(VALUE str
)
9137 VALUE ary
= WANTARRAY("codepoints", rb_str_strlen(str
));
9138 return rb_str_enumerate_codepoints(str
, ary
);
9142 get_reg_grapheme_cluster(rb_encoding
*enc
)
9144 int encidx
= rb_enc_to_index(enc
);
9145 regex_t
*reg_grapheme_cluster
= NULL
;
9146 static regex_t
*reg_grapheme_cluster_utf8
= NULL
;
9149 if (encidx
== rb_utf8_encindex() && reg_grapheme_cluster_utf8
) {
9150 reg_grapheme_cluster
= reg_grapheme_cluster_utf8
;
9152 if (!reg_grapheme_cluster
) {
9153 const OnigUChar source_ascii
[] = "\\X";
9154 OnigErrorInfo einfo
;
9155 const OnigUChar
*source
= source_ascii
;
9156 size_t source_len
= sizeof(source_ascii
) - 1;
9158 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9159 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9160 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9161 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9162 #define CASE_UTF(e) \
9163 case ENCINDEX_UTF_##e: { \
9164 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9165 source = source_UTF_##e; \
9166 source_len = sizeof(source_UTF_##e); \
9169 CASE_UTF(16BE
); CASE_UTF(16LE
); CASE_UTF(32BE
); CASE_UTF(32LE
);
9176 int r
= onig_new(®_grapheme_cluster
, source
, source
+ source_len
,
9177 ONIG_OPTION_DEFAULT
, enc
, OnigDefaultSyntax
, &einfo
);
9179 UChar message
[ONIG_MAX_ERROR_MESSAGE_LEN
];
9180 onig_error_code_to_str(message
, r
, &einfo
);
9181 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message
);
9183 if (encidx
== rb_utf8_encindex()) {
9184 reg_grapheme_cluster_utf8
= reg_grapheme_cluster
;
9187 return reg_grapheme_cluster
;
9191 rb_str_each_grapheme_cluster_size(VALUE str
, VALUE args
, VALUE eobj
)
9193 size_t grapheme_cluster_count
= 0;
9194 regex_t
*reg_grapheme_cluster
= NULL
;
9195 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9196 const char *ptr
, *end
;
9198 if (!rb_enc_unicode_p(enc
)) {
9199 return rb_str_length(str
);
9202 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9203 ptr
= RSTRING_PTR(str
);
9204 end
= RSTRING_END(str
);
9207 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9208 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9209 (const OnigUChar
*)ptr
, NULL
, 0);
9210 if (len
<= 0) break;
9211 grapheme_cluster_count
++;
9215 return SIZET2NUM(grapheme_cluster_count
);
9219 rb_str_enumerate_grapheme_clusters(VALUE str
, VALUE ary
)
9222 regex_t
*reg_grapheme_cluster
= NULL
;
9223 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9224 const char *ptr0
, *ptr
, *end
;
9226 if (!rb_enc_unicode_p(enc
)) {
9227 return rb_str_enumerate_chars(str
, ary
);
9230 if (!ary
) str
= rb_str_new_frozen(str
);
9231 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9232 ptr0
= ptr
= RSTRING_PTR(str
);
9233 end
= RSTRING_END(str
);
9236 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9237 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9238 (const OnigUChar
*)ptr
, NULL
, 0);
9239 if (len
<= 0) break;
9240 ENUM_ELEM(ary
, rb_str_subseq(str
, ptr
-ptr0
, len
));
9252 * str.each_grapheme_cluster {|cstr| block } -> str
9253 * str.each_grapheme_cluster -> an_enumerator
9255 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9256 * an enumerator if no block is given.
9257 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9258 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9260 * "a\u0300".each_char.to_a.size #=> 2
9261 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9266 rb_str_each_grapheme_cluster(VALUE str
)
9268 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_grapheme_cluster_size
);
9269 return rb_str_enumerate_grapheme_clusters(str
, 0);
9274 * str.grapheme_clusters -> an_array
9276 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9277 * for <code>str.each_grapheme_cluster.to_a</code>.
9279 * If a block is given, which is a deprecated form, works the same as
9280 * <code>each_grapheme_cluster</code>.
9284 rb_str_grapheme_clusters(VALUE str
)
9286 VALUE ary
= WANTARRAY("grapheme_clusters", rb_str_strlen(str
));
9287 return rb_str_enumerate_grapheme_clusters(str
, ary
);
9291 chopped_length(VALUE str
)
9293 rb_encoding
*enc
= STR_ENC_GET(str
);
9294 const char *p
, *p2
, *beg
, *end
;
9296 beg
= RSTRING_PTR(str
);
9297 end
= beg
+ RSTRING_LEN(str
);
9298 if (beg
>= end
) return 0;
9299 p
= rb_enc_prev_char(beg
, end
, end
, enc
);
9301 if (p
> beg
&& rb_enc_ascget(p
, end
, 0, enc
) == '\n') {
9302 p2
= rb_enc_prev_char(beg
, p
, end
, enc
);
9303 if (p2
&& rb_enc_ascget(p2
, end
, 0, enc
) == '\r') p
= p2
;
9310 * str.chop! -> str or nil
9312 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9313 * <code>nil</code> if <i>str</i> is the empty string. See also
9318 rb_str_chop_bang(VALUE str
)
9320 str_modify_keep_cr(str
);
9321 if (RSTRING_LEN(str
) > 0) {
9323 len
= chopped_length(str
);
9324 STR_SET_LEN(str
, len
);
9325 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9326 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9327 ENC_CODERANGE_CLEAR(str
);
9337 * str.chop -> new_str
9339 * Returns a new String with the last character removed. If the
9340 * string ends with <code>\r\n</code>, both characters are
9341 * removed. Applying <code>chop</code> to an empty string returns an
9342 * empty string. String#chomp is often a safer alternative, as it
9343 * leaves the string unchanged if it doesn't end in a record
9346 * "string\r\n".chop #=> "string"
9347 * "string\n\r".chop #=> "string\n"
9348 * "string\n".chop #=> "string"
9349 * "string".chop #=> "strin"
9350 * "x".chop.chop #=> ""
9354 rb_str_chop(VALUE str
)
9356 return rb_str_subseq(str
, 0, chopped_length(str
));
9360 smart_chomp(VALUE str
, const char *e
, const char *p
)
9362 rb_encoding
*enc
= rb_enc_get(str
);
9363 if (rb_enc_mbminlen(enc
) > 1) {
9364 const char *pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9365 if (rb_enc_is_newline(pp
, e
, enc
)) {
9368 pp
= e
- rb_enc_mbminlen(enc
);
9370 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9371 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9377 switch (*(e
-1)) { /* not e[-1] to get rid of VC bug */
9379 if (--e
> p
&& *(e
-1) == '\r') {
9392 chompped_length(VALUE str
, VALUE rs
)
9396 char *pp
, *e
, *rsptr
;
9398 char *const p
= RSTRING_PTR(str
);
9399 long len
= RSTRING_LEN(str
);
9401 if (len
== 0) return 0;
9403 if (rs
== rb_default_rs
) {
9404 return smart_chomp(str
, e
, p
);
9407 enc
= rb_enc_get(str
);
9408 RSTRING_GETMEM(rs
, rsptr
, rslen
);
9410 if (rb_enc_mbminlen(enc
) > 1) {
9412 pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9413 if (!rb_enc_is_newline(pp
, e
, enc
)) break;
9415 pp
-= rb_enc_mbminlen(enc
);
9417 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9418 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9425 while (e
> p
&& *(e
-1) == '\n') {
9427 if (e
> p
&& *(e
-1) == '\r')
9433 if (rslen
> len
) return len
;
9435 enc
= rb_enc_get(rs
);
9436 newline
= rsptr
[rslen
-1];
9437 if (rslen
== rb_enc_mbminlen(enc
)) {
9439 if (newline
== '\n')
9440 return smart_chomp(str
, e
, p
);
9443 if (rb_enc_is_newline(rsptr
, rsptr
+rslen
, enc
))
9444 return smart_chomp(str
, e
, p
);
9448 enc
= rb_enc_check(str
, rs
);
9449 if (is_broken_string(rs
)) {
9453 if (p
[len
-1] == newline
&&
9455 memcmp(rsptr
, pp
, rslen
) == 0)) {
9456 if (rb_enc_left_char_head(p
, pp
, e
, enc
) == pp
)
9464 * Returns the separator for arguments of rb_str_chomp.
9466 * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9469 chomp_rs(int argc
, const VALUE
*argv
)
9471 rb_check_arity(argc
, 0, 1);
9474 if (!NIL_P(rs
)) StringValue(rs
);
9483 rb_str_chomp_string(VALUE str
, VALUE rs
)
9485 long olen
= RSTRING_LEN(str
);
9486 long len
= chompped_length(str
, rs
);
9487 if (len
>= olen
) return Qnil
;
9488 str_modify_keep_cr(str
);
9489 STR_SET_LEN(str
, len
);
9490 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9491 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9492 ENC_CODERANGE_CLEAR(str
);
9499 * str.chomp!(separator=$/) -> str or nil
9501 * Modifies <i>str</i> in place as described for String#chomp,
9502 * returning <i>str</i>, or <code>nil</code> if no modifications were
9507 rb_str_chomp_bang(int argc
, VALUE
*argv
, VALUE str
)
9510 str_modifiable(str
);
9511 if (RSTRING_LEN(str
) == 0) return Qnil
;
9512 rs
= chomp_rs(argc
, argv
);
9513 if (NIL_P(rs
)) return Qnil
;
9514 return rb_str_chomp_string(str
, rs
);
9520 * str.chomp(separator=$/) -> new_str
9522 * Returns a new String with the given record separator removed
9523 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9524 * changed from the default Ruby record separator, then <code>chomp</code> also
9525 * removes carriage return characters (that is, it will remove <code>\n</code>,
9526 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9527 * it will remove all trailing newlines from the string.
9529 * "hello".chomp #=> "hello"
9530 * "hello\n".chomp #=> "hello"
9531 * "hello\r\n".chomp #=> "hello"
9532 * "hello\n\r".chomp #=> "hello\n"
9533 * "hello\r".chomp #=> "hello"
9534 * "hello \n there".chomp #=> "hello \n there"
9535 * "hello".chomp("llo") #=> "he"
9536 * "hello\r\n\r\n".chomp('') #=> "hello"
9537 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9541 rb_str_chomp(int argc
, VALUE
*argv
, VALUE str
)
9543 VALUE rs
= chomp_rs(argc
, argv
);
9544 if (NIL_P(rs
)) return str_duplicate(rb_cString
, str
);
9545 return rb_str_subseq(str
, 0, chompped_length(str
, rs
));
9549 lstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9551 const char *const start
= s
;
9553 if (!s
|| s
>= e
) return 0;
9555 /* remove spaces at head */
9556 if (single_byte_optimizable(str
)) {
9557 while (s
< e
&& (*s
== '\0' || ascii_isspace(*s
))) s
++;
9562 unsigned int cc
= rb_enc_codepoint_len(s
, e
, &n
, enc
);
9564 if (cc
&& !rb_isspace(cc
)) break;
9573 * str.lstrip! -> self or nil
9575 * Removes leading whitespace from the receiver.
9576 * Returns the altered receiver, or +nil+ if no change was made.
9577 * See also String#rstrip! and String#strip!.
9579 * Refer to String#strip for the definition of whitespace.
9581 * " hello ".lstrip! #=> "hello "
9582 * "hello ".lstrip! #=> nil
9583 * "hello".lstrip! #=> nil
9587 rb_str_lstrip_bang(VALUE str
)
9593 str_modify_keep_cr(str
);
9594 enc
= STR_ENC_GET(str
);
9595 RSTRING_GETMEM(str
, start
, olen
);
9596 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9598 long len
= olen
-loffset
;
9599 s
= start
+ loffset
;
9600 memmove(start
, s
, len
);
9601 STR_SET_LEN(str
, len
);
9602 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9611 * str.lstrip -> new_str
9613 * Returns a copy of the receiver with leading whitespace removed.
9614 * See also String#rstrip and String#strip.
9616 * Refer to String#strip for the definition of whitespace.
9618 * " hello ".lstrip #=> "hello "
9619 * "hello".lstrip #=> "hello"
9623 rb_str_lstrip(VALUE str
)
9627 RSTRING_GETMEM(str
, start
, len
);
9628 loffset
= lstrip_offset(str
, start
, start
+len
, STR_ENC_GET(str
));
9629 if (loffset
<= 0) return str_duplicate(rb_cString
, str
);
9630 return rb_str_subseq(str
, loffset
, len
- loffset
);
9634 rstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9638 rb_str_check_dummy_enc(enc
);
9639 if (!s
|| s
>= e
) return 0;
9642 /* remove trailing spaces or '\0's */
9643 if (single_byte_optimizable(str
)) {
9645 while (s
< t
&& ((c
= *(t
-1)) == '\0' || ascii_isspace(c
))) t
--;
9650 while ((tp
= rb_enc_prev_char(s
, t
, e
, enc
)) != NULL
) {
9651 unsigned int c
= rb_enc_codepoint(tp
, e
, enc
);
9652 if (c
&& !rb_isspace(c
)) break;
9661 * str.rstrip! -> self or nil
9663 * Removes trailing whitespace from the receiver.
9664 * Returns the altered receiver, or +nil+ if no change was made.
9665 * See also String#lstrip! and String#strip!.
9667 * Refer to String#strip for the definition of whitespace.
9669 * " hello ".rstrip! #=> " hello"
9670 * " hello".rstrip! #=> nil
9671 * "hello".rstrip! #=> nil
9675 rb_str_rstrip_bang(VALUE str
)
9681 str_modify_keep_cr(str
);
9682 enc
= STR_ENC_GET(str
);
9683 RSTRING_GETMEM(str
, start
, olen
);
9684 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9686 long len
= olen
- roffset
;
9688 STR_SET_LEN(str
, len
);
9689 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9698 * str.rstrip -> new_str
9700 * Returns a copy of the receiver with trailing whitespace removed.
9701 * See also String#lstrip and String#strip.
9703 * Refer to String#strip for the definition of whitespace.
9705 * " hello ".rstrip #=> " hello"
9706 * "hello".rstrip #=> "hello"
9710 rb_str_rstrip(VALUE str
)
9716 enc
= STR_ENC_GET(str
);
9717 RSTRING_GETMEM(str
, start
, olen
);
9718 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9720 if (roffset
<= 0) return str_duplicate(rb_cString
, str
);
9721 return rb_str_subseq(str
, 0, olen
-roffset
);
9727 * str.strip! -> self or nil
9729 * Removes leading and trailing whitespace from the receiver.
9730 * Returns the altered receiver, or +nil+ if there was no change.
9732 * Refer to String#strip for the definition of whitespace.
9734 * " hello ".strip! #=> "hello"
9735 * "hello".strip! #=> nil
9739 rb_str_strip_bang(VALUE str
)
9742 long olen
, loffset
, roffset
;
9745 str_modify_keep_cr(str
);
9746 enc
= STR_ENC_GET(str
);
9747 RSTRING_GETMEM(str
, start
, olen
);
9748 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9749 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9751 if (loffset
> 0 || roffset
> 0) {
9752 long len
= olen
-roffset
;
9755 memmove(start
, start
+ loffset
, len
);
9757 STR_SET_LEN(str
, len
);
9758 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9767 * str.strip -> new_str
9769 * Returns a copy of the receiver with leading and trailing whitespace removed.
9771 * Whitespace is defined as any of the following characters:
9772 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9774 * " hello ".strip #=> "hello"
9775 * "\tgoodbye\r\n".strip #=> "goodbye"
9776 * "\x00\t\n\v\f\r ".strip #=> ""
9777 * "hello".strip #=> "hello"
9781 rb_str_strip(VALUE str
)
9784 long olen
, loffset
, roffset
;
9785 rb_encoding
*enc
= STR_ENC_GET(str
);
9787 RSTRING_GETMEM(str
, start
, olen
);
9788 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9789 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9791 if (loffset
<= 0 && roffset
<= 0) return str_duplicate(rb_cString
, str
);
9792 return rb_str_subseq(str
, loffset
, olen
-loffset
-roffset
);
9796 scan_once(VALUE str
, VALUE pat
, long *start
, int set_backref_str
)
9798 VALUE result
, match
;
9799 struct re_registers
*regs
;
9801 long end
, pos
= rb_pat_search(pat
, str
, *start
, set_backref_str
);
9803 if (BUILTIN_TYPE(pat
) == T_STRING
) {
9805 end
= pos
+ RSTRING_LEN(pat
);
9808 match
= rb_backref_get();
9809 regs
= RMATCH_REGS(match
);
9814 rb_encoding
*enc
= STR_ENC_GET(str
);
9816 * Always consume at least one character of the input string
9818 if (RSTRING_LEN(str
) > end
)
9819 *start
= end
+ rb_enc_fast_mbclen(RSTRING_PTR(str
) + end
,
9820 RSTRING_END(str
), enc
);
9827 if (!regs
|| regs
->num_regs
== 1) {
9828 result
= rb_str_subseq(str
, pos
, end
- pos
);
9831 result
= rb_ary_new2(regs
->num_regs
);
9832 for (i
=1; i
< regs
->num_regs
; i
++) {
9835 s
= rb_str_subseq(str
, BEG(i
), END(i
)-BEG(i
));
9837 rb_ary_push(result
, s
);
9848 * str.scan(pattern) -> array
9849 * str.scan(pattern) {|match, ...| block } -> str
9851 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9852 * Regexp or a String). For each match, a result is
9853 * generated and either added to the result array or passed to the block. If
9854 * the pattern contains no groups, each individual result consists of the
9855 * matched string, <code>$&</code>. If the pattern contains groups, each
9856 * individual result is itself an array containing one entry per group.
9859 * a.scan(/\w+/) #=> ["cruel", "world"]
9860 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9861 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9862 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9864 * And the block form:
9866 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9868 * a.scan(/(.)(.)/) {|x,y| print y, x }
9871 * <em>produces:</em>
9873 * <<cruel>> <<world>>
9878 rb_str_scan(VALUE str
, VALUE pat
)
9882 long last
= -1, prev
= 0;
9883 char *p
= RSTRING_PTR(str
); long len
= RSTRING_LEN(str
);
9885 pat
= get_pat_quoted(pat
, 1);
9886 mustnot_broken(str
);
9887 if (!rb_block_given_p()) {
9888 VALUE ary
= rb_ary_new();
9890 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 0))) {
9893 rb_ary_push(ary
, result
);
9895 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9896 else rb_backref_set(Qnil
);
9900 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 1))) {
9904 str_mod_check(str
, p
, len
);
9906 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9913 * str.hex -> integer
9915 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9916 * (with an optional sign and an optional <code>0x</code>) and returns the
9917 * corresponding number. Zero is returned on error.
9920 * "-1234".hex #=> -4660
9922 * "wombat".hex #=> 0
9926 rb_str_hex(VALUE str
)
9928 return rb_str_to_inum(str
, 16, FALSE
);
9934 * str.oct -> integer
9936 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9937 * optional sign) and returns the corresponding number. Returns 0 if the
9941 * "-377".oct #=> -255
9943 * "0377bad".oct #=> 255
9945 * If +str+ starts with <code>0</code>, radix indicators are honored.
9946 * See Kernel#Integer.
9950 rb_str_oct(VALUE str
)
9952 return rb_str_to_inum(str
, -8, FALSE
);
9955 #ifndef HAVE_CRYPT_R
9956 # include "ruby/thread_native.h"
9957 # include "ruby/atomic.h"
9960 rb_atomic_t initialized
;
9961 rb_nativethread_lock_t lock
;
9965 crypt_mutex_destroy(void)
9967 RUBY_ASSERT_ALWAYS(crypt_mutex
.initialized
== 1);
9968 rb_nativethread_lock_destroy(&crypt_mutex
.lock
);
9969 crypt_mutex
.initialized
= 0;
9973 crypt_mutex_initialize(void)
9976 while ((i
= RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 0, 2)) == 2);
9979 rb_nativethread_lock_initialize(&crypt_mutex
.lock
);
9980 atexit(crypt_mutex_destroy
);
9981 RUBY_ASSERT(crypt_mutex
.initialized
== 2);
9982 RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 2, 1);
9987 rb_bug("crypt_mutex.initialized: %d->%d", i
, crypt_mutex
.initialized
);
9994 * str.crypt(salt_str) -> new_str
9996 * Returns the string generated by calling <code>crypt(3)</code>
9997 * standard library function with <code>str</code> and
9998 * <code>salt_str</code>, in this order, as its arguments. Please do
9999 * not use this method any longer. It is legacy; provided only for
10000 * backward compatibility with ruby scripts in earlier days. It is
10001 * bad to use in contemporary programs for several reasons:
10003 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10004 * run. The generated string lacks data portability.
10006 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10007 * (i.e. silently ends up in unexpected results).
10009 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10012 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10013 * very very weak. According to its manpage, Linux's traditional
10014 * <code>crypt(3)</code> output has only 2**56 variations; too
10015 * easy to brute force today. And this is the default behaviour.
10017 * * In order to make things robust some OSes implement so-called
10018 * "modular" usage. To go through, you have to do a complex
10019 * build-up of the <code>salt_str</code> parameter, by hand.
10020 * Failure in generation of a proper salt string tends not to
10021 * yield any errors; typos in parameters are normally not
10024 * * For instance, in the following example, the second invocation
10025 * of String#crypt is wrong; it has a typo in "round=" (lacks
10026 * "s"). However the call does not fail and something unexpected
10029 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10030 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10032 * * Even in the "modular" mode, some hash functions are considered
10033 * archaic and no longer recommended at all; for instance module
10034 * <code>$1$</code> is officially abandoned by its author: see
10035 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10036 * instance module <code>$3$</code> is considered completely
10037 * broken: see the manpage of FreeBSD.
10039 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10040 * written above, <code>crypt(3)</code> on Mac OS never fails.
10041 * This means even if you build up a proper salt string it
10042 * generates a traditional DES hash anyways, and there is no way
10043 * for you to be aware of.
10045 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10047 * If for some reason you cannot migrate to other secure contemporary
10048 * password hashing algorithms, install the string-crypt gem and
10049 * <code>require 'string/crypt'</code> to continue using it.
10053 rb_str_crypt(VALUE str
, VALUE salt
)
10055 #ifdef HAVE_CRYPT_R
10057 struct crypt_data
*data
;
10058 # define CRYPT_END() ALLOCV_END(databuf)
10060 extern char *crypt(const char *, const char *);
10061 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10064 const char *s
, *saltp
;
10066 #ifdef BROKEN_CRYPT
10067 char salt_8bit_clean
[3];
10071 mustnot_wchar(str
);
10072 mustnot_wchar(salt
);
10073 s
= StringValueCStr(str
);
10074 saltp
= RSTRING_PTR(salt
);
10075 if (RSTRING_LEN(salt
) < 2 || !saltp
[0] || !saltp
[1]) {
10076 rb_raise(rb_eArgError
, "salt too short (need >=2 bytes)");
10079 #ifdef BROKEN_CRYPT
10080 if (!ISASCII((unsigned char)saltp
[0]) || !ISASCII((unsigned char)saltp
[1])) {
10081 salt_8bit_clean
[0] = saltp
[0] & 0x7f;
10082 salt_8bit_clean
[1] = saltp
[1] & 0x7f;
10083 salt_8bit_clean
[2] = '\0';
10084 saltp
= salt_8bit_clean
;
10087 #ifdef HAVE_CRYPT_R
10088 data
= ALLOCV(databuf
, sizeof(struct crypt_data
));
10089 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10090 data
->initialized
= 0;
10092 res
= crypt_r(s
, saltp
, data
);
10094 crypt_mutex_initialize();
10095 rb_nativethread_lock_lock(&crypt_mutex
.lock
);
10096 res
= crypt(s
, saltp
);
10101 rb_syserr_fail(err
, "crypt");
10103 result
= rb_str_new_cstr(res
);
10111 * str.ord -> integer
10113 * Returns the Integer ordinal of a one-character string.
10119 rb_str_ord(VALUE s
)
10123 c
= rb_enc_codepoint(RSTRING_PTR(s
), RSTRING_END(s
), STR_ENC_GET(s
));
10124 return UINT2NUM(c
);
10128 * str.sum(n=16) -> integer
10130 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10131 * where <em>n</em> is the optional Integer parameter, defaulting
10132 * to 16. The result is simply the sum of the binary value of each byte in
10133 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10138 rb_str_sum(int argc
, VALUE
*argv
, VALUE str
)
10141 char *ptr
, *p
, *pend
;
10143 VALUE sum
= INT2FIX(0);
10144 unsigned long sum0
= 0;
10146 if (rb_check_arity(argc
, 0, 1) && (bits
= NUM2INT(argv
[0])) < 0) {
10149 ptr
= p
= RSTRING_PTR(str
);
10150 len
= RSTRING_LEN(str
);
10154 if (FIXNUM_MAX
- UCHAR_MAX
< sum0
) {
10155 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10156 str_mod_check(str
, ptr
, len
);
10159 sum0
+= (unsigned char)*p
;
10165 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10169 if (sum
== INT2FIX(0)) {
10170 if (bits
< (int)sizeof(long)*CHAR_BIT
) {
10171 sum0
&= (((unsigned long)1)<<bits
)-1;
10173 sum
= LONG2FIX(sum0
);
10179 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10182 mod
= rb_funcall(INT2FIX(1), idLTLT
, 1, INT2FIX(bits
));
10183 mod
= rb_funcall(mod
, '-', 1, INT2FIX(1));
10184 sum
= rb_funcall(sum
, '&', 1, mod
);
10191 rb_str_justify(int argc
, VALUE
*argv
, VALUE str
, char jflag
)
10195 long width
, len
, flen
= 1, fclen
= 1;
10198 const char *f
= " ";
10199 long n
, size
, llen
, rlen
, llen2
= 0, rlen2
= 0;
10201 int singlebyte
= 1, cr
;
10204 rb_scan_args(argc
, argv
, "11", &w
, &pad
);
10205 enc
= STR_ENC_GET(str
);
10206 termlen
= rb_enc_mbminlen(enc
);
10207 width
= NUM2LONG(w
);
10210 enc
= rb_enc_check(str
, pad
);
10211 f
= RSTRING_PTR(pad
);
10212 flen
= RSTRING_LEN(pad
);
10213 fclen
= str_strlen(pad
, enc
); /* rb_enc_check */
10214 singlebyte
= single_byte_optimizable(pad
);
10215 if (flen
== 0 || fclen
== 0) {
10216 rb_raise(rb_eArgError
, "zero width padding");
10219 len
= str_strlen(str
, enc
); /* rb_enc_check */
10220 if (width
< 0 || len
>= width
) return str_duplicate(rb_cString
, str
);
10222 llen
= (jflag
== 'l') ? 0 : ((jflag
== 'r') ? n
: n
/2);
10224 cr
= ENC_CODERANGE(str
);
10226 llen2
= str_offset(f
, f
+ flen
, llen
% fclen
, enc
, singlebyte
);
10227 rlen2
= str_offset(f
, f
+ flen
, rlen
% fclen
, enc
, singlebyte
);
10229 size
= RSTRING_LEN(str
);
10230 if ((len
= llen
/ fclen
+ rlen
/ fclen
) >= LONG_MAX
/ flen
||
10231 (len
*= flen
) >= LONG_MAX
- llen2
- rlen2
||
10232 (len
+= llen2
+ rlen2
) >= LONG_MAX
- size
) {
10233 rb_raise(rb_eArgError
, "argument too big");
10236 res
= str_new0(rb_cString
, 0, len
, termlen
);
10237 p
= RSTRING_PTR(res
);
10239 memset(p
, *f
, llen
);
10243 while (llen
>= fclen
) {
10249 memcpy(p
, f
, llen2
);
10253 memcpy(p
, RSTRING_PTR(str
), size
);
10256 memset(p
, *f
, rlen
);
10260 while (rlen
>= fclen
) {
10266 memcpy(p
, f
, rlen2
);
10270 TERM_FILL(p
, termlen
);
10271 STR_SET_LEN(res
, p
-RSTRING_PTR(res
));
10272 rb_enc_associate(res
, enc
);
10274 cr
= ENC_CODERANGE_AND(cr
, ENC_CODERANGE(pad
));
10275 if (cr
!= ENC_CODERANGE_BROKEN
)
10276 ENC_CODERANGE_SET(res
, cr
);
10285 * str.ljust(integer, padstr=' ') -> new_str
10287 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10288 * String of length <i>integer</i> with <i>str</i> left justified
10289 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10291 * "hello".ljust(4) #=> "hello"
10292 * "hello".ljust(20) #=> "hello "
10293 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10297 rb_str_ljust(int argc
, VALUE
*argv
, VALUE str
)
10299 return rb_str_justify(argc
, argv
, str
, 'l');
10305 * str.rjust(integer, padstr=' ') -> new_str
10307 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10308 * String of length <i>integer</i> with <i>str</i> right justified
10309 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10311 * "hello".rjust(4) #=> "hello"
10312 * "hello".rjust(20) #=> " hello"
10313 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10317 rb_str_rjust(int argc
, VALUE
*argv
, VALUE str
)
10319 return rb_str_justify(argc
, argv
, str
, 'r');
10325 * str.center(width, padstr=' ') -> new_str
10327 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10328 * returns a new String of length +width+ with +str+ centered and padded with
10329 * +padstr+; otherwise, returns +str+.
10331 * "hello".center(4) #=> "hello"
10332 * "hello".center(20) #=> " hello "
10333 * "hello".center(20, '123') #=> "1231231hello12312312"
10337 rb_str_center(int argc
, VALUE
*argv
, VALUE str
)
10339 return rb_str_justify(argc
, argv
, str
, 'c');
10344 * str.partition(sep) -> [head, sep, tail]
10345 * str.partition(regexp) -> [head, match, tail]
10347 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10348 * and returns the part before it, the match, and the part
10350 * If it is not found, returns two empty strings and <i>str</i>.
10352 * "hello".partition("l") #=> ["he", "l", "lo"]
10353 * "hello".partition("x") #=> ["hello", "", ""]
10354 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10358 rb_str_partition(VALUE str
, VALUE sep
)
10362 sep
= get_pat_quoted(sep
, 0);
10363 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10364 if (rb_reg_search(sep
, str
, 0, 0) < 0) {
10367 VALUE match
= rb_backref_get();
10368 struct re_registers
*regs
= RMATCH_REGS(match
);
10371 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10374 pos
= rb_str_index(str
, sep
, 0);
10375 if (pos
< 0) goto failed
;
10377 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10379 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10380 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10383 return rb_ary_new3(3, str_duplicate(rb_cString
, str
), str_new_empty_String(str
), str_new_empty_String(str
));
10388 * str.rpartition(sep) -> [head, sep, tail]
10389 * str.rpartition(regexp) -> [head, match, tail]
10391 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10392 * of the string, and returns the part before it, the match, and the part
10394 * If it is not found, returns two empty strings and <i>str</i>.
10396 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10397 * "hello".rpartition("x") #=> ["", "", "hello"]
10398 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10400 * The match from the end means starting at the possible last position, not
10401 * the last of longest matches.
10403 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10405 * To partition at the last longest match, needs to combine with
10406 * negative lookbehind.
10408 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10410 * Or String#partition with negative lookforward.
10412 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10416 rb_str_rpartition(VALUE str
, VALUE sep
)
10418 long pos
= RSTRING_LEN(str
);
10420 sep
= get_pat_quoted(sep
, 0);
10421 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10422 if (rb_reg_search(sep
, str
, pos
, 1) < 0) {
10425 VALUE match
= rb_backref_get();
10426 struct re_registers
*regs
= RMATCH_REGS(match
);
10429 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10432 pos
= rb_str_sublen(str
, pos
);
10433 pos
= rb_str_rindex(str
, sep
, pos
);
10437 pos
= rb_str_offset(str
, pos
);
10440 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10442 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10443 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10445 return rb_ary_new3(3, str_new_empty_String(str
), str_new_empty_String(str
), str_duplicate(rb_cString
, str
));
10450 * str.start_with?([prefixes]+) -> true or false
10452 * Returns true if +str+ starts with one of the +prefixes+ given.
10453 * Each of the +prefixes+ should be a String or a Regexp.
10455 * "hello".start_with?("hell") #=> true
10456 * "hello".start_with?(/H/i) #=> true
10458 * # returns true if one of the prefixes matches.
10459 * "hello".start_with?("heaven", "hell") #=> true
10460 * "hello".start_with?("heaven", "paradise") #=> false
10464 rb_str_start_with(int argc
, VALUE
*argv
, VALUE str
)
10468 for (i
=0; i
<argc
; i
++) {
10469 VALUE tmp
= argv
[i
];
10470 if (RB_TYPE_P(tmp
, T_REGEXP
)) {
10471 if (rb_reg_start_with_p(tmp
, str
))
10476 rb_enc_check(str
, tmp
);
10477 if (RSTRING_LEN(str
) < RSTRING_LEN(tmp
)) continue;
10478 if (memcmp(RSTRING_PTR(str
), RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10487 * str.end_with?([suffixes]+) -> true or false
10489 * Returns true if +str+ ends with one of the +suffixes+ given.
10491 * "hello".end_with?("ello") #=> true
10493 * # returns true if one of the +suffixes+ matches.
10494 * "hello".end_with?("heaven", "ello") #=> true
10495 * "hello".end_with?("heaven", "paradise") #=> false
10499 rb_str_end_with(int argc
, VALUE
*argv
, VALUE str
)
10505 for (i
=0; i
<argc
; i
++) {
10506 VALUE tmp
= argv
[i
];
10509 enc
= rb_enc_check(str
, tmp
);
10510 if ((tlen
= RSTRING_LEN(tmp
)) == 0) return Qtrue
;
10511 if ((slen
= RSTRING_LEN(str
)) < tlen
) continue;
10512 p
= RSTRING_PTR(str
);
10515 if (rb_enc_left_char_head(p
, s
, e
, enc
) != s
)
10517 if (memcmp(s
, RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10524 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10525 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10527 * @param str the target
10528 * @param prefix the prefix
10529 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10530 * @retval Positive-Integer otherwise
10533 deleted_prefix_length(VALUE str
, VALUE prefix
)
10535 char *strptr
, *prefixptr
;
10536 long olen
, prefixlen
;
10538 StringValue(prefix
);
10539 if (is_broken_string(prefix
)) return 0;
10540 rb_enc_check(str
, prefix
);
10542 /* return 0 if not start with prefix */
10543 prefixlen
= RSTRING_LEN(prefix
);
10544 if (prefixlen
<= 0) return 0;
10545 olen
= RSTRING_LEN(str
);
10546 if (olen
< prefixlen
) return 0;
10547 strptr
= RSTRING_PTR(str
);
10548 prefixptr
= RSTRING_PTR(prefix
);
10549 if (memcmp(strptr
, prefixptr
, prefixlen
) != 0) return 0;
10556 * str.delete_prefix!(prefix) -> self or nil
10558 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10559 * <code>nil</code> if no change was made.
10561 * "hello".delete_prefix!("hel") #=> "lo"
10562 * "hello".delete_prefix!("llo") #=> nil
10566 rb_str_delete_prefix_bang(VALUE str
, VALUE prefix
)
10569 str_modify_keep_cr(str
);
10571 prefixlen
= deleted_prefix_length(str
, prefix
);
10572 if (prefixlen
<= 0) return Qnil
;
10574 return rb_str_drop_bytes(str
, prefixlen
);
10579 * str.delete_prefix(prefix) -> new_str
10581 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10583 * "hello".delete_prefix("hel") #=> "lo"
10584 * "hello".delete_prefix("llo") #=> "hello"
10588 rb_str_delete_prefix(VALUE str
, VALUE prefix
)
10592 prefixlen
= deleted_prefix_length(str
, prefix
);
10593 if (prefixlen
<= 0) return str_duplicate(rb_cString
, str
);
10595 return rb_str_subseq(str
, prefixlen
, RSTRING_LEN(str
) - prefixlen
);
10599 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10600 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10602 * @param str the target
10603 * @param suffix the suffix
10604 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10605 * @retval Positive-Integer otherwise
10608 deleted_suffix_length(VALUE str
, VALUE suffix
)
10610 char *strptr
, *suffixptr
, *s
;
10611 long olen
, suffixlen
;
10614 StringValue(suffix
);
10615 if (is_broken_string(suffix
)) return 0;
10616 enc
= rb_enc_check(str
, suffix
);
10618 /* return 0 if not start with suffix */
10619 suffixlen
= RSTRING_LEN(suffix
);
10620 if (suffixlen
<= 0) return 0;
10621 olen
= RSTRING_LEN(str
);
10622 if (olen
< suffixlen
) return 0;
10623 strptr
= RSTRING_PTR(str
);
10624 suffixptr
= RSTRING_PTR(suffix
);
10625 s
= strptr
+ olen
- suffixlen
;
10626 if (memcmp(s
, suffixptr
, suffixlen
) != 0) return 0;
10627 if (rb_enc_left_char_head(strptr
, s
, strptr
+ olen
, enc
) != s
) return 0;
10634 * str.delete_suffix!(suffix) -> self or nil
10636 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10637 * <code>nil</code> if no change was made.
10639 * "hello".delete_suffix!("llo") #=> "he"
10640 * "hello".delete_suffix!("hel") #=> nil
10644 rb_str_delete_suffix_bang(VALUE str
, VALUE suffix
)
10646 long olen
, suffixlen
, len
;
10647 str_modifiable(str
);
10649 suffixlen
= deleted_suffix_length(str
, suffix
);
10650 if (suffixlen
<= 0) return Qnil
;
10652 olen
= RSTRING_LEN(str
);
10653 str_modify_keep_cr(str
);
10654 len
= olen
- suffixlen
;
10655 STR_SET_LEN(str
, len
);
10656 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
10657 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
10658 ENC_CODERANGE_CLEAR(str
);
10665 * str.delete_suffix(suffix) -> new_str
10667 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10669 * "hello".delete_suffix("llo") #=> "he"
10670 * "hello".delete_suffix("hel") #=> "hello"
10674 rb_str_delete_suffix(VALUE str
, VALUE suffix
)
10678 suffixlen
= deleted_suffix_length(str
, suffix
);
10679 if (suffixlen
<= 0) return str_duplicate(rb_cString
, str
);
10681 return rb_str_subseq(str
, 0, RSTRING_LEN(str
) - suffixlen
);
10685 rb_str_setter(VALUE val
, ID id
, VALUE
*var
)
10687 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
)) {
10688 rb_raise(rb_eTypeError
, "value of %"PRIsVALUE
" must be String", rb_id2str(id
));
10694 rb_fs_setter(VALUE val
, ID id
, VALUE
*var
)
10696 val
= rb_fs_check(val
);
10698 rb_raise(rb_eTypeError
,
10699 "value of %"PRIsVALUE
" must be String or Regexp",
10703 rb_warn_deprecated("`$;'", NULL
);
10711 * str.force_encoding(encoding) -> str
10713 * Changes the encoding to +encoding+ and returns self.
10717 rb_str_force_encoding(VALUE str
, VALUE enc
)
10719 str_modifiable(str
);
10720 rb_enc_associate(str
, rb_to_encoding(enc
));
10721 ENC_CODERANGE_CLEAR(str
);
10729 * Returns a copied string whose encoding is ASCII-8BIT.
10733 rb_str_b(VALUE str
)
10736 if (FL_TEST(str
, STR_NOEMBED
)) {
10737 str2
= str_alloc_heap(rb_cString
);
10740 str2
= str_alloc_embed(rb_cString
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
10742 str_replace_shared_without_enc(str2
, str
);
10743 ENC_CODERANGE_CLEAR(str2
);
10749 * str.valid_encoding? -> true or false
10751 * Returns true for a string which is encoded correctly.
10753 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10754 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10755 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10759 rb_str_valid_encoding_p(VALUE str
)
10761 int cr
= rb_enc_str_coderange(str
);
10763 return RBOOL(cr
!= ENC_CODERANGE_BROKEN
);
10768 * str.ascii_only? -> true or false
10770 * Returns true for a string which has only ASCII characters.
10772 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10773 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10777 rb_str_is_ascii_only_p(VALUE str
)
10779 int cr
= rb_enc_str_coderange(str
);
10781 return RBOOL(cr
== ENC_CODERANGE_7BIT
);
10785 rb_str_ellipsize(VALUE str
, long len
)
10787 static const char ellipsis
[] = "...";
10788 const long ellipsislen
= sizeof(ellipsis
) - 1;
10789 rb_encoding
*const enc
= rb_enc_get(str
);
10790 const long blen
= RSTRING_LEN(str
);
10791 const char *const p
= RSTRING_PTR(str
), *e
= p
+ blen
;
10792 VALUE estr
, ret
= 0;
10794 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
10795 if (len
* rb_enc_mbminlen(enc
) >= blen
||
10796 (e
= rb_enc_nth(p
, e
, len
, enc
)) - p
== blen
) {
10799 else if (len
<= ellipsislen
||
10800 !(e
= rb_enc_step_back(p
, e
, e
, len
= ellipsislen
, enc
))) {
10801 if (rb_enc_asciicompat(enc
)) {
10802 ret
= rb_str_new(ellipsis
, len
);
10803 rb_enc_associate(ret
, enc
);
10806 estr
= rb_usascii_str_new(ellipsis
, len
);
10807 ret
= rb_str_encode(estr
, rb_enc_from_encoding(enc
), 0, Qnil
);
10810 else if (ret
= rb_str_subseq(str
, 0, e
- p
), rb_enc_asciicompat(enc
)) {
10811 rb_str_cat(ret
, ellipsis
, ellipsislen
);
10814 estr
= rb_str_encode(rb_usascii_str_new(ellipsis
, ellipsislen
),
10815 rb_enc_from_encoding(enc
), 0, Qnil
);
10816 rb_str_append(ret
, estr
);
10822 str_compat_and_valid(VALUE str
, rb_encoding
*enc
)
10825 str
= StringValue(str
);
10826 cr
= rb_enc_str_coderange(str
);
10827 if (cr
== ENC_CODERANGE_BROKEN
) {
10828 rb_raise(rb_eArgError
, "replacement must be valid byte sequence '%+"PRIsVALUE
"'", str
);
10831 rb_encoding
*e
= STR_ENC_GET(str
);
10832 if (cr
== ENC_CODERANGE_7BIT
? rb_enc_mbminlen(enc
) != 1 : enc
!= e
) {
10833 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
10834 rb_enc_name(enc
), rb_enc_name(e
));
10840 static VALUE
enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
);
10843 rb_str_scrub(VALUE str
, VALUE repl
)
10845 rb_encoding
*enc
= STR_ENC_GET(str
);
10846 return enc_str_scrub(enc
, str
, repl
, ENC_CODERANGE(str
));
10850 rb_enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
)
10852 int cr
= ENC_CODERANGE_UNKNOWN
;
10853 if (enc
== STR_ENC_GET(str
)) {
10854 /* cached coderange makes sense only when enc equals the
10855 * actual encoding of str */
10856 cr
= ENC_CODERANGE(str
);
10858 return enc_str_scrub(enc
, str
, repl
, cr
);
10862 enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
)
10866 const char *rep
, *p
, *e
, *p1
, *sp
;
10870 if (rb_block_given_p()) {
10872 rb_raise(rb_eArgError
, "both of block and replacement given");
10876 if (ENC_CODERANGE_CLEAN_P(cr
))
10879 if (!NIL_P(repl
)) {
10880 repl
= str_compat_and_valid(repl
, enc
);
10883 if (rb_enc_dummy_p(enc
)) {
10886 encidx
= rb_enc_to_index(enc
);
10888 #define DEFAULT_REPLACE_CHAR(str) do { \
10889 static const char replace[sizeof(str)-1] = str; \
10890 rep = replace; replen = (int)sizeof(replace); \
10893 slen
= RSTRING_LEN(str
);
10894 p
= RSTRING_PTR(str
);
10895 e
= RSTRING_END(str
);
10899 if (rb_enc_asciicompat(enc
)) {
10905 else if (!NIL_P(repl
)) {
10906 rep
= RSTRING_PTR(repl
);
10907 replen
= RSTRING_LEN(repl
);
10908 rep7bit_p
= (ENC_CODERANGE(repl
) == ENC_CODERANGE_7BIT
);
10910 else if (encidx
== rb_utf8_encindex()) {
10911 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10915 DEFAULT_REPLACE_CHAR("?");
10918 cr
= ENC_CODERANGE_7BIT
;
10920 p
= search_nonascii(p
, e
);
10925 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
10926 if (MBCLEN_NEEDMORE_P(ret
)) {
10929 else if (MBCLEN_CHARFOUND_P(ret
)) {
10930 cr
= ENC_CODERANGE_VALID
;
10931 p
+= MBCLEN_CHARFOUND_LEN(ret
);
10933 else if (MBCLEN_INVALID_P(ret
)) {
10935 * p1~p: valid ascii/multibyte chars
10936 * p ~e: invalid bytes + unknown bytes
10938 long clen
= rb_enc_mbmaxlen(enc
);
10939 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
10941 rb_str_buf_cat(buf
, p1
, p
- p1
);
10944 if (e
- p
< clen
) clen
= e
- p
;
10951 for (; clen
> 1; clen
--) {
10952 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
10953 if (MBCLEN_NEEDMORE_P(ret
)) break;
10954 if (MBCLEN_INVALID_P(ret
)) continue;
10959 rb_str_buf_cat(buf
, rep
, replen
);
10960 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
10963 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
10964 str_mod_check(str
, sp
, slen
);
10965 repl
= str_compat_and_valid(repl
, enc
);
10966 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
10967 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
10968 cr
= ENC_CODERANGE_VALID
;
10972 p
= search_nonascii(p
, e
);
10984 ENC_CODERANGE_SET(str
, cr
);
10987 buf
= rb_str_buf_new(RSTRING_LEN(str
));
10990 rb_str_buf_cat(buf
, p1
, p
- p1
);
10994 rb_str_buf_cat(buf
, rep
, replen
);
10995 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
10998 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
10999 str_mod_check(str
, sp
, slen
);
11000 repl
= str_compat_and_valid(repl
, enc
);
11001 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11002 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
11003 cr
= ENC_CODERANGE_VALID
;
11008 /* ASCII incompatible */
11009 long mbminlen
= rb_enc_mbminlen(enc
);
11013 else if (!NIL_P(repl
)) {
11014 rep
= RSTRING_PTR(repl
);
11015 replen
= RSTRING_LEN(repl
);
11017 else if (encidx
== ENCINDEX_UTF_16BE
) {
11018 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11020 else if (encidx
== ENCINDEX_UTF_16LE
) {
11021 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11023 else if (encidx
== ENCINDEX_UTF_32BE
) {
11024 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11026 else if (encidx
== ENCINDEX_UTF_32LE
) {
11027 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11030 DEFAULT_REPLACE_CHAR("?");
11034 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
11035 if (MBCLEN_NEEDMORE_P(ret
)) {
11038 else if (MBCLEN_CHARFOUND_P(ret
)) {
11039 p
+= MBCLEN_CHARFOUND_LEN(ret
);
11041 else if (MBCLEN_INVALID_P(ret
)) {
11043 long clen
= rb_enc_mbmaxlen(enc
);
11044 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
11045 if (p
> p1
) rb_str_buf_cat(buf
, p1
, p
- p1
);
11047 if (e
- p
< clen
) clen
= e
- p
;
11048 if (clen
<= mbminlen
* 2) {
11053 for (; clen
> mbminlen
; clen
-=mbminlen
) {
11054 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
11055 if (MBCLEN_NEEDMORE_P(ret
)) break;
11056 if (MBCLEN_INVALID_P(ret
)) continue;
11061 rb_str_buf_cat(buf
, rep
, replen
);
11064 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
11065 str_mod_check(str
, sp
, slen
);
11066 repl
= str_compat_and_valid(repl
, enc
);
11067 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11078 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
11081 buf
= rb_str_buf_new(RSTRING_LEN(str
));
11084 rb_str_buf_cat(buf
, p1
, p
- p1
);
11088 rb_str_buf_cat(buf
, rep
, replen
);
11091 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
11092 str_mod_check(str
, sp
, slen
);
11093 repl
= str_compat_and_valid(repl
, enc
);
11094 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11097 cr
= ENC_CODERANGE_VALID
;
11099 ENCODING_CODERANGE_SET(buf
, rb_enc_to_index(enc
), cr
);
11105 * str.scrub -> new_str
11106 * str.scrub(repl) -> new_str
11107 * str.scrub{|bytes|} -> new_str
11109 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11110 * character, else returns self.
11111 * If block is given, replace invalid bytes with returned value of the block.
11113 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11114 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11115 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11118 str_scrub(int argc
, VALUE
*argv
, VALUE str
)
11120 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11121 VALUE
new = rb_str_scrub(str
, repl
);
11122 return NIL_P(new) ? str_duplicate(rb_cString
, str
): new;
11127 * str.scrub! -> str
11128 * str.scrub!(repl) -> str
11129 * str.scrub!{|bytes|} -> str
11131 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11132 * character, else returns self.
11133 * If block is given, replace invalid bytes with returned value of the block.
11135 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11136 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11137 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11140 str_scrub_bang(int argc
, VALUE
*argv
, VALUE str
)
11142 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11143 VALUE
new = rb_str_scrub(str
, repl
);
11144 if (!NIL_P(new)) rb_str_replace(str
, new);
11148 static ID id_normalize
;
11149 static ID id_normalized_p
;
11150 static VALUE mUnicodeNormalize
;
11153 unicode_normalize_common(int argc
, VALUE
*argv
, VALUE str
, ID id
)
11155 static int UnicodeNormalizeRequired
= 0;
11158 if (!UnicodeNormalizeRequired
) {
11159 rb_require("unicode_normalize/normalize.rb");
11160 UnicodeNormalizeRequired
= 1;
11163 if (rb_check_arity(argc
, 0, 1)) argv2
[1] = argv
[0];
11164 return rb_funcallv(mUnicodeNormalize
, id
, argc
+1, argv2
);
11169 * str.unicode_normalize(form=:nfc)
11171 * Unicode Normalization---Returns a normalized form of +str+,
11172 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11173 * The normalization form used is determined by +form+, which can
11174 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11175 * The default is +:nfc+.
11177 * If the string is not in a Unicode Encoding, then an Exception is raised.
11178 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11179 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11180 * Anything other than UTF-8 is implemented by converting to UTF-8,
11181 * which makes it slower than UTF-8.
11183 * "a\u0300".unicode_normalize #=> "\u00E0"
11184 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11185 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11186 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11187 * #=> Encoding::CompatibilityError raised
11190 rb_str_unicode_normalize(int argc
, VALUE
*argv
, VALUE str
)
11192 return unicode_normalize_common(argc
, argv
, str
, id_normalize
);
11197 * str.unicode_normalize!(form=:nfc)
11199 * Destructive version of String#unicode_normalize, doing Unicode
11200 * normalization in place.
11203 rb_str_unicode_normalize_bang(int argc
, VALUE
*argv
, VALUE str
)
11205 return rb_str_replace(str
, unicode_normalize_common(argc
, argv
, str
, id_normalize
));
11209 * str.unicode_normalized?(form=:nfc)
11211 * Checks whether +str+ is in Unicode normalization form +form+,
11212 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11213 * The default is +:nfc+.
11215 * If the string is not in a Unicode Encoding, then an Exception is raised.
11216 * For details, see String#unicode_normalize.
11218 * "a\u0300".unicode_normalized? #=> false
11219 * "a\u0300".unicode_normalized?(:nfd) #=> true
11220 * "\u00E0".unicode_normalized? #=> true
11221 * "\u00E0".unicode_normalized?(:nfd) #=> false
11222 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11223 * #=> Encoding::CompatibilityError raised
11226 rb_str_unicode_normalized_p(int argc
, VALUE
*argv
, VALUE str
)
11228 return unicode_normalize_common(argc
, argv
, str
, id_normalized_p
);
11231 /**********************************************************************
11232 * Document-class: Symbol
11234 * Symbol objects represent named identifiers inside the Ruby interpreter.
11236 * You can create a \Symbol object explicitly with:
11238 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11240 * The same Symbol object will be
11241 * created for a given name or string for the duration of a program's
11242 * execution, regardless of the context or meaning of that name. Thus
11243 * if <code>Fred</code> is a constant in one context, a method in
11244 * another, and a class in a third, the Symbol <code>:Fred</code>
11245 * will be the same object in all three contexts.
11259 * $f1.object_id #=> 2514190
11260 * $f2.object_id #=> 2514190
11261 * $f3.object_id #=> 2514190
11263 * Constant, method, and variable names are returned as symbols:
11276 * One.instance_methods(true)
11278 * One.instance_variables
11280 * One.class_variables
11282 * global_variables.grep(/six/)
11287 * Symbol objects are different from String objects in that
11288 * Symbol objects represent identifiers, while String objects
11289 * represent text or data.
11293 * First, what's elsewhere. \Class \Symbol:
11295 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11296 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11298 * Here, class \Symbol provides methods that are useful for:
11300 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11301 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11302 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11304 * === Methods for Querying
11306 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11307 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11308 * in symbol that matches a given Regexp
11309 * or other object; returns +nil+ if no match is found.
11310 * - #[], #slice :: Returns a substring of symbol
11311 * determined by a given index, start/length, or range, or string.
11312 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11313 * - #encoding:: Returns the Encoding object that represents the encoding
11315 * - #end_with?:: Returns +true+ if symbol ends with
11316 * any of the given strings.
11317 * - #match:: Returns a MatchData object if symbol
11318 * matches a given Regexp; +nil+ otherwise.
11319 * - #match?:: Returns +true+ if symbol
11320 * matches a given Regexp; +false+ otherwise.
11321 * - #length, #size:: Returns the number of characters in symbol.
11322 * - #start_with?:: Returns +true+ if symbol starts with
11323 * any of the given strings.
11325 * === Methods for Comparing
11327 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11328 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11329 * has the same content and encoding.
11330 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11331 * symbol is smaller than, equal to, or larger than symbol.
11332 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11333 * after Unicode case folding; +false+ otherwise.
11335 * === Methods for Converting
11337 * - #capitalize:: Returns symbol with the first character upcased
11338 * and all other characters downcased.
11339 * - #downcase:: Returns symbol with all characters downcased.
11340 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11341 * - #name:: Returns the frozen string corresponding to symbol.
11342 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11343 * - #swapcase:: Returns symbol with all upcase characters downcased
11344 * and all downcase characters upcased.
11345 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11346 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11347 * - #to_sym, #intern:: Returns +self+.
11348 * - #upcase:: Returns symbol with all characters upcased.
11355 * sym == obj -> true or false
11357 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11358 * symbol, returns <code>true</code>.
11361 #define sym_equal rb_obj_equal
11364 sym_printable(const char *s
, const char *send
, rb_encoding
*enc
)
11368 int c
= rb_enc_precise_mbclen(s
, send
, enc
);
11370 if (!MBCLEN_CHARFOUND_P(c
)) return FALSE
;
11371 n
= MBCLEN_CHARFOUND_LEN(c
);
11372 c
= rb_enc_mbc_to_codepoint(s
, send
, enc
);
11373 if (!rb_enc_isprint(c
, enc
)) return FALSE
;
11380 rb_str_symname_p(VALUE sym
)
11385 rb_encoding
*resenc
= rb_default_internal_encoding();
11387 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11388 enc
= STR_ENC_GET(sym
);
11389 ptr
= RSTRING_PTR(sym
);
11390 len
= RSTRING_LEN(sym
);
11391 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(sym
)) || len
!= (long)strlen(ptr
) ||
11392 !rb_enc_symname2_p(ptr
, len
, enc
) || !sym_printable(ptr
, ptr
+ len
, enc
)) {
11399 rb_str_quote_unprintable(VALUE str
)
11404 rb_encoding
*resenc
;
11406 Check_Type(str
, T_STRING
);
11407 resenc
= rb_default_internal_encoding();
11408 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11409 enc
= STR_ENC_GET(str
);
11410 ptr
= RSTRING_PTR(str
);
11411 len
= RSTRING_LEN(str
);
11412 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(str
)) ||
11413 !sym_printable(ptr
, ptr
+ len
, enc
)) {
11414 return rb_str_escape(str
);
11419 MJIT_FUNC_EXPORTED VALUE
11420 rb_id_quote_unprintable(ID id
)
11422 VALUE str
= rb_id2str(id
);
11423 if (!rb_str_symname_p(str
)) {
11424 return rb_str_escape(str
);
11431 * sym.inspect -> string
11433 * Returns the representation of <i>sym</i> as a symbol literal.
11435 * :fred.inspect #=> ":fred"
11439 sym_inspect(VALUE sym
)
11441 VALUE str
= rb_sym2str(sym
);
11446 if (!rb_str_symname_p(str
)) {
11447 str
= rb_str_inspect(str
);
11448 len
= RSTRING_LEN(str
);
11449 rb_str_resize(str
, len
+ 1);
11450 dest
= RSTRING_PTR(str
);
11451 memmove(dest
+ 1, dest
, len
);
11454 rb_encoding
*enc
= STR_ENC_GET(str
);
11455 RSTRING_GETMEM(str
, ptr
, len
);
11456 str
= rb_enc_str_new(0, len
+ 1, enc
);
11457 dest
= RSTRING_PTR(str
);
11458 memcpy(dest
+ 1, ptr
, len
);
11464 #if 0 /* for RDoc */
11467 * sym.name -> string
11469 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11470 * returned string is frozen.
11472 * :fred.name #=> "fred"
11473 * :fred.name.frozen? #=> true
11474 * :fred.to_s #=> "fred"
11475 * :fred.to_s.frozen? #=> false
11478 rb_sym2str(VALUE sym
)
11487 * sym.id2name -> string
11488 * sym.to_s -> string
11490 * Returns the name or string corresponding to <i>sym</i>.
11492 * :fred.id2name #=> "fred"
11493 * :ginger.to_s #=> "ginger"
11495 * Note that this string is not frozen (unlike the symbol itself).
11496 * To get a frozen string, use #name.
11501 rb_sym_to_s(VALUE sym
)
11503 return str_new_shared(rb_cString
, rb_sym2str(sym
));
11509 * sym.to_sym -> sym
11510 * sym.intern -> sym
11512 * In general, <code>to_sym</code> returns the Symbol corresponding
11513 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11518 sym_to_sym(VALUE sym
)
11523 MJIT_FUNC_EXPORTED VALUE
11524 rb_sym_proc_call(ID mid
, int argc
, const VALUE
*argv
, int kw_splat
, VALUE passed_proc
)
11529 rb_raise(rb_eArgError
, "no receiver given");
11532 return rb_funcall_with_block_kw(obj
, mid
, argc
- 1, argv
+ 1, passed_proc
, kw_splat
);
11540 * Returns a _Proc_ object which responds to the given method by _sym_.
11542 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11546 rb_sym_to_proc(VALUE sym
)
11556 * Same as <code>sym.to_s.succ.intern</code>.
11560 sym_succ(VALUE sym
)
11562 return rb_str_intern(rb_str_succ(rb_sym2str(sym
)));
11568 * symbol <=> other_symbol -> -1, 0, +1, or nil
11570 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11571 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11572 * less than, equal to, or greater than +other_symbol+.
11574 * +nil+ is returned if the two values are incomparable.
11576 * See String#<=> for more information.
11580 sym_cmp(VALUE sym
, VALUE other
)
11582 if (!SYMBOL_P(other
)) {
11585 return rb_str_cmp_m(rb_sym2str(sym
), rb_sym2str(other
));
11590 * casecmp(other_symbol) -> -1, 0, 1, or nil
11592 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11594 * :aBcDeF.casecmp(:abcde) # => 1
11595 * :aBcDeF.casecmp(:abcdef) # => 0
11596 * :aBcDeF.casecmp(:abcdefg) # => -1
11597 * :abcdef.casecmp(:ABCDEF) # => 0
11599 * Returns +nil+ if the two symbols have incompatible encodings,
11600 * or if +other_symbol+ is not a symbol:
11602 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11603 * other_sym = :"\u{c4 d6 dc}"
11604 * sym.casecmp(other_sym) # => nil
11605 * :foo.casecmp(2) # => nil
11607 * Currently, case-insensitivity only works on characters A-Z/a-z,
11608 * not all of Unicode. This is different from Symbol#casecmp?.
11610 * Related: Symbol#casecmp?.
11615 sym_casecmp(VALUE sym
, VALUE other
)
11617 if (!SYMBOL_P(other
)) {
11620 return str_casecmp(rb_sym2str(sym
), rb_sym2str(other
));
11625 * casecmp?(other_symbol) -> true, false, or nil
11627 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11628 * Unicode case folding, +false+ if they are not equal:
11630 * :aBcDeF.casecmp?(:abcde) # => false
11631 * :aBcDeF.casecmp?(:abcdef) # => true
11632 * :aBcDeF.casecmp?(:abcdefg) # => false
11633 * :abcdef.casecmp?(:ABCDEF) # => true
11634 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11636 * Returns +nil+ if the two symbols have incompatible encodings,
11637 * or if +other_symbol+ is not a symbol:
11639 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11640 * other_sym = :"\u{c4 d6 dc}"
11641 * sym.casecmp?(other_sym) # => nil
11642 * :foo.casecmp?(2) # => nil
11644 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11646 * Related: Symbol#casecmp.
11651 sym_casecmp_p(VALUE sym
, VALUE other
)
11653 if (!SYMBOL_P(other
)) {
11656 return str_casecmp_p(rb_sym2str(sym
), rb_sym2str(other
));
11661 * sym =~ obj -> integer or nil
11663 * Returns <code>sym.to_s =~ obj</code>.
11667 sym_match(VALUE sym
, VALUE other
)
11669 return rb_str_match(rb_sym2str(sym
), other
);
11674 * sym.match(pattern) -> matchdata or nil
11675 * sym.match(pattern, pos) -> matchdata or nil
11677 * Returns <code>sym.to_s.match</code>.
11681 sym_match_m(int argc
, VALUE
*argv
, VALUE sym
)
11683 return rb_str_match_m(argc
, argv
, rb_sym2str(sym
));
11688 * sym.match?(pattern) -> true or false
11689 * sym.match?(pattern, pos) -> true or false
11691 * Returns <code>sym.to_s.match?</code>.
11695 sym_match_m_p(int argc
, VALUE
*argv
, VALUE sym
)
11697 return rb_str_match_m_p(argc
, argv
, sym
);
11703 * sym[b, n] -> string
11704 * sym.slice(idx) -> char
11705 * sym.slice(b, n) -> string
11707 * Returns <code>sym.to_s[]</code>.
11711 sym_aref(int argc
, VALUE
*argv
, VALUE sym
)
11713 return rb_str_aref_m(argc
, argv
, rb_sym2str(sym
));
11718 * sym.length -> integer
11719 * sym.size -> integer
11721 * Same as <code>sym.to_s.length</code>.
11725 sym_length(VALUE sym
)
11727 return rb_str_length(rb_sym2str(sym
));
11732 * sym.empty? -> true or false
11734 * Returns whether _sym_ is :"" or not.
11738 sym_empty(VALUE sym
)
11740 return rb_str_empty(rb_sym2str(sym
));
11745 * upcase(*options) -> symbol
11747 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11749 * See String#upcase.
11754 sym_upcase(int argc
, VALUE
*argv
, VALUE sym
)
11756 return rb_str_intern(rb_str_upcase(argc
, argv
, rb_sym2str(sym
)));
11761 * downcase(*options) -> symbol
11763 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11765 * See String#downcase.
11767 * Related: Symbol#upcase.
11772 sym_downcase(int argc
, VALUE
*argv
, VALUE sym
)
11774 return rb_str_intern(rb_str_downcase(argc
, argv
, rb_sym2str(sym
)));
11779 * capitalize(*options) -> symbol
11781 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11783 * See String#capitalize.
11788 sym_capitalize(int argc
, VALUE
*argv
, VALUE sym
)
11790 return rb_str_intern(rb_str_capitalize(argc
, argv
, rb_sym2str(sym
)));
11795 * swapcase(*options) -> symbol
11797 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11799 * See String#swapcase.
11804 sym_swapcase(int argc
, VALUE
*argv
, VALUE sym
)
11806 return rb_str_intern(rb_str_swapcase(argc
, argv
, rb_sym2str(sym
)));
11811 * sym.start_with?([prefixes]+) -> true or false
11813 * Returns true if +sym+ starts with one of the +prefixes+ given.
11814 * Each of the +prefixes+ should be a String or a Regexp.
11816 * :hello.start_with?("hell") #=> true
11817 * :hello.start_with?(/H/i) #=> true
11819 * # returns true if one of the prefixes matches.
11820 * :hello.start_with?("heaven", "hell") #=> true
11821 * :hello.start_with?("heaven", "paradise") #=> false
11825 sym_start_with(int argc
, VALUE
*argv
, VALUE sym
)
11827 return rb_str_start_with(argc
, argv
, rb_sym2str(sym
));
11832 * sym.end_with?([suffixes]+) -> true or false
11834 * Returns true if +sym+ ends with one of the +suffixes+ given.
11836 * :hello.end_with?("ello") #=> true
11838 * # returns true if one of the +suffixes+ matches.
11839 * :hello.end_with?("heaven", "ello") #=> true
11840 * :hello.end_with?("heaven", "paradise") #=> false
11844 sym_end_with(int argc
, VALUE
*argv
, VALUE sym
)
11846 return rb_str_end_with(argc
, argv
, rb_sym2str(sym
));
11851 * sym.encoding -> encoding
11853 * Returns the Encoding object that represents the encoding of _sym_.
11857 sym_encoding(VALUE sym
)
11859 return rb_obj_encoding(rb_sym2str(sym
));
11863 string_for_symbol(VALUE name
)
11865 if (!RB_TYPE_P(name
, T_STRING
)) {
11866 VALUE tmp
= rb_check_string_type(name
);
11868 rb_raise(rb_eTypeError
, "%+"PRIsVALUE
" is not a symbol",
11877 rb_to_id(VALUE name
)
11879 if (SYMBOL_P(name
)) {
11880 return SYM2ID(name
);
11882 name
= string_for_symbol(name
);
11883 return rb_intern_str(name
);
11887 rb_to_symbol(VALUE name
)
11889 if (SYMBOL_P(name
)) {
11892 name
= string_for_symbol(name
);
11893 return rb_str_intern(name
);
11898 * Symbol.all_symbols => array
11900 * Returns an array of all the symbols currently in Ruby's symbol
11903 * Symbol.all_symbols.size #=> 903
11904 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11905 * :chown, :EOFError, :$;, :String,
11906 * :LOCK_SH, :"setuid?", :$<,
11907 * :default_proc, :compact, :extend,
11908 * :Tms, :getwd, :$=, :ThreadGroup,
11913 sym_all_symbols(VALUE _
)
11915 return rb_sym_all_symbols();
11919 rb_str_to_interned_str(VALUE str
)
11921 return rb_fstring(str
);
11925 rb_interned_str(const char *ptr
, long len
)
11927 struct RString fake_str
;
11928 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), TRUE
);
11932 rb_interned_str_cstr(const char *ptr
)
11934 return rb_interned_str(ptr
, strlen(ptr
));
11938 rb_enc_interned_str(const char *ptr
, long len
, rb_encoding
*enc
)
11940 if (UNLIKELY(rb_enc_autoload_p(enc
))) {
11941 rb_enc_autoload(enc
);
11944 struct RString fake_str
;
11945 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), TRUE
);
11949 rb_enc_interned_str_cstr(const char *ptr
, rb_encoding
*enc
)
11951 return rb_enc_interned_str(ptr
, strlen(ptr
), enc
);
11955 * A \String object has an arbitrary sequence of bytes,
11956 * typically representing text or binary data.
11957 * A \String object may be created using String::new or as literals.
11959 * String objects differ from Symbol objects in that Symbol objects are
11960 * designed to be used as identifiers, instead of text or data.
11962 * You can create a \String object explicitly with:
11964 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11965 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11967 * You can convert certain objects to Strings with:
11969 * - \Method {String}[Kernel.html#method-i-String].
11971 * Some \String methods modify +self+.
11972 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11973 * and returns +self+;
11974 * often a similarly named method (without the <tt>!</tt>)
11975 * returns a new string.
11977 * In general, if there exist both bang and non-bang version of method,
11978 * the bang! mutates and the non-bang! does not.
11979 * However, a method without a bang can also mutate, such as String#replace.
11981 * == Substitution Methods
11983 * These methods perform substitutions:
11985 * - String#sub: One substitution (or none); returns a new string.
11986 * - String#sub!: One substitution (or none); returns +self+.
11987 * - String#gsub: Zero or more substitutions; returns a new string.
11988 * - String#gsub!: Zero or more substitutions; returns +self+.
11990 * Each of these methods takes:
11992 * - A first argument, +pattern+ (string or regexp),
11993 * that specifies the substring(s) to be replaced.
11995 * - Either of these:
11997 * - A second argument, +replacement+ (string or hash),
11998 * that determines the replacing string.
11999 * - A block that will determine the replacing string.
12001 * The examples in this section mostly use methods String#sub and String#gsub;
12002 * the principles illustrated apply to all four substitution methods.
12004 * <b>Argument +pattern+</b>
12006 * Argument +pattern+ is commonly a regular expression:
12009 * s.sub(/[aeiou]/, '*') # => "h*llo"
12010 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12011 * s.gsub(/[aeiou]/, '') # => "hll"
12012 * s.sub(/ell/, 'al') # => "halo"
12013 * s.gsub(/xyzzy/, '*') # => "hello"
12014 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12016 * When +pattern+ is a string, all its characters are treated
12017 * as ordinary characters (not as regexp special characters):
12019 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12021 * <b>\String +replacement+</b>
12023 * If +replacement+ is a string, that string will determine
12024 * the replacing string that is to be substituted for the matched text.
12026 * Each of the examples above uses a simple string as the replacing string.
12028 * \String +replacement+ may contain back-references to the pattern's captures:
12030 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12031 * - <tt>\k<name></tt> refers to the named capture +name+.
12033 * See rdoc-ref:regexp.rdoc for details.
12035 * Note that within the string +replacement+, a character combination
12036 * such as <tt>$&</tt> is treated as ordinary text, and not as
12037 * a special match variable.
12038 * However, you may refer to some special match variables using these
12041 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12042 * which contains the complete matched text.
12043 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12044 * which contains string after match.
12045 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12046 * which contains string before match.
12047 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12048 * which contains last capture group.
12050 * See rdoc-ref:regexp.rdoc for details.
12052 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12054 * Note also that a string literal consumes backslashes.
12055 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12057 * A back-reference is typically preceded by an additional backslash.
12058 * For example, if you want to write a back-reference <tt>\&</tt> in
12059 * +replacement+ with a double-quoted string literal, you need to write
12060 * <tt>"..\\\\&.."</tt>.
12062 * If you want to write a non-back-reference string <tt>\&</tt> in
12063 * +replacement+, you need first to escape the backslash to prevent
12064 * this method from interpreting it as a back-reference, and then you
12065 * need to escape the backslashes again to prevent a string literal from
12066 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12068 * You may want to use the block form to avoid a lot of backslashes.
12070 * <b>\Hash +replacement+</b>
12072 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12073 * the replacing string is the value for that key:
12075 * h = {'foo' => 'bar', 'baz' => 'bat'}
12076 * 'food'.sub('foo', h) # => "bard"
12078 * Note that a symbol key does not match:
12080 * h = {foo: 'bar', baz: 'bat'}
12081 * 'food'.sub('foo', h) # => "d"
12085 * In the block form, the current match string is passed to the block;
12086 * the block's return value becomes the replacing string:
12089 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12091 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12092 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12097 * First, what's elsewhere. \Class \String:
12099 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12100 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12102 * Here, class \String provides methods that are useful for:
12104 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12105 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12106 * - {Querying}[#class-String-label-Methods+for+Querying]
12107 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12108 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12109 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12110 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12111 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12113 * === Methods for Creating a \String
12115 * - ::new:: Returns a new string.
12116 * - ::try_convert:: Returns a new string created from a given object.
12118 * === Methods for a Frozen/Unfrozen String
12120 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12121 * +self+, if not frozen; +self.dup+ otherwise.
12122 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12123 * +self+, if already frozen; +self.freeze+ otherwise.
12124 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12126 * === Methods for Querying
12130 * - #length, #size:: Returns the count of characters (not bytes).
12131 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12132 * - #bytesize:: Returns the count of bytes.
12133 * - #count:: Returns the count of substrings matching given strings.
12137 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12138 * returns +nil+ if no match is found.
12139 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12140 * returns +nil+ if none found.
12141 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12142 * returns +nil+ if none found.
12143 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12144 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12145 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12146 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12147 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12151 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12152 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12153 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12154 * for its encoding.
12155 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12159 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12160 * - #hash:: Returns the integer hash code.
12162 * === Methods for Comparing
12164 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12165 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12166 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12167 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12168 * other string is smaller than, equal to, or larger than +self+.
12169 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12170 * +false+ otherwise.
12172 * === Methods for Modifying a \String
12174 * Each of these methods modifies +self+.
12178 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12179 * - #<<:: Returns +self+ concatenated with a given string or integer.
12183 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12184 * returns +self+ if any changes, +nil+ otherwise.
12185 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12186 * returns +self+ if any changes, +nil+ otherwise.
12187 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12188 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12189 * - #reverse!:: Returns +self+ with its characters in reverse order.
12190 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12191 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12192 * returns +self+ if any changes, +nil+ otherwise.
12193 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12194 * removing duplicates from the substrings that were modified;
12195 * returns +self+ if any changes, +nil+ otherwise.
12199 * - #capitalize!:: Upcases the initial character and downcases all others;
12200 * returns +self+ if any changes, +nil+ otherwise.
12201 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12202 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12203 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12204 * returns +self+ if any changes, +nil+ otherwise.
12208 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12209 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12210 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12211 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12215 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12216 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12217 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12218 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12219 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12220 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12221 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12222 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12223 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12224 * returns +self+ if any changes, +nil+ otherwise.
12226 * === Methods for Converting to New \String
12228 * Each of these methods returns a new \String based on +self+,
12229 * often just a modified copy of +self+.
12233 * - #*:: Returns the concatenation of multiple copies of +self+,
12234 * - #+:: Returns the concatenation of +self+ and a given other string.
12235 * - #center:: Returns a copy of +self+ centered between pad substring.
12236 * - #concat:: Returns the concatenation of +self+ with given other strings.
12237 * - #prepend:: Returns the concatenation of a given other string with +self+.
12238 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12239 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12243 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12244 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12245 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12246 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12250 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12251 * and all special characters escaped.
12252 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12253 * and all escaped characters unescaped.
12254 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12255 * replaced with a given replacement string;.
12256 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12257 * replaced with a given replacement string.
12258 * - #succ, #next:: Returns the string that is the successor to +self+.
12259 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12260 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12261 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12262 * removing duplicates from the substrings that were modified.
12263 * - #%:: Returns the string resulting from formatting a given object into +self+
12267 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12268 * and all other characters downcased.
12269 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12270 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12271 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12272 * and all downcase characters upcased.
12276 * - #delete:: Returns a copy of +self+ with characters removed
12277 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12278 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12279 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12280 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12281 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12282 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12283 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12284 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12285 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12286 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12287 * - #chr:: Returns the first character.
12291 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12292 * otherwise, returns +self+.
12294 * === Methods for Converting to Non-\String
12296 * Each of these methods converts the contents of +self+ to a non-\String.
12298 * <em>Characters, Bytes, and Clusters</em>
12300 * - #bytes:: Returns an array of the bytes in +self+.
12301 * - #chars:: Returns an array of the characters in +self+.
12302 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12303 * - #getbyte:: Returns an integer byte as determined by a given index.
12304 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12308 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12309 * - #partition:: Returns a 3-element array determined by the first substring that matches
12310 * a given substring or regexp,
12311 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12312 * a given substring or regexp,
12313 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12314 * or, if a block given, passes those substrings to the block.
12318 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12319 * if a block given, passes each matching substring to the block.
12320 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12321 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12325 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12326 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12327 * - #ord:: Returns the integer ordinal of the first character in +self+.
12328 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12329 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12331 * <em>Strings and Symbols</em>
12333 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12334 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12336 * === Methods for Iterating
12338 * - #each_byte:: Calls the given block with each successive byte in +self+.
12339 * - #each_char:: Calls the given block with each successive character in +self+.
12340 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12341 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12342 * - #each_line:: Calls the given block with each successive line in +self+,
12343 * as determined by a given record separator.
12344 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12350 rb_cString
= rb_define_class("String", rb_cObject
);
12351 assert(rb_vm_fstring_table());
12352 st_foreach(rb_vm_fstring_table(), fstring_set_class_i
, rb_cString
);
12353 rb_include_module(rb_cString
, rb_mComparable
);
12354 rb_define_alloc_func(rb_cString
, empty_str_alloc
);
12355 rb_define_singleton_method(rb_cString
, "try_convert", rb_str_s_try_convert
, 1);
12356 rb_define_method(rb_cString
, "initialize", rb_str_init
, -1);
12357 rb_define_method(rb_cString
, "initialize_copy", rb_str_replace
, 1);
12358 rb_define_method(rb_cString
, "<=>", rb_str_cmp_m
, 1);
12359 rb_define_method(rb_cString
, "==", rb_str_equal
, 1);
12360 rb_define_method(rb_cString
, "===", rb_str_equal
, 1);
12361 rb_define_method(rb_cString
, "eql?", rb_str_eql
, 1);
12362 rb_define_method(rb_cString
, "hash", rb_str_hash_m
, 0);
12363 rb_define_method(rb_cString
, "casecmp", rb_str_casecmp
, 1);
12364 rb_define_method(rb_cString
, "casecmp?", rb_str_casecmp_p
, 1);
12365 rb_define_method(rb_cString
, "+", rb_str_plus
, 1);
12366 rb_define_method(rb_cString
, "*", rb_str_times
, 1);
12367 rb_define_method(rb_cString
, "%", rb_str_format_m
, 1);
12368 rb_define_method(rb_cString
, "[]", rb_str_aref_m
, -1);
12369 rb_define_method(rb_cString
, "[]=", rb_str_aset_m
, -1);
12370 rb_define_method(rb_cString
, "insert", rb_str_insert
, 2);
12371 rb_define_method(rb_cString
, "length", rb_str_length
, 0);
12372 rb_define_method(rb_cString
, "size", rb_str_length
, 0);
12373 rb_define_method(rb_cString
, "bytesize", rb_str_bytesize
, 0);
12374 rb_define_method(rb_cString
, "empty?", rb_str_empty
, 0);
12375 rb_define_method(rb_cString
, "=~", rb_str_match
, 1);
12376 rb_define_method(rb_cString
, "match", rb_str_match_m
, -1);
12377 rb_define_method(rb_cString
, "match?", rb_str_match_m_p
, -1);
12378 rb_define_method(rb_cString
, "succ", rb_str_succ
, 0);
12379 rb_define_method(rb_cString
, "succ!", rb_str_succ_bang
, 0);
12380 rb_define_method(rb_cString
, "next", rb_str_succ
, 0);
12381 rb_define_method(rb_cString
, "next!", rb_str_succ_bang
, 0);
12382 rb_define_method(rb_cString
, "upto", rb_str_upto
, -1);
12383 rb_define_method(rb_cString
, "index", rb_str_index_m
, -1);
12384 rb_define_method(rb_cString
, "rindex", rb_str_rindex_m
, -1);
12385 rb_define_method(rb_cString
, "replace", rb_str_replace
, 1);
12386 rb_define_method(rb_cString
, "clear", rb_str_clear
, 0);
12387 rb_define_method(rb_cString
, "chr", rb_str_chr
, 0);
12388 rb_define_method(rb_cString
, "getbyte", rb_str_getbyte
, 1);
12389 rb_define_method(rb_cString
, "setbyte", rb_str_setbyte
, 2);
12390 rb_define_method(rb_cString
, "byteslice", rb_str_byteslice
, -1);
12391 rb_define_method(rb_cString
, "scrub", str_scrub
, -1);
12392 rb_define_method(rb_cString
, "scrub!", str_scrub_bang
, -1);
12393 rb_define_method(rb_cString
, "freeze", rb_str_freeze
, 0);
12394 rb_define_method(rb_cString
, "+@", str_uplus
, 0);
12395 rb_define_method(rb_cString
, "-@", str_uminus
, 0);
12397 rb_define_method(rb_cString
, "to_i", rb_str_to_i
, -1);
12398 rb_define_method(rb_cString
, "to_f", rb_str_to_f
, 0);
12399 rb_define_method(rb_cString
, "to_s", rb_str_to_s
, 0);
12400 rb_define_method(rb_cString
, "to_str", rb_str_to_s
, 0);
12401 rb_define_method(rb_cString
, "inspect", rb_str_inspect
, 0);
12402 rb_define_method(rb_cString
, "dump", rb_str_dump
, 0);
12403 rb_define_method(rb_cString
, "undump", str_undump
, 0);
12405 sym_ascii
= ID2SYM(rb_intern_const("ascii"));
12406 sym_turkic
= ID2SYM(rb_intern_const("turkic"));
12407 sym_lithuanian
= ID2SYM(rb_intern_const("lithuanian"));
12408 sym_fold
= ID2SYM(rb_intern_const("fold"));
12410 rb_define_method(rb_cString
, "upcase", rb_str_upcase
, -1);
12411 rb_define_method(rb_cString
, "downcase", rb_str_downcase
, -1);
12412 rb_define_method(rb_cString
, "capitalize", rb_str_capitalize
, -1);
12413 rb_define_method(rb_cString
, "swapcase", rb_str_swapcase
, -1);
12415 rb_define_method(rb_cString
, "upcase!", rb_str_upcase_bang
, -1);
12416 rb_define_method(rb_cString
, "downcase!", rb_str_downcase_bang
, -1);
12417 rb_define_method(rb_cString
, "capitalize!", rb_str_capitalize_bang
, -1);
12418 rb_define_method(rb_cString
, "swapcase!", rb_str_swapcase_bang
, -1);
12420 rb_define_method(rb_cString
, "hex", rb_str_hex
, 0);
12421 rb_define_method(rb_cString
, "oct", rb_str_oct
, 0);
12422 rb_define_method(rb_cString
, "split", rb_str_split_m
, -1);
12423 rb_define_method(rb_cString
, "lines", rb_str_lines
, -1);
12424 rb_define_method(rb_cString
, "bytes", rb_str_bytes
, 0);
12425 rb_define_method(rb_cString
, "chars", rb_str_chars
, 0);
12426 rb_define_method(rb_cString
, "codepoints", rb_str_codepoints
, 0);
12427 rb_define_method(rb_cString
, "grapheme_clusters", rb_str_grapheme_clusters
, 0);
12428 rb_define_method(rb_cString
, "reverse", rb_str_reverse
, 0);
12429 rb_define_method(rb_cString
, "reverse!", rb_str_reverse_bang
, 0);
12430 rb_define_method(rb_cString
, "concat", rb_str_concat_multi
, -1);
12431 rb_define_method(rb_cString
, "<<", rb_str_concat
, 1);
12432 rb_define_method(rb_cString
, "prepend", rb_str_prepend_multi
, -1);
12433 rb_define_method(rb_cString
, "crypt", rb_str_crypt
, 1);
12434 rb_define_method(rb_cString
, "intern", rb_str_intern
, 0); /* in symbol.c */
12435 rb_define_method(rb_cString
, "to_sym", rb_str_intern
, 0); /* in symbol.c */
12436 rb_define_method(rb_cString
, "ord", rb_str_ord
, 0);
12438 rb_define_method(rb_cString
, "include?", rb_str_include
, 1);
12439 rb_define_method(rb_cString
, "start_with?", rb_str_start_with
, -1);
12440 rb_define_method(rb_cString
, "end_with?", rb_str_end_with
, -1);
12442 rb_define_method(rb_cString
, "scan", rb_str_scan
, 1);
12444 rb_define_method(rb_cString
, "ljust", rb_str_ljust
, -1);
12445 rb_define_method(rb_cString
, "rjust", rb_str_rjust
, -1);
12446 rb_define_method(rb_cString
, "center", rb_str_center
, -1);
12448 rb_define_method(rb_cString
, "sub", rb_str_sub
, -1);
12449 rb_define_method(rb_cString
, "gsub", rb_str_gsub
, -1);
12450 rb_define_method(rb_cString
, "chop", rb_str_chop
, 0);
12451 rb_define_method(rb_cString
, "chomp", rb_str_chomp
, -1);
12452 rb_define_method(rb_cString
, "strip", rb_str_strip
, 0);
12453 rb_define_method(rb_cString
, "lstrip", rb_str_lstrip
, 0);
12454 rb_define_method(rb_cString
, "rstrip", rb_str_rstrip
, 0);
12455 rb_define_method(rb_cString
, "delete_prefix", rb_str_delete_prefix
, 1);
12456 rb_define_method(rb_cString
, "delete_suffix", rb_str_delete_suffix
, 1);
12458 rb_define_method(rb_cString
, "sub!", rb_str_sub_bang
, -1);
12459 rb_define_method(rb_cString
, "gsub!", rb_str_gsub_bang
, -1);
12460 rb_define_method(rb_cString
, "chop!", rb_str_chop_bang
, 0);
12461 rb_define_method(rb_cString
, "chomp!", rb_str_chomp_bang
, -1);
12462 rb_define_method(rb_cString
, "strip!", rb_str_strip_bang
, 0);
12463 rb_define_method(rb_cString
, "lstrip!", rb_str_lstrip_bang
, 0);
12464 rb_define_method(rb_cString
, "rstrip!", rb_str_rstrip_bang
, 0);
12465 rb_define_method(rb_cString
, "delete_prefix!", rb_str_delete_prefix_bang
, 1);
12466 rb_define_method(rb_cString
, "delete_suffix!", rb_str_delete_suffix_bang
, 1);
12468 rb_define_method(rb_cString
, "tr", rb_str_tr
, 2);
12469 rb_define_method(rb_cString
, "tr_s", rb_str_tr_s
, 2);
12470 rb_define_method(rb_cString
, "delete", rb_str_delete
, -1);
12471 rb_define_method(rb_cString
, "squeeze", rb_str_squeeze
, -1);
12472 rb_define_method(rb_cString
, "count", rb_str_count
, -1);
12474 rb_define_method(rb_cString
, "tr!", rb_str_tr_bang
, 2);
12475 rb_define_method(rb_cString
, "tr_s!", rb_str_tr_s_bang
, 2);
12476 rb_define_method(rb_cString
, "delete!", rb_str_delete_bang
, -1);
12477 rb_define_method(rb_cString
, "squeeze!", rb_str_squeeze_bang
, -1);
12479 rb_define_method(rb_cString
, "each_line", rb_str_each_line
, -1);
12480 rb_define_method(rb_cString
, "each_byte", rb_str_each_byte
, 0);
12481 rb_define_method(rb_cString
, "each_char", rb_str_each_char
, 0);
12482 rb_define_method(rb_cString
, "each_codepoint", rb_str_each_codepoint
, 0);
12483 rb_define_method(rb_cString
, "each_grapheme_cluster", rb_str_each_grapheme_cluster
, 0);
12485 rb_define_method(rb_cString
, "sum", rb_str_sum
, -1);
12487 rb_define_method(rb_cString
, "slice", rb_str_aref_m
, -1);
12488 rb_define_method(rb_cString
, "slice!", rb_str_slice_bang
, -1);
12490 rb_define_method(rb_cString
, "partition", rb_str_partition
, 1);
12491 rb_define_method(rb_cString
, "rpartition", rb_str_rpartition
, 1);
12493 rb_define_method(rb_cString
, "encoding", rb_obj_encoding
, 0); /* in encoding.c */
12494 rb_define_method(rb_cString
, "force_encoding", rb_str_force_encoding
, 1);
12495 rb_define_method(rb_cString
, "b", rb_str_b
, 0);
12496 rb_define_method(rb_cString
, "valid_encoding?", rb_str_valid_encoding_p
, 0);
12497 rb_define_method(rb_cString
, "ascii_only?", rb_str_is_ascii_only_p
, 0);
12499 /* define UnicodeNormalize module here so that we don't have to look it up */
12500 mUnicodeNormalize
= rb_define_module("UnicodeNormalize");
12501 id_normalize
= rb_intern_const("normalize");
12502 id_normalized_p
= rb_intern_const("normalized?");
12504 rb_define_method(rb_cString
, "unicode_normalize", rb_str_unicode_normalize
, -1);
12505 rb_define_method(rb_cString
, "unicode_normalize!", rb_str_unicode_normalize_bang
, -1);
12506 rb_define_method(rb_cString
, "unicode_normalized?", rb_str_unicode_normalized_p
, -1);
12509 rb_define_hooked_variable("$;", &rb_fs
, 0, rb_fs_setter
);
12510 rb_define_hooked_variable("$-F", &rb_fs
, 0, rb_fs_setter
);
12511 rb_gc_register_address(&rb_fs
);
12513 rb_cSymbol
= rb_define_class("Symbol", rb_cObject
);
12514 rb_include_module(rb_cSymbol
, rb_mComparable
);
12515 rb_undef_alloc_func(rb_cSymbol
);
12516 rb_undef_method(CLASS_OF(rb_cSymbol
), "new");
12517 rb_define_singleton_method(rb_cSymbol
, "all_symbols", sym_all_symbols
, 0);
12519 rb_define_method(rb_cSymbol
, "==", sym_equal
, 1);
12520 rb_define_method(rb_cSymbol
, "===", sym_equal
, 1);
12521 rb_define_method(rb_cSymbol
, "inspect", sym_inspect
, 0);
12522 rb_define_method(rb_cSymbol
, "to_s", rb_sym_to_s
, 0);
12523 rb_define_method(rb_cSymbol
, "id2name", rb_sym_to_s
, 0);
12524 rb_define_method(rb_cSymbol
, "name", rb_sym2str
, 0);
12525 rb_define_method(rb_cSymbol
, "intern", sym_to_sym
, 0);
12526 rb_define_method(rb_cSymbol
, "to_sym", sym_to_sym
, 0);
12527 rb_define_method(rb_cSymbol
, "to_proc", rb_sym_to_proc
, 0);
12528 rb_define_method(rb_cSymbol
, "succ", sym_succ
, 0);
12529 rb_define_method(rb_cSymbol
, "next", sym_succ
, 0);
12531 rb_define_method(rb_cSymbol
, "<=>", sym_cmp
, 1);
12532 rb_define_method(rb_cSymbol
, "casecmp", sym_casecmp
, 1);
12533 rb_define_method(rb_cSymbol
, "casecmp?", sym_casecmp_p
, 1);
12534 rb_define_method(rb_cSymbol
, "=~", sym_match
, 1);
12536 rb_define_method(rb_cSymbol
, "[]", sym_aref
, -1);
12537 rb_define_method(rb_cSymbol
, "slice", sym_aref
, -1);
12538 rb_define_method(rb_cSymbol
, "length", sym_length
, 0);
12539 rb_define_method(rb_cSymbol
, "size", sym_length
, 0);
12540 rb_define_method(rb_cSymbol
, "empty?", sym_empty
, 0);
12541 rb_define_method(rb_cSymbol
, "match", sym_match_m
, -1);
12542 rb_define_method(rb_cSymbol
, "match?", sym_match_m_p
, -1);
12544 rb_define_method(rb_cSymbol
, "upcase", sym_upcase
, -1);
12545 rb_define_method(rb_cSymbol
, "downcase", sym_downcase
, -1);
12546 rb_define_method(rb_cSymbol
, "capitalize", sym_capitalize
, -1);
12547 rb_define_method(rb_cSymbol
, "swapcase", sym_swapcase
, -1);
12549 rb_define_method(rb_cSymbol
, "start_with?", sym_start_with
, -1);
12550 rb_define_method(rb_cSymbol
, "end_with?", sym_end_with
, -1);
12552 rb_define_method(rb_cSymbol
, "encoding", sym_encoding
, 0);