1 /**********************************************************************
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
24 #include "debug_counter.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
43 #include "ruby/encoding.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
65 #undef rb_str_new_cstr
66 #undef rb_tainted_str_new_cstr
67 #undef rb_usascii_str_new_cstr
68 #undef rb_utf8_str_new_cstr
69 #undef rb_enc_str_new_cstr
70 #undef rb_external_str_new_cstr
71 #undef rb_locale_str_new_cstr
72 #undef rb_str_dup_frozen
73 #undef rb_str_buf_new_cstr
75 #undef rb_str_buf_cat2
77 #undef rb_str_cat_cstr
78 #undef rb_fstring_cstr
86 * 2: STR_SHARED (== ELTS_SHARED)
87 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
88 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
89 * other strings that rely on this string's buffer)
90 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
91 * early, specific to rb_str_tmp_frozen_{acquire,release})
92 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
93 * such as read(2). Any modification and realloc is prohibited)
95 * 8-9: ENC_CODERANGE (2 bits)
96 * 10-16: ENCODING (7 bits == 128)
98 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
99 * used for a string object based on C string literal)
100 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
101 * object header is temporarily allocated on C stack)
104 #define RUBY_MAX_CHAR_LEN 16
105 #define STR_SHARED_ROOT FL_USER5
106 #define STR_BORROWED FL_USER6
107 #define STR_TMPLOCK FL_USER7
108 #define STR_NOFREE FL_USER18
109 #define STR_FAKESTR FL_USER19
111 #define STR_SET_NOEMBED(str) do {\
112 FL_SET((str), STR_NOEMBED);\
114 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
117 STR_SET_EMBED_LEN((str), 0);\
120 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
122 # define STR_SET_EMBED_LEN(str, n) do { \
123 assert(str_embed_capa(str) > (n));\
124 RSTRING(str)->as.embed.len = (n);\
127 # define STR_SET_EMBED_LEN(str, n) do { \
129 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
130 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
134 #define STR_SET_LEN(str, n) do { \
135 if (STR_EMBED_P(str)) {\
136 STR_SET_EMBED_LEN((str), (n));\
139 RSTRING(str)->as.heap.len = (n);\
143 #define STR_DEC_LEN(str) do {\
144 if (STR_EMBED_P(str)) {\
145 long n = RSTRING_LEN(str);\
147 STR_SET_EMBED_LEN((str), n);\
150 RSTRING(str)->as.heap.len--;\
154 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
155 #define TERM_FILL(ptr, termlen) do {\
156 char *const term_fill_ptr = (ptr);\
157 const int term_fill_len = (termlen);\
158 *term_fill_ptr = '\0';\
159 if (UNLIKELY(term_fill_len > 1))\
160 memset(term_fill_ptr, 0, term_fill_len);\
163 #define RESIZE_CAPA(str,capacity) do {\
164 const int termlen = TERM_LEN(str);\
165 RESIZE_CAPA_TERM(str,capacity,termlen);\
167 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
168 if (STR_EMBED_P(str)) {\
169 if (str_embed_capa(str) < capacity + termlen) {\
170 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
171 const long tlen = RSTRING_LEN(str);\
172 memcpy(tmp, RSTRING_PTR(str), tlen);\
173 RSTRING(str)->as.heap.ptr = tmp;\
174 RSTRING(str)->as.heap.len = tlen;\
175 STR_SET_NOEMBED(str);\
176 RSTRING(str)->as.heap.aux.capa = (capacity);\
180 assert(!FL_TEST((str), STR_SHARED)); \
181 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
182 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
183 RSTRING(str)->as.heap.aux.capa = (capacity);\
187 #define STR_SET_SHARED(str, shared_str) do { \
188 if (!FL_TEST(str, STR_FAKESTR)) { \
189 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
190 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
191 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
192 FL_SET((str), STR_SHARED); \
193 FL_SET((shared_str), STR_SHARED_ROOT); \
194 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
195 FL_SET_RAW((shared_str), STR_BORROWED); \
199 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
200 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
201 /* TODO: include the terminator size in capa. */
203 #define STR_ENC_GET(str) get_encoding(str)
205 #if !defined SHARABLE_MIDDLE_SUBSTRING
206 # define SHARABLE_MIDDLE_SUBSTRING 0
208 #if !SHARABLE_MIDDLE_SUBSTRING
209 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
211 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
216 str_embed_capa(VALUE str
)
219 return rb_gc_obj_slot_size(str
) - offsetof(struct RString
, as
.embed
.ary
);
221 return RSTRING_EMBED_LEN_MAX
+ 1;
226 str_embed_size(long capa
)
228 return offsetof(struct RString
, as
.embed
.ary
) + capa
;
232 STR_EMBEDDABLE_P(long len
, long termlen
)
235 return rb_gc_size_allocatable_p(str_embed_size(len
+ termlen
));
237 return len
<= RSTRING_EMBED_LEN_MAX
+ 1 - termlen
;
241 static VALUE
str_replace_shared_without_enc(VALUE str2
, VALUE str
);
242 static VALUE
str_new_frozen(VALUE klass
, VALUE orig
);
243 static VALUE
str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
);
244 static VALUE
str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
);
245 static VALUE
str_new(VALUE klass
, const char *ptr
, long len
);
246 static void str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
);
247 static inline void str_modifiable(VALUE str
);
248 static VALUE
rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
);
251 str_make_independent(VALUE str
)
253 long len
= RSTRING_LEN(str
);
254 int termlen
= TERM_LEN(str
);
255 str_make_independent_expand((str
), len
, 0L, termlen
);
258 static inline int str_dependent_p(VALUE str
);
261 rb_str_make_independent(VALUE str
)
263 if (str_dependent_p(str
)) {
264 str_make_independent(str
);
269 rb_debug_rstring_null_ptr(const char *func
)
271 fprintf(stderr
, "%s is returning NULL!! "
272 "SIGSEGV is highly expected to follow immediately. "
273 "If you could reproduce, attach your debugger here, "
274 "and look at the passed string.",
278 /* symbols for [up|down|swap]case/capitalize options */
279 static VALUE sym_ascii
, sym_turkic
, sym_lithuanian
, sym_fold
;
282 get_actual_encoding(const int encidx
, VALUE str
)
284 const unsigned char *q
;
287 case ENCINDEX_UTF_16
:
288 if (RSTRING_LEN(str
) < 2) break;
289 q
= (const unsigned char *)RSTRING_PTR(str
);
290 if (q
[0] == 0xFE && q
[1] == 0xFF) {
291 return rb_enc_get_from_index(ENCINDEX_UTF_16BE
);
293 if (q
[0] == 0xFF && q
[1] == 0xFE) {
294 return rb_enc_get_from_index(ENCINDEX_UTF_16LE
);
296 return rb_ascii8bit_encoding();
297 case ENCINDEX_UTF_32
:
298 if (RSTRING_LEN(str
) < 4) break;
299 q
= (const unsigned char *)RSTRING_PTR(str
);
300 if (q
[0] == 0 && q
[1] == 0 && q
[2] == 0xFE && q
[3] == 0xFF) {
301 return rb_enc_get_from_index(ENCINDEX_UTF_32BE
);
303 if (q
[3] == 0 && q
[2] == 0 && q
[1] == 0xFE && q
[0] == 0xFF) {
304 return rb_enc_get_from_index(ENCINDEX_UTF_32LE
);
306 return rb_ascii8bit_encoding();
308 return rb_enc_from_index(encidx
);
312 get_encoding(VALUE str
)
314 return get_actual_encoding(ENCODING_GET(str
), str
);
318 mustnot_broken(VALUE str
)
320 if (is_broken_string(str
)) {
321 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str
)));
326 mustnot_wchar(VALUE str
)
328 rb_encoding
*enc
= STR_ENC_GET(str
);
329 if (rb_enc_mbminlen(enc
) > 1) {
330 rb_raise(rb_eArgError
, "wide char encoding: %s", rb_enc_name(enc
));
334 static int fstring_cmp(VALUE a
, VALUE b
);
336 static VALUE
register_fstring(VALUE str
, bool copy
);
338 const struct st_hash_type rb_fstring_hash_type
= {
343 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
345 struct fstr_update_arg
{
351 fstr_update_callback(st_data_t
*key
, st_data_t
*value
, st_data_t data
, int existing
)
354 struct fstr_update_arg
*arg
= (struct fstr_update_arg
*)data
;
355 VALUE str
= (VALUE
)*key
;
358 /* because of lazy sweep, str may be unmarked already and swept
361 if (rb_objspace_garbage_object_p(str
)) {
370 if (FL_TEST_RAW(str
, STR_FAKESTR
)) {
372 VALUE new_str
= str_new(rb_cString
, RSTRING(str
)->as
.heap
.ptr
, RSTRING(str
)->as
.heap
.len
);
373 rb_enc_copy(new_str
, str
);
377 str
= str_new_static(rb_cString
, RSTRING(str
)->as
.heap
.ptr
,
378 RSTRING(str
)->as
.heap
.len
,
384 if (!OBJ_FROZEN(str
))
385 str
= str_new_frozen(rb_cString
, str
);
386 if (STR_SHARED_P(str
)) { /* str should not be shared */
387 /* shared substring */
388 str_make_independent(str
);
389 assert(OBJ_FROZEN(str
));
391 if (!BARE_STRING_P(str
)) {
392 str
= str_new_frozen(rb_cString
, str
);
395 RBASIC(str
)->flags
|= RSTRING_FSTR
;
397 *key
= *value
= arg
->fstr
= str
;
404 rb_fstring(VALUE str
)
409 Check_Type(str
, T_STRING
);
411 if (FL_TEST(str
, RSTRING_FSTR
))
414 bare
= BARE_STRING_P(str
);
416 if (STR_EMBED_P(str
)) {
420 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_SHARED_ROOT
|STR_SHARED
) == (STR_NOEMBED
|STR_SHARED_ROOT
)) {
421 assert(OBJ_FROZEN(str
));
426 if (!OBJ_FROZEN(str
))
427 rb_str_resize(str
, RSTRING_LEN(str
));
429 fstr
= register_fstring(str
, FALSE
);
432 str_replace_shared_without_enc(str
, fstr
);
440 register_fstring(VALUE str
, bool copy
)
442 struct fstr_update_arg args
;
447 st_table
*frozen_strings
= rb_vm_fstring_table();
450 st_update(frozen_strings
, (st_data_t
)str
, fstr_update_callback
, (st_data_t
)&args
);
451 } while (args
.fstr
== Qundef
);
455 assert(OBJ_FROZEN(args
.fstr
));
456 assert(!FL_TEST_RAW(args
.fstr
, STR_FAKESTR
));
457 assert(!FL_TEST_RAW(args
.fstr
, FL_EXIVAR
));
458 assert(RBASIC_CLASS(args
.fstr
) == rb_cString
);
463 setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, int encidx
)
465 fake_str
->basic
.flags
= T_STRING
|RSTRING_NOEMBED
|STR_NOFREE
|STR_FAKESTR
;
466 /* SHARED to be allocated by the callback */
469 RUBY_ASSERT_ALWAYS(len
== 0);
473 ENCODING_SET_INLINED((VALUE
)fake_str
, encidx
);
475 RBASIC_SET_CLASS_RAW((VALUE
)fake_str
, rb_cString
);
476 fake_str
->as
.heap
.len
= len
;
477 fake_str
->as
.heap
.ptr
= (char *)name
;
478 fake_str
->as
.heap
.aux
.capa
= len
;
479 return (VALUE
)fake_str
;
483 * set up a fake string which refers a static string literal.
486 rb_setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, rb_encoding
*enc
)
488 return setup_fake_str(fake_str
, name
, len
, rb_enc_to_index(enc
));
492 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
493 * shared string which refers a static string literal. `ptr` must
494 * point a constant string.
496 MJIT_FUNC_EXPORTED VALUE
497 rb_fstring_new(const char *ptr
, long len
)
499 struct RString fake_str
;
500 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), FALSE
);
504 rb_fstring_enc_new(const char *ptr
, long len
, rb_encoding
*enc
)
506 struct RString fake_str
;
507 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), FALSE
);
511 rb_fstring_cstr(const char *ptr
)
513 return rb_fstring_new(ptr
, strlen(ptr
));
517 fstring_set_class_i(st_data_t key
, st_data_t val
, st_data_t arg
)
519 RBASIC_SET_CLASS((VALUE
)key
, (VALUE
)arg
);
524 fstring_cmp(VALUE a
, VALUE b
)
527 const char *aptr
, *bptr
;
528 RSTRING_GETMEM(a
, aptr
, alen
);
529 RSTRING_GETMEM(b
, bptr
, blen
);
530 return (alen
!= blen
||
531 ENCODING_GET(a
) != ENCODING_GET(b
) ||
532 memcmp(aptr
, bptr
, alen
) != 0);
536 single_byte_optimizable(VALUE str
)
540 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
541 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
544 enc
= STR_ENC_GET(str
);
545 if (rb_enc_mbmaxlen(enc
) == 1)
548 /* Conservative. Possibly single byte.
549 * "\xa1" in Shift_JIS for example. */
555 static inline const char *
556 search_nonascii(const char *p
, const char *e
)
558 const uintptr_t *s
, *t
;
560 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
561 # if SIZEOF_UINTPTR_T == 8
562 # define NONASCII_MASK UINT64_C(0x8080808080808080)
563 # elif SIZEOF_UINTPTR_T == 4
564 # define NONASCII_MASK UINT32_C(0x80808080)
566 # error "don't know what to do."
569 # if SIZEOF_UINTPTR_T == 8
570 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
571 # elif SIZEOF_UINTPTR_T == 4
572 # define NONASCII_MASK 0x80808080UL /* or...? */
574 # error "don't know what to do."
578 if (UNALIGNED_WORD_ACCESS
|| e
- p
>= SIZEOF_VOIDP
) {
579 #if !UNALIGNED_WORD_ACCESS
580 if ((uintptr_t)p
% SIZEOF_VOIDP
) {
581 int l
= SIZEOF_VOIDP
- (uintptr_t)p
% SIZEOF_VOIDP
;
584 default: UNREACHABLE
;
586 case 7: if (p
[-7]&0x80) return p
-7;
587 case 6: if (p
[-6]&0x80) return p
-6;
588 case 5: if (p
[-5]&0x80) return p
-5;
589 case 4: if (p
[-4]&0x80) return p
-4;
591 case 3: if (p
[-3]&0x80) return p
-3;
592 case 2: if (p
[-2]&0x80) return p
-2;
593 case 1: if (p
[-1]&0x80) return p
-1;
598 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
599 #define aligned_ptr(value) \
600 __builtin_assume_aligned((value), sizeof(uintptr_t))
602 #define aligned_ptr(value) (uintptr_t *)(value)
605 t
= (uintptr_t *)(e
- (SIZEOF_VOIDP
-1));
608 if (*s
& NONASCII_MASK
) {
609 #ifdef WORDS_BIGENDIAN
610 return (const char *)s
+ (nlz_intptr(*s
&NONASCII_MASK
)>>3);
612 return (const char *)s
+ (ntz_intptr(*s
&NONASCII_MASK
)>>3);
620 default: UNREACHABLE
;
622 case 7: if (e
[-7]&0x80) return e
-7;
623 case 6: if (e
[-6]&0x80) return e
-6;
624 case 5: if (e
[-5]&0x80) return e
-5;
625 case 4: if (e
[-4]&0x80) return e
-4;
627 case 3: if (e
[-3]&0x80) return e
-3;
628 case 2: if (e
[-2]&0x80) return e
-2;
629 case 1: if (e
[-1]&0x80) return e
-1;
635 coderange_scan(const char *p
, long len
, rb_encoding
*enc
)
637 const char *e
= p
+ len
;
639 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
640 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
641 p
= search_nonascii(p
, e
);
642 return p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
645 if (rb_enc_asciicompat(enc
)) {
646 p
= search_nonascii(p
, e
);
647 if (!p
) return ENC_CODERANGE_7BIT
;
649 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
650 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
651 p
+= MBCLEN_CHARFOUND_LEN(ret
);
653 p
= search_nonascii(p
, e
);
659 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
660 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
661 p
+= MBCLEN_CHARFOUND_LEN(ret
);
664 return ENC_CODERANGE_VALID
;
668 rb_str_coderange_scan_restartable(const char *s
, const char *e
, rb_encoding
*enc
, int *cr
)
672 if (*cr
== ENC_CODERANGE_BROKEN
)
675 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
676 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
677 if (*cr
== ENC_CODERANGE_VALID
) return e
- s
;
678 p
= search_nonascii(p
, e
);
679 *cr
= p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
682 else if (rb_enc_asciicompat(enc
)) {
683 p
= search_nonascii(p
, e
);
685 if (*cr
!= ENC_CODERANGE_VALID
) *cr
= ENC_CODERANGE_7BIT
;
689 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
690 if (!MBCLEN_CHARFOUND_P(ret
)) {
691 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
694 p
+= MBCLEN_CHARFOUND_LEN(ret
);
696 p
= search_nonascii(p
, e
);
702 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
703 if (!MBCLEN_CHARFOUND_P(ret
)) {
704 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
707 p
+= MBCLEN_CHARFOUND_LEN(ret
);
710 *cr
= ENC_CODERANGE_VALID
;
715 str_enc_copy(VALUE str1
, VALUE str2
)
717 rb_enc_set_index(str1
, ENCODING_GET(str2
));
721 rb_enc_cr_str_copy_for_substr(VALUE dest
, VALUE src
)
723 /* this function is designed for copying encoding and coderange
724 * from src to new string "dest" which is made from the part of src.
726 str_enc_copy(dest
, src
);
727 if (RSTRING_LEN(dest
) == 0) {
728 if (!rb_enc_asciicompat(STR_ENC_GET(src
)))
729 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
731 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
734 switch (ENC_CODERANGE(src
)) {
735 case ENC_CODERANGE_7BIT
:
736 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
738 case ENC_CODERANGE_VALID
:
739 if (!rb_enc_asciicompat(STR_ENC_GET(src
)) ||
740 search_nonascii(RSTRING_PTR(dest
), RSTRING_END(dest
)))
741 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
743 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
751 rb_enc_cr_str_exact_copy(VALUE dest
, VALUE src
)
753 str_enc_copy(dest
, src
);
754 ENC_CODERANGE_SET(dest
, ENC_CODERANGE(src
));
758 enc_coderange_scan(VALUE str
, rb_encoding
*enc
, int encidx
)
760 if (rb_enc_mbminlen(enc
) > 1 && rb_enc_dummy_p(enc
) &&
761 rb_enc_mbminlen(enc
= get_actual_encoding(encidx
, str
)) == 1) {
762 return ENC_CODERANGE_BROKEN
;
765 return coderange_scan(RSTRING_PTR(str
), RSTRING_LEN(str
), enc
);
770 rb_enc_str_coderange_scan(VALUE str
, rb_encoding
*enc
)
772 return enc_coderange_scan(str
, enc
, rb_enc_to_index(enc
));
776 rb_enc_str_coderange(VALUE str
)
778 int cr
= ENC_CODERANGE(str
);
780 if (cr
== ENC_CODERANGE_UNKNOWN
) {
781 int encidx
= ENCODING_GET(str
);
782 rb_encoding
*enc
= rb_enc_from_index(encidx
);
783 cr
= enc_coderange_scan(str
, enc
, encidx
);
784 ENC_CODERANGE_SET(str
, cr
);
790 rb_enc_str_asciionly_p(VALUE str
)
792 rb_encoding
*enc
= STR_ENC_GET(str
);
794 if (!rb_enc_asciicompat(enc
))
796 else if (rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
)
802 str_mod_check(VALUE s
, const char *p
, long len
)
804 if (RSTRING_PTR(s
) != p
|| RSTRING_LEN(s
) != len
){
805 rb_raise(rb_eRuntimeError
, "string modified");
810 str_capacity(VALUE str
, const int termlen
)
812 if (STR_EMBED_P(str
)) {
814 return str_embed_capa(str
) - termlen
;
816 return (RSTRING_EMBED_LEN_MAX
+ 1 - termlen
);
819 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
820 return RSTRING(str
)->as
.heap
.len
;
823 return RSTRING(str
)->as
.heap
.aux
.capa
;
828 rb_str_capacity(VALUE str
)
830 return str_capacity(str
, TERM_LEN(str
));
834 must_not_null(const char *ptr
)
837 rb_raise(rb_eArgError
, "NULL pointer given");
842 str_alloc(VALUE klass
, size_t size
)
845 RVARGC_NEWOBJ_OF(str
, struct RString
, klass
,
846 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
851 str_alloc_embed(VALUE klass
, size_t capa
)
853 size_t size
= str_embed_size(capa
);
854 assert(rb_gc_size_allocatable_p(size
));
856 assert(size
<= sizeof(struct RString
));
858 return str_alloc(klass
, size
);
862 str_alloc_heap(VALUE klass
)
864 return str_alloc(klass
, sizeof(struct RString
));
868 empty_str_alloc(VALUE klass
)
870 RUBY_DTRACE_CREATE_HOOK(STRING
, 0);
871 VALUE str
= str_alloc_embed(klass
, 0);
872 memset(RSTRING(str
)->as
.embed
.ary
, 0, str_embed_capa(str
));
877 str_new0(VALUE klass
, const char *ptr
, long len
, int termlen
)
882 rb_raise(rb_eArgError
, "negative string size (or size too big)");
885 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
887 if (STR_EMBEDDABLE_P(len
, termlen
)) {
888 str
= str_alloc_embed(klass
, len
+ termlen
);
890 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
894 str
= str_alloc_heap(klass
);
895 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
896 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
897 * integer overflow. If we can STATIC_ASSERT that, the following
898 * mul_add_mul can be reverted to a simple ALLOC_N. */
899 RSTRING(str
)->as
.heap
.ptr
=
900 rb_xmalloc_mul_add_mul(sizeof(char), len
, sizeof(char), termlen
);
901 STR_SET_NOEMBED(str
);
904 memcpy(RSTRING_PTR(str
), ptr
, len
);
906 STR_SET_LEN(str
, len
);
907 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
912 str_new(VALUE klass
, const char *ptr
, long len
)
914 return str_new0(klass
, ptr
, len
, 1);
918 rb_str_new(const char *ptr
, long len
)
920 return str_new(rb_cString
, ptr
, len
);
924 rb_usascii_str_new(const char *ptr
, long len
)
926 VALUE str
= rb_str_new(ptr
, len
);
927 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
932 rb_utf8_str_new(const char *ptr
, long len
)
934 VALUE str
= str_new(rb_cString
, ptr
, len
);
935 rb_enc_associate_index(str
, rb_utf8_encindex());
940 rb_enc_str_new(const char *ptr
, long len
, rb_encoding
*enc
)
944 if (!enc
) return rb_str_new(ptr
, len
);
946 str
= str_new0(rb_cString
, ptr
, len
, rb_enc_mbminlen(enc
));
947 rb_enc_associate(str
, enc
);
952 rb_str_new_cstr(const char *ptr
)
955 /* rb_str_new_cstr() can take pointer from non-malloc-generated
956 * memory regions, and that cannot be detected by the MSAN. Just
957 * trust the programmer that the argument passed here is a sane C
959 __msan_unpoison_string(ptr
);
960 return rb_str_new(ptr
, strlen(ptr
));
964 rb_usascii_str_new_cstr(const char *ptr
)
966 VALUE str
= rb_str_new_cstr(ptr
);
967 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
972 rb_utf8_str_new_cstr(const char *ptr
)
974 VALUE str
= rb_str_new_cstr(ptr
);
975 rb_enc_associate_index(str
, rb_utf8_encindex());
980 rb_enc_str_new_cstr(const char *ptr
, rb_encoding
*enc
)
983 if (rb_enc_mbminlen(enc
) != 1) {
984 rb_raise(rb_eArgError
, "wchar encoding given");
986 return rb_enc_str_new(ptr
, strlen(ptr
), enc
);
990 str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
)
995 rb_raise(rb_eArgError
, "negative string size (or size too big)");
999 rb_encoding
*enc
= rb_enc_get_from_index(encindex
);
1000 str
= str_new0(klass
, ptr
, len
, rb_enc_mbminlen(enc
));
1003 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
1004 str
= str_alloc_heap(klass
);
1005 RSTRING(str
)->as
.heap
.len
= len
;
1006 RSTRING(str
)->as
.heap
.ptr
= (char *)ptr
;
1007 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
1008 STR_SET_NOEMBED(str
);
1009 RBASIC(str
)->flags
|= STR_NOFREE
;
1011 rb_enc_associate_index(str
, encindex
);
1016 rb_str_new_static(const char *ptr
, long len
)
1018 return str_new_static(rb_cString
, ptr
, len
, 0);
1022 rb_usascii_str_new_static(const char *ptr
, long len
)
1024 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_US_ASCII
);
1028 rb_utf8_str_new_static(const char *ptr
, long len
)
1030 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_UTF_8
);
1034 rb_enc_str_new_static(const char *ptr
, long len
, rb_encoding
*enc
)
1036 return str_new_static(rb_cString
, ptr
, len
, rb_enc_to_index(enc
));
1040 rb_tainted_str_new(const char *ptr
, long len
)
1042 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new", NULL
);
1043 return rb_str_new(ptr
, len
);
1047 rb_tainted_str_new_cstr(const char *ptr
)
1049 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new_cstr", NULL
);
1050 return rb_str_new_cstr(ptr
);
1053 static VALUE
str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1054 rb_encoding
*from
, rb_encoding
*to
,
1055 int ecflags
, VALUE ecopts
);
1058 is_enc_ascii_string(VALUE str
, rb_encoding
*enc
)
1060 int encidx
= rb_enc_to_index(enc
);
1061 if (rb_enc_get_index(str
) == encidx
)
1062 return is_ascii_string(str
);
1063 return enc_coderange_scan(str
, enc
, encidx
) == ENC_CODERANGE_7BIT
;
1067 rb_str_conv_enc_opts(VALUE str
, rb_encoding
*from
, rb_encoding
*to
, int ecflags
, VALUE ecopts
)
1073 if (!to
) return str
;
1074 if (!from
) from
= rb_enc_get(str
);
1075 if (from
== to
) return str
;
1076 if ((rb_enc_asciicompat(to
) && is_enc_ascii_string(str
, from
)) ||
1077 to
== rb_ascii8bit_encoding()) {
1078 if (STR_ENC_GET(str
) != to
) {
1079 str
= rb_str_dup(str
);
1080 rb_enc_associate(str
, to
);
1085 RSTRING_GETMEM(str
, ptr
, len
);
1086 newstr
= str_cat_conv_enc_opts(rb_str_buf_new(len
), 0, ptr
, len
,
1087 from
, to
, ecflags
, ecopts
);
1088 if (NIL_P(newstr
)) {
1089 /* some error, return original */
1096 rb_str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1097 rb_encoding
*from
, int ecflags
, VALUE ecopts
)
1101 olen
= RSTRING_LEN(newstr
);
1102 if (ofs
< -olen
|| olen
< ofs
)
1103 rb_raise(rb_eIndexError
, "index %ld out of string", ofs
);
1104 if (ofs
< 0) ofs
+= olen
;
1106 STR_SET_LEN(newstr
, ofs
);
1107 return rb_str_cat(newstr
, ptr
, len
);
1110 rb_str_modify(newstr
);
1111 return str_cat_conv_enc_opts(newstr
, ofs
, ptr
, len
, from
,
1117 rb_str_initialize(VALUE str
, const char *ptr
, long len
, rb_encoding
*enc
)
1119 STR_SET_LEN(str
, 0);
1120 rb_enc_associate(str
, enc
);
1121 rb_str_cat(str
, ptr
, len
);
1126 str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1127 rb_encoding
*from
, rb_encoding
*to
,
1128 int ecflags
, VALUE ecopts
)
1131 rb_econv_result_t ret
;
1133 VALUE econv_wrapper
;
1134 const unsigned char *start
, *sp
;
1135 unsigned char *dest
, *dp
;
1136 size_t converted_output
= (size_t)ofs
;
1138 olen
= rb_str_capacity(newstr
);
1140 econv_wrapper
= rb_obj_alloc(rb_cEncodingConverter
);
1141 RBASIC_CLEAR_CLASS(econv_wrapper
);
1142 ec
= rb_econv_open_opts(from
->name
, to
->name
, ecflags
, ecopts
);
1143 if (!ec
) return Qnil
;
1144 DATA_PTR(econv_wrapper
) = ec
;
1146 sp
= (unsigned char*)ptr
;
1148 while ((dest
= (unsigned char*)RSTRING_PTR(newstr
)),
1149 (dp
= dest
+ converted_output
),
1150 (ret
= rb_econv_convert(ec
, &sp
, start
+ len
, &dp
, dest
+ olen
, 0)),
1151 ret
== econv_destination_buffer_full
) {
1152 /* destination buffer short */
1153 size_t converted_input
= sp
- start
;
1154 size_t rest
= len
- converted_input
;
1155 converted_output
= dp
- dest
;
1156 rb_str_set_len(newstr
, converted_output
);
1157 if (converted_input
&& converted_output
&&
1158 rest
< (LONG_MAX
/ converted_output
)) {
1159 rest
= (rest
* converted_output
) / converted_input
;
1164 olen
+= rest
< 2 ? 2 : rest
;
1165 rb_str_resize(newstr
, olen
);
1167 DATA_PTR(econv_wrapper
) = 0;
1170 case econv_finished
:
1171 len
= dp
- (unsigned char*)RSTRING_PTR(newstr
);
1172 rb_str_set_len(newstr
, len
);
1173 rb_enc_associate(newstr
, to
);
1182 rb_str_conv_enc(VALUE str
, rb_encoding
*from
, rb_encoding
*to
)
1184 return rb_str_conv_enc_opts(str
, from
, to
, 0, Qnil
);
1188 rb_external_str_new_with_enc(const char *ptr
, long len
, rb_encoding
*eenc
)
1192 const int eidx
= rb_enc_to_index(eenc
);
1195 return rb_enc_str_new(ptr
, len
, eenc
);
1198 /* ASCII-8BIT case, no conversion */
1199 if ((eidx
== rb_ascii8bit_encindex()) ||
1200 (eidx
== rb_usascii_encindex() && search_nonascii(ptr
, ptr
+ len
))) {
1201 return rb_str_new(ptr
, len
);
1203 /* no default_internal or same encoding, no conversion */
1204 ienc
= rb_default_internal_encoding();
1205 if (!ienc
|| eenc
== ienc
) {
1206 return rb_enc_str_new(ptr
, len
, eenc
);
1208 /* ASCII compatible, and ASCII only string, no conversion in
1209 * default_internal */
1210 if ((eidx
== rb_ascii8bit_encindex()) ||
1211 (eidx
== rb_usascii_encindex()) ||
1212 (rb_enc_asciicompat(eenc
) && !search_nonascii(ptr
, ptr
+ len
))) {
1213 return rb_enc_str_new(ptr
, len
, ienc
);
1215 /* convert from the given encoding to default_internal */
1216 str
= rb_enc_str_new(NULL
, 0, ienc
);
1217 /* when the conversion failed for some reason, just ignore the
1218 * default_internal and result in the given encoding as-is. */
1219 if (NIL_P(rb_str_cat_conv_enc_opts(str
, 0, ptr
, len
, eenc
, 0, Qnil
))) {
1220 rb_str_initialize(str
, ptr
, len
, eenc
);
1226 rb_external_str_with_enc(VALUE str
, rb_encoding
*eenc
)
1228 int eidx
= rb_enc_to_index(eenc
);
1229 if (eidx
== rb_usascii_encindex() &&
1230 rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
) {
1231 rb_enc_associate_index(str
, rb_ascii8bit_encindex());
1234 rb_enc_associate_index(str
, eidx
);
1235 return rb_str_conv_enc(str
, eenc
, rb_default_internal_encoding());
1239 rb_external_str_new(const char *ptr
, long len
)
1241 return rb_external_str_new_with_enc(ptr
, len
, rb_default_external_encoding());
1245 rb_external_str_new_cstr(const char *ptr
)
1247 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_default_external_encoding());
1251 rb_locale_str_new(const char *ptr
, long len
)
1253 return rb_external_str_new_with_enc(ptr
, len
, rb_locale_encoding());
1257 rb_locale_str_new_cstr(const char *ptr
)
1259 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_locale_encoding());
1263 rb_filesystem_str_new(const char *ptr
, long len
)
1265 return rb_external_str_new_with_enc(ptr
, len
, rb_filesystem_encoding());
1269 rb_filesystem_str_new_cstr(const char *ptr
)
1271 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_filesystem_encoding());
1275 rb_str_export(VALUE str
)
1277 return rb_str_export_to_enc(str
, rb_default_external_encoding());
1281 rb_str_export_locale(VALUE str
)
1283 return rb_str_export_to_enc(str
, rb_locale_encoding());
1287 rb_str_export_to_enc(VALUE str
, rb_encoding
*enc
)
1289 return rb_str_conv_enc(str
, STR_ENC_GET(str
), enc
);
1293 str_replace_shared_without_enc(VALUE str2
, VALUE str
)
1295 const int termlen
= TERM_LEN(str
);
1299 RSTRING_GETMEM(str
, ptr
, len
);
1300 if (str_embed_capa(str2
) >= len
+ termlen
) {
1301 char *ptr2
= RSTRING(str2
)->as
.embed
.ary
;
1302 STR_SET_EMBED(str2
);
1303 memcpy(ptr2
, RSTRING_PTR(str
), len
);
1304 STR_SET_EMBED_LEN(str2
, len
);
1305 TERM_FILL(ptr2
+len
, termlen
);
1309 if (STR_SHARED_P(str
)) {
1310 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1311 RSTRING_GETMEM(str
, ptr
, len
);
1314 root
= rb_str_new_frozen(str
);
1315 RSTRING_GETMEM(root
, ptr
, len
);
1317 assert(OBJ_FROZEN(root
));
1318 if (!STR_EMBED_P(str2
) && !FL_TEST_RAW(str2
, STR_SHARED
|STR_NOFREE
)) {
1319 if (FL_TEST_RAW(str2
, STR_SHARED_ROOT
)) {
1320 rb_fatal("about to free a possible shared root");
1322 char *ptr2
= STR_HEAP_PTR(str2
);
1324 ruby_sized_xfree(ptr2
, STR_HEAP_SIZE(str2
));
1327 FL_SET(str2
, STR_NOEMBED
);
1328 RSTRING(str2
)->as
.heap
.len
= len
;
1329 RSTRING(str2
)->as
.heap
.ptr
= ptr
;
1330 STR_SET_SHARED(str2
, root
);
1336 str_replace_shared(VALUE str2
, VALUE str
)
1338 str_replace_shared_without_enc(str2
, str
);
1339 rb_enc_cr_str_exact_copy(str2
, str
);
1344 str_new_shared(VALUE klass
, VALUE str
)
1346 return str_replace_shared(str_alloc_heap(klass
), str
);
1350 rb_str_new_shared(VALUE str
)
1352 return str_new_shared(rb_obj_class(str
), str
);
1356 rb_str_new_frozen(VALUE orig
)
1358 if (OBJ_FROZEN(orig
)) return orig
;
1359 return str_new_frozen(rb_obj_class(orig
), orig
);
1363 rb_str_new_frozen_String(VALUE orig
)
1365 if (OBJ_FROZEN(orig
) && rb_obj_class(orig
) == rb_cString
) return orig
;
1366 return str_new_frozen(rb_cString
, orig
);
1370 rb_str_tmp_frozen_acquire(VALUE orig
)
1372 if (OBJ_FROZEN_RAW(orig
)) return orig
;
1373 return str_new_frozen_buffer(0, orig
, FALSE
);
1377 rb_str_tmp_frozen_release(VALUE orig
, VALUE tmp
)
1379 if (RBASIC_CLASS(tmp
) != 0)
1382 if (STR_EMBED_P(tmp
)) {
1383 assert(OBJ_FROZEN_RAW(tmp
));
1385 else if (FL_TEST_RAW(orig
, STR_SHARED
) &&
1386 !FL_TEST_RAW(orig
, STR_TMPLOCK
|RUBY_FL_FREEZE
)) {
1387 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1389 if (shared
== tmp
&& !FL_TEST_RAW(tmp
, STR_BORROWED
)) {
1390 assert(RSTRING(orig
)->as
.heap
.ptr
== RSTRING(tmp
)->as
.heap
.ptr
);
1391 assert(RSTRING(orig
)->as
.heap
.len
== RSTRING(tmp
)->as
.heap
.len
);
1393 /* Unshare orig since the root (tmp) only has this one child. */
1394 FL_UNSET_RAW(orig
, STR_SHARED
);
1395 RSTRING(orig
)->as
.heap
.aux
.capa
= RSTRING(tmp
)->as
.heap
.aux
.capa
;
1396 RBASIC(orig
)->flags
|= RBASIC(tmp
)->flags
& STR_NOFREE
;
1397 assert(OBJ_FROZEN_RAW(tmp
));
1399 /* Make tmp embedded and empty so it is safe for sweeping. */
1401 STR_SET_EMBED_LEN(tmp
, 0);
1407 str_new_frozen(VALUE klass
, VALUE orig
)
1409 return str_new_frozen_buffer(klass
, orig
, TRUE
);
1413 heap_str_make_shared(VALUE klass
, VALUE orig
)
1415 assert(!STR_EMBED_P(orig
));
1416 assert(!STR_SHARED_P(orig
));
1418 VALUE str
= str_alloc_heap(klass
);
1419 STR_SET_NOEMBED(str
);
1420 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(orig
);
1421 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(orig
);
1422 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(orig
)->as
.heap
.aux
.capa
;
1423 RBASIC(str
)->flags
|= RBASIC(orig
)->flags
& STR_NOFREE
;
1424 RBASIC(orig
)->flags
&= ~STR_NOFREE
;
1425 STR_SET_SHARED(orig
, str
);
1427 FL_UNSET_RAW(str
, STR_BORROWED
);
1432 str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
)
1436 long len
= RSTRING_LEN(orig
);
1438 if (STR_EMBED_P(orig
) || STR_EMBEDDABLE_P(len
, 1)) {
1439 str
= str_new(klass
, RSTRING_PTR(orig
), len
);
1440 assert(STR_EMBED_P(str
));
1443 if (FL_TEST_RAW(orig
, STR_SHARED
)) {
1444 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1445 long ofs
= RSTRING(orig
)->as
.heap
.ptr
- RSTRING_PTR(shared
);
1446 long rest
= RSTRING_LEN(shared
) - ofs
- RSTRING(orig
)->as
.heap
.len
;
1449 assert(ofs
+ rest
<= RSTRING_LEN(shared
));
1451 assert(!STR_EMBED_P(shared
));
1453 assert(OBJ_FROZEN(shared
));
1455 if ((ofs
> 0) || (rest
> 0) ||
1456 (klass
!= RBASIC(shared
)->klass
) ||
1457 ENCODING_GET(shared
) != ENCODING_GET(orig
)) {
1458 str
= str_new_shared(klass
, shared
);
1459 assert(!STR_EMBED_P(str
));
1460 RSTRING(str
)->as
.heap
.ptr
+= ofs
;
1461 RSTRING(str
)->as
.heap
.len
-= ofs
+ rest
;
1464 if (RBASIC_CLASS(shared
) == 0)
1465 FL_SET_RAW(shared
, STR_BORROWED
);
1469 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig
), TERM_LEN(orig
))) {
1470 str
= str_alloc_embed(klass
, RSTRING_LEN(orig
) + TERM_LEN(orig
));
1472 memcpy(RSTRING_PTR(str
), RSTRING_PTR(orig
), RSTRING_LEN(orig
));
1473 STR_SET_EMBED_LEN(str
, RSTRING_LEN(orig
));
1474 TERM_FILL(RSTRING_END(str
), TERM_LEN(orig
));
1477 str
= heap_str_make_shared(klass
, orig
);
1481 if (copy_encoding
) rb_enc_cr_str_exact_copy(str
, orig
);
1487 rb_str_new_with_class(VALUE obj
, const char *ptr
, long len
)
1489 return str_new0(rb_obj_class(obj
), ptr
, len
, TERM_LEN(obj
));
1493 str_new_empty_String(VALUE str
)
1495 VALUE v
= rb_str_new(0, 0);
1496 rb_enc_copy(v
, str
);
1500 #define STR_BUF_MIN_SIZE 63
1502 STATIC_ASSERT(STR_BUF_MIN_SIZE
, STR_BUF_MIN_SIZE
> RSTRING_EMBED_LEN_MAX
);
1506 rb_str_buf_new(long capa
)
1508 if (STR_EMBEDDABLE_P(capa
, 1)) {
1509 return str_alloc_embed(rb_cString
, capa
+ 1);
1512 VALUE str
= str_alloc_heap(rb_cString
);
1515 if (capa
< STR_BUF_MIN_SIZE
) {
1516 capa
= STR_BUF_MIN_SIZE
;
1519 FL_SET(str
, STR_NOEMBED
);
1520 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1521 RSTRING(str
)->as
.heap
.ptr
= ALLOC_N(char, (size_t)capa
+ 1);
1522 RSTRING(str
)->as
.heap
.ptr
[0] = '\0';
1528 rb_str_buf_new_cstr(const char *ptr
)
1531 long len
= strlen(ptr
);
1533 str
= rb_str_buf_new(len
);
1534 rb_str_buf_cat(str
, ptr
, len
);
1540 rb_str_tmp_new(long len
)
1542 return str_new(0, 0, len
);
1546 rb_str_free(VALUE str
)
1548 if (FL_TEST(str
, RSTRING_FSTR
)) {
1549 st_data_t fstr
= (st_data_t
)str
;
1553 st_delete(rb_vm_fstring_table(), &fstr
, NULL
);
1554 RB_DEBUG_COUNTER_INC(obj_str_fstr
);
1559 if (STR_EMBED_P(str
)) {
1560 RB_DEBUG_COUNTER_INC(obj_str_embed
);
1562 else if (FL_TEST(str
, STR_SHARED
| STR_NOFREE
)) {
1563 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_SHARED
));
1564 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_NOFREE
));
1567 RB_DEBUG_COUNTER_INC(obj_str_ptr
);
1568 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
1572 RUBY_FUNC_EXPORTED
size_t
1573 rb_str_memsize(VALUE str
)
1575 if (FL_TEST(str
, STR_NOEMBED
|STR_SHARED
|STR_NOFREE
) == STR_NOEMBED
) {
1576 return STR_HEAP_SIZE(str
);
1584 rb_str_to_str(VALUE str
)
1586 return rb_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
1589 static inline void str_discard(VALUE str
);
1590 static void str_shared_replace(VALUE str
, VALUE str2
);
1593 rb_str_shared_replace(VALUE str
, VALUE str2
)
1595 if (str
!= str2
) str_shared_replace(str
, str2
);
1599 str_shared_replace(VALUE str
, VALUE str2
)
1605 RUBY_ASSERT(str2
!= str
);
1606 enc
= STR_ENC_GET(str2
);
1607 cr
= ENC_CODERANGE(str2
);
1609 termlen
= rb_enc_mbminlen(enc
);
1611 if (str_embed_capa(str
) >= RSTRING_LEN(str2
) + termlen
) {
1613 memcpy(RSTRING_PTR(str
), RSTRING_PTR(str2
), (size_t)RSTRING_LEN(str2
) + termlen
);
1614 STR_SET_EMBED_LEN(str
, RSTRING_LEN(str2
));
1615 rb_enc_associate(str
, enc
);
1616 ENC_CODERANGE_SET(str
, cr
);
1620 if (STR_EMBED_P(str2
)) {
1621 assert(!FL_TEST(str2
, STR_SHARED
));
1622 long len
= RSTRING(str2
)->as
.embed
.len
;
1623 assert(len
+ termlen
<= str_embed_capa(str2
));
1625 char *new_ptr
= ALLOC_N(char, len
+ termlen
);
1626 memcpy(new_ptr
, RSTRING(str2
)->as
.embed
.ary
, len
+ termlen
);
1627 RSTRING(str2
)->as
.heap
.ptr
= new_ptr
;
1628 RSTRING(str2
)->as
.heap
.len
= len
;
1629 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
1630 STR_SET_NOEMBED(str2
);
1634 STR_SET_NOEMBED(str
);
1635 FL_UNSET(str
, STR_SHARED
);
1636 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1637 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(str2
);
1639 if (FL_TEST(str2
, STR_SHARED
)) {
1640 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1641 STR_SET_SHARED(str
, shared
);
1644 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(str2
)->as
.heap
.aux
.capa
;
1648 STR_SET_EMBED(str2
);
1649 RSTRING_PTR(str2
)[0] = 0;
1650 STR_SET_EMBED_LEN(str2
, 0);
1651 rb_enc_associate(str
, enc
);
1652 ENC_CODERANGE_SET(str
, cr
);
1657 rb_obj_as_string(VALUE obj
)
1661 if (RB_TYPE_P(obj
, T_STRING
)) {
1664 str
= rb_funcall(obj
, idTo_s
, 0);
1665 return rb_obj_as_string_result(str
, obj
);
1668 MJIT_FUNC_EXPORTED VALUE
1669 rb_obj_as_string_result(VALUE str
, VALUE obj
)
1671 if (!RB_TYPE_P(str
, T_STRING
))
1672 return rb_any_to_s(obj
);
1677 str_replace(VALUE str
, VALUE str2
)
1681 len
= RSTRING_LEN(str2
);
1682 if (STR_SHARED_P(str2
)) {
1683 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1684 assert(OBJ_FROZEN(shared
));
1685 STR_SET_NOEMBED(str
);
1686 RSTRING(str
)->as
.heap
.len
= len
;
1687 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1688 STR_SET_SHARED(str
, shared
);
1689 rb_enc_cr_str_exact_copy(str
, str2
);
1692 str_replace_shared(str
, str2
);
1699 ec_str_alloc(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t size
)
1702 RB_RVARGC_EC_NEWOBJ_OF(ec
, str
, struct RString
, klass
,
1703 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
1708 ec_str_alloc_embed(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t capa
)
1710 size_t size
= str_embed_size(capa
);
1711 assert(rb_gc_size_allocatable_p(size
));
1713 assert(size
<= sizeof(struct RString
));
1715 return ec_str_alloc(ec
, klass
, size
);
1719 ec_str_alloc_heap(struct rb_execution_context_struct
*ec
, VALUE klass
)
1721 return ec_str_alloc(ec
, klass
, sizeof(struct RString
));
1725 str_duplicate_setup(VALUE klass
, VALUE str
, VALUE dup
)
1727 const VALUE flag_mask
=
1729 RSTRING_NOEMBED
| RSTRING_EMBED_LEN_MASK
|
1731 ENC_CODERANGE_MASK
| ENCODING_MASK
|
1734 VALUE flags
= FL_TEST_RAW(str
, flag_mask
);
1736 if (STR_EMBED_P(str
)) {
1737 long len
= RSTRING_EMBED_LEN(str
);
1739 assert(str_embed_capa(dup
) >= len
+ 1);
1740 STR_SET_EMBED_LEN(dup
, len
);
1741 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.ary
, char, len
+ 1);
1745 if (FL_TEST_RAW(str
, STR_SHARED
)) {
1746 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1748 else if (UNLIKELY(!(flags
& FL_FREEZE
))) {
1749 root
= str
= str_new_frozen(klass
, str
);
1750 flags
= FL_TEST_RAW(str
, flag_mask
);
1752 assert(!STR_SHARED_P(root
));
1753 assert(RB_OBJ_FROZEN_RAW(root
));
1757 if (STR_EMBED_P(root
)) {
1758 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(root
)->as
.embed
.ary
,
1759 char, RSTRING_EMBED_LEN_MAX
+ 1);
1763 RSTRING(dup
)->as
.heap
.len
= RSTRING_LEN(str
);
1764 RSTRING(dup
)->as
.heap
.ptr
= RSTRING_PTR(str
);
1765 RB_OBJ_WRITE(dup
, &RSTRING(dup
)->as
.heap
.aux
.shared
, root
);
1766 flags
|= RSTRING_NOEMBED
| STR_SHARED
;
1770 if ((flags
& ENCODING_MASK
) == (ENCODING_INLINE_MAX
<<ENCODING_SHIFT
)) {
1771 encidx
= rb_enc_get_index(str
);
1772 flags
&= ~ENCODING_MASK
;
1774 FL_SET_RAW(dup
, flags
& ~FL_FREEZE
);
1775 if (encidx
) rb_enc_associate_index(dup
, encidx
);
1780 ec_str_duplicate(struct rb_execution_context_struct
*ec
, VALUE klass
, VALUE str
)
1783 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1784 dup
= ec_str_alloc_heap(ec
, klass
);
1787 dup
= ec_str_alloc_embed(ec
, klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1790 return str_duplicate_setup(klass
, str
, dup
);
1794 str_duplicate(VALUE klass
, VALUE str
)
1797 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1798 dup
= str_alloc_heap(klass
);
1801 dup
= str_alloc_embed(klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1804 return str_duplicate_setup(klass
, str
, dup
);
1808 rb_str_dup(VALUE str
)
1810 return str_duplicate(rb_obj_class(str
), str
);
1814 rb_str_resurrect(VALUE str
)
1816 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1817 return str_duplicate(rb_cString
, str
);
1821 rb_ec_str_resurrect(struct rb_execution_context_struct
*ec
, VALUE str
)
1823 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1824 return ec_str_duplicate(ec
, rb_cString
, str
);
1829 * String.new(string = '') -> new_string
1830 * String.new(string = '', encoding: encoding) -> new_string
1831 * String.new(string = '', capacity: size) -> new_string
1833 * Returns a new \String that is a copy of +string+.
1835 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1838 * s.encoding # => #<Encoding:ASCII-8BIT>
1840 * With the single \String argument +string+, returns a copy of +string+
1841 * with the same encoding as +string+:
1842 * s = String.new("Que veut dire \u{e7}a?")
1843 * s # => "Que veut dire \u{e7}a?"
1844 * s.encoding # => #<Encoding:UTF-8>
1846 * Literal strings like <tt>""</tt> or here-documents always use
1847 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1849 * With keyword +encoding+, returns a copy of +str+
1850 * with the specified encoding:
1851 * s = String.new(encoding: 'ASCII')
1852 * s.encoding # => #<Encoding:US-ASCII>
1853 * s = String.new('foo', encoding: 'ASCII')
1854 * s.encoding # => #<Encoding:US-ASCII>
1856 * Note that these are equivalent:
1857 * s0 = String.new('foo', encoding: 'ASCII')
1858 * s1 = 'foo'.force_encoding('ASCII')
1859 * s0.encoding == s1.encoding # => true
1861 * With keyword +capacity+, returns a copy of +str+;
1862 * the given +capacity+ may set the size of the internal buffer,
1863 * which may affect performance:
1864 * String.new(capacity: 1) # => ""
1865 * String.new(capacity: 4096) # => ""
1867 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1869 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1874 rb_str_init(int argc
, VALUE
*argv
, VALUE str
)
1876 static ID keyword_ids
[2];
1877 VALUE orig
, opt
, venc
, vcapa
;
1879 rb_encoding
*enc
= 0;
1882 if (!keyword_ids
[0]) {
1883 keyword_ids
[0] = rb_id_encoding();
1884 CONST_ID(keyword_ids
[1], "capacity");
1887 n
= rb_scan_args(argc
, argv
, "01:", &orig
, &opt
);
1889 rb_get_kwargs(opt
, keyword_ids
, 0, 2, kwargs
);
1892 if (venc
!= Qundef
&& !NIL_P(venc
)) {
1893 enc
= rb_to_encoding(venc
);
1895 if (vcapa
!= Qundef
&& !NIL_P(vcapa
)) {
1896 long capa
= NUM2LONG(vcapa
);
1898 int termlen
= enc
? rb_enc_mbminlen(enc
) : 1;
1900 if (capa
< STR_BUF_MIN_SIZE
) {
1901 capa
= STR_BUF_MIN_SIZE
;
1905 len
= RSTRING_LEN(orig
);
1909 if (orig
== str
) n
= 0;
1911 str_modifiable(str
);
1912 if (STR_EMBED_P(str
)) { /* make noembed always */
1913 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1915 assert(RSTRING(str
)->as
.embed
.len
+ 1 <= str_embed_capa(str
));
1916 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.len
+ 1);
1918 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING_EMBED_LEN_MAX
+ 1);
1920 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1922 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
1923 const size_t size
= (size_t)capa
+ termlen
;
1924 const char *const old_ptr
= RSTRING_PTR(str
);
1925 const size_t osize
= RSTRING(str
)->as
.heap
.len
+ TERM_LEN(str
);
1926 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1927 memcpy(new_ptr
, old_ptr
, osize
< size
? osize
: size
);
1928 FL_UNSET_RAW(str
, STR_SHARED
|STR_NOFREE
);
1929 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1931 else if (STR_HEAP_SIZE(str
) != (size_t)capa
+ termlen
) {
1932 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
1933 (size_t)capa
+ termlen
, STR_HEAP_SIZE(str
));
1935 RSTRING(str
)->as
.heap
.len
= len
;
1936 TERM_FILL(&RSTRING(str
)->as
.heap
.ptr
[len
], termlen
);
1938 memcpy(RSTRING(str
)->as
.heap
.ptr
, RSTRING_PTR(orig
), len
);
1939 rb_enc_cr_str_exact_copy(str
, orig
);
1941 FL_SET(str
, STR_NOEMBED
);
1942 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1945 rb_str_replace(str
, orig
);
1948 rb_enc_associate(str
, enc
);
1949 ENC_CODERANGE_CLEAR(str
);
1953 rb_str_replace(str
, orig
);
1958 #ifdef NONASCII_MASK
1959 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1962 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1963 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1964 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1966 * if (!(byte & 0x80))
1967 * byte |= 0x40; // turn on bit6
1968 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1970 * This function calculates whether a byte is leading or not for all bytes
1971 * in the argument word by concurrently using the above logic, and then
1972 * adds up the number of leading bytes in the word.
1974 static inline uintptr_t
1975 count_utf8_lead_bytes_with_word(const uintptr_t *s
)
1979 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1980 d
= (d
>>6) | (~d
>>7);
1981 d
&= NONASCII_MASK
>> 7;
1983 /* Gather all bytes. */
1984 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1985 /* use only if it can use POPCNT */
1986 return rb_popcount_intptr(d
);
1990 # if SIZEOF_VOIDP == 8
1999 enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
, int cr
)
2004 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2005 long diff
= (long)(e
- p
);
2006 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
2008 #ifdef NONASCII_MASK
2009 else if (cr
== ENC_CODERANGE_VALID
&& enc
== rb_utf8_encoding()) {
2011 if ((int)sizeof(uintptr_t) * 2 < e
- p
) {
2012 const uintptr_t *s
, *t
;
2013 const uintptr_t lowbits
= sizeof(uintptr_t) - 1;
2014 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2015 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2016 while (p
< (const char *)s
) {
2017 if (is_utf8_lead_byte(*p
)) len
++;
2021 len
+= count_utf8_lead_bytes_with_word(s
);
2024 p
= (const char *)s
;
2027 if (is_utf8_lead_byte(*p
)) len
++;
2033 else if (rb_enc_asciicompat(enc
)) {
2035 if (ENC_CODERANGE_CLEAN_P(cr
)) {
2038 q
= search_nonascii(p
, e
);
2044 p
+= rb_enc_fast_mbclen(p
, e
, enc
);
2051 q
= search_nonascii(p
, e
);
2057 p
+= rb_enc_mbclen(p
, e
, enc
);
2064 for (c
=0; p
<e
; c
++) {
2065 p
+= rb_enc_mbclen(p
, e
, enc
);
2071 rb_enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
)
2073 return enc_strlen(p
, e
, enc
, ENC_CODERANGE_UNKNOWN
);
2076 /* To get strlen with cr
2077 * Note that given cr is not used.
2080 rb_enc_strlen_cr(const char *p
, const char *e
, rb_encoding
*enc
, int *cr
)
2087 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2088 long diff
= (long)(e
- p
);
2089 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
2091 else if (rb_enc_asciicompat(enc
)) {
2095 q
= search_nonascii(p
, e
);
2097 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2103 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2104 if (MBCLEN_CHARFOUND_P(ret
)) {
2105 *cr
|= ENC_CODERANGE_VALID
;
2106 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2109 *cr
= ENC_CODERANGE_BROKEN
;
2114 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2118 for (c
=0; p
<e
; c
++) {
2119 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2120 if (MBCLEN_CHARFOUND_P(ret
)) {
2121 *cr
|= ENC_CODERANGE_VALID
;
2122 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2125 *cr
= ENC_CODERANGE_BROKEN
;
2126 if (p
+ rb_enc_mbminlen(enc
) <= e
)
2127 p
+= rb_enc_mbminlen(enc
);
2132 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2136 /* enc must be str's enc or rb_enc_check(str, str2) */
2138 str_strlen(VALUE str
, rb_encoding
*enc
)
2143 if (single_byte_optimizable(str
)) return RSTRING_LEN(str
);
2144 if (!enc
) enc
= STR_ENC_GET(str
);
2145 p
= RSTRING_PTR(str
);
2146 e
= RSTRING_END(str
);
2147 cr
= ENC_CODERANGE(str
);
2149 if (cr
== ENC_CODERANGE_UNKNOWN
) {
2150 long n
= rb_enc_strlen_cr(p
, e
, enc
, &cr
);
2151 if (cr
) ENC_CODERANGE_SET(str
, cr
);
2155 return enc_strlen(p
, e
, enc
, cr
);
2160 rb_str_strlen(VALUE str
)
2162 return str_strlen(str
, NULL
);
2169 * Returns the count of characters (not bytes) in +self+:
2171 * "\x80\u3042".length # => 2
2172 * "hello".length # => 5
2174 * String#size is an alias for String#length.
2176 * Related: String#bytesize.
2180 rb_str_length(VALUE str
)
2182 return LONG2NUM(str_strlen(str
, NULL
));
2187 * bytesize -> integer
2189 * Returns the count of bytes in +self+:
2191 * "\x80\u3042".bytesize # => 4
2192 * "hello".bytesize # => 5
2194 * Related: String#length.
2198 rb_str_bytesize(VALUE str
)
2200 return LONG2NUM(RSTRING_LEN(str
));
2205 * empty? -> true or false
2207 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2209 * "hello".empty? # => false
2210 * " ".empty? # => false
2211 * "".empty? # => true
2216 rb_str_empty(VALUE str
)
2218 return RBOOL(RSTRING_LEN(str
) == 0);
2223 * string + other_string -> new_string
2225 * Returns a new \String containing +other_string+ concatenated to +self+:
2227 * "Hello from " + self.to_s # => "Hello from main"
2232 rb_str_plus(VALUE str1
, VALUE str2
)
2236 char *ptr1
, *ptr2
, *ptr3
;
2241 enc
= rb_enc_check_str(str1
, str2
);
2242 RSTRING_GETMEM(str1
, ptr1
, len1
);
2243 RSTRING_GETMEM(str2
, ptr2
, len2
);
2244 termlen
= rb_enc_mbminlen(enc
);
2245 if (len1
> LONG_MAX
- len2
) {
2246 rb_raise(rb_eArgError
, "string size too big");
2248 str3
= str_new0(rb_cString
, 0, len1
+len2
, termlen
);
2249 ptr3
= RSTRING_PTR(str3
);
2250 memcpy(ptr3
, ptr1
, len1
);
2251 memcpy(ptr3
+len1
, ptr2
, len2
);
2252 TERM_FILL(&ptr3
[len1
+len2
], termlen
);
2254 ENCODING_CODERANGE_SET(str3
, rb_enc_to_index(enc
),
2255 ENC_CODERANGE_AND(ENC_CODERANGE(str1
), ENC_CODERANGE(str2
)));
2261 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2262 MJIT_FUNC_EXPORTED VALUE
2263 rb_str_opt_plus(VALUE str1
, VALUE str2
)
2265 assert(RBASIC_CLASS(str1
) == rb_cString
);
2266 assert(RBASIC_CLASS(str2
) == rb_cString
);
2268 MAYBE_UNUSED(char) *ptr1
, *ptr2
;
2269 RSTRING_GETMEM(str1
, ptr1
, len1
);
2270 RSTRING_GETMEM(str2
, ptr2
, len2
);
2271 int enc1
= rb_enc_get_index(str1
);
2272 int enc2
= rb_enc_get_index(str2
);
2277 else if (enc2
< 0) {
2280 else if (enc1
!= enc2
) {
2283 else if (len1
> LONG_MAX
- len2
) {
2287 return rb_str_plus(str1
, str2
);
2294 * string * integer -> new_string
2296 * Returns a new \String containing +integer+ copies of +self+:
2298 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2299 * "Ho! " * 0 # => ""
2304 rb_str_times(VALUE str
, VALUE times
)
2311 if (times
== INT2FIX(1)) {
2312 return str_duplicate(rb_cString
, str
);
2314 if (times
== INT2FIX(0)) {
2315 str2
= str_alloc_embed(rb_cString
, 0);
2316 rb_enc_copy(str2
, str
);
2319 len
= NUM2LONG(times
);
2321 rb_raise(rb_eArgError
, "negative argument");
2323 if (RSTRING_LEN(str
) == 1 && RSTRING_PTR(str
)[0] == 0) {
2324 if (STR_EMBEDDABLE_P(len
, 1)) {
2325 str2
= str_alloc_embed(rb_cString
, len
+ 1);
2326 memset(RSTRING_PTR(str2
), 0, len
+ 1);
2329 str2
= str_alloc_heap(rb_cString
);
2330 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
2331 RSTRING(str2
)->as
.heap
.ptr
= ZALLOC_N(char, (size_t)len
+ 1);
2332 STR_SET_NOEMBED(str2
);
2334 STR_SET_LEN(str2
, len
);
2335 rb_enc_copy(str2
, str
);
2338 if (len
&& LONG_MAX
/len
< RSTRING_LEN(str
)) {
2339 rb_raise(rb_eArgError
, "argument too big");
2342 len
*= RSTRING_LEN(str
);
2343 termlen
= TERM_LEN(str
);
2344 str2
= str_new0(rb_cString
, 0, len
, termlen
);
2345 ptr2
= RSTRING_PTR(str2
);
2347 n
= RSTRING_LEN(str
);
2348 memcpy(ptr2
, RSTRING_PTR(str
), n
);
2349 while (n
<= len
/2) {
2350 memcpy(ptr2
+ n
, ptr2
, n
);
2353 memcpy(ptr2
+ n
, ptr2
, len
-n
);
2355 STR_SET_LEN(str2
, len
);
2356 TERM_FILL(&ptr2
[len
], termlen
);
2357 rb_enc_cr_str_copy_for_substr(str2
, str
);
2364 * string % object -> new_string
2366 * Returns the result of formatting +object+ into the format specification +self+
2367 * (see Kernel#sprintf for formatting details):
2369 * "%05d" % 123 # => "00123"
2371 * If +self+ contains multiple substitutions, +object+ must be
2372 * an \Array or \Hash containing the values to be substituted:
2374 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2375 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2376 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2381 rb_str_format_m(VALUE str
, VALUE arg
)
2383 VALUE tmp
= rb_check_array_type(arg
);
2386 return rb_str_format(RARRAY_LENINT(tmp
), RARRAY_CONST_PTR(tmp
), str
);
2388 return rb_str_format(1, &arg
, str
);
2392 rb_check_lockedtmp(VALUE str
)
2394 if (FL_TEST(str
, STR_TMPLOCK
)) {
2395 rb_raise(rb_eRuntimeError
, "can't modify string; temporarily locked");
2400 str_modifiable(VALUE str
)
2402 rb_check_lockedtmp(str
);
2403 rb_check_frozen(str
);
2407 str_dependent_p(VALUE str
)
2409 if (STR_EMBED_P(str
) || !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2418 str_independent(VALUE str
)
2420 str_modifiable(str
);
2421 return !str_dependent_p(str
);
2425 str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
)
2429 long capa
= len
+ expand
;
2431 if (len
> capa
) len
= capa
;
2433 if (!STR_EMBED_P(str
) && str_embed_capa(str
) >= capa
+ termlen
) {
2434 ptr
= RSTRING(str
)->as
.heap
.ptr
;
2436 memcpy(RSTRING(str
)->as
.embed
.ary
, ptr
, len
);
2437 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
2438 STR_SET_EMBED_LEN(str
, len
);
2442 ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
2443 oldptr
= RSTRING_PTR(str
);
2445 memcpy(ptr
, oldptr
, len
);
2447 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_NOFREE
|STR_SHARED
) == STR_NOEMBED
) {
2450 STR_SET_NOEMBED(str
);
2451 FL_UNSET(str
, STR_SHARED
|STR_NOFREE
);
2452 TERM_FILL(ptr
+ len
, termlen
);
2453 RSTRING(str
)->as
.heap
.ptr
= ptr
;
2454 RSTRING(str
)->as
.heap
.len
= len
;
2455 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
2459 rb_str_modify(VALUE str
)
2461 if (!str_independent(str
))
2462 str_make_independent(str
);
2463 ENC_CODERANGE_CLEAR(str
);
2467 rb_str_modify_expand(VALUE str
, long expand
)
2469 int termlen
= TERM_LEN(str
);
2470 long len
= RSTRING_LEN(str
);
2473 rb_raise(rb_eArgError
, "negative expanding string size");
2475 if (expand
>= LONG_MAX
- len
) {
2476 rb_raise(rb_eArgError
, "string size too big");
2479 if (!str_independent(str
)) {
2480 str_make_independent_expand(str
, len
, expand
, termlen
);
2482 else if (expand
> 0) {
2483 RESIZE_CAPA_TERM(str
, len
+ expand
, termlen
);
2485 ENC_CODERANGE_CLEAR(str
);
2488 /* As rb_str_modify(), but don't clear coderange */
2490 str_modify_keep_cr(VALUE str
)
2492 if (!str_independent(str
))
2493 str_make_independent(str
);
2494 if (ENC_CODERANGE(str
) == ENC_CODERANGE_BROKEN
)
2495 /* Force re-scan later */
2496 ENC_CODERANGE_CLEAR(str
);
2500 str_discard(VALUE str
)
2502 str_modifiable(str
);
2503 if (!STR_EMBED_P(str
) && !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2504 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
2505 RSTRING(str
)->as
.heap
.ptr
= 0;
2506 RSTRING(str
)->as
.heap
.len
= 0;
2511 rb_must_asciicompat(VALUE str
)
2513 rb_encoding
*enc
= rb_enc_get(str
);
2514 if (!rb_enc_asciicompat(enc
)) {
2515 rb_raise(rb_eEncCompatError
, "ASCII incompatible encoding: %s", rb_enc_name(enc
));
2520 rb_string_value(volatile VALUE
*ptr
)
2523 if (!RB_TYPE_P(s
, T_STRING
)) {
2524 s
= rb_str_to_str(s
);
2531 rb_string_value_ptr(volatile VALUE
*ptr
)
2533 VALUE str
= rb_string_value(ptr
);
2534 return RSTRING_PTR(str
);
2538 zero_filled(const char *s
, int n
)
2540 for (; n
> 0; --n
) {
2547 str_null_char(const char *s
, long len
, const int minlen
, rb_encoding
*enc
)
2549 const char *e
= s
+ len
;
2551 for (; s
+ minlen
<= e
; s
+= rb_enc_mbclen(s
, e
, enc
)) {
2552 if (zero_filled(s
, minlen
)) return s
;
2558 str_fill_term(VALUE str
, char *s
, long len
, int termlen
)
2560 /* This function assumes that (capa + termlen) bytes of memory
2561 * is allocated, like many other functions in this file.
2563 if (str_dependent_p(str
)) {
2564 if (!zero_filled(s
+ len
, termlen
))
2565 str_make_independent_expand(str
, len
, 0L, termlen
);
2568 TERM_FILL(s
+ len
, termlen
);
2571 return RSTRING_PTR(str
);
2575 rb_str_change_terminator_length(VALUE str
, const int oldtermlen
, const int termlen
)
2577 long capa
= str_capacity(str
, oldtermlen
) + oldtermlen
;
2578 long len
= RSTRING_LEN(str
);
2580 assert(capa
>= len
);
2581 if (capa
- len
< termlen
) {
2582 rb_check_lockedtmp(str
);
2583 str_make_independent_expand(str
, len
, 0L, termlen
);
2585 else if (str_dependent_p(str
)) {
2586 if (termlen
> oldtermlen
)
2587 str_make_independent_expand(str
, len
, 0L, termlen
);
2590 if (!STR_EMBED_P(str
)) {
2591 /* modify capa instead of realloc */
2592 assert(!FL_TEST((str
), STR_SHARED
));
2593 RSTRING(str
)->as
.heap
.aux
.capa
= capa
- termlen
;
2595 if (termlen
> oldtermlen
) {
2596 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
2604 str_null_check(VALUE str
, int *w
)
2606 char *s
= RSTRING_PTR(str
);
2607 long len
= RSTRING_LEN(str
);
2608 rb_encoding
*enc
= rb_enc_get(str
);
2609 const int minlen
= rb_enc_mbminlen(enc
);
2613 if (str_null_char(s
, len
, minlen
, enc
)) {
2616 return str_fill_term(str
, s
, len
, minlen
);
2619 if (!s
|| memchr(s
, 0, len
)) {
2623 s
= str_fill_term(str
, s
, len
, minlen
);
2629 rb_str_to_cstr(VALUE str
)
2632 return str_null_check(str
, &w
);
2636 rb_string_value_cstr(volatile VALUE
*ptr
)
2638 VALUE str
= rb_string_value(ptr
);
2640 char *s
= str_null_check(str
, &w
);
2643 rb_raise(rb_eArgError
, "string contains null char");
2645 rb_raise(rb_eArgError
, "string contains null byte");
2651 rb_str_fill_terminator(VALUE str
, const int newminlen
)
2653 char *s
= RSTRING_PTR(str
);
2654 long len
= RSTRING_LEN(str
);
2655 return str_fill_term(str
, s
, len
, newminlen
);
2659 rb_check_string_type(VALUE str
)
2661 str
= rb_check_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
2667 * String.try_convert(object) -> object, new_string, or nil
2669 * If +object+ is a \String object, returns +object+.
2671 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2672 * calls <tt>object.to_str</tt> and returns the result.
2674 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2676 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2679 rb_str_s_try_convert(VALUE dummy
, VALUE str
)
2681 return rb_check_string_type(str
);
2685 str_nth_len(const char *p
, const char *e
, long *nthp
, rb_encoding
*enc
)
2688 if (rb_enc_mbmaxlen(enc
) == 1) {
2691 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2692 p
+= nth
* rb_enc_mbmaxlen(enc
);
2694 else if (rb_enc_asciicompat(enc
)) {
2695 const char *p2
, *e2
;
2698 while (p
< e
&& 0 < nth
) {
2705 p2
= search_nonascii(p
, e2
);
2714 n
= rb_enc_mbclen(p
, e
, enc
);
2725 while (p
< e
&& nth
--) {
2726 p
+= rb_enc_mbclen(p
, e
, enc
);
2735 rb_enc_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
)
2737 return str_nth_len(p
, e
, &nth
, enc
);
2741 str_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2746 p
= str_nth_len(p
, e
, &nth
, enc
);
2753 /* char offset to byte offset */
2755 str_offset(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2757 const char *pp
= str_nth(p
, e
, nth
, enc
, singlebyte
);
2758 if (!pp
) return e
- p
;
2763 rb_str_offset(VALUE str
, long pos
)
2765 return str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
2766 STR_ENC_GET(str
), single_byte_optimizable(str
));
2769 #ifdef NONASCII_MASK
2771 str_utf8_nth(const char *p
, const char *e
, long *nthp
)
2774 if ((int)SIZEOF_VOIDP
* 2 < e
- p
&& (int)SIZEOF_VOIDP
* 2 < nth
) {
2775 const uintptr_t *s
, *t
;
2776 const uintptr_t lowbits
= SIZEOF_VOIDP
- 1;
2777 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2778 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2779 while (p
< (const char *)s
) {
2780 if (is_utf8_lead_byte(*p
)) nth
--;
2784 nth
-= count_utf8_lead_bytes_with_word(s
);
2786 } while (s
< t
&& (int)SIZEOF_VOIDP
<= nth
);
2790 if (is_utf8_lead_byte(*p
)) {
2791 if (nth
== 0) break;
2801 str_utf8_offset(const char *p
, const char *e
, long nth
)
2803 const char *pp
= str_utf8_nth(p
, e
, &nth
);
2808 /* byte offset to char offset */
2810 rb_str_sublen(VALUE str
, long pos
)
2812 if (single_byte_optimizable(str
) || pos
< 0)
2815 char *p
= RSTRING_PTR(str
);
2816 return enc_strlen(p
, p
+ pos
, STR_ENC_GET(str
), ENC_CODERANGE(str
));
2821 rb_str_subseq(VALUE str
, long beg
, long len
)
2825 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2826 SHARABLE_SUBSTRING_P(beg
, len
, RSTRING_LEN(str
))) {
2828 str2
= rb_str_new_shared(rb_str_new_frozen_String(str
));
2829 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
2830 olen
= RSTRING(str2
)->as
.heap
.len
;
2831 if (olen
> len
) RSTRING(str2
)->as
.heap
.len
= len
;
2834 str2
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
2838 rb_enc_cr_str_copy_for_substr(str2
, str
);
2844 rb_str_subpos(VALUE str
, long beg
, long *lenp
)
2848 long blen
= RSTRING_LEN(str
);
2849 rb_encoding
*enc
= STR_ENC_GET(str
);
2850 char *p
, *s
= RSTRING_PTR(str
), *e
= s
+ blen
;
2852 if (len
< 0) return 0;
2856 if (single_byte_optimizable(str
)) {
2857 if (beg
> blen
) return 0;
2860 if (beg
< 0) return 0;
2862 if (len
> blen
- beg
)
2864 if (len
< 0) return 0;
2869 if (len
> -beg
) len
= -beg
;
2870 if (-beg
* rb_enc_mbmaxlen(enc
) < RSTRING_LEN(str
) / 8) {
2872 while (beg
-- > len
&& (e
= rb_enc_prev_char(s
, e
, e
, enc
)) != 0);
2875 while (len
-- > 0 && (p
= rb_enc_prev_char(s
, p
, e
, enc
)) != 0);
2881 slen
= str_strlen(str
, enc
);
2883 if (beg
< 0) return 0;
2885 if (len
== 0) goto end
;
2888 else if (beg
> 0 && beg
> RSTRING_LEN(str
)) {
2892 if (beg
> str_strlen(str
, enc
)) return 0; /* str's enc */
2895 #ifdef NONASCII_MASK
2896 else if (ENC_CODERANGE(str
) == ENC_CODERANGE_VALID
&&
2897 enc
== rb_utf8_encoding()) {
2898 p
= str_utf8_nth(s
, e
, &beg
);
2899 if (beg
> 0) return 0;
2900 len
= str_utf8_offset(p
, e
, len
);
2903 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2904 int char_sz
= rb_enc_mbmaxlen(enc
);
2906 p
= s
+ beg
* char_sz
;
2910 else if (len
* char_sz
> e
- p
)
2915 else if ((p
= str_nth_len(s
, e
, &beg
, enc
)) == e
) {
2916 if (beg
> 0) return 0;
2920 len
= str_offset(p
, e
, len
, enc
, 0);
2928 static VALUE
str_substr(VALUE str
, long beg
, long len
, int empty
);
2931 rb_str_substr(VALUE str
, long beg
, long len
)
2933 return str_substr(str
, beg
, len
, TRUE
);
2937 str_substr(VALUE str
, long beg
, long len
, int empty
)
2940 char *p
= rb_str_subpos(str
, beg
, &len
);
2942 if (!p
) return Qnil
;
2943 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2944 SHARABLE_SUBSTRING_P(p
, len
, RSTRING_END(str
))) {
2945 long ofs
= p
- RSTRING_PTR(str
);
2946 str2
= rb_str_new_frozen(str
);
2947 str2
= str_new_shared(rb_cString
, str2
);
2948 RSTRING(str2
)->as
.heap
.ptr
+= ofs
;
2949 RSTRING(str2
)->as
.heap
.len
= len
;
2950 ENC_CODERANGE_CLEAR(str2
);
2953 if (!len
&& !empty
) return Qnil
;
2954 str2
= rb_str_new(p
, len
);
2957 rb_enc_cr_str_copy_for_substr(str2
, str
);
2963 rb_str_freeze(VALUE str
)
2965 if (OBJ_FROZEN(str
)) return str
;
2966 rb_str_resize(str
, RSTRING_LEN(str
));
2967 return rb_obj_freeze(str
);
2973 * +string -> new_string or self
2975 * Returns +self+ if +self+ is not frozen.
2977 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2980 str_uplus(VALUE str
)
2982 if (OBJ_FROZEN(str
)) {
2983 return rb_str_dup(str
);
2992 * -string -> frozen_string
2994 * Returns a frozen, possibly pre-existing copy of the string.
2996 * The returned \String will be deduplicated as long as it does not have
2997 * any instance variables set on it.
3000 str_uminus(VALUE str
)
3002 if (!BARE_STRING_P(str
) && !rb_obj_frozen_p(str
)) {
3003 str
= rb_str_dup(str
);
3005 return rb_fstring(str
);
3008 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str
), rb_str_new_frozen
, (str
))
3009 #define rb_str_dup_frozen rb_str_new_frozen
3012 rb_str_locktmp(VALUE str
)
3014 if (FL_TEST(str
, STR_TMPLOCK
)) {
3015 rb_raise(rb_eRuntimeError
, "temporal locking already locked string");
3017 FL_SET(str
, STR_TMPLOCK
);
3022 rb_str_unlocktmp(VALUE str
)
3024 if (!FL_TEST(str
, STR_TMPLOCK
)) {
3025 rb_raise(rb_eRuntimeError
, "temporal unlocking already unlocked string");
3027 FL_UNSET(str
, STR_TMPLOCK
);
3031 RUBY_FUNC_EXPORTED VALUE
3032 rb_str_locktmp_ensure(VALUE str
, VALUE (*func
)(VALUE
), VALUE arg
)
3034 rb_str_locktmp(str
);
3035 return rb_ensure(func
, arg
, rb_str_unlocktmp
, str
);
3039 rb_str_set_len(VALUE str
, long len
)
3042 const int termlen
= TERM_LEN(str
);
3044 str_modifiable(str
);
3045 if (STR_SHARED_P(str
)) {
3046 rb_raise(rb_eRuntimeError
, "can't set length of shared string");
3048 if (len
> (capa
= (long)str_capacity(str
, termlen
)) || len
< 0) {
3049 rb_bug("probable buffer overflow: %ld for %ld", len
, capa
);
3051 STR_SET_LEN(str
, len
);
3052 TERM_FILL(&RSTRING_PTR(str
)[len
], termlen
);
3056 rb_str_resize(VALUE str
, long len
)
3062 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3065 independent
= str_independent(str
);
3066 ENC_CODERANGE_CLEAR(str
);
3067 slen
= RSTRING_LEN(str
);
3071 const int termlen
= TERM_LEN(str
);
3072 if (STR_EMBED_P(str
)) {
3073 if (len
== slen
) return str
;
3074 if (str_embed_capa(str
) >= len
+ termlen
) {
3075 STR_SET_EMBED_LEN(str
, len
);
3076 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3079 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3081 else if (str_embed_capa(str
) >= len
+ termlen
) {
3082 char *ptr
= STR_HEAP_PTR(str
);
3084 if (slen
> len
) slen
= len
;
3085 if (slen
> 0) MEMCPY(RSTRING(str
)->as
.embed
.ary
, ptr
, char, slen
);
3086 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3087 STR_SET_EMBED_LEN(str
, len
);
3088 if (independent
) ruby_xfree(ptr
);
3091 else if (!independent
) {
3092 if (len
== slen
) return str
;
3093 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3095 else if ((capa
= RSTRING(str
)->as
.heap
.aux
.capa
) < len
||
3096 (capa
- len
) > (len
< 1024 ? len
: 1024)) {
3097 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
3098 (size_t)len
+ termlen
, STR_HEAP_SIZE(str
));
3099 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
3101 else if (len
== slen
) return str
;
3102 RSTRING(str
)->as
.heap
.len
= len
;
3103 TERM_FILL(RSTRING(str
)->as
.heap
.ptr
+ len
, termlen
); /* sentinel */
3109 str_buf_cat(VALUE str
, const char *ptr
, long len
)
3111 long capa
, total
, olen
, off
= -1;
3113 const int termlen
= TERM_LEN(str
);
3115 assert(termlen
< RSTRING_EMBED_LEN_MAX
+ 1); /* < (LONG_MAX/2) */
3118 RSTRING_GETMEM(str
, sptr
, olen
);
3119 if (ptr
>= sptr
&& ptr
<= sptr
+ olen
) {
3123 if (len
== 0) return 0;
3124 if (STR_EMBED_P(str
)) {
3125 capa
= str_embed_capa(str
) - termlen
;
3126 sptr
= RSTRING(str
)->as
.embed
.ary
;
3127 olen
= RSTRING_EMBED_LEN(str
);
3130 capa
= RSTRING(str
)->as
.heap
.aux
.capa
;
3131 sptr
= RSTRING(str
)->as
.heap
.ptr
;
3132 olen
= RSTRING(str
)->as
.heap
.len
;
3134 if (olen
> LONG_MAX
- len
) {
3135 rb_raise(rb_eArgError
, "string sizes too big");
3139 if (total
>= LONG_MAX
/ 2) {
3142 while (total
> capa
) {
3143 capa
= 2 * capa
+ termlen
; /* == 2*(capa+termlen)-termlen */
3145 RESIZE_CAPA_TERM(str
, capa
, termlen
);
3146 sptr
= RSTRING_PTR(str
);
3151 memcpy(sptr
+ olen
, ptr
, len
);
3152 STR_SET_LEN(str
, total
);
3153 TERM_FILL(sptr
+ total
, termlen
); /* sentinel */
3158 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3161 rb_str_cat(VALUE str
, const char *ptr
, long len
)
3163 if (len
== 0) return str
;
3165 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3167 return str_buf_cat(str
, ptr
, len
);
3171 rb_str_cat_cstr(VALUE str
, const char *ptr
)
3174 return rb_str_buf_cat(str
, ptr
, strlen(ptr
));
3177 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str
, const char *ptr
, long len
), rb_str_cat
, (str
, ptr
, len
))
3178 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3179 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3182 rb_enc_cr_str_buf_cat(VALUE str
, const char *ptr
, long len
,
3183 int ptr_encindex
, int ptr_cr
, int *ptr_cr_ret
)
3185 int str_encindex
= ENCODING_GET(str
);
3188 rb_encoding
*str_enc
, *ptr_enc
;
3190 str_cr
= RSTRING_LEN(str
) ? ENC_CODERANGE(str
) : ENC_CODERANGE_7BIT
;
3192 if (str_encindex
== ptr_encindex
) {
3193 if (str_cr
!= ENC_CODERANGE_UNKNOWN
&& ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3194 ptr_cr
= coderange_scan(ptr
, len
, rb_enc_from_index(ptr_encindex
));
3198 str_enc
= rb_enc_from_index(str_encindex
);
3199 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3200 if (!rb_enc_asciicompat(str_enc
) || !rb_enc_asciicompat(ptr_enc
)) {
3203 if (RSTRING_LEN(str
) == 0) {
3204 rb_str_buf_cat(str
, ptr
, len
);
3205 ENCODING_CODERANGE_SET(str
, ptr_encindex
, ptr_cr
);
3210 if (ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3211 ptr_cr
= coderange_scan(ptr
, len
, ptr_enc
);
3213 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3214 if (ENCODING_IS_ASCII8BIT(str
) || ptr_cr
!= ENC_CODERANGE_7BIT
) {
3215 str_cr
= rb_enc_str_coderange(str
);
3220 *ptr_cr_ret
= ptr_cr
;
3222 if (str_encindex
!= ptr_encindex
&&
3223 str_cr
!= ENC_CODERANGE_7BIT
&&
3224 ptr_cr
!= ENC_CODERANGE_7BIT
) {
3225 str_enc
= rb_enc_from_index(str_encindex
);
3226 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3230 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3231 res_encindex
= str_encindex
;
3232 res_cr
= ENC_CODERANGE_UNKNOWN
;
3234 else if (str_cr
== ENC_CODERANGE_7BIT
) {
3235 if (ptr_cr
== ENC_CODERANGE_7BIT
) {
3236 res_encindex
= str_encindex
;
3237 res_cr
= ENC_CODERANGE_7BIT
;
3240 res_encindex
= ptr_encindex
;
3244 else if (str_cr
== ENC_CODERANGE_VALID
) {
3245 res_encindex
= str_encindex
;
3246 if (ENC_CODERANGE_CLEAN_P(ptr_cr
))
3251 else { /* str_cr == ENC_CODERANGE_BROKEN */
3252 res_encindex
= str_encindex
;
3254 if (0 < len
) res_cr
= ENC_CODERANGE_UNKNOWN
;
3258 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3260 str_buf_cat(str
, ptr
, len
);
3261 ENCODING_CODERANGE_SET(str
, res_encindex
, res_cr
);
3265 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
3266 rb_enc_name(str_enc
), rb_enc_name(ptr_enc
));
3267 UNREACHABLE_RETURN(Qundef
);
3271 rb_enc_str_buf_cat(VALUE str
, const char *ptr
, long len
, rb_encoding
*ptr_enc
)
3273 return rb_enc_cr_str_buf_cat(str
, ptr
, len
,
3274 rb_enc_to_index(ptr_enc
), ENC_CODERANGE_UNKNOWN
, NULL
);
3278 rb_str_buf_cat_ascii(VALUE str
, const char *ptr
)
3280 /* ptr must reference NUL terminated ASCII string. */
3281 int encindex
= ENCODING_GET(str
);
3282 rb_encoding
*enc
= rb_enc_from_index(encindex
);
3283 if (rb_enc_asciicompat(enc
)) {
3284 return rb_enc_cr_str_buf_cat(str
, ptr
, strlen(ptr
),
3285 encindex
, ENC_CODERANGE_7BIT
, 0);
3288 char *buf
= ALLOCA_N(char, rb_enc_mbmaxlen(enc
));
3290 unsigned int c
= (unsigned char)*ptr
;
3291 int len
= rb_enc_codelen(c
, enc
);
3292 rb_enc_mbcput(c
, buf
, enc
);
3293 rb_enc_cr_str_buf_cat(str
, buf
, len
,
3294 encindex
, ENC_CODERANGE_VALID
, 0);
3302 rb_str_buf_append(VALUE str
, VALUE str2
)
3306 str2_cr
= ENC_CODERANGE(str2
);
3308 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(str2
), RSTRING_LEN(str2
),
3309 ENCODING_GET(str2
), str2_cr
, &str2_cr
);
3311 ENC_CODERANGE_SET(str2
, str2_cr
);
3317 rb_str_append(VALUE str
, VALUE str2
)
3320 return rb_str_buf_append(str
, str2
);
3323 #define MIN_PRE_ALLOC_SIZE 48
3325 MJIT_FUNC_EXPORTED VALUE
3326 rb_str_concat_literals(size_t num
, const VALUE
*strary
)
3332 if (UNLIKELY(!num
)) return rb_str_new(0, 0);
3333 if (UNLIKELY(num
== 1)) return rb_str_resurrect(strary
[0]);
3335 for (i
= 0; i
< num
; ++i
) { len
+= RSTRING_LEN(strary
[i
]); }
3336 if (LIKELY(len
< MIN_PRE_ALLOC_SIZE
)) {
3337 str
= rb_str_resurrect(strary
[0]);
3341 str
= rb_str_buf_new(len
);
3342 rb_enc_copy(str
, strary
[0]);
3346 for (i
= s
; i
< num
; ++i
) {
3347 const VALUE v
= strary
[i
];
3348 int encidx
= ENCODING_GET(v
);
3350 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(v
), RSTRING_LEN(v
),
3351 encidx
, ENC_CODERANGE(v
), NULL
);
3352 if (encidx
!= ENCINDEX_US_ASCII
) {
3353 if (ENCODING_GET_INLINED(str
) == ENCINDEX_US_ASCII
)
3354 rb_enc_set_index(str
, encidx
);
3362 * concat(*objects) -> string
3364 * Concatenates each object in +objects+ to +self+ and returns +self+:
3367 * s.concat('bar', 'baz') # => "foobarbaz"
3368 * s # => "foobarbaz"
3370 * For each given object +object+ that is an \Integer,
3371 * the value is considered a codepoint and converted to a character before concatenation:
3374 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3376 * Related: String#<<, which takes a single argument.
3379 rb_str_concat_multi(int argc
, VALUE
*argv
, VALUE str
)
3381 str_modifiable(str
);
3384 return rb_str_concat(str
, argv
[0]);
3386 else if (argc
> 1) {
3388 VALUE arg_str
= rb_str_tmp_new(0);
3389 rb_enc_copy(arg_str
, str
);
3390 for (i
= 0; i
< argc
; i
++) {
3391 rb_str_concat(arg_str
, argv
[i
]);
3393 rb_str_buf_append(str
, arg_str
);
3401 * string << object -> string
3403 * Concatenates +object+ to +self+ and returns +self+:
3406 * s << 'bar' # => "foobar"
3409 * If +object+ is an \Integer,
3410 * the value is considered a codepoint and converted to a character before concatenation:
3413 * s << 33 # => "foo!"
3415 * Related: String#concat, which takes multiple arguments.
3418 rb_str_concat(VALUE str1
, VALUE str2
)
3421 rb_encoding
*enc
= STR_ENC_GET(str1
);
3424 if (RB_INTEGER_TYPE_P(str2
)) {
3425 if (rb_num_to_uint(str2
, &code
) == 0) {
3427 else if (FIXNUM_P(str2
)) {
3428 rb_raise(rb_eRangeError
, "%ld out of char range", FIX2LONG(str2
));
3431 rb_raise(rb_eRangeError
, "bignum out of char range");
3435 return rb_str_append(str1
, str2
);
3438 encidx
= rb_enc_to_index(enc
);
3439 if (encidx
== ENCINDEX_ASCII
|| encidx
== ENCINDEX_US_ASCII
) {
3440 /* US-ASCII automatically extended to ASCII-8BIT */
3442 buf
[0] = (char)code
;
3444 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3446 rb_str_cat(str1
, buf
, 1);
3447 if (encidx
== ENCINDEX_US_ASCII
&& code
> 127) {
3448 rb_enc_associate_index(str1
, ENCINDEX_ASCII
);
3449 ENC_CODERANGE_SET(str1
, ENC_CODERANGE_VALID
);
3453 long pos
= RSTRING_LEN(str1
);
3454 int cr
= ENC_CODERANGE(str1
);
3458 switch (len
= rb_enc_codelen(code
, enc
)) {
3459 case ONIGERR_INVALID_CODE_POINT_VALUE
:
3460 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3462 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
:
3464 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3467 buf
= ALLOCA_N(char, len
+ 1);
3468 rb_enc_mbcput(code
, buf
, enc
);
3469 if (rb_enc_precise_mbclen(buf
, buf
+ len
+ 1, enc
) != len
) {
3470 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3472 rb_str_resize(str1
, pos
+len
);
3473 memcpy(RSTRING_PTR(str1
) + pos
, buf
, len
);
3474 if (cr
== ENC_CODERANGE_7BIT
&& code
> 127)
3475 cr
= ENC_CODERANGE_VALID
;
3476 ENC_CODERANGE_SET(str1
, cr
);
3483 * prepend(*other_strings) -> string
3485 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3488 * s.prepend('bar', 'baz') # => "barbazfoo"
3489 * s # => "barbazfoo"
3491 * Related: String#concat.
3495 rb_str_prepend_multi(int argc
, VALUE
*argv
, VALUE str
)
3497 str_modifiable(str
);
3500 rb_str_update(str
, 0L, 0L, argv
[0]);
3502 else if (argc
> 1) {
3504 VALUE arg_str
= rb_str_tmp_new(0);
3505 rb_enc_copy(arg_str
, str
);
3506 for (i
= 0; i
< argc
; i
++) {
3507 rb_str_append(arg_str
, argv
[i
]);
3509 rb_str_update(str
, 0L, 0L, arg_str
);
3516 rb_str_hash(VALUE str
)
3518 int e
= ENCODING_GET(str
);
3519 if (e
&& rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
) {
3522 return rb_memhash((const void *)RSTRING_PTR(str
), RSTRING_LEN(str
)) ^ e
;
3526 rb_str_hash_cmp(VALUE str1
, VALUE str2
)
3529 const char *ptr1
, *ptr2
;
3530 RSTRING_GETMEM(str1
, ptr1
, len1
);
3531 RSTRING_GETMEM(str2
, ptr2
, len2
);
3532 return (len1
!= len2
||
3533 !rb_str_comparable(str1
, str2
) ||
3534 memcmp(ptr1
, ptr2
, len1
) != 0);
3541 * Returns the integer hash value for +self+.
3542 * The value is based on the length, content and encoding of +self+.
3544 * Related: Object#hash.
3548 rb_str_hash_m(VALUE str
)
3550 st_index_t hval
= rb_str_hash(str
);
3551 return ST2FIX(hval
);
3554 #define lesser(a,b) (((a)>(b))?(b):(a))
3557 rb_str_comparable(VALUE str1
, VALUE str2
)
3562 if (RSTRING_LEN(str1
) == 0) return TRUE
;
3563 if (RSTRING_LEN(str2
) == 0) return TRUE
;
3564 idx1
= ENCODING_GET(str1
);
3565 idx2
= ENCODING_GET(str2
);
3566 if (idx1
== idx2
) return TRUE
;
3567 rc1
= rb_enc_str_coderange(str1
);
3568 rc2
= rb_enc_str_coderange(str2
);
3569 if (rc1
== ENC_CODERANGE_7BIT
) {
3570 if (rc2
== ENC_CODERANGE_7BIT
) return TRUE
;
3571 if (rb_enc_asciicompat(rb_enc_from_index(idx2
)))
3574 if (rc2
== ENC_CODERANGE_7BIT
) {
3575 if (rb_enc_asciicompat(rb_enc_from_index(idx1
)))
3582 rb_str_cmp(VALUE str1
, VALUE str2
)
3585 const char *ptr1
, *ptr2
;
3588 if (str1
== str2
) return 0;
3589 RSTRING_GETMEM(str1
, ptr1
, len1
);
3590 RSTRING_GETMEM(str2
, ptr2
, len2
);
3591 if (ptr1
== ptr2
|| (retval
= memcmp(ptr1
, ptr2
, lesser(len1
, len2
))) == 0) {
3593 if (!rb_str_comparable(str1
, str2
)) {
3594 if (ENCODING_GET(str1
) > ENCODING_GET(str2
))
3600 if (len1
> len2
) return 1;
3603 if (retval
> 0) return 1;
3609 * string == object -> true or false
3610 * string === object -> true or false
3612 * Returns +true+ if +object+ has the same length and content;
3613 * as +self+; +false+ otherwise:
3616 * s == 'foo' # => true
3617 * s == 'food' # => false
3618 * s == 'FOO' # => false
3620 * Returns +false+ if the two strings' encodings are not compatible:
3621 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3623 * If +object+ is not an instance of \String but responds to +to_str+, then the
3624 * two strings are compared using <code>object.==</code>.
3628 rb_str_equal(VALUE str1
, VALUE str2
)
3630 if (str1
== str2
) return Qtrue
;
3631 if (!RB_TYPE_P(str2
, T_STRING
)) {
3632 if (!rb_respond_to(str2
, idTo_str
)) {
3635 return rb_equal(str2
, str1
);
3637 return rb_str_eql_internal(str1
, str2
);
3642 * eql?(object) -> true or false
3644 * Returns +true+ if +object+ has the same length and content;
3645 * as +self+; +false+ otherwise:
3648 * s.eql?('foo') # => true
3649 * s.eql?('food') # => false
3650 * s.eql?('FOO') # => false
3652 * Returns +false+ if the two strings' encodings are not compatible:
3654 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3658 MJIT_FUNC_EXPORTED VALUE
3659 rb_str_eql(VALUE str1
, VALUE str2
)
3661 if (str1
== str2
) return Qtrue
;
3662 if (!RB_TYPE_P(str2
, T_STRING
)) return Qfalse
;
3663 return rb_str_eql_internal(str1
, str2
);
3668 * string <=> other_string -> -1, 0, 1, or nil
3670 * Compares +self+ and +other_string+, returning:
3672 * - -1 if +other_string+ is larger.
3673 * - 0 if the two are equal.
3674 * - 1 if +other_string+ is smaller.
3675 * - +nil+ if the two are incomparable.
3679 * 'foo' <=> 'foo' # => 0
3680 * 'foo' <=> 'food' # => -1
3681 * 'food' <=> 'foo' # => 1
3682 * 'FOO' <=> 'foo' # => -1
3683 * 'foo' <=> 'FOO' # => 1
3684 * 'foo' <=> 1 # => nil
3689 rb_str_cmp_m(VALUE str1
, VALUE str2
)
3692 VALUE s
= rb_check_string_type(str2
);
3694 return rb_invcmp(str1
, str2
);
3696 result
= rb_str_cmp(str1
, s
);
3697 return INT2FIX(result
);
3700 static VALUE
str_casecmp(VALUE str1
, VALUE str2
);
3701 static VALUE
str_casecmp_p(VALUE str1
, VALUE str2
);
3705 * casecmp(other_string) -> -1, 0, 1, or nil
3707 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3709 * - -1 if <tt>other_string.downcase</tt> is larger.
3710 * - 0 if the two are equal.
3711 * - 1 if <tt>other_string.downcase</tt> is smaller.
3712 * - +nil+ if the two are incomparable.
3716 * 'foo'.casecmp('foo') # => 0
3717 * 'foo'.casecmp('food') # => -1
3718 * 'food'.casecmp('foo') # => 1
3719 * 'FOO'.casecmp('foo') # => 0
3720 * 'foo'.casecmp('FOO') # => 0
3721 * 'foo'.casecmp(1) # => nil
3723 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3725 * Related: String#casecmp?.
3730 rb_str_casecmp(VALUE str1
, VALUE str2
)
3732 VALUE s
= rb_check_string_type(str2
);
3736 return str_casecmp(str1
, s
);
3740 str_casecmp(VALUE str1
, VALUE str2
)
3744 const char *p1
, *p1end
, *p2
, *p2end
;
3746 enc
= rb_enc_compatible(str1
, str2
);
3751 p1
= RSTRING_PTR(str1
); p1end
= RSTRING_END(str1
);
3752 p2
= RSTRING_PTR(str2
); p2end
= RSTRING_END(str2
);
3753 if (single_byte_optimizable(str1
) && single_byte_optimizable(str2
)) {
3754 while (p1
< p1end
&& p2
< p2end
) {
3756 unsigned int c1
= TOLOWER(*p1
& 0xff);
3757 unsigned int c2
= TOLOWER(*p2
& 0xff);
3759 return INT2FIX(c1
< c2
? -1 : 1);
3766 while (p1
< p1end
&& p2
< p2end
) {
3767 int l1
, c1
= rb_enc_ascget(p1
, p1end
, &l1
, enc
);
3768 int l2
, c2
= rb_enc_ascget(p2
, p2end
, &l2
, enc
);
3770 if (0 <= c1
&& 0 <= c2
) {
3774 return INT2FIX(c1
< c2
? -1 : 1);
3778 l1
= rb_enc_mbclen(p1
, p1end
, enc
);
3779 l2
= rb_enc_mbclen(p2
, p2end
, enc
);
3780 len
= l1
< l2
? l1
: l2
;
3781 r
= memcmp(p1
, p2
, len
);
3783 return INT2FIX(r
< 0 ? -1 : 1);
3785 return INT2FIX(l1
< l2
? -1 : 1);
3791 if (RSTRING_LEN(str1
) == RSTRING_LEN(str2
)) return INT2FIX(0);
3792 if (RSTRING_LEN(str1
) > RSTRING_LEN(str2
)) return INT2FIX(1);
3798 * casecmp?(other_string) -> true, false, or nil
3800 * Returns +true+ if +self+ and +other_string+ are equal after
3801 * Unicode case folding, otherwise +false+:
3803 * 'foo'.casecmp?('foo') # => true
3804 * 'foo'.casecmp?('food') # => false
3805 * 'food'.casecmp?('foo') # => false
3806 * 'FOO'.casecmp?('foo') # => true
3807 * 'foo'.casecmp?('FOO') # => true
3809 * Returns +nil+ if the two values are incomparable:
3811 * 'foo'.casecmp?(1) # => nil
3813 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3815 * Related: String#casecmp.
3820 rb_str_casecmp_p(VALUE str1
, VALUE str2
)
3822 VALUE s
= rb_check_string_type(str2
);
3826 return str_casecmp_p(str1
, s
);
3830 str_casecmp_p(VALUE str1
, VALUE str2
)
3833 VALUE folded_str1
, folded_str2
;
3834 VALUE fold_opt
= sym_fold
;
3836 enc
= rb_enc_compatible(str1
, str2
);
3841 folded_str1
= rb_str_downcase(1, &fold_opt
, str1
);
3842 folded_str2
= rb_str_downcase(1, &fold_opt
, str2
);
3844 return rb_str_eql(folded_str1
, folded_str2
);
3848 strseq_core(const char *str_ptr
, const char *str_ptr_end
, long str_len
,
3849 const char *sub_ptr
, long sub_len
, long offset
, rb_encoding
*enc
)
3851 const char *search_start
= str_ptr
;
3852 long pos
, search_len
= str_len
- offset
;
3856 pos
= rb_memsearch(sub_ptr
, sub_len
, search_start
, search_len
, enc
);
3857 if (pos
< 0) return pos
;
3858 t
= rb_enc_right_char_head(search_start
, search_start
+pos
, str_ptr_end
, enc
);
3859 if (t
== search_start
+ pos
) break;
3860 search_len
-= t
- search_start
;
3861 if (search_len
<= 0) return -1;
3862 offset
+= t
- search_start
;
3865 return pos
+ offset
;
3868 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3871 rb_strseq_index(VALUE str
, VALUE sub
, long offset
, int in_byte
)
3873 const char *str_ptr
, *str_ptr_end
, *sub_ptr
;
3874 long str_len
, sub_len
;
3877 enc
= rb_enc_check(str
, sub
);
3878 if (is_broken_string(sub
)) return -1;
3880 str_ptr
= RSTRING_PTR(str
);
3881 str_ptr_end
= RSTRING_END(str
);
3882 str_len
= RSTRING_LEN(str
);
3883 sub_ptr
= RSTRING_PTR(sub
);
3884 sub_len
= RSTRING_LEN(sub
);
3886 if (str_len
< sub_len
) return -1;
3889 long str_len_char
, sub_len_char
;
3890 int single_byte
= single_byte_optimizable(str
);
3891 str_len_char
= (in_byte
|| single_byte
) ? str_len
: str_strlen(str
, enc
);
3892 sub_len_char
= in_byte
? sub_len
: str_strlen(sub
, enc
);
3894 offset
+= str_len_char
;
3895 if (offset
< 0) return -1;
3897 if (str_len_char
- offset
< sub_len_char
) return -1;
3898 if (!in_byte
) offset
= str_offset(str_ptr
, str_ptr_end
, offset
, enc
, single_byte
);
3901 if (sub_len
== 0) return offset
;
3903 /* need proceed one character at a time */
3904 return strseq_core(str_ptr
, str_ptr_end
, str_len
, sub_ptr
, sub_len
, offset
, enc
);
3910 * index(substring, offset = 0) -> integer or nil
3911 * index(regexp, offset = 0) -> integer or nil
3913 * Returns the \Integer index of the first occurrence of the given +substring+,
3914 * or +nil+ if none found:
3916 * 'foo'.index('f') # => 0
3917 * 'foo'.index('o') # => 1
3918 * 'foo'.index('oo') # => 1
3919 * 'foo'.index('ooo') # => nil
3921 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3922 * or +nil+ if none found:
3924 * 'foo'.index(/f/) # => 0
3925 * 'foo'.index(/o/) # => 1
3926 * 'foo'.index(/oo/) # => 1
3927 * 'foo'.index(/ooo/) # => nil
3929 * \Integer argument +offset+, if given, specifies the position in the
3930 * string to begin the search:
3932 * 'foo'.index('o', 1) # => 1
3933 * 'foo'.index('o', 2) # => 2
3934 * 'foo'.index('o', 3) # => nil
3936 * If +offset+ is negative, counts backward from the end of +self+:
3938 * 'foo'.index('o', -1) # => 2
3939 * 'foo'.index('o', -2) # => 1
3940 * 'foo'.index('o', -3) # => 1
3941 * 'foo'.index('o', -4) # => nil
3943 * Related: String#rindex.
3947 rb_str_index_m(int argc
, VALUE
*argv
, VALUE str
)
3953 if (rb_scan_args(argc
, argv
, "11", &sub
, &initpos
) == 2) {
3954 pos
= NUM2LONG(initpos
);
3960 pos
+= str_strlen(str
, NULL
);
3962 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3963 rb_backref_set(Qnil
);
3969 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3970 if (pos
> str_strlen(str
, NULL
))
3972 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
3973 rb_enc_check(str
, sub
), single_byte_optimizable(str
));
3975 if (rb_reg_search(sub
, str
, pos
, 0) < 0) {
3979 VALUE match
= rb_backref_get();
3980 struct re_registers
*regs
= RMATCH_REGS(match
);
3981 pos
= rb_str_sublen(str
, BEG(0));
3982 return LONG2NUM(pos
);
3987 pos
= rb_str_index(str
, sub
, pos
);
3988 pos
= rb_str_sublen(str
, pos
);
3991 if (pos
== -1) return Qnil
;
3992 return LONG2NUM(pos
);
3997 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
3999 char *hit
, *adjusted
;
4001 long slen
, searchlen
;
4004 slen
= RSTRING_LEN(sub
);
4005 if (slen
== 0) return pos
;
4006 sbeg
= RSTRING_PTR(str
);
4007 e
= RSTRING_END(str
);
4008 t
= RSTRING_PTR(sub
);
4010 searchlen
= s
- sbeg
+ 1;
4013 hit
= memrchr(sbeg
, c
, searchlen
);
4015 adjusted
= rb_enc_left_char_head(sbeg
, hit
, e
, enc
);
4016 if (hit
!= adjusted
) {
4017 searchlen
= adjusted
- sbeg
;
4020 if (memcmp(hit
, t
, slen
) == 0)
4021 return rb_str_sublen(str
, hit
- sbeg
);
4022 searchlen
= adjusted
- sbeg
;
4023 } while (searchlen
> 0);
4029 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
4034 sbeg
= RSTRING_PTR(str
);
4035 e
= RSTRING_END(str
);
4036 t
= RSTRING_PTR(sub
);
4037 slen
= RSTRING_LEN(sub
);
4040 if (memcmp(s
, t
, slen
) == 0) {
4043 if (pos
== 0) break;
4045 s
= rb_enc_prev_char(sbeg
, s
, e
, enc
);
4053 rb_str_rindex(VALUE str
, VALUE sub
, long pos
)
4060 enc
= rb_enc_check(str
, sub
);
4061 if (is_broken_string(sub
)) return -1;
4062 singlebyte
= single_byte_optimizable(str
);
4063 len
= singlebyte
? RSTRING_LEN(str
) : str_strlen(str
, enc
); /* rb_enc_check */
4064 slen
= str_strlen(sub
, enc
); /* rb_enc_check */
4066 /* substring longer than string */
4067 if (len
< slen
) return -1;
4068 if (len
- pos
< slen
) pos
= len
- slen
;
4069 if (len
== 0) return pos
;
4071 sbeg
= RSTRING_PTR(str
);
4074 if (memcmp(sbeg
, RSTRING_PTR(sub
), RSTRING_LEN(sub
)) == 0)
4080 s
= str_nth(sbeg
, RSTRING_END(str
), pos
, enc
, singlebyte
);
4081 return str_rindex(str
, sub
, s
, pos
, enc
);
4086 * rindex(substring, offset = self.length) -> integer or nil
4087 * rindex(regexp, offset = self.length) -> integer or nil
4089 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4090 * or +nil+ if none found:
4092 * 'foo'.rindex('f') # => 0
4093 * 'foo'.rindex('o') # => 2
4094 * 'foo'.rindex('oo') # => 1
4095 * 'foo'.rindex('ooo') # => nil
4097 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4098 * or +nil+ if none found:
4100 * 'foo'.rindex(/f/) # => 0
4101 * 'foo'.rindex(/o/) # => 2
4102 * 'foo'.rindex(/oo/) # => 1
4103 * 'foo'.rindex(/ooo/) # => nil
4105 * The _last_ match means starting at the possible last position, not
4106 * the last of longest matches.
4108 * 'foo'.rindex(/o+/) # => 2
4109 * $~ #=> #<MatchData "o">
4111 * To get the last longest match, needs to combine with negative
4114 * 'foo'.rindex(/(?<!o)o+/) # => 1
4115 * $~ #=> #<MatchData "oo">
4117 * Or String#index with negative lookforward.
4119 * 'foo'.index(/o+(?!.*o)/) # => 1
4120 * $~ #=> #<MatchData "oo">
4122 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4123 * string to _end_ the search:
4125 * 'foo'.rindex('o', 0) # => nil
4126 * 'foo'.rindex('o', 1) # => 1
4127 * 'foo'.rindex('o', 2) # => 2
4128 * 'foo'.rindex('o', 3) # => 2
4130 * If +offset+ is a negative \Integer, the maximum starting position in the
4131 * string to _end_ the search is the sum of the string's length and +offset+:
4133 * 'foo'.rindex('o', -1) # => 2
4134 * 'foo'.rindex('o', -2) # => 1
4135 * 'foo'.rindex('o', -3) # => nil
4136 * 'foo'.rindex('o', -4) # => nil
4138 * Related: String#index.
4142 rb_str_rindex_m(int argc
, VALUE
*argv
, VALUE str
)
4146 rb_encoding
*enc
= STR_ENC_GET(str
);
4147 long pos
, len
= str_strlen(str
, enc
); /* str's enc */
4149 if (rb_scan_args(argc
, argv
, "11", &sub
, &vpos
) == 2) {
4150 pos
= NUM2LONG(vpos
);
4154 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4155 rb_backref_set(Qnil
);
4160 if (pos
> len
) pos
= len
;
4166 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4167 /* enc = rb_get_check(str, sub); */
4168 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
4169 enc
, single_byte_optimizable(str
));
4171 if (rb_reg_search(sub
, str
, pos
, 1) >= 0) {
4172 VALUE match
= rb_backref_get();
4173 struct re_registers
*regs
= RMATCH_REGS(match
);
4174 pos
= rb_str_sublen(str
, BEG(0));
4175 return LONG2NUM(pos
);
4180 pos
= rb_str_rindex(str
, sub
, pos
);
4181 if (pos
>= 0) return LONG2NUM(pos
);
4188 * string =~ regexp -> integer or nil
4189 * string =~ object -> integer or nil
4191 * Returns the \Integer index of the first substring that matches
4192 * the given +regexp+, or +nil+ if no match found:
4194 * 'foo' =~ /f/ # => 0
4195 * 'foo' =~ /o/ # => 1
4196 * 'foo' =~ /x/ # => nil
4198 * Note: also updates
4199 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4201 * If the given +object+ is not a \Regexp, returns the value
4202 * returned by <tt>object =~ self</tt>.
4204 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4205 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4208 * "no. 9" =~ /(?<number>\d+)/
4209 * number # => nil (not assigned)
4210 * /(?<number>\d+)/ =~ "no. 9"
4216 rb_str_match(VALUE x
, VALUE y
)
4218 switch (OBJ_BUILTIN_TYPE(y
)) {
4220 rb_raise(rb_eTypeError
, "type mismatch: String given");
4223 return rb_reg_match(y
, x
);
4226 return rb_funcall(y
, idEqTilde
, 1, x
);
4231 static VALUE
get_pat(VALUE
);
4236 * match(pattern, offset = 0) -> matchdata or nil
4237 * match(pattern, offset = 0) {|matchdata| ... } -> object
4239 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4241 * Note: also updates
4242 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4244 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4245 * regexp = Regexp.new(pattern)
4246 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4247 * (see Regexp#match):
4248 * matchdata = <tt>regexp.match(self)
4250 * With no block given, returns the computed +matchdata+:
4252 * 'foo'.match('f') # => #<MatchData "f">
4253 * 'foo'.match('o') # => #<MatchData "o">
4254 * 'foo'.match('x') # => nil
4256 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4258 * 'foo'.match('f', 1) # => nil
4259 * 'foo'.match('o', 1) # => #<MatchData "o">
4261 * With a block given, calls the block with the computed +matchdata+
4262 * and returns the block's return value:
4264 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4265 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4266 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4271 rb_str_match_m(int argc
, VALUE
*argv
, VALUE str
)
4275 rb_check_arity(argc
, 1, 2);
4278 result
= rb_funcallv(get_pat(re
), rb_intern("match"), argc
, argv
);
4279 if (!NIL_P(result
) && rb_block_given_p()) {
4280 return rb_yield(result
);
4287 * match?(pattern, offset = 0) -> true or false
4289 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4291 * Note: does not update
4292 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4294 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4295 * regexp = Regexp.new(pattern)
4297 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4298 * +false+ otherwise:
4300 * 'foo'.match?(/o/) # => true
4301 * 'foo'.match?('o') # => true
4302 * 'foo'.match?(/x/) # => false
4304 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4305 * 'foo'.match?('f', 1) # => false
4306 * 'foo'.match?('o', 1) # => true
4311 rb_str_match_m_p(int argc
, VALUE
*argv
, VALUE str
)
4314 rb_check_arity(argc
, 1, 2);
4315 re
= get_pat(argv
[0]);
4316 return rb_reg_match_p(re
, str
, argc
> 1 ? NUM2LONG(argv
[1]) : 0);
4319 enum neighbor_char
{
4325 static enum neighbor_char
4326 enc_succ_char(char *p
, long len
, rb_encoding
*enc
)
4331 if (rb_enc_mbminlen(enc
) > 1) {
4332 /* wchar, trivial case */
4333 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4334 if (!MBCLEN_CHARFOUND_P(r
)) {
4335 return NEIGHBOR_NOT_CHAR
;
4337 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
) + 1;
4338 l
= rb_enc_code_to_mbclen(c
, enc
);
4339 if (!l
) return NEIGHBOR_NOT_CHAR
;
4340 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4341 rb_enc_mbcput(c
, p
, enc
);
4342 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4343 if (!MBCLEN_CHARFOUND_P(r
)) {
4344 return NEIGHBOR_NOT_CHAR
;
4346 return NEIGHBOR_FOUND
;
4349 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0xff; i
--)
4352 return NEIGHBOR_WRAPPED
;
4353 ++((unsigned char*)p
)[i
];
4354 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4355 if (MBCLEN_CHARFOUND_P(l
)) {
4356 l
= MBCLEN_CHARFOUND_LEN(l
);
4358 return NEIGHBOR_FOUND
;
4361 memset(p
+l
, 0xff, len
-l
);
4364 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4367 for (len2
= len
-1; 0 < len2
; len2
--) {
4368 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4369 if (!MBCLEN_INVALID_P(l2
))
4372 memset(p
+len2
+1, 0xff, len
-(len2
+1));
4377 static enum neighbor_char
4378 enc_pred_char(char *p
, long len
, rb_encoding
*enc
)
4382 if (rb_enc_mbminlen(enc
) > 1) {
4383 /* wchar, trivial case */
4384 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4385 if (!MBCLEN_CHARFOUND_P(r
)) {
4386 return NEIGHBOR_NOT_CHAR
;
4388 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
);
4389 if (!c
) return NEIGHBOR_NOT_CHAR
;
4391 l
= rb_enc_code_to_mbclen(c
, enc
);
4392 if (!l
) return NEIGHBOR_NOT_CHAR
;
4393 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4394 rb_enc_mbcput(c
, p
, enc
);
4395 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4396 if (!MBCLEN_CHARFOUND_P(r
)) {
4397 return NEIGHBOR_NOT_CHAR
;
4399 return NEIGHBOR_FOUND
;
4402 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0; i
--)
4405 return NEIGHBOR_WRAPPED
;
4406 --((unsigned char*)p
)[i
];
4407 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4408 if (MBCLEN_CHARFOUND_P(l
)) {
4409 l
= MBCLEN_CHARFOUND_LEN(l
);
4411 return NEIGHBOR_FOUND
;
4414 memset(p
+l
, 0, len
-l
);
4417 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4420 for (len2
= len
-1; 0 < len2
; len2
--) {
4421 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4422 if (!MBCLEN_INVALID_P(l2
))
4425 memset(p
+len2
+1, 0, len
-(len2
+1));
4431 overwrite +p+ by succeeding letter in +enc+ and returns
4432 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4433 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4434 assuming each ranges are successive, and mbclen
4435 never change in each ranges.
4436 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4439 static enum neighbor_char
4440 enc_succ_alnum_char(char *p
, long len
, rb_encoding
*enc
, char *carry
)
4442 enum neighbor_char ret
;
4446 char save
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4448 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4450 const int max_gaps
= 1;
4452 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4453 if (rb_enc_isctype(c
, ONIGENC_CTYPE_DIGIT
, enc
))
4454 ctype
= ONIGENC_CTYPE_DIGIT
;
4455 else if (rb_enc_isctype(c
, ONIGENC_CTYPE_ALPHA
, enc
))
4456 ctype
= ONIGENC_CTYPE_ALPHA
;
4458 return NEIGHBOR_NOT_CHAR
;
4460 MEMCPY(save
, p
, char, len
);
4461 for (try = 0; try <= max_gaps
; ++try) {
4462 ret
= enc_succ_char(p
, len
, enc
);
4463 if (ret
== NEIGHBOR_FOUND
) {
4464 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4465 if (rb_enc_isctype(c
, ctype
, enc
))
4466 return NEIGHBOR_FOUND
;
4469 MEMCPY(p
, save
, char, len
);
4472 MEMCPY(save
, p
, char, len
);
4473 ret
= enc_pred_char(p
, len
, enc
);
4474 if (ret
== NEIGHBOR_FOUND
) {
4475 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4476 if (!rb_enc_isctype(c
, ctype
, enc
)) {
4477 MEMCPY(p
, save
, char, len
);
4482 MEMCPY(p
, save
, char, len
);
4488 return NEIGHBOR_NOT_CHAR
;
4491 if (ctype
!= ONIGENC_CTYPE_DIGIT
) {
4492 MEMCPY(carry
, p
, char, len
);
4493 return NEIGHBOR_WRAPPED
;
4496 MEMCPY(carry
, p
, char, len
);
4497 enc_succ_char(carry
, len
, enc
);
4498 return NEIGHBOR_WRAPPED
;
4502 static VALUE
str_succ(VALUE str
);
4508 * Returns the successor to +self+. The successor is calculated by
4509 * incrementing characters.
4511 * The first character to be incremented is the rightmost alphanumeric:
4512 * or, if no alphanumerics, the rightmost character:
4514 * 'THX1138'.succ # => "THX1139"
4515 * '<<koala>>'.succ # => "<<koalb>>"
4516 * '***'.succ # => '**+'
4518 * The successor to a digit is another digit, "carrying" to the next-left
4519 * character for a "rollover" from 9 to 0, and prepending another digit
4522 * '00'.succ # => "01"
4523 * '09'.succ # => "10"
4524 * '99'.succ # => "100"
4526 * The successor to a letter is another letter of the same case,
4527 * carrying to the next-left character for a rollover,
4528 * and prepending another same-case letter if necessary:
4530 * 'aa'.succ # => "ab"
4531 * 'az'.succ # => "ba"
4532 * 'zz'.succ # => "aaa"
4533 * 'AA'.succ # => "AB"
4534 * 'AZ'.succ # => "BA"
4535 * 'ZZ'.succ # => "AAA"
4537 * The successor to a non-alphanumeric character is the next character
4538 * in the underlying character set's collating sequence,
4539 * carrying to the next-left character for a rollover,
4540 * and prepending another character if necessary:
4543 * s # => "\x00\x00\x00"
4544 * s.succ # => "\x00\x00\x01"
4546 * s # => "\xFF\xFF\xFF"
4547 * s.succ # => "\x01\x00\x00\x00"
4549 * Carrying can occur between and among mixtures of alphanumeric characters:
4552 * s.succ # => "aaa00aa00"
4554 * s.succ # => "100aa00aa"
4556 * The successor to an empty \String is a new empty \String:
4560 * String#next is an alias for String#succ.
4564 rb_str_succ(VALUE orig
)
4567 str
= rb_str_new(RSTRING_PTR(orig
), RSTRING_LEN(orig
));
4568 rb_enc_cr_str_copy_for_substr(str
, orig
);
4569 return str_succ(str
);
4576 char *sbeg
, *s
, *e
, *last_alnum
= 0;
4577 int found_alnum
= 0;
4579 char carry
[ONIGENC_CODE_TO_MBC_MAXLEN
] = "\1";
4580 long carry_pos
= 0, carry_len
= 1;
4581 enum neighbor_char neighbor
= NEIGHBOR_FOUND
;
4583 slen
= RSTRING_LEN(str
);
4584 if (slen
== 0) return str
;
4586 enc
= STR_ENC_GET(str
);
4587 sbeg
= RSTRING_PTR(str
);
4588 s
= e
= sbeg
+ slen
;
4590 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4591 if (neighbor
== NEIGHBOR_NOT_CHAR
&& last_alnum
) {
4592 if (ISALPHA(*last_alnum
) ? ISDIGIT(*s
) :
4593 ISDIGIT(*last_alnum
) ? ISALPHA(*s
) : 0) {
4597 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4598 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4599 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4600 neighbor
= enc_succ_alnum_char(s
, l
, enc
, carry
);
4602 case NEIGHBOR_NOT_CHAR
:
4604 case NEIGHBOR_FOUND
:
4606 case NEIGHBOR_WRAPPED
:
4611 carry_pos
= s
- sbeg
;
4614 if (!found_alnum
) { /* str contains no alnum */
4616 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4617 enum neighbor_char neighbor
;
4618 char tmp
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4619 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4620 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4621 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4622 MEMCPY(tmp
, s
, char, l
);
4623 neighbor
= enc_succ_char(tmp
, l
, enc
);
4625 case NEIGHBOR_FOUND
:
4626 MEMCPY(s
, tmp
, char, l
);
4629 case NEIGHBOR_WRAPPED
:
4630 MEMCPY(s
, tmp
, char, l
);
4632 case NEIGHBOR_NOT_CHAR
:
4635 if (rb_enc_precise_mbclen(s
, s
+l
, enc
) != l
) {
4636 /* wrapped to \0...\0. search next valid char. */
4637 enc_succ_char(s
, l
, enc
);
4639 if (!rb_enc_asciicompat(enc
)) {
4640 MEMCPY(carry
, s
, char, l
);
4643 carry_pos
= s
- sbeg
;
4645 ENC_CODERANGE_SET(str
, ENC_CODERANGE_UNKNOWN
);
4647 RESIZE_CAPA(str
, slen
+ carry_len
);
4648 sbeg
= RSTRING_PTR(str
);
4649 s
= sbeg
+ carry_pos
;
4650 memmove(s
+ carry_len
, s
, slen
- carry_pos
);
4651 memmove(s
, carry
, carry_len
);
4653 STR_SET_LEN(str
, slen
);
4654 TERM_FILL(&sbeg
[slen
], rb_enc_mbminlen(enc
));
4655 rb_enc_str_coderange(str
);
4664 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4666 * String#next! is an alias for String#succ!.
4670 rb_str_succ_bang(VALUE str
)
4678 all_digits_p(const char *s
, long len
)
4681 if (!ISDIGIT(*s
)) return 0;
4688 str_upto_i(VALUE str
, VALUE arg
)
4696 * upto(other_string, exclusive = false) {|string| ... } -> self
4697 * upto(other_string, exclusive = false) -> new_enumerator
4699 * With a block given, calls the block with each \String value
4700 * returned by successive calls to String#succ;
4701 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4702 * the sequence terminates when value +other_string+ is reached;
4705 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4708 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4710 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4712 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4716 * a8 a9 b0 b1 b2 b3 b4 b5
4718 * If +other_string+ would not be reached, does not call the block:
4720 * '25'.upto('5') {|s| fail s }
4721 * 'aa'.upto('a') {|s| fail s }
4723 * With no block given, returns a new \Enumerator:
4725 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4730 rb_str_upto(int argc
, VALUE
*argv
, VALUE beg
)
4732 VALUE end
, exclusive
;
4734 rb_scan_args(argc
, argv
, "11", &end
, &exclusive
);
4735 RETURN_ENUMERATOR(beg
, argc
, argv
);
4736 return rb_str_upto_each(beg
, end
, RTEST(exclusive
), str_upto_i
, Qnil
);
4740 rb_str_upto_each(VALUE beg
, VALUE end
, int excl
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4742 VALUE current
, after_end
;
4747 CONST_ID(succ
, "succ");
4749 enc
= rb_enc_check(beg
, end
);
4750 ascii
= (is_ascii_string(beg
) && is_ascii_string(end
));
4751 /* single character */
4752 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1 && ascii
) {
4753 char c
= RSTRING_PTR(beg
)[0];
4754 char e
= RSTRING_PTR(end
)[0];
4756 if (c
> e
|| (excl
&& c
== e
)) return beg
;
4758 if ((*each
)(rb_enc_str_new(&c
, 1, enc
), arg
)) break;
4759 if (!excl
&& c
== e
) break;
4761 if (excl
&& c
== e
) break;
4765 /* both edges are all digits */
4766 if (ascii
&& ISDIGIT(RSTRING_PTR(beg
)[0]) && ISDIGIT(RSTRING_PTR(end
)[0]) &&
4767 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
)) &&
4768 all_digits_p(RSTRING_PTR(end
), RSTRING_LEN(end
))) {
4772 width
= RSTRING_LENINT(beg
);
4773 b
= rb_str_to_inum(beg
, 10, FALSE
);
4774 e
= rb_str_to_inum(end
, 10, FALSE
);
4775 if (FIXNUM_P(b
) && FIXNUM_P(e
)) {
4776 long bi
= FIX2LONG(b
);
4777 long ei
= FIX2LONG(e
);
4778 rb_encoding
*usascii
= rb_usascii_encoding();
4781 if (excl
&& bi
== ei
) break;
4782 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4787 ID op
= excl
? '<' : idLE
;
4788 VALUE args
[2], fmt
= rb_fstring_lit("%.*d");
4790 args
[0] = INT2FIX(width
);
4791 while (rb_funcall(b
, op
, 1, e
)) {
4793 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4794 b
= rb_funcallv(b
, succ
, 0, 0);
4800 n
= rb_str_cmp(beg
, end
);
4801 if (n
> 0 || (excl
&& n
== 0)) return beg
;
4803 after_end
= rb_funcallv(end
, succ
, 0, 0);
4804 current
= str_duplicate(rb_cString
, beg
);
4805 while (!rb_str_equal(current
, after_end
)) {
4807 if (excl
|| !rb_str_equal(current
, end
))
4808 next
= rb_funcallv(current
, succ
, 0, 0);
4809 if ((*each
)(current
, arg
)) break;
4810 if (NIL_P(next
)) break;
4812 StringValue(current
);
4813 if (excl
&& rb_str_equal(current
, end
)) break;
4814 if (RSTRING_LEN(current
) > RSTRING_LEN(end
) || RSTRING_LEN(current
) == 0)
4822 rb_str_upto_endless_each(VALUE beg
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4827 CONST_ID(succ
, "succ");
4828 /* both edges are all digits */
4829 if (is_ascii_string(beg
) && ISDIGIT(RSTRING_PTR(beg
)[0]) &&
4830 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
))) {
4831 VALUE b
, args
[2], fmt
= rb_fstring_lit("%.*d");
4832 int width
= RSTRING_LENINT(beg
);
4833 b
= rb_str_to_inum(beg
, 10, FALSE
);
4835 long bi
= FIX2LONG(b
);
4836 rb_encoding
*usascii
= rb_usascii_encoding();
4838 while (FIXABLE(bi
)) {
4839 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4844 args
[0] = INT2FIX(width
);
4847 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4848 b
= rb_funcallv(b
, succ
, 0, 0);
4852 current
= str_duplicate(rb_cString
, beg
);
4854 VALUE next
= rb_funcallv(current
, succ
, 0, 0);
4855 if ((*each
)(current
, arg
)) break;
4857 StringValue(current
);
4858 if (RSTRING_LEN(current
) == 0)
4866 include_range_i(VALUE str
, VALUE arg
)
4868 VALUE
*argp
= (VALUE
*)arg
;
4869 if (!rb_equal(str
, *argp
)) return 0;
4875 rb_str_include_range_p(VALUE beg
, VALUE end
, VALUE val
, VALUE exclusive
)
4877 beg
= rb_str_new_frozen(beg
);
4879 end
= rb_str_new_frozen(end
);
4880 if (NIL_P(val
)) return Qfalse
;
4881 val
= rb_check_string_type(val
);
4882 if (NIL_P(val
)) return Qfalse
;
4883 if (rb_enc_asciicompat(STR_ENC_GET(beg
)) &&
4884 rb_enc_asciicompat(STR_ENC_GET(end
)) &&
4885 rb_enc_asciicompat(STR_ENC_GET(val
))) {
4886 const char *bp
= RSTRING_PTR(beg
);
4887 const char *ep
= RSTRING_PTR(end
);
4888 const char *vp
= RSTRING_PTR(val
);
4889 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1) {
4890 if (RSTRING_LEN(val
) == 0 || RSTRING_LEN(val
) > 1)
4897 if (ISASCII(b
) && ISASCII(e
) && ISASCII(v
)) {
4898 if (b
<= v
&& v
< e
) return Qtrue
;
4899 return RBOOL(!RTEST(exclusive
) && v
== e
);
4904 /* both edges are all digits */
4905 if (ISDIGIT(*bp
) && ISDIGIT(*ep
) &&
4906 all_digits_p(bp
, RSTRING_LEN(beg
)) &&
4907 all_digits_p(ep
, RSTRING_LEN(end
))) {
4912 rb_str_upto_each(beg
, end
, RTEST(exclusive
), include_range_i
, (VALUE
)&val
);
4914 return RBOOL(NIL_P(val
));
4918 rb_str_subpat(VALUE str
, VALUE re
, VALUE backref
)
4920 if (rb_reg_search(re
, str
, 0, 0) >= 0) {
4921 VALUE match
= rb_backref_get();
4922 int nth
= rb_reg_backref_number(match
, backref
);
4923 return rb_reg_nth_match(nth
, match
);
4929 rb_str_aref(VALUE str
, VALUE indx
)
4933 if (FIXNUM_P(indx
)) {
4934 idx
= FIX2LONG(indx
);
4936 else if (RB_TYPE_P(indx
, T_REGEXP
)) {
4937 return rb_str_subpat(str
, indx
, INT2FIX(0));
4939 else if (RB_TYPE_P(indx
, T_STRING
)) {
4940 if (rb_str_index(str
, indx
, 0) != -1)
4941 return str_duplicate(rb_cString
, indx
);
4945 /* check if indx is Range */
4946 long beg
, len
= str_strlen(str
, NULL
);
4947 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
4953 return rb_str_substr(str
, beg
, len
);
4955 idx
= NUM2LONG(indx
);
4958 return str_substr(str
, idx
, 1, FALSE
);
4964 * string[index] -> new_string or nil
4965 * string[start, length] -> new_string or nil
4966 * string[range] -> new_string or nil
4967 * string[regexp, capture = 0] -> new_string or nil
4968 * string[substring] -> new_string or nil
4970 * Returns the substring of +self+ specified by the arguments.
4972 * When the single \Integer argument +index+ is given,
4973 * returns the 1-character substring found in +self+ at offset +index+:
4977 * Counts backward from the end of +self+ if +index+ is negative:
4979 * 'foo'[-3] # => "f"
4981 * Returns +nil+ if +index+ is out of range:
4984 * 'foo'[-4] # => nil
4986 * When the two \Integer arguments +start+ and +length+ are given,
4987 * returns the substring of the given +length+ found in +self+ at offset +start+:
4989 * 'foo'[0, 2] # => "fo"
4990 * 'foo'[0, 0] # => ""
4992 * Counts backward from the end of +self+ if +start+ is negative:
4994 * 'foo'[-2, 2] # => "oo"
4996 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4998 * 'foo'[3, 2] # => ""
5000 * Returns +nil+ if +start+ is out of range:
5002 * 'foo'[4, 2] # => nil
5003 * 'foo'[-4, 2] # => nil
5005 * Returns the trailing substring of +self+ if +length+ is large:
5007 * 'foo'[1, 50] # => "oo"
5009 * Returns +nil+ if +length+ is negative:
5011 * 'foo'[0, -1] # => nil
5013 * When the single \Range argument +range+ is given,
5014 * derives +start+ and +length+ values from the given +range+,
5015 * and returns values as above:
5017 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5018 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5020 * When the \Regexp argument +regexp+ is given,
5021 * and the +capture+ argument is <tt>0</tt>,
5022 * returns the first matching substring found in +self+,
5023 * or +nil+ if none found:
5025 * 'foo'[/o/] # => "o"
5026 * 'foo'[/x/] # => nil
5028 * s[/[aeiou](.)\1/] # => "ell"
5029 * s[/[aeiou](.)\1/, 0] # => "ell"
5031 * If argument +capture+ is given and not <tt>0</tt>,
5032 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5033 * the method call returns only the specified capture
5034 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5037 * s[/[aeiou](.)\1/, 1] # => "l"
5038 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5039 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5041 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5042 * capture group name is given, +IndexError+ is raised.
5044 * When the single \String argument +substring+ is given,
5045 * returns the substring from +self+ if found, otherwise +nil+:
5047 * 'foo'['oo'] # => "oo"
5048 * 'foo'['xx'] # => nil
5050 * String#slice is an alias for String#[].
5054 rb_str_aref_m(int argc
, VALUE
*argv
, VALUE str
)
5057 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5058 return rb_str_subpat(str
, argv
[0], argv
[1]);
5061 long beg
= NUM2LONG(argv
[0]);
5062 long len
= NUM2LONG(argv
[1]);
5063 return rb_str_substr(str
, beg
, len
);
5066 rb_check_arity(argc
, 1, 2);
5067 return rb_str_aref(str
, argv
[0]);
5071 rb_str_drop_bytes(VALUE str
, long len
)
5073 char *ptr
= RSTRING_PTR(str
);
5074 long olen
= RSTRING_LEN(str
), nlen
;
5076 str_modifiable(str
);
5077 if (len
> olen
) len
= olen
;
5079 if (str_embed_capa(str
) >= nlen
+ TERM_LEN(str
)) {
5081 int fl
= (int)(RBASIC(str
)->flags
& (STR_NOEMBED
|STR_SHARED
|STR_NOFREE
));
5083 STR_SET_EMBED_LEN(str
, nlen
);
5084 ptr
= RSTRING(str
)->as
.embed
.ary
;
5085 memmove(ptr
, oldptr
+ len
, nlen
);
5086 if (fl
== STR_NOEMBED
) xfree(oldptr
);
5089 if (!STR_SHARED_P(str
)) {
5090 VALUE shared
= heap_str_make_shared(rb_obj_class(str
), str
);
5091 rb_enc_cr_str_exact_copy(shared
, str
);
5094 ptr
= RSTRING(str
)->as
.heap
.ptr
+= len
;
5095 RSTRING(str
)->as
.heap
.len
= nlen
;
5098 ENC_CODERANGE_CLEAR(str
);
5103 rb_str_splice_0(VALUE str
, long beg
, long len
, VALUE val
)
5106 long slen
, vlen
= RSTRING_LEN(val
);
5109 if (beg
== 0 && vlen
== 0) {
5110 rb_str_drop_bytes(str
, len
);
5114 str_modify_keep_cr(str
);
5115 RSTRING_GETMEM(str
, sptr
, slen
);
5118 RESIZE_CAPA(str
, slen
+ vlen
- len
);
5119 sptr
= RSTRING_PTR(str
);
5122 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
5123 cr
= rb_enc_str_coderange(val
);
5125 cr
= ENC_CODERANGE_UNKNOWN
;
5128 memmove(sptr
+ beg
+ vlen
,
5130 slen
- (beg
+ len
));
5132 if (vlen
< beg
&& len
< 0) {
5133 MEMZERO(sptr
+ slen
, char, -len
);
5136 memmove(sptr
+ beg
, RSTRING_PTR(val
), vlen
);
5139 STR_SET_LEN(str
, slen
);
5140 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5141 ENC_CODERANGE_SET(str
, cr
);
5145 rb_str_update(VALUE str
, long beg
, long len
, VALUE val
)
5150 int singlebyte
= single_byte_optimizable(str
);
5153 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
5156 enc
= rb_enc_check(str
, val
);
5157 slen
= str_strlen(str
, enc
); /* rb_enc_check */
5159 if ((slen
< beg
) || ((beg
< 0) && (beg
+ slen
< 0))) {
5160 rb_raise(rb_eIndexError
, "index %ld out of string", beg
);
5166 assert(beg
<= slen
);
5167 if (len
> slen
- beg
) {
5170 str_modify_keep_cr(str
);
5171 p
= str_nth(RSTRING_PTR(str
), RSTRING_END(str
), beg
, enc
, singlebyte
);
5172 if (!p
) p
= RSTRING_END(str
);
5173 e
= str_nth(p
, RSTRING_END(str
), len
, enc
, singlebyte
);
5174 if (!e
) e
= RSTRING_END(str
);
5176 beg
= p
- RSTRING_PTR(str
); /* physical position */
5177 len
= e
- p
; /* physical length */
5178 rb_str_splice_0(str
, beg
, len
, val
);
5179 rb_enc_associate(str
, enc
);
5180 cr
= ENC_CODERANGE_AND(ENC_CODERANGE(str
), ENC_CODERANGE(val
));
5181 if (cr
!= ENC_CODERANGE_BROKEN
)
5182 ENC_CODERANGE_SET(str
, cr
);
5185 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5188 rb_str_subpat_set(VALUE str
, VALUE re
, VALUE backref
, VALUE val
)
5192 long start
, end
, len
;
5194 struct re_registers
*regs
;
5196 if (rb_reg_search(re
, str
, 0, 0) < 0) {
5197 rb_raise(rb_eIndexError
, "regexp not matched");
5199 match
= rb_backref_get();
5200 nth
= rb_reg_backref_number(match
, backref
);
5201 regs
= RMATCH_REGS(match
);
5202 if ((nth
>= regs
->num_regs
) || ((nth
< 0) && (-nth
>= regs
->num_regs
))) {
5203 rb_raise(rb_eIndexError
, "index %d out of regexp", nth
);
5206 nth
+= regs
->num_regs
;
5211 rb_raise(rb_eIndexError
, "regexp group %d not matched", nth
);
5216 enc
= rb_enc_check_str(str
, val
);
5217 rb_str_splice_0(str
, start
, len
, val
);
5218 rb_enc_associate(str
, enc
);
5222 rb_str_aset(VALUE str
, VALUE indx
, VALUE val
)
5226 switch (TYPE(indx
)) {
5228 rb_str_subpat_set(str
, indx
, INT2FIX(0), val
);
5232 beg
= rb_str_index(str
, indx
, 0);
5234 rb_raise(rb_eIndexError
, "string not matched");
5236 beg
= rb_str_sublen(str
, beg
);
5237 rb_str_splice(str
, beg
, str_strlen(indx
, NULL
), val
);
5241 /* check if indx is Range */
5244 if (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 2)) {
5245 rb_str_splice(str
, beg
, len
, val
);
5252 idx
= NUM2LONG(indx
);
5253 rb_str_splice(str
, idx
, 1, val
);
5260 * str[integer] = new_str
5261 * str[integer, integer] = new_str
5262 * str[range] = aString
5263 * str[regexp] = new_str
5264 * str[regexp, integer] = new_str
5265 * str[regexp, name] = new_str
5266 * str[other_str] = new_str
5268 * Element Assignment---Replaces some or all of the content of
5269 * <i>str</i>. The portion of the string affected is determined using
5270 * the same criteria as String#[]. If the replacement string is not
5271 * the same length as the text it is replacing, the string will be
5272 * adjusted accordingly. If the regular expression or string is used
5273 * as the index doesn't match a position in the string, IndexError is
5274 * raised. If the regular expression form is used, the optional
5275 * second Integer allows you to specify which portion of the match to
5276 * replace (effectively using the MatchData indexing rules. The forms
5277 * that take an Integer will raise an IndexError if the value is out
5278 * of range; the Range form will raise a RangeError, and the Regexp
5279 * and String will raise an IndexError on negative match.
5283 rb_str_aset_m(int argc
, VALUE
*argv
, VALUE str
)
5286 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5287 rb_str_subpat_set(str
, argv
[0], argv
[1], argv
[2]);
5290 rb_str_splice(str
, NUM2LONG(argv
[0]), NUM2LONG(argv
[1]), argv
[2]);
5294 rb_check_arity(argc
, 2, 3);
5295 return rb_str_aset(str
, argv
[0], argv
[1]);
5300 * insert(index, other_string) -> self
5302 * Inserts the given +other_string+ into +self+; returns +self+.
5304 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5306 * 'foo'.insert(1, 'bar') # => "fbaroo"
5308 * If the \Integer +index+ is negative, counts backward from the end of +self+
5309 * and inserts +other_string+ at offset <tt>index+1</tt>
5310 * (that is, _after_ <tt>self[index]</tt>):
5312 * 'foo'.insert(-2, 'bar') # => "fobaro"
5317 rb_str_insert(VALUE str
, VALUE idx
, VALUE str2
)
5319 long pos
= NUM2LONG(idx
);
5322 return rb_str_append(str
, str2
);
5327 rb_str_splice(str
, pos
, 0, str2
);
5334 * slice!(index) -> new_string or nil
5335 * slice!(start, length) -> new_string or nil
5336 * slice!(range) -> new_string or nil
5337 * slice!(regexp, capture = 0) -> new_string or nil
5338 * slice!(substring) -> new_string or nil
5340 * Removes the substring of +self+ specified by the arguments;
5341 * returns the removed substring.
5343 * See String#[] for details about the arguments that specify the substring.
5347 * string = "This is a string"
5348 * string.slice!(2) #=> "i"
5349 * string.slice!(3..6) #=> " is "
5350 * string.slice!(/s.*t/) #=> "sa st"
5351 * string.slice!("r") #=> "r"
5352 * string #=> "Thing"
5357 rb_str_slice_bang(int argc
, VALUE
*argv
, VALUE str
)
5359 VALUE result
= Qnil
;
5364 rb_check_arity(argc
, 1, 2);
5365 str_modify_keep_cr(str
);
5367 if (RB_TYPE_P(indx
, T_REGEXP
)) {
5368 if (rb_reg_search(indx
, str
, 0, 0) < 0) return Qnil
;
5369 VALUE match
= rb_backref_get();
5370 struct re_registers
*regs
= RMATCH_REGS(match
);
5372 if (argc
> 1 && (nth
= rb_reg_backref_number(match
, argv
[1])) < 0) {
5373 if ((nth
+= regs
->num_regs
) <= 0) return Qnil
;
5375 else if (nth
>= regs
->num_regs
) return Qnil
;
5377 len
= END(nth
) - beg
;
5380 else if (argc
== 2) {
5381 beg
= NUM2LONG(indx
);
5382 len
= NUM2LONG(argv
[1]);
5385 else if (FIXNUM_P(indx
)) {
5386 beg
= FIX2LONG(indx
);
5387 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5388 if (!len
) return Qnil
;
5389 beg
= p
- RSTRING_PTR(str
);
5392 else if (RB_TYPE_P(indx
, T_STRING
)) {
5393 beg
= rb_str_index(str
, indx
, 0);
5394 if (beg
== -1) return Qnil
;
5395 len
= RSTRING_LEN(indx
);
5396 result
= str_duplicate(rb_cString
, indx
);
5400 switch (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 0)) {
5404 beg
= NUM2LONG(indx
);
5405 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5406 if (!len
) return Qnil
;
5407 beg
= p
- RSTRING_PTR(str
);
5415 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5416 beg
= p
- RSTRING_PTR(str
);
5419 result
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
5420 rb_enc_cr_str_copy_for_substr(result
, str
);
5425 rb_str_drop_bytes(str
, len
);
5428 char *sptr
= RSTRING_PTR(str
);
5429 long slen
= RSTRING_LEN(str
);
5430 if (beg
+ len
> slen
) /* pathological check */
5434 slen
- (beg
+ len
));
5436 STR_SET_LEN(str
, slen
);
5437 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5448 switch (OBJ_BUILTIN_TYPE(pat
)) {
5456 val
= rb_check_string_type(pat
);
5458 Check_Type(pat
, T_REGEXP
);
5463 return rb_reg_regcomp(pat
);
5467 get_pat_quoted(VALUE pat
, int check
)
5471 switch (OBJ_BUILTIN_TYPE(pat
)) {
5479 val
= rb_check_string_type(pat
);
5481 Check_Type(pat
, T_REGEXP
);
5485 if (check
&& is_broken_string(pat
)) {
5486 rb_exc_raise(rb_reg_check_preprocess(pat
));
5492 rb_pat_search(VALUE pat
, VALUE str
, long pos
, int set_backref_str
)
5494 if (BUILTIN_TYPE(pat
) == T_STRING
) {
5495 pos
= rb_strseq_index(str
, pat
, pos
, 1);
5496 if (set_backref_str
) {
5498 str
= rb_str_new_frozen_String(str
);
5499 rb_backref_set_string(str
, pos
, RSTRING_LEN(pat
));
5502 rb_backref_set(Qnil
);
5508 return rb_reg_search0(pat
, str
, pos
, 0, set_backref_str
);
5515 * sub!(pattern, replacement) -> self or nil
5516 * sub!(pattern) {|match| ... } -> self or nil
5518 * Returns +self+ with only the first occurrence
5519 * (not all occurrences) of the given +pattern+ replaced.
5521 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5523 * Related: String#sub, String#gsub, String#gsub!.
5528 rb_str_sub_bang(int argc
, VALUE
*argv
, VALUE str
)
5530 VALUE pat
, repl
, hash
= Qnil
;
5533 int min_arity
= rb_block_given_p() ? 1 : 2;
5536 rb_check_arity(argc
, min_arity
, 2);
5542 hash
= rb_check_hash_type(argv
[1]);
5548 pat
= get_pat_quoted(argv
[0], 1);
5550 str_modifiable(str
);
5551 beg
= rb_pat_search(pat
, str
, 0, 1);
5554 int cr
= ENC_CODERANGE(str
);
5556 VALUE match
, match0
= Qnil
;
5557 struct re_registers
*regs
;
5561 match
= rb_backref_get();
5562 regs
= RMATCH_REGS(match
);
5563 if (RB_TYPE_P(pat
, T_STRING
)) {
5565 end0
= beg0
+ RSTRING_LEN(pat
);
5571 if (iter
) match0
= rb_reg_nth_match(0, match
);
5574 if (iter
|| !NIL_P(hash
)) {
5575 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5578 repl
= rb_obj_as_string(rb_yield(match0
));
5581 repl
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5582 repl
= rb_obj_as_string(repl
);
5584 str_mod_check(str
, p
, len
);
5585 rb_check_frozen(str
);
5588 repl
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5591 enc
= rb_enc_compatible(str
, repl
);
5593 rb_encoding
*str_enc
= STR_ENC_GET(str
);
5594 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5595 if (coderange_scan(p
, beg0
, str_enc
) != ENC_CODERANGE_7BIT
||
5596 coderange_scan(p
+end0
, len
-end0
, str_enc
) != ENC_CODERANGE_7BIT
) {
5597 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
5598 rb_enc_name(str_enc
),
5599 rb_enc_name(STR_ENC_GET(repl
)));
5601 enc
= STR_ENC_GET(repl
);
5604 rb_enc_associate(str
, enc
);
5605 if (ENC_CODERANGE_UNKNOWN
< cr
&& cr
< ENC_CODERANGE_BROKEN
) {
5606 int cr2
= ENC_CODERANGE(repl
);
5607 if (cr2
== ENC_CODERANGE_BROKEN
||
5608 (cr
== ENC_CODERANGE_VALID
&& cr2
== ENC_CODERANGE_7BIT
))
5609 cr
= ENC_CODERANGE_UNKNOWN
;
5614 rlen
= RSTRING_LEN(repl
);
5615 len
= RSTRING_LEN(str
);
5617 RESIZE_CAPA(str
, len
+ rlen
- plen
);
5619 p
= RSTRING_PTR(str
);
5621 memmove(p
+ beg0
+ rlen
, p
+ beg0
+ plen
, len
- beg0
- plen
);
5623 rp
= RSTRING_PTR(repl
);
5624 memmove(p
+ beg0
, rp
, rlen
);
5626 STR_SET_LEN(str
, len
);
5627 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
5628 ENC_CODERANGE_SET(str
, cr
);
5638 * sub(pattern, replacement) -> new_string
5639 * sub(pattern) {|match| ... } -> new_string
5641 * Returns a copy of +self+ with only the first occurrence
5642 * (not all occurrences) of the given +pattern+ replaced.
5644 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5646 * Related: String#sub!, String#gsub, String#gsub!.
5651 rb_str_sub(int argc
, VALUE
*argv
, VALUE str
)
5653 str
= str_duplicate(rb_cString
, str
);
5654 rb_str_sub_bang(argc
, argv
, str
);
5659 str_gsub(int argc
, VALUE
*argv
, VALUE str
, int bang
)
5661 VALUE pat
, val
= Qnil
, repl
, match
, match0
= Qnil
, dest
, hash
= Qnil
;
5662 struct re_registers
*regs
;
5663 long beg
, beg0
, end0
;
5664 long offset
, blen
, slen
, len
, last
;
5665 enum {STR
, ITER
, MAP
} mode
= STR
;
5667 int need_backref
= -1;
5668 rb_encoding
*str_enc
;
5672 RETURN_ENUMERATOR(str
, argc
, argv
);
5677 hash
= rb_check_hash_type(argv
[1]);
5686 rb_error_arity(argc
, 1, 2);
5689 pat
= get_pat_quoted(argv
[0], 1);
5690 beg
= rb_pat_search(pat
, str
, 0, need_backref
);
5692 if (bang
) return Qnil
; /* no match, no substitution */
5693 return str_duplicate(rb_cString
, str
);
5697 blen
= RSTRING_LEN(str
) + 30; /* len + margin */
5698 dest
= rb_str_buf_new(blen
);
5699 sp
= RSTRING_PTR(str
);
5700 slen
= RSTRING_LEN(str
);
5702 str_enc
= STR_ENC_GET(str
);
5703 rb_enc_associate(dest
, str_enc
);
5704 ENC_CODERANGE_SET(dest
, rb_enc_asciicompat(str_enc
) ? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
);
5707 match
= rb_backref_get();
5708 regs
= RMATCH_REGS(match
);
5709 if (RB_TYPE_P(pat
, T_STRING
)) {
5711 end0
= beg0
+ RSTRING_LEN(pat
);
5717 if (mode
== ITER
) match0
= rb_reg_nth_match(0, match
);
5722 val
= rb_obj_as_string(rb_yield(match0
));
5725 val
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5726 val
= rb_obj_as_string(val
);
5728 str_mod_check(str
, sp
, slen
);
5729 if (val
== dest
) { /* paranoid check [ruby-dev:24827] */
5730 rb_raise(rb_eRuntimeError
, "block should not cheat");
5733 else if (need_backref
) {
5734 val
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5735 if (need_backref
< 0) {
5736 need_backref
= val
!= repl
;
5743 len
= beg0
- offset
; /* copy pre-match substr */
5745 rb_enc_str_buf_cat(dest
, cp
, len
, str_enc
);
5748 rb_str_buf_append(dest
, val
);
5754 * Always consume at least one character of the input string
5755 * in order to prevent infinite loops.
5757 if (RSTRING_LEN(str
) <= end0
) break;
5758 len
= rb_enc_fast_mbclen(RSTRING_PTR(str
)+end0
, RSTRING_END(str
), str_enc
);
5759 rb_enc_str_buf_cat(dest
, RSTRING_PTR(str
)+end0
, len
, str_enc
);
5760 offset
= end0
+ len
;
5762 cp
= RSTRING_PTR(str
) + offset
;
5763 if (offset
> RSTRING_LEN(str
)) break;
5764 beg
= rb_pat_search(pat
, str
, offset
, need_backref
);
5766 if (RSTRING_LEN(str
) > offset
) {
5767 rb_enc_str_buf_cat(dest
, cp
, RSTRING_LEN(str
) - offset
, str_enc
);
5769 rb_pat_search(pat
, str
, last
, 1);
5771 str_shared_replace(str
, dest
);
5783 * gsub!(pattern, replacement) -> self or nil
5784 * gsub!(pattern) {|match| ... } -> self or nil
5785 * gsub!(pattern) -> an_enumerator
5787 * Performs the specified substring replacement(s) on +self+;
5788 * returns +self+ if any replacement occurred, +nil+ otherwise.
5790 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5792 * Returns an Enumerator if no +replacement+ and no block given.
5794 * Related: String#sub, String#gsub, String#sub!.
5799 rb_str_gsub_bang(int argc
, VALUE
*argv
, VALUE str
)
5801 str_modify_keep_cr(str
);
5802 return str_gsub(argc
, argv
, str
, 1);
5808 * gsub(pattern, replacement) -> new_string
5809 * gsub(pattern) {|match| ... } -> new_string
5810 * gsub(pattern) -> enumerator
5812 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5814 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5816 * Returns an Enumerator if no +replacement+ and no block given.
5818 * Related: String#sub, String#sub!, String#gsub!.
5823 rb_str_gsub(int argc
, VALUE
*argv
, VALUE str
)
5825 return str_gsub(argc
, argv
, str
, 0);
5831 * replace(other_string) -> self
5833 * Replaces the contents of +self+ with the contents of +other_string+:
5835 * s = 'foo' # => "foo"
5836 * s.replace('bar') # => "bar"
5841 rb_str_replace(VALUE str
, VALUE str2
)
5843 str_modifiable(str
);
5844 if (str
== str2
) return str
;
5848 return str_replace(str
, str2
);
5855 * Removes the contents of +self+:
5857 * s = 'foo' # => "foo"
5863 rb_str_clear(VALUE str
)
5867 STR_SET_EMBED_LEN(str
, 0);
5868 RSTRING_PTR(str
)[0] = 0;
5869 if (rb_enc_asciicompat(STR_ENC_GET(str
)))
5870 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
5872 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5880 * Returns a string containing the first character of +self+:
5882 * s = 'foo' # => "foo"
5888 rb_str_chr(VALUE str
)
5890 return rb_str_substr(str
, 0, 1);
5895 * getbyte(index) -> integer
5897 * Returns the byte at zero-based +index+ as an integer:
5899 * s = 'abcde' # => "abcde"
5900 * s.getbyte(0) # => 97
5901 * s.getbyte(1) # => 98
5903 * Related: String#setbyte.
5906 rb_str_getbyte(VALUE str
, VALUE index
)
5908 long pos
= NUM2LONG(index
);
5911 pos
+= RSTRING_LEN(str
);
5912 if (pos
< 0 || RSTRING_LEN(str
) <= pos
)
5915 return INT2FIX((unsigned char)RSTRING_PTR(str
)[pos
]);
5920 * setbyte(index, integer) -> integer
5922 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5924 * s = 'abcde' # => "abcde"
5925 * s.setbyte(0, 98) # => 98
5928 * Related: String#getbyte.
5931 rb_str_setbyte(VALUE str
, VALUE index
, VALUE value
)
5933 long pos
= NUM2LONG(index
);
5934 long len
= RSTRING_LEN(str
);
5935 char *ptr
, *head
, *left
= 0;
5937 int cr
= ENC_CODERANGE_UNKNOWN
, width
, nlen
;
5939 if (pos
< -len
|| len
<= pos
)
5940 rb_raise(rb_eIndexError
, "index %ld out of string", pos
);
5944 VALUE v
= rb_to_int(value
);
5945 VALUE w
= rb_int_and(v
, INT2FIX(0xff));
5946 char byte
= (char)(NUM2INT(w
) & 0xFF);
5948 if (!str_independent(str
))
5949 str_make_independent(str
);
5950 enc
= STR_ENC_GET(str
);
5951 head
= RSTRING_PTR(str
);
5953 if (!STR_EMBED_P(str
)) {
5954 cr
= ENC_CODERANGE(str
);
5956 case ENC_CODERANGE_7BIT
:
5959 if (ISASCII(byte
)) goto end
;
5960 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5961 if (!MBCLEN_CHARFOUND_P(nlen
))
5962 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5964 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5966 case ENC_CODERANGE_VALID
:
5967 left
= rb_enc_left_char_head(head
, ptr
, head
+len
, enc
);
5968 width
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5970 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5971 if (!MBCLEN_CHARFOUND_P(nlen
))
5972 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5973 else if (MBCLEN_CHARFOUND_LEN(nlen
) != width
|| ISASCII(byte
))
5974 ENC_CODERANGE_CLEAR(str
);
5978 ENC_CODERANGE_CLEAR(str
);
5986 str_byte_substr(VALUE str
, long beg
, long len
, int empty
)
5988 char *p
, *s
= RSTRING_PTR(str
);
5989 long n
= RSTRING_LEN(str
);
5992 if (beg
> n
|| len
< 0) return Qnil
;
5995 if (beg
< 0) return Qnil
;
6000 if (!empty
) return Qnil
;
6007 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) && SHARABLE_SUBSTRING_P(beg
, len
, n
)) {
6008 str2
= rb_str_new_frozen(str
);
6009 str2
= str_new_shared(rb_cString
, str2
);
6010 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
6011 RSTRING(str2
)->as
.heap
.len
= len
;
6014 str2
= rb_str_new(p
, len
);
6017 str_enc_copy(str2
, str
);
6019 if (RSTRING_LEN(str2
) == 0) {
6020 if (!rb_enc_asciicompat(STR_ENC_GET(str
)))
6021 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_VALID
);
6023 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6026 switch (ENC_CODERANGE(str
)) {
6027 case ENC_CODERANGE_7BIT
:
6028 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6031 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_UNKNOWN
);
6040 str_byte_aref(VALUE str
, VALUE indx
)
6043 if (FIXNUM_P(indx
)) {
6044 idx
= FIX2LONG(indx
);
6047 /* check if indx is Range */
6048 long beg
, len
= RSTRING_LEN(str
);
6050 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
6056 return str_byte_substr(str
, beg
, len
, TRUE
);
6059 idx
= NUM2LONG(indx
);
6061 return str_byte_substr(str
, idx
, 1, FALSE
);
6066 * byteslice(index, length = 1) -> string or nil
6067 * byteslice(range) -> string or nil
6069 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6071 * With integer arguments +index+ and +length+ given,
6072 * returns the substring beginning at the given +index+
6073 * of the given +length+ (if possible),
6074 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6076 * s = '0123456789' # => "0123456789"
6077 * s.byteslice(2) # => "2"
6078 * s.byteslice(200) # => nil
6079 * s.byteslice(4, 3) # => "456"
6080 * s.byteslice(4, 30) # => "456789"
6081 * s.byteslice(4, -1) # => nil
6082 * s.byteslice(40, 2) # => nil
6084 * In either case above, counts backwards from the end of +self+
6085 * if +index+ is negative:
6087 * s = '0123456789' # => "0123456789"
6088 * s.byteslice(-4) # => "6"
6089 * s.byteslice(-4, 3) # => "678"
6091 * With Range argument +range+ given, returns
6092 * <tt>byteslice(range.begin, range.size)</tt>:
6094 * s = '0123456789' # => "0123456789"
6095 * s.byteslice(4..6) # => "456"
6096 * s.byteslice(-6..-4) # => "456"
6097 * s.byteslice(5..2) # => "" # range.size is zero.
6098 * s.byteslice(40..42) # => nil
6100 * In all cases, a returned string has the same encoding as +self+:
6102 * s.encoding # => #<Encoding:UTF-8>
6103 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6108 rb_str_byteslice(int argc
, VALUE
*argv
, VALUE str
)
6111 long beg
= NUM2LONG(argv
[0]);
6112 long end
= NUM2LONG(argv
[1]);
6113 return str_byte_substr(str
, beg
, end
, TRUE
);
6115 rb_check_arity(argc
, 1, 2);
6116 return str_byte_aref(str
, argv
[0]);
6123 * Returns a new string with the characters from +self+ in reverse order.
6125 * 'stressed'.reverse # => "desserts"
6130 rb_str_reverse(VALUE str
)
6137 if (RSTRING_LEN(str
) <= 1) return str_duplicate(rb_cString
, str
);
6138 enc
= STR_ENC_GET(str
);
6139 rev
= rb_str_new(0, RSTRING_LEN(str
));
6140 s
= RSTRING_PTR(str
); e
= RSTRING_END(str
);
6141 p
= RSTRING_END(rev
);
6142 cr
= ENC_CODERANGE(str
);
6144 if (RSTRING_LEN(str
) > 1) {
6145 if (single_byte_optimizable(str
)) {
6150 else if (cr
== ENC_CODERANGE_VALID
) {
6152 int clen
= rb_enc_fast_mbclen(s
, e
, enc
);
6160 cr
= rb_enc_asciicompat(enc
) ?
6161 ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
6163 int clen
= rb_enc_mbclen(s
, e
, enc
);
6165 if (clen
> 1 || (*s
& 0x80)) cr
= ENC_CODERANGE_UNKNOWN
;
6172 STR_SET_LEN(rev
, RSTRING_LEN(str
));
6173 str_enc_copy(rev
, str
);
6174 ENC_CODERANGE_SET(rev
, cr
);
6184 * Returns +self+ with its characters reversed:
6187 * s.reverse! # => "desserts"
6193 rb_str_reverse_bang(VALUE str
)
6195 if (RSTRING_LEN(str
) > 1) {
6196 if (single_byte_optimizable(str
)) {
6199 str_modify_keep_cr(str
);
6200 s
= RSTRING_PTR(str
);
6201 e
= RSTRING_END(str
) - 1;
6209 str_shared_replace(str
, rb_str_reverse(str
));
6213 str_modify_keep_cr(str
);
6221 * include? other_string -> true or false
6223 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6226 * s.include?('f') # => true
6227 * s.include?('fo') # => true
6228 * s.include?('food') # => false
6233 rb_str_include(VALUE str
, VALUE arg
)
6238 i
= rb_str_index(str
, arg
, 0);
6240 return RBOOL(i
!= -1);
6246 * to_i(base = 10) -> integer
6248 * Returns the result of interpreting leading characters in +self+
6249 * as an integer in the given +base+ (which must be in (2..36)):
6251 * '123456'.to_i # => 123456
6252 * '123def'.to_i(16) # => 1195503
6254 * Characters past a leading valid number (in the given +base+) are ignored:
6256 * '12.345'.to_i # => 12
6257 * '12345'.to_i(2) # => 1
6259 * Returns zero if there is no leading valid number:
6261 * 'abcdef'.to_i # => 0
6262 * '2'.to_i(2) # => 0
6267 rb_str_to_i(int argc
, VALUE
*argv
, VALUE str
)
6271 if (rb_check_arity(argc
, 0, 1) && (base
= NUM2INT(argv
[0])) < 0) {
6272 rb_raise(rb_eArgError
, "invalid radix %d", base
);
6274 return rb_str_to_inum(str
, base
, FALSE
);
6282 * Returns the result of interpreting leading characters in +self+ as a Float:
6284 * '3.14159'.to_f # => 3.14159
6285 '1.234e-2'.to_f # => 0.01234
6287 * Characters past a leading valid number (in the given +base+) are ignored:
6289 * '3.14 (pi to two places)'.to_f # => 3.14
6291 * Returns zero if there is no leading valid number:
6293 * 'abcdef'.to_f # => 0.0
6298 rb_str_to_f(VALUE str
)
6300 return DBL2NUM(rb_str_to_dbl(str
, FALSE
));
6306 * to_s -> self or string
6308 * Returns +self+ if +self+ is a \String,
6309 * or +self+ converted to a \String if +self+ is a subclass of \String.
6311 * String#to_str is an alias for String#to_s.
6316 rb_str_to_s(VALUE str
)
6318 if (rb_obj_class(str
) != rb_cString
) {
6319 return str_duplicate(rb_cString
, str
);
6326 str_cat_char(VALUE str
, unsigned int c
, rb_encoding
*enc
)
6328 char s
[RUBY_MAX_CHAR_LEN
];
6329 int n
= rb_enc_codelen(c
, enc
);
6331 rb_enc_mbcput(c
, s
, enc
);
6332 rb_enc_str_buf_cat(str
, s
, n
, enc
);
6336 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6339 rb_str_buf_cat_escaped_char(VALUE result
, unsigned int c
, int unicode_p
)
6341 char buf
[CHAR_ESC_LEN
+ 1];
6348 if (c
< 0x7F && ISPRINT(c
)) {
6349 snprintf(buf
, CHAR_ESC_LEN
, "%c", c
);
6351 else if (c
< 0x10000) {
6352 snprintf(buf
, CHAR_ESC_LEN
, "\\u%04X", c
);
6355 snprintf(buf
, CHAR_ESC_LEN
, "\\u{%X}", c
);
6360 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", c
);
6363 snprintf(buf
, CHAR_ESC_LEN
, "\\x{%X}", c
);
6366 l
= (int)strlen(buf
); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6367 rb_str_buf_cat(result
, buf
, l
);
6372 ruby_escaped_char(int c
)
6375 case '\0': return "\\0";
6376 case '\n': return "\\n";
6377 case '\r': return "\\r";
6378 case '\t': return "\\t";
6379 case '\f': return "\\f";
6380 case '\013': return "\\v";
6381 case '\010': return "\\b";
6382 case '\007': return "\\a";
6383 case '\033': return "\\e";
6384 case '\x7f': return "\\c?";
6390 rb_str_escape(VALUE str
)
6392 int encidx
= ENCODING_GET(str
);
6393 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6394 const char *p
= RSTRING_PTR(str
);
6395 const char *pend
= RSTRING_END(str
);
6396 const char *prev
= p
;
6397 char buf
[CHAR_ESC_LEN
+ 1];
6398 VALUE result
= rb_str_buf_new(0);
6399 int unicode_p
= rb_enc_unicode_p(enc
);
6400 int asciicompat
= rb_enc_asciicompat(enc
);
6405 int n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6406 if (!MBCLEN_CHARFOUND_P(n
)) {
6407 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6408 n
= rb_enc_mbminlen(enc
);
6410 n
= (int)(pend
- p
);
6412 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6413 str_buf_cat(result
, buf
, strlen(buf
));
6418 n
= MBCLEN_CHARFOUND_LEN(n
);
6419 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6421 cc
= ruby_escaped_char(c
);
6423 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6424 str_buf_cat(result
, cc
, strlen(cc
));
6427 else if (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
)) {
6430 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6431 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6435 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6436 ENCODING_CODERANGE_SET(result
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
6445 * Returns a printable version of +self+, enclosed in double-quotes,
6446 * and with special characters escaped:
6448 * s = "foo\tbar\tbaz\n"
6449 * # => "foo\tbar\tbaz\n"
6451 * # => "\"foo\\tbar\\tbaz\\n\""
6456 rb_str_inspect(VALUE str
)
6458 int encidx
= ENCODING_GET(str
);
6459 rb_encoding
*enc
= rb_enc_from_index(encidx
), *actenc
;
6460 const char *p
, *pend
, *prev
;
6461 char buf
[CHAR_ESC_LEN
+ 1];
6462 VALUE result
= rb_str_buf_new(0);
6463 rb_encoding
*resenc
= rb_default_internal_encoding();
6464 int unicode_p
= rb_enc_unicode_p(enc
);
6465 int asciicompat
= rb_enc_asciicompat(enc
);
6467 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
6468 if (!rb_enc_asciicompat(resenc
)) resenc
= rb_usascii_encoding();
6469 rb_enc_associate(result
, resenc
);
6470 str_buf_cat2(result
, "\"");
6472 p
= RSTRING_PTR(str
); pend
= RSTRING_END(str
);
6474 actenc
= get_actual_encoding(encidx
, str
);
6475 if (actenc
!= enc
) {
6477 if (unicode_p
) unicode_p
= rb_enc_unicode_p(enc
);
6483 n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6484 if (!MBCLEN_CHARFOUND_P(n
)) {
6485 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6486 n
= rb_enc_mbminlen(enc
);
6488 n
= (int)(pend
- p
);
6490 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6491 str_buf_cat(result
, buf
, strlen(buf
));
6496 n
= MBCLEN_CHARFOUND_LEN(n
);
6497 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6499 if ((asciicompat
|| unicode_p
) &&
6500 (c
== '"'|| c
== '\\' ||
6503 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p
,pend
,enc
)) &&
6504 (cc
= rb_enc_codepoint(p
,pend
,enc
),
6505 (cc
== '$' || cc
== '@' || cc
== '{'))))) {
6506 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6507 str_buf_cat2(result
, "\\");
6508 if (asciicompat
|| enc
== resenc
) {
6514 case '\n': cc
= 'n'; break;
6515 case '\r': cc
= 'r'; break;
6516 case '\t': cc
= 't'; break;
6517 case '\f': cc
= 'f'; break;
6518 case '\013': cc
= 'v'; break;
6519 case '\010': cc
= 'b'; break;
6520 case '\007': cc
= 'a'; break;
6521 case 033: cc
= 'e'; break;
6522 default: cc
= 0; break;
6525 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6528 str_buf_cat(result
, buf
, 2);
6532 if ((enc
== resenc
&& rb_enc_isprint(c
, enc
)) ||
6533 (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
))) {
6537 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6538 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6543 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6544 str_buf_cat2(result
, "\"");
6549 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6555 * Returns a printable version of +self+, enclosed in double-quotes,
6556 * with special characters escaped, and with non-printing characters
6557 * replaced by hexadecimal notation:
6559 * "hello \n ''".dump # => "\"hello \\n ''\""
6560 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6562 * Related: String#undump (inverse of String#dump).
6567 rb_str_dump(VALUE str
)
6569 int encidx
= rb_enc_get_index(str
);
6570 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6572 const char *p
, *pend
;
6575 int u8
= (encidx
== rb_utf8_encindex());
6576 static const char nonascii_suffix
[] = ".dup.force_encoding(\"%s\")";
6579 if (!rb_enc_asciicompat(enc
)) {
6580 len
+= strlen(nonascii_suffix
) - rb_strlen_lit("%s");
6581 len
+= strlen(enc
->name
);
6584 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6587 unsigned char c
= *p
++;
6590 case '"': case '\\':
6591 case '\n': case '\r':
6592 case '\t': case '\f':
6593 case '\013': case '\010': case '\007': case '\033':
6598 clen
= IS_EVSTR(p
, pend
) ? 2 : 1;
6606 if (u8
&& c
> 0x7F) { /* \u notation */
6607 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
);
6608 if (MBCLEN_CHARFOUND_P(n
)) {
6609 unsigned int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6611 clen
= 6; /* \uXXXX */
6612 else if (cc
<= 0xFFFFF)
6613 clen
= 9; /* \u{XXXXX} */
6615 clen
= 10; /* \u{XXXXXX} */
6616 p
+= MBCLEN_CHARFOUND_LEN(n
)-1;
6620 clen
= 4; /* \xNN */
6625 if (clen
> LONG_MAX
- len
) {
6626 rb_raise(rb_eRuntimeError
, "string size too big");
6631 result
= rb_str_new(0, len
);
6632 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6633 q
= RSTRING_PTR(result
); qend
= q
+ len
+ 1;
6637 unsigned char c
= *p
++;
6639 if (c
== '"' || c
== '\\') {
6643 else if (c
== '#') {
6644 if (IS_EVSTR(p
, pend
)) *q
++ = '\\';
6647 else if (c
== '\n') {
6651 else if (c
== '\r') {
6655 else if (c
== '\t') {
6659 else if (c
== '\f') {
6663 else if (c
== '\013') {
6667 else if (c
== '\010') {
6671 else if (c
== '\007') {
6675 else if (c
== '\033') {
6679 else if (ISPRINT(c
)) {
6685 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
) - 1;
6686 if (MBCLEN_CHARFOUND_P(n
)) {
6687 int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6690 snprintf(q
, qend
-q
, "u%04X", cc
); /* \uXXXX */
6692 snprintf(q
, qend
-q
, "u{%X}", cc
); /* \u{XXXXX} or \u{XXXXXX} */
6697 snprintf(q
, qend
-q
, "x%02X", c
);
6703 if (!rb_enc_asciicompat(enc
)) {
6704 snprintf(q
, qend
-q
, nonascii_suffix
, enc
->name
);
6705 encidx
= rb_ascii8bit_encindex();
6707 /* result from dump is ASCII */
6708 rb_enc_associate_index(result
, encidx
);
6709 ENC_CODERANGE_SET(result
, ENC_CODERANGE_7BIT
);
6714 unescape_ascii(unsigned int c
)
6734 UNREACHABLE_RETURN(-1);
6738 undump_after_backslash(VALUE undumped
, const char **ss
, const char *s_end
, rb_encoding
**penc
, bool *utf8
, bool *binary
)
6740 const char *s
= *ss
;
6744 unsigned char buf
[6];
6745 static rb_encoding
*enc_utf8
= NULL
;
6751 rb_str_cat(undumped
, s
, 1); /* cat itself */
6762 *buf
= unescape_ascii(*s
);
6763 rb_str_cat(undumped
, (char *)buf
, 1);
6768 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6772 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6774 if (enc_utf8
== NULL
) enc_utf8
= rb_utf8_encoding();
6775 if (*penc
!= enc_utf8
) {
6777 rb_enc_associate(undumped
, enc_utf8
);
6779 if (*s
== '{') { /* handle \u{...} form */
6783 rb_raise(rb_eRuntimeError
, "unterminated Unicode escape");
6793 c
= scan_hex(s
, s_end
-s
, &hexlen
);
6794 if (hexlen
== 0 || hexlen
> 6) {
6795 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6798 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint (too large)");
6800 if (0xd800 <= c
&& c
<= 0xdfff) {
6801 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6803 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6804 rb_str_cat(undumped
, (char *)buf
, codelen
);
6808 else { /* handle \uXXXX form */
6809 c
= scan_hex(s
, 4, &hexlen
);
6811 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6813 if (0xd800 <= c
&& c
<= 0xdfff) {
6814 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6816 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6817 rb_str_cat(undumped
, (char *)buf
, codelen
);
6823 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6827 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6829 *buf
= scan_hex(s
, 2, &hexlen
);
6831 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6833 rb_str_cat(undumped
, (char *)buf
, 1);
6837 rb_str_cat(undumped
, s
-1, 2);
6844 static VALUE
rb_str_is_ascii_only_p(VALUE str
);
6850 * Returns an unescaped version of +self+:
6852 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6853 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6854 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6855 * s_undumped == s_orig # => true
6857 * Related: String#dump (inverse of String#undump).
6862 str_undump(VALUE str
)
6864 const char *s
= RSTRING_PTR(str
);
6865 const char *s_end
= RSTRING_END(str
);
6866 rb_encoding
*enc
= rb_enc_get(str
);
6867 VALUE undumped
= rb_enc_str_new(s
, 0L, enc
);
6869 bool binary
= false;
6872 rb_must_asciicompat(str
);
6873 if (rb_str_is_ascii_only_p(str
) == Qfalse
) {
6874 rb_raise(rb_eRuntimeError
, "non-ASCII character detected");
6876 if (!str_null_check(str
, &w
)) {
6877 rb_raise(rb_eRuntimeError
, "string contains null byte");
6879 if (RSTRING_LEN(str
) < 2) goto invalid_format
;
6880 if (*s
!= '"') goto invalid_format
;
6882 /* strip '"' at the start */
6887 rb_raise(rb_eRuntimeError
, "unterminated dumped string");
6894 /* ascii compatible dumped string */
6898 static const char force_encoding_suffix
[] = ".force_encoding(\""; /* "\")" */
6899 static const char dup_suffix
[] = ".dup";
6900 const char *encname
;
6904 /* check separately for strings dumped by older versions */
6905 size
= sizeof(dup_suffix
) - 1;
6906 if (s_end
- s
> size
&& memcmp(s
, dup_suffix
, size
) == 0) s
+= size
;
6908 size
= sizeof(force_encoding_suffix
) - 1;
6909 if (s_end
- s
<= size
) goto invalid_format
;
6910 if (memcmp(s
, force_encoding_suffix
, size
) != 0) goto invalid_format
;
6914 rb_raise(rb_eRuntimeError
, "dumped string contained Unicode escape but used force_encoding");
6918 s
= memchr(s
, '"', s_end
-s
);
6920 if (!s
) goto invalid_format
;
6921 if (s_end
- s
!= 2) goto invalid_format
;
6922 if (s
[0] != '"' || s
[1] != ')') goto invalid_format
;
6924 encidx
= rb_enc_find_index2(encname
, (long)size
);
6926 rb_raise(rb_eRuntimeError
, "dumped string has unknown encoding name");
6928 rb_enc_associate_index(undumped
, encidx
);
6936 rb_raise(rb_eRuntimeError
, "invalid escape");
6938 undump_after_backslash(undumped
, &s
, s_end
, &enc
, &utf8
, &binary
);
6941 rb_str_cat(undumped
, s
++, 1);
6947 rb_raise(rb_eRuntimeError
, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6951 rb_str_check_dummy_enc(rb_encoding
*enc
)
6953 if (rb_enc_dummy_p(enc
)) {
6954 rb_raise(rb_eEncCompatError
, "incompatible encoding with this operation: %s",
6959 static rb_encoding
*
6960 str_true_enc(VALUE str
)
6962 rb_encoding
*enc
= STR_ENC_GET(str
);
6963 rb_str_check_dummy_enc(enc
);
6967 static OnigCaseFoldType
6968 check_case_options(int argc
, VALUE
*argv
, OnigCaseFoldType flags
)
6973 rb_raise(rb_eArgError
, "too many options");
6974 if (argv
[0]==sym_turkic
) {
6975 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6977 if (argv
[1]==sym_lithuanian
)
6978 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6980 rb_raise(rb_eArgError
, "invalid second option");
6983 else if (argv
[0]==sym_lithuanian
) {
6984 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6986 if (argv
[1]==sym_turkic
)
6987 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6989 rb_raise(rb_eArgError
, "invalid second option");
6993 rb_raise(rb_eArgError
, "too many options");
6994 else if (argv
[0]==sym_ascii
)
6995 flags
|= ONIGENC_CASE_ASCII_ONLY
;
6996 else if (argv
[0]==sym_fold
) {
6997 if ((flags
& (ONIGENC_CASE_UPCASE
|ONIGENC_CASE_DOWNCASE
)) == ONIGENC_CASE_DOWNCASE
)
6998 flags
^= ONIGENC_CASE_FOLD
|ONIGENC_CASE_DOWNCASE
;
7000 rb_raise(rb_eArgError
, "option :fold only allowed for downcasing");
7003 rb_raise(rb_eArgError
, "invalid option");
7008 case_option_single_p(OnigCaseFoldType flags
, rb_encoding
*enc
, VALUE str
)
7010 if ((flags
& ONIGENC_CASE_ASCII_ONLY
) && (enc
==rb_utf8_encoding() || rb_enc_mbmaxlen(enc
) == 1))
7012 return !(flags
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) && ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
;
7015 /* 16 should be long enough to absorb any kind of single character length increase */
7016 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7017 #ifndef CASEMAP_DEBUG
7018 # define CASEMAP_DEBUG 0
7021 struct mapping_buffer
;
7022 typedef struct mapping_buffer
{
7025 struct mapping_buffer
*next
;
7026 OnigUChar space
[FLEX_ARY_LEN
];
7030 mapping_buffer_free(void *p
)
7032 mapping_buffer
*previous_buffer
;
7033 mapping_buffer
*current_buffer
= p
;
7034 while (current_buffer
) {
7035 previous_buffer
= current_buffer
;
7036 current_buffer
= current_buffer
->next
;
7037 ruby_sized_xfree(previous_buffer
, previous_buffer
->capa
);
7041 static const rb_data_type_t mapping_buffer_type
= {
7043 {0, mapping_buffer_free
,}
7047 rb_str_casemap(VALUE source
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7051 const OnigUChar
*source_current
, *source_end
;
7052 int target_length
= 0;
7053 VALUE buffer_anchor
;
7054 mapping_buffer
*current_buffer
= 0;
7055 mapping_buffer
**pre_buffer
;
7056 size_t buffer_count
= 0;
7057 int buffer_length_or_invalid
;
7059 if (RSTRING_LEN(source
) == 0) return str_duplicate(rb_cString
, source
);
7061 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7062 source_end
= (OnigUChar
*)RSTRING_END(source
);
7064 buffer_anchor
= TypedData_Wrap_Struct(0, &mapping_buffer_type
, 0);
7065 pre_buffer
= (mapping_buffer
**)&DATA_PTR(buffer_anchor
);
7066 while (source_current
< source_end
) {
7067 /* increase multiplier using buffer count to converge quickly */
7068 size_t capa
= (size_t)(source_end
-source_current
)*++buffer_count
+ CASE_MAPPING_ADDITIONAL_LENGTH
;
7069 if (CASEMAP_DEBUG
) {
7070 fprintf(stderr
, "Buffer allocation, capa is %"PRIuSIZE
"\n", capa
); /* for tuning */
7072 current_buffer
= xmalloc(offsetof(mapping_buffer
, space
) + capa
);
7073 *pre_buffer
= current_buffer
;
7074 pre_buffer
= ¤t_buffer
->next
;
7075 current_buffer
->next
= NULL
;
7076 current_buffer
->capa
= capa
;
7077 buffer_length_or_invalid
= enc
->case_map(flags
,
7078 &source_current
, source_end
,
7079 current_buffer
->space
,
7080 current_buffer
->space
+current_buffer
->capa
,
7082 if (buffer_length_or_invalid
< 0) {
7083 current_buffer
= DATA_PTR(buffer_anchor
);
7084 DATA_PTR(buffer_anchor
) = 0;
7085 mapping_buffer_free(current_buffer
);
7086 rb_raise(rb_eArgError
, "input string invalid");
7088 target_length
+= current_buffer
->used
= buffer_length_or_invalid
;
7090 if (CASEMAP_DEBUG
) {
7091 fprintf(stderr
, "Buffer count is %"PRIuSIZE
"\n", buffer_count
); /* for tuning */
7094 if (buffer_count
==1) {
7095 target
= rb_str_new((const char*)current_buffer
->space
, target_length
);
7098 char *target_current
;
7100 target
= rb_str_new(0, target_length
);
7101 target_current
= RSTRING_PTR(target
);
7102 current_buffer
= DATA_PTR(buffer_anchor
);
7103 while (current_buffer
) {
7104 memcpy(target_current
, current_buffer
->space
, current_buffer
->used
);
7105 target_current
+= current_buffer
->used
;
7106 current_buffer
= current_buffer
->next
;
7109 current_buffer
= DATA_PTR(buffer_anchor
);
7110 DATA_PTR(buffer_anchor
) = 0;
7111 mapping_buffer_free(current_buffer
);
7113 /* TODO: check about string terminator character */
7114 str_enc_copy(target
, source
);
7115 /*ENC_CODERANGE_SET(mapped, cr);*/
7121 rb_str_ascii_casemap(VALUE source
, VALUE target
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7123 const OnigUChar
*source_current
, *source_end
;
7124 OnigUChar
*target_current
, *target_end
;
7125 long old_length
= RSTRING_LEN(source
);
7126 int length_or_invalid
;
7128 if (old_length
== 0) return Qnil
;
7130 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7131 source_end
= (OnigUChar
*)RSTRING_END(source
);
7132 if (source
== target
) {
7133 target_current
= (OnigUChar
*)source_current
;
7134 target_end
= (OnigUChar
*)source_end
;
7137 target_current
= (OnigUChar
*)RSTRING_PTR(target
);
7138 target_end
= (OnigUChar
*)RSTRING_END(target
);
7141 length_or_invalid
= onigenc_ascii_only_case_map(flags
,
7142 &source_current
, source_end
,
7143 target_current
, target_end
, enc
);
7144 if (length_or_invalid
< 0)
7145 rb_raise(rb_eArgError
, "input string invalid");
7146 if (CASEMAP_DEBUG
&& length_or_invalid
!= old_length
) {
7147 fprintf(stderr
, "problem with rb_str_ascii_casemap"
7148 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7149 rb_raise(rb_eArgError
, "internal problem with rb_str_ascii_casemap"
7150 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7153 str_enc_copy(target
, source
);
7159 upcase_single(VALUE str
)
7161 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7162 bool modified
= false;
7165 unsigned int c
= *(unsigned char*)s
;
7167 if ('a' <= c
&& c
<= 'z') {
7168 *s
= 'A' + (c
- 'a');
7178 * upcase!(*options) -> self or nil
7180 * Upcases the characters in +self+;
7181 * returns +self+ if any changes were made, +nil+ otherwise:
7183 * s = 'Hello World!' # => "Hello World!"
7184 * s.upcase! # => "HELLO WORLD!"
7185 * s # => "HELLO WORLD!"
7186 * s.upcase! # => nil
7188 * The casing may be affected by the given +options+;
7189 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7191 * Related: String#upcase, String#downcase, String#downcase!.
7196 rb_str_upcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7199 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7201 flags
= check_case_options(argc
, argv
, flags
);
7202 str_modify_keep_cr(str
);
7203 enc
= str_true_enc(str
);
7204 if (case_option_single_p(flags
, enc
, str
)) {
7205 if (upcase_single(str
))
7206 flags
|= ONIGENC_CASE_MODIFIED
;
7208 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7209 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7211 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7213 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7220 * upcase(*options) -> string
7222 * Returns a string containing the upcased characters in +self+:
7224 * s = 'Hello World!' # => "Hello World!"
7225 * s.upcase # => "HELLO WORLD!"
7227 * The casing may be affected by the given +options+;
7228 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7230 * Related: String#upcase!, String#downcase, String#downcase!.
7235 rb_str_upcase(int argc
, VALUE
*argv
, VALUE str
)
7238 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7241 flags
= check_case_options(argc
, argv
, flags
);
7242 enc
= str_true_enc(str
);
7243 if (case_option_single_p(flags
, enc
, str
)) {
7244 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7245 str_enc_copy(ret
, str
);
7248 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7249 ret
= rb_str_new(0, RSTRING_LEN(str
));
7250 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7253 ret
= rb_str_casemap(str
, &flags
, enc
);
7260 downcase_single(VALUE str
)
7262 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7263 bool modified
= false;
7266 unsigned int c
= *(unsigned char*)s
;
7268 if ('A' <= c
&& c
<= 'Z') {
7269 *s
= 'a' + (c
- 'A');
7280 * downcase!(*options) -> self or nil
7282 * Downcases the characters in +self+;
7283 * returns +self+ if any changes were made, +nil+ otherwise:
7285 * s = 'Hello World!' # => "Hello World!"
7286 * s.downcase! # => "hello world!"
7287 * s # => "hello world!"
7288 * s.downcase! # => nil
7290 * The casing may be affected by the given +options+;
7291 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7293 * Related: String#downcase, String#upcase, String#upcase!.
7298 rb_str_downcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7301 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7303 flags
= check_case_options(argc
, argv
, flags
);
7304 str_modify_keep_cr(str
);
7305 enc
= str_true_enc(str
);
7306 if (case_option_single_p(flags
, enc
, str
)) {
7307 if (downcase_single(str
))
7308 flags
|= ONIGENC_CASE_MODIFIED
;
7310 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7311 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7313 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7315 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7322 * downcase(*options) -> string
7324 * Returns a string containing the downcased characters in +self+:
7326 * s = 'Hello World!' # => "Hello World!"
7327 * s.downcase # => "hello world!"
7329 * The casing may be affected by the given +options+;
7330 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7332 * Related: String#downcase!, String#upcase, String#upcase!.
7337 rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
)
7340 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7343 flags
= check_case_options(argc
, argv
, flags
);
7344 enc
= str_true_enc(str
);
7345 if (case_option_single_p(flags
, enc
, str
)) {
7346 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7347 str_enc_copy(ret
, str
);
7348 downcase_single(ret
);
7350 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7351 ret
= rb_str_new(0, RSTRING_LEN(str
));
7352 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7355 ret
= rb_str_casemap(str
, &flags
, enc
);
7364 * capitalize!(*options) -> self or nil
7366 * Upcases the first character in +self+;
7367 * downcases the remaining characters;
7368 * returns +self+ if any changes were made, +nil+ otherwise:
7370 * s = 'hello World!' # => "hello World!"
7371 * s.capitalize! # => "Hello world!"
7372 * s # => "Hello world!"
7373 * s.capitalize! # => nil
7375 * The casing may be affected by the given +options+;
7376 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7378 * Related: String#capitalize.
7383 rb_str_capitalize_bang(int argc
, VALUE
*argv
, VALUE str
)
7386 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7388 flags
= check_case_options(argc
, argv
, flags
);
7389 str_modify_keep_cr(str
);
7390 enc
= str_true_enc(str
);
7391 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7392 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7393 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7395 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7397 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7404 * capitalize(*options) -> string
7406 * Returns a string containing the characters in +self+;
7407 * the first character is upcased;
7408 * the remaining characters are downcased:
7410 * s = 'hello World!' # => "hello World!"
7411 * s.capitalize # => "Hello world!"
7413 * The casing may be affected by the given +options+;
7414 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7416 * Related: String#capitalize!.
7421 rb_str_capitalize(int argc
, VALUE
*argv
, VALUE str
)
7424 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7427 flags
= check_case_options(argc
, argv
, flags
);
7428 enc
= str_true_enc(str
);
7429 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str
;
7430 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7431 ret
= rb_str_new(0, RSTRING_LEN(str
));
7432 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7435 ret
= rb_str_casemap(str
, &flags
, enc
);
7443 * swapcase!(*options) -> self or nil
7445 * Upcases each lowercase character in +self+;
7446 * downcases uppercase character;
7447 * returns +self+ if any changes were made, +nil+ otherwise:
7449 * s = 'Hello World!' # => "Hello World!"
7450 * s.swapcase! # => "hELLO wORLD!"
7451 * s # => "Hello World!"
7452 * ''.swapcase! # => nil
7454 * The casing may be affected by the given +options+;
7455 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7457 * Related: String#swapcase.
7462 rb_str_swapcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7465 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7467 flags
= check_case_options(argc
, argv
, flags
);
7468 str_modify_keep_cr(str
);
7469 enc
= str_true_enc(str
);
7470 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7471 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7473 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7475 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7482 * swapcase(*options) -> string
7484 * Returns a string containing the characters in +self+, with cases reversed;
7485 * each uppercase character is downcased;
7486 * each lowercase character is upcased:
7488 * s = 'Hello World!' # => "Hello World!"
7489 * s.swapcase # => "hELLO wORLD!"
7491 * The casing may be affected by the given +options+;
7492 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7494 * Related: String#swapcase!.
7499 rb_str_swapcase(int argc
, VALUE
*argv
, VALUE str
)
7502 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7505 flags
= check_case_options(argc
, argv
, flags
);
7506 enc
= str_true_enc(str
);
7507 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str_duplicate(rb_cString
, str
);
7508 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7509 ret
= rb_str_new(0, RSTRING_LEN(str
));
7510 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7513 ret
= rb_str_casemap(str
, &flags
, enc
);
7518 typedef unsigned char *USTR
;
7522 unsigned int now
, max
;
7527 trnext(struct tr
*t
, rb_encoding
*enc
)
7534 if (t
->p
== t
->pend
) return -1;
7535 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '\\' && t
->p
+ n
< t
->pend
) {
7538 t
->now
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7540 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '-' && t
->p
+ n
< t
->pend
) {
7542 if (t
->p
< t
->pend
) {
7543 unsigned int c
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7546 if (t
->now
< 0x80 && c
< 0x80) {
7547 rb_raise(rb_eArgError
,
7548 "invalid range \"%c-%c\" in string transliteration",
7552 rb_raise(rb_eArgError
, "invalid range in string transliteration");
7554 continue; /* not reached */
7563 while (ONIGENC_CODE_TO_MBCLEN(enc
, ++t
->now
) <= 0) {
7564 if (t
->now
== t
->max
) {
7569 if (t
->now
< t
->max
) {
7580 static VALUE
rb_str_delete_bang(int,VALUE
*,VALUE
);
7583 tr_trans(VALUE str
, VALUE src
, VALUE repl
, int sflag
)
7585 const unsigned int errc
= -1;
7586 unsigned int trans
[256];
7587 rb_encoding
*enc
, *e1
, *e2
;
7588 struct tr trsrc
, trrepl
;
7590 unsigned int c
, c0
, last
= 0;
7591 int modify
= 0, i
, l
;
7592 unsigned char *s
, *send
;
7594 int singlebyte
= single_byte_optimizable(str
);
7598 #define CHECK_IF_ASCII(c) \
7599 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7600 (cr = ENC_CODERANGE_VALID) : 0)
7604 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7605 if (RSTRING_LEN(repl
) == 0) {
7606 return rb_str_delete_bang(1, &src
, str
);
7609 cr
= ENC_CODERANGE(str
);
7610 e1
= rb_enc_check(str
, src
);
7611 e2
= rb_enc_check(str
, repl
);
7616 enc
= rb_enc_check(src
, repl
);
7618 trsrc
.p
= RSTRING_PTR(src
); trsrc
.pend
= trsrc
.p
+ RSTRING_LEN(src
);
7619 if (RSTRING_LEN(src
) > 1 &&
7620 rb_enc_ascget(trsrc
.p
, trsrc
.pend
, &l
, enc
) == '^' &&
7621 trsrc
.p
+ l
< trsrc
.pend
) {
7625 trrepl
.p
= RSTRING_PTR(repl
);
7626 trrepl
.pend
= trrepl
.p
+ RSTRING_LEN(repl
);
7627 trsrc
.gen
= trrepl
.gen
= 0;
7628 trsrc
.now
= trrepl
.now
= 0;
7629 trsrc
.max
= trrepl
.max
= 0;
7632 for (i
=0; i
<256; i
++) {
7635 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7640 if (!hash
) hash
= rb_hash_new();
7641 rb_hash_aset(hash
, UINT2NUM(c
), Qtrue
);
7644 while ((c
= trnext(&trrepl
, enc
)) != errc
)
7645 /* retrieve last replacer */;
7647 for (i
=0; i
<256; i
++) {
7648 if (trans
[i
] != errc
) {
7656 for (i
=0; i
<256; i
++) {
7659 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7660 r
= trnext(&trrepl
, enc
);
7661 if (r
== errc
) r
= trrepl
.now
;
7664 if (rb_enc_codelen(r
, enc
) != 1) singlebyte
= 0;
7667 if (!hash
) hash
= rb_hash_new();
7668 rb_hash_aset(hash
, UINT2NUM(c
), UINT2NUM(r
));
7673 if (cr
== ENC_CODERANGE_VALID
&& rb_enc_asciicompat(e1
))
7674 cr
= ENC_CODERANGE_7BIT
;
7675 str_modify_keep_cr(str
);
7676 s
= (unsigned char *)RSTRING_PTR(str
); send
= (unsigned char *)RSTRING_END(str
);
7677 termlen
= rb_enc_mbminlen(enc
);
7680 long offset
, max
= RSTRING_LEN(str
);
7681 unsigned int save
= -1;
7682 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7687 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7688 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7695 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7697 if (cflag
) c
= last
;
7700 else if (cflag
) c
= errc
;
7701 else c
= NUM2INT(tmp
);
7706 if (c
!= (unsigned int)-1) {
7712 tlen
= rb_enc_codelen(c
, enc
);
7718 if (enc
!= e1
) may_modify
= 1;
7720 if ((offset
= t
- buf
) + tlen
> max
) {
7721 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7722 max
= offset
+ tlen
+ (send
- s
);
7723 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7726 rb_enc_mbcput(c
, t
, enc
);
7727 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7733 if (!STR_EMBED_P(str
)) {
7734 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7736 TERM_FILL((char *)t
, termlen
);
7737 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7738 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7739 STR_SET_NOEMBED(str
);
7740 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7742 else if (rb_enc_mbmaxlen(enc
) == 1 || (singlebyte
&& !hash
)) {
7744 c
= (unsigned char)*s
;
7745 if (trans
[c
] != errc
) {
7762 long offset
, max
= (long)((send
- s
) * 1.2);
7763 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7767 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7768 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7774 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7776 if (cflag
) c
= last
;
7779 else if (cflag
) c
= errc
;
7780 else c
= NUM2INT(tmp
);
7783 c
= cflag
? last
: errc
;
7786 tlen
= rb_enc_codelen(c
, enc
);
7791 if (enc
!= e1
) may_modify
= 1;
7793 if ((offset
= t
- buf
) + tlen
> max
) {
7794 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7795 max
= offset
+ tlen
+ (long)((send
- s
) * 1.2);
7796 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7800 rb_enc_mbcput(c
, t
, enc
);
7801 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7809 if (!STR_EMBED_P(str
)) {
7810 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7812 TERM_FILL((char *)t
, termlen
);
7813 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7814 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7815 STR_SET_NOEMBED(str
);
7816 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7820 if (cr
!= ENC_CODERANGE_BROKEN
)
7821 ENC_CODERANGE_SET(str
, cr
);
7822 rb_enc_associate(str
, enc
);
7831 * str.tr!(from_str, to_str) -> str or nil
7833 * Translates <i>str</i> in place, using the same rules as
7834 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7839 rb_str_tr_bang(VALUE str
, VALUE src
, VALUE repl
)
7841 return tr_trans(str
, src
, repl
, 0);
7847 * str.tr(from_str, to_str) => new_str
7849 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7850 * corresponding characters in +to_str+. If +to_str+ is shorter than
7851 * +from_str+, it is padded with its last character in order to maintain the
7854 * "hello".tr('el', 'ip') #=> "hippo"
7855 * "hello".tr('aeiou', '*') #=> "h*ll*"
7856 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7858 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7859 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7860 * all characters except those listed.
7862 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7863 * "hello".tr('^aeiou', '*') #=> "*e**o"
7865 * The backslash character <code>\\</code> can be used to escape
7866 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7867 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7869 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7870 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7872 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7873 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7874 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7876 * "X['\\b']".tr("X\\", "") #=> "['b']"
7877 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7881 rb_str_tr(VALUE str
, VALUE src
, VALUE repl
)
7883 str
= str_duplicate(rb_cString
, str
);
7884 tr_trans(str
, src
, repl
, 0);
7888 #define TR_TABLE_MAX (UCHAR_MAX+1)
7889 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7891 tr_setup_table(VALUE str
, char stable
[TR_TABLE_SIZE
], int first
,
7892 VALUE
*tablep
, VALUE
*ctablep
, rb_encoding
*enc
)
7894 const unsigned int errc
= -1;
7895 char buf
[TR_TABLE_MAX
];
7898 VALUE table
= 0, ptable
= 0;
7899 int i
, l
, cflag
= 0;
7901 tr
.p
= RSTRING_PTR(str
); tr
.pend
= tr
.p
+ RSTRING_LEN(str
);
7902 tr
.gen
= tr
.now
= tr
.max
= 0;
7904 if (RSTRING_LEN(str
) > 1 && rb_enc_ascget(tr
.p
, tr
.pend
, &l
, enc
) == '^') {
7909 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7912 stable
[TR_TABLE_MAX
] = cflag
;
7914 else if (stable
[TR_TABLE_MAX
] && !cflag
) {
7915 stable
[TR_TABLE_MAX
] = 0;
7917 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7921 while ((c
= trnext(&tr
, enc
)) != errc
) {
7922 if (c
< TR_TABLE_MAX
) {
7923 buf
[(unsigned char)c
] = !cflag
;
7926 VALUE key
= UINT2NUM(c
);
7928 if (!table
&& (first
|| *tablep
|| stable
[TR_TABLE_MAX
])) {
7931 table
= ptable
? ptable
: rb_hash_new();
7935 table
= rb_hash_new();
7940 if (table
&& (!ptable
|| (cflag
^ !NIL_P(rb_hash_aref(ptable
, key
))))) {
7941 rb_hash_aset(table
, key
, Qtrue
);
7945 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7946 stable
[i
] = stable
[i
] && buf
[i
];
7948 if (!table
&& !cflag
) {
7955 tr_find(unsigned int c
, const char table
[TR_TABLE_SIZE
], VALUE del
, VALUE nodel
)
7957 if (c
< TR_TABLE_MAX
) {
7958 return table
[c
] != 0;
7961 VALUE v
= UINT2NUM(c
);
7964 if (!NIL_P(rb_hash_lookup(del
, v
)) &&
7965 (!nodel
|| NIL_P(rb_hash_lookup(nodel
, v
)))) {
7969 else if (nodel
&& !NIL_P(rb_hash_lookup(nodel
, v
))) {
7972 return table
[TR_TABLE_MAX
] ? TRUE
: FALSE
;
7978 * str.delete!([other_str]+) -> str or nil
7980 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7981 * <code>nil</code> if <i>str</i> was not modified.
7985 rb_str_delete_bang(int argc
, VALUE
*argv
, VALUE str
)
7987 char squeez
[TR_TABLE_SIZE
];
7988 rb_encoding
*enc
= 0;
7990 VALUE del
= 0, nodel
= 0;
7992 int i
, ascompat
, cr
;
7994 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7995 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
7996 for (i
=0; i
<argc
; i
++) {
8000 enc
= rb_enc_check(str
, s
);
8001 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
8004 str_modify_keep_cr(str
);
8005 ascompat
= rb_enc_asciicompat(enc
);
8006 s
= t
= RSTRING_PTR(str
);
8007 send
= RSTRING_END(str
);
8008 cr
= ascompat
? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
8013 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8024 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8026 if (tr_find(c
, squeez
, del
, nodel
)) {
8030 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8032 if (cr
== ENC_CODERANGE_7BIT
) cr
= ENC_CODERANGE_VALID
;
8037 TERM_FILL(t
, TERM_LEN(str
));
8038 STR_SET_LEN(str
, t
- RSTRING_PTR(str
));
8039 ENC_CODERANGE_SET(str
, cr
);
8041 if (modify
) return str
;
8048 * str.delete([other_str]+) -> new_str
8050 * Returns a copy of <i>str</i> with all characters in the intersection of its
8051 * arguments deleted. Uses the same rules for building the set of characters as
8054 * "hello".delete "l","lo" #=> "heo"
8055 * "hello".delete "lo" #=> "he"
8056 * "hello".delete "aeiou", "^e" #=> "hell"
8057 * "hello".delete "ej-m" #=> "ho"
8061 rb_str_delete(int argc
, VALUE
*argv
, VALUE str
)
8063 str
= str_duplicate(rb_cString
, str
);
8064 rb_str_delete_bang(argc
, argv
, str
);
8071 * str.squeeze!([other_str]*) -> str or nil
8073 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8074 * <code>nil</code> if no changes were made.
8078 rb_str_squeeze_bang(int argc
, VALUE
*argv
, VALUE str
)
8080 char squeez
[TR_TABLE_SIZE
];
8081 rb_encoding
*enc
= 0;
8082 VALUE del
= 0, nodel
= 0;
8083 unsigned char *s
, *send
, *t
;
8085 int ascompat
, singlebyte
= single_byte_optimizable(str
);
8089 enc
= STR_ENC_GET(str
);
8092 for (i
=0; i
<argc
; i
++) {
8096 enc
= rb_enc_check(str
, s
);
8097 if (singlebyte
&& !single_byte_optimizable(s
))
8099 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
8103 str_modify_keep_cr(str
);
8104 s
= t
= (unsigned char *)RSTRING_PTR(str
);
8105 if (!s
|| RSTRING_LEN(str
) == 0) return Qnil
;
8106 send
= (unsigned char *)RSTRING_END(str
);
8108 ascompat
= rb_enc_asciicompat(enc
);
8112 unsigned int c
= *s
++;
8113 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8123 if (ascompat
&& (c
= *s
) < 0x80) {
8124 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8130 c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, enc
);
8132 if (c
!= save
|| (argc
> 0 && !tr_find(c
, squeez
, del
, nodel
))) {
8133 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8142 TERM_FILL((char *)t
, TERM_LEN(str
));
8143 if ((char *)t
- RSTRING_PTR(str
) != RSTRING_LEN(str
)) {
8144 STR_SET_LEN(str
, (char *)t
- RSTRING_PTR(str
));
8148 if (modify
) return str
;
8155 * str.squeeze([other_str]*) -> new_str
8157 * Builds a set of characters from the <i>other_str</i> parameter(s)
8158 * using the procedure described for String#count. Returns a new
8159 * string where runs of the same character that occur in this set are
8160 * replaced by a single character. If no arguments are given, all
8161 * runs of identical characters are replaced by a single character.
8163 * "yellow moon".squeeze #=> "yelow mon"
8164 * " now is the".squeeze(" ") #=> " now is the"
8165 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8169 rb_str_squeeze(int argc
, VALUE
*argv
, VALUE str
)
8171 str
= str_duplicate(rb_cString
, str
);
8172 rb_str_squeeze_bang(argc
, argv
, str
);
8179 * str.tr_s!(from_str, to_str) -> str or nil
8181 * Performs String#tr_s processing on <i>str</i> in place,
8182 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8186 rb_str_tr_s_bang(VALUE str
, VALUE src
, VALUE repl
)
8188 return tr_trans(str
, src
, repl
, 1);
8194 * str.tr_s(from_str, to_str) -> new_str
8196 * Processes a copy of <i>str</i> as described under String#tr, then
8197 * removes duplicate characters in regions that were affected by the
8200 * "hello".tr_s('l', 'r') #=> "hero"
8201 * "hello".tr_s('el', '*') #=> "h*o"
8202 * "hello".tr_s('el', 'hx') #=> "hhxo"
8206 rb_str_tr_s(VALUE str
, VALUE src
, VALUE repl
)
8208 str
= str_duplicate(rb_cString
, str
);
8209 tr_trans(str
, src
, repl
, 1);
8216 * str.count([other_str]+) -> integer
8218 * Each +other_str+ parameter defines a set of characters to count. The
8219 * intersection of these sets defines the characters to count in +str+. Any
8220 * +other_str+ that starts with a caret <code>^</code> is negated. The
8221 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8222 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8223 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8224 * sequence or the end of a +other_str+.
8227 * a.count "lo" #=> 5
8228 * a.count "lo", "o" #=> 2
8229 * a.count "hello", "^l" #=> 4
8230 * a.count "ej-m" #=> 4
8232 * "hello^world".count "\\^aeiou" #=> 4
8233 * "hello-world".count "a\\-eo" #=> 4
8235 * c = "hello world\\r\\n"
8236 * c.count "\\" #=> 2
8237 * c.count "\\A" #=> 0
8238 * c.count "X-\\w" #=> 3
8242 rb_str_count(int argc
, VALUE
*argv
, VALUE str
)
8244 char table
[TR_TABLE_SIZE
];
8245 rb_encoding
*enc
= 0;
8246 VALUE del
= 0, nodel
= 0, tstr
;
8252 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
8256 enc
= rb_enc_check(str
, tstr
);
8259 if (RSTRING_LEN(tstr
) == 1 && rb_enc_asciicompat(enc
) &&
8260 (ptstr
= RSTRING_PTR(tstr
),
8261 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc
, (const unsigned char *)ptstr
, (const unsigned char *)ptstr
+1)) &&
8262 !is_broken_string(str
)) {
8264 unsigned char c
= rb_enc_codepoint_len(ptstr
, ptstr
+1, &clen
, enc
);
8266 s
= RSTRING_PTR(str
);
8267 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8268 send
= RSTRING_END(str
);
8270 if (*(unsigned char*)s
++ == c
) n
++;
8272 return SIZET2NUM(n
);
8276 tr_setup_table(tstr
, table
, TRUE
, &del
, &nodel
, enc
);
8277 for (i
=1; i
<argc
; i
++) {
8280 enc
= rb_enc_check(str
, tstr
);
8281 tr_setup_table(tstr
, table
, FALSE
, &del
, &nodel
, enc
);
8284 s
= RSTRING_PTR(str
);
8285 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8286 send
= RSTRING_END(str
);
8287 ascompat
= rb_enc_asciicompat(enc
);
8291 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8299 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8300 if (tr_find(c
, table
, del
, nodel
)) {
8307 return SIZET2NUM(n
);
8311 rb_fs_check(VALUE val
)
8313 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
) && !RB_TYPE_P(val
, T_REGEXP
)) {
8314 val
= rb_check_string_type(val
);
8315 if (NIL_P(val
)) return 0;
8320 static const char isspacetable
[256] = {
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8339 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8342 split_string(VALUE result
, VALUE str
, long beg
, long len
, long empty_count
)
8344 if (empty_count
>= 0 && len
== 0) {
8345 return empty_count
+ 1;
8347 if (empty_count
> 0) {
8348 /* make different substrings */
8351 rb_ary_push(result
, str_new_empty_String(str
));
8352 } while (--empty_count
> 0);
8356 rb_yield(str_new_empty_String(str
));
8357 } while (--empty_count
> 0);
8360 str
= rb_str_subseq(str
, beg
, len
);
8362 rb_ary_push(result
, str
);
8371 SPLIT_TYPE_AWK
, SPLIT_TYPE_STRING
, SPLIT_TYPE_REGEXP
, SPLIT_TYPE_CHARS
8375 literal_split_pattern(VALUE spat
, split_type_t default_type
)
8377 rb_encoding
*enc
= STR_ENC_GET(spat
);
8380 RSTRING_GETMEM(spat
, ptr
, len
);
8382 /* Special case - split into chars */
8383 return SPLIT_TYPE_CHARS
;
8385 else if (rb_enc_asciicompat(enc
)) {
8386 if (len
== 1 && ptr
[0] == ' ') {
8387 return SPLIT_TYPE_AWK
;
8392 if (rb_enc_ascget(ptr
, ptr
+ len
, &l
, enc
) == ' ' && len
== l
) {
8393 return SPLIT_TYPE_AWK
;
8396 return default_type
;
8401 * str.split(pattern=nil, [limit]) -> an_array
8402 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8404 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8405 * of these substrings.
8407 * If <i>pattern</i> is a String, then its contents are used as
8408 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8409 * space, <i>str</i> is split on whitespace, with leading and trailing
8410 * whitespace and runs of contiguous whitespace characters ignored.
8412 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8413 * pattern matches. Whenever the pattern matches a zero-length string,
8414 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8415 * groups, the respective matches will be returned in the array as well.
8417 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8418 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8419 * split on whitespace as if ' ' were specified.
8421 * If the <i>limit</i> parameter is omitted, trailing null fields are
8422 * suppressed. If <i>limit</i> is a positive number, at most that number
8423 * of split substrings will be returned (captured groups will be returned
8424 * as well, but are not counted towards the limit).
8425 * If <i>limit</i> is <code>1</code>, the entire
8426 * string is returned as the only entry in an array. If negative, there is no
8427 * limit to the number of fields returned, and trailing null fields are not
8430 * When the input +str+ is empty an empty Array is returned as the string is
8431 * considered to have no fields to split.
8433 * " now's the time ".split #=> ["now's", "the", "time"]
8434 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8435 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8436 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8437 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8438 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8439 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8441 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8442 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8443 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8444 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8446 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8448 * "".split(',', -1) #=> []
8450 * If a block is given, invoke the block with each split substring.
8455 rb_str_split_m(int argc
, VALUE
*argv
, VALUE str
)
8460 split_type_t split_type
;
8461 long beg
, end
, i
= 0, empty_count
= -1;
8465 result
= rb_block_given_p() ? Qfalse
: Qnil
;
8466 if (rb_scan_args(argc
, argv
, "02", &spat
, &limit
) == 2) {
8467 lim
= NUM2INT(limit
);
8468 if (lim
<= 0) limit
= Qnil
;
8469 else if (lim
== 1) {
8470 if (RSTRING_LEN(str
) == 0)
8471 return result
? rb_ary_new2(0) : str
;
8472 tmp
= str_duplicate(rb_cString
, str
);
8477 return rb_ary_new3(1, tmp
);
8481 if (NIL_P(limit
) && !lim
) empty_count
= 0;
8483 enc
= STR_ENC_GET(str
);
8484 split_type
= SPLIT_TYPE_REGEXP
;
8486 spat
= get_pat_quoted(spat
, 0);
8488 else if (NIL_P(spat
= rb_fs
)) {
8489 split_type
= SPLIT_TYPE_AWK
;
8491 else if (!(spat
= rb_fs_check(spat
))) {
8492 rb_raise(rb_eTypeError
, "value of $; must be String or Regexp");
8495 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$; is set to non-nil value");
8497 if (split_type
!= SPLIT_TYPE_AWK
) {
8498 switch (BUILTIN_TYPE(spat
)) {
8500 rb_reg_options(spat
); /* check if uninitialized */
8501 tmp
= RREGEXP_SRC(spat
);
8502 split_type
= literal_split_pattern(tmp
, SPLIT_TYPE_REGEXP
);
8503 if (split_type
== SPLIT_TYPE_AWK
) {
8505 split_type
= SPLIT_TYPE_STRING
;
8510 mustnot_broken(spat
);
8511 split_type
= literal_split_pattern(spat
, SPLIT_TYPE_STRING
);
8515 UNREACHABLE_RETURN(Qnil
);
8519 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8521 if (result
) result
= rb_ary_new();
8523 char *ptr
= RSTRING_PTR(str
);
8524 char *eptr
= RSTRING_END(str
);
8525 if (split_type
== SPLIT_TYPE_AWK
) {
8531 if (is_ascii_string(str
)) {
8532 while (ptr
< eptr
) {
8533 c
= (unsigned char)*ptr
++;
8535 if (ascii_isspace(c
)) {
8541 if (!NIL_P(limit
) && lim
<= i
) break;
8544 else if (ascii_isspace(c
)) {
8545 SPLIT_STR(beg
, end
-beg
);
8548 if (!NIL_P(limit
)) ++i
;
8556 while (ptr
< eptr
) {
8559 c
= rb_enc_codepoint_len(ptr
, eptr
, &n
, enc
);
8562 if (rb_isspace(c
)) {
8568 if (!NIL_P(limit
) && lim
<= i
) break;
8571 else if (rb_isspace(c
)) {
8572 SPLIT_STR(beg
, end
-beg
);
8575 if (!NIL_P(limit
)) ++i
;
8583 else if (split_type
== SPLIT_TYPE_STRING
) {
8584 char *str_start
= ptr
;
8585 char *substr_start
= ptr
;
8586 char *sptr
= RSTRING_PTR(spat
);
8587 long slen
= RSTRING_LEN(spat
);
8589 mustnot_broken(str
);
8590 enc
= rb_enc_check(str
, spat
);
8591 while (ptr
< eptr
&&
8592 (end
= rb_memsearch(sptr
, slen
, ptr
, eptr
- ptr
, enc
)) >= 0) {
8593 /* Check we are at the start of a char */
8594 char *t
= rb_enc_right_char_head(ptr
, ptr
+ end
, eptr
, enc
);
8595 if (t
!= ptr
+ end
) {
8599 SPLIT_STR(substr_start
- str_start
, (ptr
+end
) - substr_start
);
8602 if (!NIL_P(limit
) && lim
<= ++i
) break;
8604 beg
= ptr
- str_start
;
8606 else if (split_type
== SPLIT_TYPE_CHARS
) {
8607 char *str_start
= ptr
;
8610 mustnot_broken(str
);
8611 enc
= rb_enc_get(str
);
8612 while (ptr
< eptr
&&
8613 (n
= rb_enc_precise_mbclen(ptr
, eptr
, enc
)) > 0) {
8614 SPLIT_STR(ptr
- str_start
, n
);
8616 if (!NIL_P(limit
) && lim
<= ++i
) break;
8618 beg
= ptr
- str_start
;
8621 long len
= RSTRING_LEN(str
);
8625 struct re_registers
*regs
;
8628 for (; rb_reg_search(spat
, str
, start
, 0) >= 0;
8629 (match
? (rb_match_unbusy(match
), rb_backref_set(match
)) : (void)0)) {
8630 match
= rb_backref_get();
8631 if (!result
) rb_match_busy(match
);
8632 regs
= RMATCH_REGS(match
);
8634 if (start
== end
&& BEG(0) == END(0)) {
8639 else if (last_null
== 1) {
8640 SPLIT_STR(beg
, rb_enc_fast_mbclen(ptr
+beg
, eptr
, enc
));
8647 start
+= rb_enc_fast_mbclen(ptr
+start
,eptr
,enc
);
8653 SPLIT_STR(beg
, end
-beg
);
8654 beg
= start
= END(0);
8658 for (idx
=1; idx
< regs
->num_regs
; idx
++) {
8659 if (BEG(idx
) == -1) continue;
8660 SPLIT_STR(BEG(idx
), END(idx
)-BEG(idx
));
8662 if (!NIL_P(limit
) && lim
<= ++i
) break;
8664 if (match
) rb_match_unbusy(match
);
8666 if (RSTRING_LEN(str
) > 0 && (!NIL_P(limit
) || RSTRING_LEN(str
) > beg
|| lim
< 0)) {
8667 SPLIT_STR(beg
, RSTRING_LEN(str
)-beg
);
8670 return result
? result
: str
;
8674 rb_str_split(VALUE str
, const char *sep0
)
8679 sep
= rb_str_new_cstr(sep0
);
8680 return rb_str_split_m(1, &sep
, str
);
8683 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8686 enumerator_element(VALUE ary
, VALUE e
)
8689 rb_ary_push(ary
, e
);
8698 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8701 chomp_newline(const char *p
, const char *e
, rb_encoding
*enc
)
8703 const char *prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8704 if (rb_enc_is_newline(prev
, e
, enc
)) {
8706 prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8707 if (prev
&& rb_enc_ascget(prev
, e
, NULL
, enc
) == '\r')
8718 (!RB_TYPE_P(rs
, T_STRING
) ||
8719 RSTRING_LEN(rs
) != 1 ||
8720 RSTRING_PTR(rs
)[0] != '\n')) {
8721 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$/ is set to non-default value");
8726 #define rb_rs get_rs()
8729 rb_str_enumerate_lines(int argc
, VALUE
*argv
, VALUE str
, VALUE ary
)
8732 VALUE line
, rs
, orig
= str
, opts
= Qnil
, chomp
= Qfalse
;
8733 const char *ptr
, *pend
, *subptr
, *subend
, *rsptr
, *hit
, *adjusted
;
8734 long pos
, len
, rslen
;
8737 if (rb_scan_args(argc
, argv
, "01:", &rs
, &opts
) == 0)
8740 static ID keywords
[1];
8742 keywords
[0] = rb_intern_const("chomp");
8744 rb_get_kwargs(opts
, keywords
, 0, 1, &chomp
);
8745 chomp
= (chomp
!= Qundef
&& RTEST(chomp
));
8749 if (!ENUM_ELEM(ary
, str
)) {
8757 if (!RSTRING_LEN(str
)) goto end
;
8758 str
= rb_str_new_frozen(str
);
8759 ptr
= subptr
= RSTRING_PTR(str
);
8760 pend
= RSTRING_END(str
);
8761 len
= RSTRING_LEN(str
);
8763 rslen
= RSTRING_LEN(rs
);
8765 if (rs
== rb_default_rs
)
8766 enc
= rb_enc_get(str
);
8768 enc
= rb_enc_check(str
, rs
);
8771 /* paragraph mode */
8773 const char *eol
= NULL
;
8775 while (subend
< pend
) {
8777 if (rb_enc_ascget(subend
, pend
, &n
, enc
) != '\r')
8779 rslen
= n
+ rb_enc_mbclen(subend
+ n
, pend
, enc
);
8780 if (rb_enc_is_newline(subend
+ n
, pend
, enc
)) {
8781 if (eol
== subend
) break;
8783 if (subptr
) eol
= subend
;
8786 if (!subptr
) subptr
= subend
;
8790 } while (subend
< pend
);
8792 line
= rb_str_subseq(str
, subptr
- ptr
,
8793 subend
- subptr
+ (chomp
? 0 : rslen
));
8794 if (ENUM_ELEM(ary
, line
)) {
8795 str_mod_check(str
, ptr
, len
);
8797 subptr
= eol
= NULL
;
8802 rsptr
= RSTRING_PTR(rs
);
8803 if (RSTRING_LEN(rs
) == rb_enc_mbminlen(enc
) &&
8804 rb_enc_is_newline(rsptr
, rsptr
+ RSTRING_LEN(rs
), enc
)) {
8809 if ((rs
== rb_default_rs
) && !rb_enc_asciicompat(enc
)) {
8810 rs
= rb_str_new(rsptr
, rslen
);
8811 rs
= rb_str_encode(rs
, rb_enc_from_encoding(enc
), 0, Qnil
);
8812 rsptr
= RSTRING_PTR(rs
);
8813 rslen
= RSTRING_LEN(rs
);
8816 while (subptr
< pend
) {
8817 pos
= rb_memsearch(rsptr
, rslen
, subptr
, pend
- subptr
, enc
);
8820 adjusted
= rb_enc_right_char_head(subptr
, hit
, pend
, enc
);
8821 if (hit
!= adjusted
) {
8825 subend
= hit
+= rslen
;
8828 subend
= chomp_newline(subptr
, subend
, enc
);
8834 line
= rb_str_subseq(str
, subptr
- ptr
, subend
- subptr
);
8835 if (ENUM_ELEM(ary
, line
)) {
8836 str_mod_check(str
, ptr
, len
);
8841 if (subptr
!= pend
) {
8844 pend
= chomp_newline(subptr
, pend
, enc
);
8846 else if (pend
- subptr
>= rslen
&&
8847 memcmp(pend
- rslen
, rsptr
, rslen
) == 0) {
8851 line
= rb_str_subseq(str
, subptr
- ptr
, pend
- subptr
);
8852 ENUM_ELEM(ary
, line
);
8865 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8866 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8868 * Splits <i>str</i> using the supplied parameter as the record
8869 * separator (<code>$/</code> by default), passing each substring in
8870 * turn to the supplied block. If a zero-length record separator is
8871 * supplied, the string is split into paragraphs delimited by
8872 * multiple successive newlines.
8874 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8877 * If no block is given, an enumerator is returned instead.
8879 * "hello\nworld".each_line {|s| p s}
8884 * "hello\nworld".each_line('l') {|s| p s}
8891 * "hello\n\n\nworld".each_line('') {|s| p s}
8896 * "hello\nworld".each_line(chomp: true) {|s| p s}
8901 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8911 rb_str_each_line(int argc
, VALUE
*argv
, VALUE str
)
8913 RETURN_SIZED_ENUMERATOR(str
, argc
, argv
, 0);
8914 return rb_str_enumerate_lines(argc
, argv
, str
, 0);
8919 * str.lines(separator=$/, chomp: false) -> an_array
8921 * Returns an array of lines in <i>str</i> split using the supplied
8922 * record separator (<code>$/</code> by default). This is a
8923 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8925 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8928 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8929 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8930 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8932 * If a block is given, which is a deprecated form, works the same as
8933 * <code>each_line</code>.
8937 rb_str_lines(int argc
, VALUE
*argv
, VALUE str
)
8939 VALUE ary
= WANTARRAY("lines", 0);
8940 return rb_str_enumerate_lines(argc
, argv
, str
, ary
);
8944 rb_str_each_byte_size(VALUE str
, VALUE args
, VALUE eobj
)
8946 return LONG2FIX(RSTRING_LEN(str
));
8950 rb_str_enumerate_bytes(VALUE str
, VALUE ary
)
8954 for (i
=0; i
<RSTRING_LEN(str
); i
++) {
8955 ENUM_ELEM(ary
, INT2FIX((unsigned char)RSTRING_PTR(str
)[i
]));
8965 * str.each_byte {|integer| block } -> str
8966 * str.each_byte -> an_enumerator
8968 * Passes each byte in <i>str</i> to the given block, or returns an
8969 * enumerator if no block is given.
8971 * "hello".each_byte {|c| print c, ' ' }
8973 * <em>produces:</em>
8975 * 104 101 108 108 111
8979 rb_str_each_byte(VALUE str
)
8981 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_byte_size
);
8982 return rb_str_enumerate_bytes(str
, 0);
8987 * str.bytes -> an_array
8989 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8990 * <code>str.each_byte.to_a</code>.
8992 * If a block is given, which is a deprecated form, works the same as
8993 * <code>each_byte</code>.
8997 rb_str_bytes(VALUE str
)
8999 VALUE ary
= WANTARRAY("bytes", RSTRING_LEN(str
));
9000 return rb_str_enumerate_bytes(str
, ary
);
9004 rb_str_each_char_size(VALUE str
, VALUE args
, VALUE eobj
)
9006 return rb_str_length(str
);
9010 rb_str_enumerate_chars(VALUE str
, VALUE ary
)
9017 str
= rb_str_new_frozen(str
);
9018 ptr
= RSTRING_PTR(str
);
9019 len
= RSTRING_LEN(str
);
9020 enc
= rb_enc_get(str
);
9022 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str
))) {
9023 for (i
= 0; i
< len
; i
+= n
) {
9024 n
= rb_enc_fast_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9025 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9029 for (i
= 0; i
< len
; i
+= n
) {
9030 n
= rb_enc_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9031 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9043 * str.each_char {|cstr| block } -> str
9044 * str.each_char -> an_enumerator
9046 * Passes each character in <i>str</i> to the given block, or returns
9047 * an enumerator if no block is given.
9049 * "hello".each_char {|c| print c, ' ' }
9051 * <em>produces:</em>
9057 rb_str_each_char(VALUE str
)
9059 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9060 return rb_str_enumerate_chars(str
, 0);
9065 * str.chars -> an_array
9067 * Returns an array of characters in <i>str</i>. This is a shorthand
9068 * for <code>str.each_char.to_a</code>.
9070 * If a block is given, which is a deprecated form, works the same as
9071 * <code>each_char</code>.
9075 rb_str_chars(VALUE str
)
9077 VALUE ary
= WANTARRAY("chars", rb_str_strlen(str
));
9078 return rb_str_enumerate_chars(str
, ary
);
9082 rb_str_enumerate_codepoints(VALUE str
, VALUE ary
)
9087 const char *ptr
, *end
;
9090 if (single_byte_optimizable(str
))
9091 return rb_str_enumerate_bytes(str
, ary
);
9093 str
= rb_str_new_frozen(str
);
9094 ptr
= RSTRING_PTR(str
);
9095 end
= RSTRING_END(str
);
9096 enc
= STR_ENC_GET(str
);
9099 c
= rb_enc_codepoint_len(ptr
, end
, &n
, enc
);
9100 ENUM_ELEM(ary
, UINT2NUM(c
));
9112 * str.each_codepoint {|integer| block } -> str
9113 * str.each_codepoint -> an_enumerator
9115 * Passes the Integer ordinal of each character in <i>str</i>,
9116 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9117 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9118 * values are directly derived from the binary representation
9119 * of each character.
9121 * If no block is given, an enumerator is returned instead.
9123 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9125 * <em>produces:</em>
9127 * 104 101 108 108 111 1593
9131 rb_str_each_codepoint(VALUE str
)
9133 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9134 return rb_str_enumerate_codepoints(str
, 0);
9139 * str.codepoints -> an_array
9141 * Returns an array of the Integer ordinals of the
9142 * characters in <i>str</i>. This is a shorthand for
9143 * <code>str.each_codepoint.to_a</code>.
9145 * If a block is given, which is a deprecated form, works the same as
9146 * <code>each_codepoint</code>.
9150 rb_str_codepoints(VALUE str
)
9152 VALUE ary
= WANTARRAY("codepoints", rb_str_strlen(str
));
9153 return rb_str_enumerate_codepoints(str
, ary
);
9157 get_reg_grapheme_cluster(rb_encoding
*enc
)
9159 int encidx
= rb_enc_to_index(enc
);
9160 regex_t
*reg_grapheme_cluster
= NULL
;
9161 static regex_t
*reg_grapheme_cluster_utf8
= NULL
;
9164 if (encidx
== rb_utf8_encindex() && reg_grapheme_cluster_utf8
) {
9165 reg_grapheme_cluster
= reg_grapheme_cluster_utf8
;
9167 if (!reg_grapheme_cluster
) {
9168 const OnigUChar source_ascii
[] = "\\X";
9169 OnigErrorInfo einfo
;
9170 const OnigUChar
*source
= source_ascii
;
9171 size_t source_len
= sizeof(source_ascii
) - 1;
9173 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9174 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9175 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9176 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9177 #define CASE_UTF(e) \
9178 case ENCINDEX_UTF_##e: { \
9179 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9180 source = source_UTF_##e; \
9181 source_len = sizeof(source_UTF_##e); \
9184 CASE_UTF(16BE
); CASE_UTF(16LE
); CASE_UTF(32BE
); CASE_UTF(32LE
);
9191 int r
= onig_new(®_grapheme_cluster
, source
, source
+ source_len
,
9192 ONIG_OPTION_DEFAULT
, enc
, OnigDefaultSyntax
, &einfo
);
9194 UChar message
[ONIG_MAX_ERROR_MESSAGE_LEN
];
9195 onig_error_code_to_str(message
, r
, &einfo
);
9196 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message
);
9198 if (encidx
== rb_utf8_encindex()) {
9199 reg_grapheme_cluster_utf8
= reg_grapheme_cluster
;
9202 return reg_grapheme_cluster
;
9206 rb_str_each_grapheme_cluster_size(VALUE str
, VALUE args
, VALUE eobj
)
9208 size_t grapheme_cluster_count
= 0;
9209 regex_t
*reg_grapheme_cluster
= NULL
;
9210 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9211 const char *ptr
, *end
;
9213 if (!rb_enc_unicode_p(enc
)) {
9214 return rb_str_length(str
);
9217 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9218 ptr
= RSTRING_PTR(str
);
9219 end
= RSTRING_END(str
);
9222 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9223 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9224 (const OnigUChar
*)ptr
, NULL
, 0);
9225 if (len
<= 0) break;
9226 grapheme_cluster_count
++;
9230 return SIZET2NUM(grapheme_cluster_count
);
9234 rb_str_enumerate_grapheme_clusters(VALUE str
, VALUE ary
)
9237 regex_t
*reg_grapheme_cluster
= NULL
;
9238 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9239 const char *ptr0
, *ptr
, *end
;
9241 if (!rb_enc_unicode_p(enc
)) {
9242 return rb_str_enumerate_chars(str
, ary
);
9245 if (!ary
) str
= rb_str_new_frozen(str
);
9246 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9247 ptr0
= ptr
= RSTRING_PTR(str
);
9248 end
= RSTRING_END(str
);
9251 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9252 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9253 (const OnigUChar
*)ptr
, NULL
, 0);
9254 if (len
<= 0) break;
9255 ENUM_ELEM(ary
, rb_str_subseq(str
, ptr
-ptr0
, len
));
9267 * str.each_grapheme_cluster {|cstr| block } -> str
9268 * str.each_grapheme_cluster -> an_enumerator
9270 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9271 * an enumerator if no block is given.
9272 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9273 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9275 * "a\u0300".each_char.to_a.size #=> 2
9276 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9281 rb_str_each_grapheme_cluster(VALUE str
)
9283 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_grapheme_cluster_size
);
9284 return rb_str_enumerate_grapheme_clusters(str
, 0);
9289 * str.grapheme_clusters -> an_array
9291 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9292 * for <code>str.each_grapheme_cluster.to_a</code>.
9294 * If a block is given, which is a deprecated form, works the same as
9295 * <code>each_grapheme_cluster</code>.
9299 rb_str_grapheme_clusters(VALUE str
)
9301 VALUE ary
= WANTARRAY("grapheme_clusters", rb_str_strlen(str
));
9302 return rb_str_enumerate_grapheme_clusters(str
, ary
);
9306 chopped_length(VALUE str
)
9308 rb_encoding
*enc
= STR_ENC_GET(str
);
9309 const char *p
, *p2
, *beg
, *end
;
9311 beg
= RSTRING_PTR(str
);
9312 end
= beg
+ RSTRING_LEN(str
);
9313 if (beg
>= end
) return 0;
9314 p
= rb_enc_prev_char(beg
, end
, end
, enc
);
9316 if (p
> beg
&& rb_enc_ascget(p
, end
, 0, enc
) == '\n') {
9317 p2
= rb_enc_prev_char(beg
, p
, end
, enc
);
9318 if (p2
&& rb_enc_ascget(p2
, end
, 0, enc
) == '\r') p
= p2
;
9325 * str.chop! -> str or nil
9327 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9328 * <code>nil</code> if <i>str</i> is the empty string. See also
9333 rb_str_chop_bang(VALUE str
)
9335 str_modify_keep_cr(str
);
9336 if (RSTRING_LEN(str
) > 0) {
9338 len
= chopped_length(str
);
9339 STR_SET_LEN(str
, len
);
9340 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9341 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9342 ENC_CODERANGE_CLEAR(str
);
9352 * str.chop -> new_str
9354 * Returns a new String with the last character removed. If the
9355 * string ends with <code>\r\n</code>, both characters are
9356 * removed. Applying <code>chop</code> to an empty string returns an
9357 * empty string. String#chomp is often a safer alternative, as it
9358 * leaves the string unchanged if it doesn't end in a record
9361 * "string\r\n".chop #=> "string"
9362 * "string\n\r".chop #=> "string\n"
9363 * "string\n".chop #=> "string"
9364 * "string".chop #=> "strin"
9365 * "x".chop.chop #=> ""
9369 rb_str_chop(VALUE str
)
9371 return rb_str_subseq(str
, 0, chopped_length(str
));
9375 smart_chomp(VALUE str
, const char *e
, const char *p
)
9377 rb_encoding
*enc
= rb_enc_get(str
);
9378 if (rb_enc_mbminlen(enc
) > 1) {
9379 const char *pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9380 if (rb_enc_is_newline(pp
, e
, enc
)) {
9383 pp
= e
- rb_enc_mbminlen(enc
);
9385 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9386 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9392 switch (*(e
-1)) { /* not e[-1] to get rid of VC bug */
9394 if (--e
> p
&& *(e
-1) == '\r') {
9407 chompped_length(VALUE str
, VALUE rs
)
9411 char *pp
, *e
, *rsptr
;
9413 char *const p
= RSTRING_PTR(str
);
9414 long len
= RSTRING_LEN(str
);
9416 if (len
== 0) return 0;
9418 if (rs
== rb_default_rs
) {
9419 return smart_chomp(str
, e
, p
);
9422 enc
= rb_enc_get(str
);
9423 RSTRING_GETMEM(rs
, rsptr
, rslen
);
9425 if (rb_enc_mbminlen(enc
) > 1) {
9427 pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9428 if (!rb_enc_is_newline(pp
, e
, enc
)) break;
9430 pp
-= rb_enc_mbminlen(enc
);
9432 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9433 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9440 while (e
> p
&& *(e
-1) == '\n') {
9442 if (e
> p
&& *(e
-1) == '\r')
9448 if (rslen
> len
) return len
;
9450 enc
= rb_enc_get(rs
);
9451 newline
= rsptr
[rslen
-1];
9452 if (rslen
== rb_enc_mbminlen(enc
)) {
9454 if (newline
== '\n')
9455 return smart_chomp(str
, e
, p
);
9458 if (rb_enc_is_newline(rsptr
, rsptr
+rslen
, enc
))
9459 return smart_chomp(str
, e
, p
);
9463 enc
= rb_enc_check(str
, rs
);
9464 if (is_broken_string(rs
)) {
9468 if (p
[len
-1] == newline
&&
9470 memcmp(rsptr
, pp
, rslen
) == 0)) {
9471 if (rb_enc_left_char_head(p
, pp
, e
, enc
) == pp
)
9479 * Returns the separator for arguments of rb_str_chomp.
9481 * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9484 chomp_rs(int argc
, const VALUE
*argv
)
9486 rb_check_arity(argc
, 0, 1);
9489 if (!NIL_P(rs
)) StringValue(rs
);
9498 rb_str_chomp_string(VALUE str
, VALUE rs
)
9500 long olen
= RSTRING_LEN(str
);
9501 long len
= chompped_length(str
, rs
);
9502 if (len
>= olen
) return Qnil
;
9503 str_modify_keep_cr(str
);
9504 STR_SET_LEN(str
, len
);
9505 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9506 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9507 ENC_CODERANGE_CLEAR(str
);
9514 * str.chomp!(separator=$/) -> str or nil
9516 * Modifies <i>str</i> in place as described for String#chomp,
9517 * returning <i>str</i>, or <code>nil</code> if no modifications were
9522 rb_str_chomp_bang(int argc
, VALUE
*argv
, VALUE str
)
9525 str_modifiable(str
);
9526 if (RSTRING_LEN(str
) == 0) return Qnil
;
9527 rs
= chomp_rs(argc
, argv
);
9528 if (NIL_P(rs
)) return Qnil
;
9529 return rb_str_chomp_string(str
, rs
);
9535 * str.chomp(separator=$/) -> new_str
9537 * Returns a new String with the given record separator removed
9538 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9539 * changed from the default Ruby record separator, then <code>chomp</code> also
9540 * removes carriage return characters (that is, it will remove <code>\n</code>,
9541 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9542 * it will remove all trailing newlines from the string.
9544 * "hello".chomp #=> "hello"
9545 * "hello\n".chomp #=> "hello"
9546 * "hello\r\n".chomp #=> "hello"
9547 * "hello\n\r".chomp #=> "hello\n"
9548 * "hello\r".chomp #=> "hello"
9549 * "hello \n there".chomp #=> "hello \n there"
9550 * "hello".chomp("llo") #=> "he"
9551 * "hello\r\n\r\n".chomp('') #=> "hello"
9552 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9556 rb_str_chomp(int argc
, VALUE
*argv
, VALUE str
)
9558 VALUE rs
= chomp_rs(argc
, argv
);
9559 if (NIL_P(rs
)) return str_duplicate(rb_cString
, str
);
9560 return rb_str_subseq(str
, 0, chompped_length(str
, rs
));
9564 lstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9566 const char *const start
= s
;
9568 if (!s
|| s
>= e
) return 0;
9570 /* remove spaces at head */
9571 if (single_byte_optimizable(str
)) {
9572 while (s
< e
&& (*s
== '\0' || ascii_isspace(*s
))) s
++;
9577 unsigned int cc
= rb_enc_codepoint_len(s
, e
, &n
, enc
);
9579 if (cc
&& !rb_isspace(cc
)) break;
9588 * str.lstrip! -> self or nil
9590 * Removes leading whitespace from the receiver.
9591 * Returns the altered receiver, or +nil+ if no change was made.
9592 * See also String#rstrip! and String#strip!.
9594 * Refer to String#strip for the definition of whitespace.
9596 * " hello ".lstrip! #=> "hello "
9597 * "hello ".lstrip! #=> nil
9598 * "hello".lstrip! #=> nil
9602 rb_str_lstrip_bang(VALUE str
)
9608 str_modify_keep_cr(str
);
9609 enc
= STR_ENC_GET(str
);
9610 RSTRING_GETMEM(str
, start
, olen
);
9611 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9613 long len
= olen
-loffset
;
9614 s
= start
+ loffset
;
9615 memmove(start
, s
, len
);
9616 STR_SET_LEN(str
, len
);
9617 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9626 * str.lstrip -> new_str
9628 * Returns a copy of the receiver with leading whitespace removed.
9629 * See also String#rstrip and String#strip.
9631 * Refer to String#strip for the definition of whitespace.
9633 * " hello ".lstrip #=> "hello "
9634 * "hello".lstrip #=> "hello"
9638 rb_str_lstrip(VALUE str
)
9642 RSTRING_GETMEM(str
, start
, len
);
9643 loffset
= lstrip_offset(str
, start
, start
+len
, STR_ENC_GET(str
));
9644 if (loffset
<= 0) return str_duplicate(rb_cString
, str
);
9645 return rb_str_subseq(str
, loffset
, len
- loffset
);
9649 rstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9653 rb_str_check_dummy_enc(enc
);
9654 if (!s
|| s
>= e
) return 0;
9657 /* remove trailing spaces or '\0's */
9658 if (single_byte_optimizable(str
)) {
9660 while (s
< t
&& ((c
= *(t
-1)) == '\0' || ascii_isspace(c
))) t
--;
9665 while ((tp
= rb_enc_prev_char(s
, t
, e
, enc
)) != NULL
) {
9666 unsigned int c
= rb_enc_codepoint(tp
, e
, enc
);
9667 if (c
&& !rb_isspace(c
)) break;
9676 * str.rstrip! -> self or nil
9678 * Removes trailing whitespace from the receiver.
9679 * Returns the altered receiver, or +nil+ if no change was made.
9680 * See also String#lstrip! and String#strip!.
9682 * Refer to String#strip for the definition of whitespace.
9684 * " hello ".rstrip! #=> " hello"
9685 * " hello".rstrip! #=> nil
9686 * "hello".rstrip! #=> nil
9690 rb_str_rstrip_bang(VALUE str
)
9696 str_modify_keep_cr(str
);
9697 enc
= STR_ENC_GET(str
);
9698 RSTRING_GETMEM(str
, start
, olen
);
9699 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9701 long len
= olen
- roffset
;
9703 STR_SET_LEN(str
, len
);
9704 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9713 * str.rstrip -> new_str
9715 * Returns a copy of the receiver with trailing whitespace removed.
9716 * See also String#lstrip and String#strip.
9718 * Refer to String#strip for the definition of whitespace.
9720 * " hello ".rstrip #=> " hello"
9721 * "hello".rstrip #=> "hello"
9725 rb_str_rstrip(VALUE str
)
9731 enc
= STR_ENC_GET(str
);
9732 RSTRING_GETMEM(str
, start
, olen
);
9733 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9735 if (roffset
<= 0) return str_duplicate(rb_cString
, str
);
9736 return rb_str_subseq(str
, 0, olen
-roffset
);
9742 * str.strip! -> self or nil
9744 * Removes leading and trailing whitespace from the receiver.
9745 * Returns the altered receiver, or +nil+ if there was no change.
9747 * Refer to String#strip for the definition of whitespace.
9749 * " hello ".strip! #=> "hello"
9750 * "hello".strip! #=> nil
9754 rb_str_strip_bang(VALUE str
)
9757 long olen
, loffset
, roffset
;
9760 str_modify_keep_cr(str
);
9761 enc
= STR_ENC_GET(str
);
9762 RSTRING_GETMEM(str
, start
, olen
);
9763 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9764 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9766 if (loffset
> 0 || roffset
> 0) {
9767 long len
= olen
-roffset
;
9770 memmove(start
, start
+ loffset
, len
);
9772 STR_SET_LEN(str
, len
);
9773 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9782 * str.strip -> new_str
9784 * Returns a copy of the receiver with leading and trailing whitespace removed.
9786 * Whitespace is defined as any of the following characters:
9787 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9789 * " hello ".strip #=> "hello"
9790 * "\tgoodbye\r\n".strip #=> "goodbye"
9791 * "\x00\t\n\v\f\r ".strip #=> ""
9792 * "hello".strip #=> "hello"
9796 rb_str_strip(VALUE str
)
9799 long olen
, loffset
, roffset
;
9800 rb_encoding
*enc
= STR_ENC_GET(str
);
9802 RSTRING_GETMEM(str
, start
, olen
);
9803 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9804 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9806 if (loffset
<= 0 && roffset
<= 0) return str_duplicate(rb_cString
, str
);
9807 return rb_str_subseq(str
, loffset
, olen
-loffset
-roffset
);
9811 scan_once(VALUE str
, VALUE pat
, long *start
, int set_backref_str
)
9813 VALUE result
, match
;
9814 struct re_registers
*regs
;
9816 long end
, pos
= rb_pat_search(pat
, str
, *start
, set_backref_str
);
9818 if (BUILTIN_TYPE(pat
) == T_STRING
) {
9820 end
= pos
+ RSTRING_LEN(pat
);
9823 match
= rb_backref_get();
9824 regs
= RMATCH_REGS(match
);
9829 rb_encoding
*enc
= STR_ENC_GET(str
);
9831 * Always consume at least one character of the input string
9833 if (RSTRING_LEN(str
) > end
)
9834 *start
= end
+ rb_enc_fast_mbclen(RSTRING_PTR(str
) + end
,
9835 RSTRING_END(str
), enc
);
9842 if (!regs
|| regs
->num_regs
== 1) {
9843 result
= rb_str_subseq(str
, pos
, end
- pos
);
9846 result
= rb_ary_new2(regs
->num_regs
);
9847 for (i
=1; i
< regs
->num_regs
; i
++) {
9850 s
= rb_str_subseq(str
, BEG(i
), END(i
)-BEG(i
));
9852 rb_ary_push(result
, s
);
9863 * str.scan(pattern) -> array
9864 * str.scan(pattern) {|match, ...| block } -> str
9866 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9867 * Regexp or a String). For each match, a result is
9868 * generated and either added to the result array or passed to the block. If
9869 * the pattern contains no groups, each individual result consists of the
9870 * matched string, <code>$&</code>. If the pattern contains groups, each
9871 * individual result is itself an array containing one entry per group.
9874 * a.scan(/\w+/) #=> ["cruel", "world"]
9875 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9876 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9877 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9879 * And the block form:
9881 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9883 * a.scan(/(.)(.)/) {|x,y| print y, x }
9886 * <em>produces:</em>
9888 * <<cruel>> <<world>>
9893 rb_str_scan(VALUE str
, VALUE pat
)
9897 long last
= -1, prev
= 0;
9898 char *p
= RSTRING_PTR(str
); long len
= RSTRING_LEN(str
);
9900 pat
= get_pat_quoted(pat
, 1);
9901 mustnot_broken(str
);
9902 if (!rb_block_given_p()) {
9903 VALUE ary
= rb_ary_new();
9905 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 0))) {
9908 rb_ary_push(ary
, result
);
9910 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9911 else rb_backref_set(Qnil
);
9915 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 1))) {
9919 str_mod_check(str
, p
, len
);
9921 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9928 * str.hex -> integer
9930 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9931 * (with an optional sign and an optional <code>0x</code>) and returns the
9932 * corresponding number. Zero is returned on error.
9935 * "-1234".hex #=> -4660
9937 * "wombat".hex #=> 0
9941 rb_str_hex(VALUE str
)
9943 return rb_str_to_inum(str
, 16, FALSE
);
9949 * str.oct -> integer
9951 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9952 * optional sign) and returns the corresponding number. Returns 0 if the
9956 * "-377".oct #=> -255
9958 * "0377bad".oct #=> 255
9960 * If +str+ starts with <code>0</code>, radix indicators are honored.
9961 * See Kernel#Integer.
9965 rb_str_oct(VALUE str
)
9967 return rb_str_to_inum(str
, -8, FALSE
);
9970 #ifndef HAVE_CRYPT_R
9971 # include "ruby/thread_native.h"
9972 # include "ruby/atomic.h"
9975 rb_atomic_t initialized
;
9976 rb_nativethread_lock_t lock
;
9980 crypt_mutex_destroy(void)
9982 RUBY_ASSERT_ALWAYS(crypt_mutex
.initialized
== 1);
9983 rb_nativethread_lock_destroy(&crypt_mutex
.lock
);
9984 crypt_mutex
.initialized
= 0;
9988 crypt_mutex_initialize(void)
9991 while ((i
= RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 0, 2)) == 2);
9994 rb_nativethread_lock_initialize(&crypt_mutex
.lock
);
9995 atexit(crypt_mutex_destroy
);
9996 RUBY_ASSERT(crypt_mutex
.initialized
== 2);
9997 RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 2, 1);
10002 rb_bug("crypt_mutex.initialized: %d->%d", i
, crypt_mutex
.initialized
);
10009 * str.crypt(salt_str) -> new_str
10011 * Returns the string generated by calling <code>crypt(3)</code>
10012 * standard library function with <code>str</code> and
10013 * <code>salt_str</code>, in this order, as its arguments. Please do
10014 * not use this method any longer. It is legacy; provided only for
10015 * backward compatibility with ruby scripts in earlier days. It is
10016 * bad to use in contemporary programs for several reasons:
10018 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10019 * run. The generated string lacks data portability.
10021 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10022 * (i.e. silently ends up in unexpected results).
10024 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10027 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10028 * very very weak. According to its manpage, Linux's traditional
10029 * <code>crypt(3)</code> output has only 2**56 variations; too
10030 * easy to brute force today. And this is the default behaviour.
10032 * * In order to make things robust some OSes implement so-called
10033 * "modular" usage. To go through, you have to do a complex
10034 * build-up of the <code>salt_str</code> parameter, by hand.
10035 * Failure in generation of a proper salt string tends not to
10036 * yield any errors; typos in parameters are normally not
10039 * * For instance, in the following example, the second invocation
10040 * of String#crypt is wrong; it has a typo in "round=" (lacks
10041 * "s"). However the call does not fail and something unexpected
10044 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10045 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10047 * * Even in the "modular" mode, some hash functions are considered
10048 * archaic and no longer recommended at all; for instance module
10049 * <code>$1$</code> is officially abandoned by its author: see
10050 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10051 * instance module <code>$3$</code> is considered completely
10052 * broken: see the manpage of FreeBSD.
10054 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10055 * written above, <code>crypt(3)</code> on Mac OS never fails.
10056 * This means even if you build up a proper salt string it
10057 * generates a traditional DES hash anyways, and there is no way
10058 * for you to be aware of.
10060 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10062 * If for some reason you cannot migrate to other secure contemporary
10063 * password hashing algorithms, install the string-crypt gem and
10064 * <code>require 'string/crypt'</code> to continue using it.
10068 rb_str_crypt(VALUE str
, VALUE salt
)
10070 #ifdef HAVE_CRYPT_R
10072 struct crypt_data
*data
;
10073 # define CRYPT_END() ALLOCV_END(databuf)
10075 extern char *crypt(const char *, const char *);
10076 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10079 const char *s
, *saltp
;
10081 #ifdef BROKEN_CRYPT
10082 char salt_8bit_clean
[3];
10086 mustnot_wchar(str
);
10087 mustnot_wchar(salt
);
10088 s
= StringValueCStr(str
);
10089 saltp
= RSTRING_PTR(salt
);
10090 if (RSTRING_LEN(salt
) < 2 || !saltp
[0] || !saltp
[1]) {
10091 rb_raise(rb_eArgError
, "salt too short (need >=2 bytes)");
10094 #ifdef BROKEN_CRYPT
10095 if (!ISASCII((unsigned char)saltp
[0]) || !ISASCII((unsigned char)saltp
[1])) {
10096 salt_8bit_clean
[0] = saltp
[0] & 0x7f;
10097 salt_8bit_clean
[1] = saltp
[1] & 0x7f;
10098 salt_8bit_clean
[2] = '\0';
10099 saltp
= salt_8bit_clean
;
10102 #ifdef HAVE_CRYPT_R
10103 data
= ALLOCV(databuf
, sizeof(struct crypt_data
));
10104 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10105 data
->initialized
= 0;
10107 res
= crypt_r(s
, saltp
, data
);
10109 crypt_mutex_initialize();
10110 rb_nativethread_lock_lock(&crypt_mutex
.lock
);
10111 res
= crypt(s
, saltp
);
10116 rb_syserr_fail(err
, "crypt");
10118 result
= rb_str_new_cstr(res
);
10126 * str.ord -> integer
10128 * Returns the Integer ordinal of a one-character string.
10134 rb_str_ord(VALUE s
)
10138 c
= rb_enc_codepoint(RSTRING_PTR(s
), RSTRING_END(s
), STR_ENC_GET(s
));
10139 return UINT2NUM(c
);
10143 * str.sum(n=16) -> integer
10145 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10146 * where <em>n</em> is the optional Integer parameter, defaulting
10147 * to 16. The result is simply the sum of the binary value of each byte in
10148 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10153 rb_str_sum(int argc
, VALUE
*argv
, VALUE str
)
10156 char *ptr
, *p
, *pend
;
10158 VALUE sum
= INT2FIX(0);
10159 unsigned long sum0
= 0;
10161 if (rb_check_arity(argc
, 0, 1) && (bits
= NUM2INT(argv
[0])) < 0) {
10164 ptr
= p
= RSTRING_PTR(str
);
10165 len
= RSTRING_LEN(str
);
10169 if (FIXNUM_MAX
- UCHAR_MAX
< sum0
) {
10170 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10171 str_mod_check(str
, ptr
, len
);
10174 sum0
+= (unsigned char)*p
;
10180 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10184 if (sum
== INT2FIX(0)) {
10185 if (bits
< (int)sizeof(long)*CHAR_BIT
) {
10186 sum0
&= (((unsigned long)1)<<bits
)-1;
10188 sum
= LONG2FIX(sum0
);
10194 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10197 mod
= rb_funcall(INT2FIX(1), idLTLT
, 1, INT2FIX(bits
));
10198 mod
= rb_funcall(mod
, '-', 1, INT2FIX(1));
10199 sum
= rb_funcall(sum
, '&', 1, mod
);
10206 rb_str_justify(int argc
, VALUE
*argv
, VALUE str
, char jflag
)
10210 long width
, len
, flen
= 1, fclen
= 1;
10213 const char *f
= " ";
10214 long n
, size
, llen
, rlen
, llen2
= 0, rlen2
= 0;
10216 int singlebyte
= 1, cr
;
10219 rb_scan_args(argc
, argv
, "11", &w
, &pad
);
10220 enc
= STR_ENC_GET(str
);
10221 termlen
= rb_enc_mbminlen(enc
);
10222 width
= NUM2LONG(w
);
10225 enc
= rb_enc_check(str
, pad
);
10226 f
= RSTRING_PTR(pad
);
10227 flen
= RSTRING_LEN(pad
);
10228 fclen
= str_strlen(pad
, enc
); /* rb_enc_check */
10229 singlebyte
= single_byte_optimizable(pad
);
10230 if (flen
== 0 || fclen
== 0) {
10231 rb_raise(rb_eArgError
, "zero width padding");
10234 len
= str_strlen(str
, enc
); /* rb_enc_check */
10235 if (width
< 0 || len
>= width
) return str_duplicate(rb_cString
, str
);
10237 llen
= (jflag
== 'l') ? 0 : ((jflag
== 'r') ? n
: n
/2);
10239 cr
= ENC_CODERANGE(str
);
10241 llen2
= str_offset(f
, f
+ flen
, llen
% fclen
, enc
, singlebyte
);
10242 rlen2
= str_offset(f
, f
+ flen
, rlen
% fclen
, enc
, singlebyte
);
10244 size
= RSTRING_LEN(str
);
10245 if ((len
= llen
/ fclen
+ rlen
/ fclen
) >= LONG_MAX
/ flen
||
10246 (len
*= flen
) >= LONG_MAX
- llen2
- rlen2
||
10247 (len
+= llen2
+ rlen2
) >= LONG_MAX
- size
) {
10248 rb_raise(rb_eArgError
, "argument too big");
10251 res
= str_new0(rb_cString
, 0, len
, termlen
);
10252 p
= RSTRING_PTR(res
);
10254 memset(p
, *f
, llen
);
10258 while (llen
>= fclen
) {
10264 memcpy(p
, f
, llen2
);
10268 memcpy(p
, RSTRING_PTR(str
), size
);
10271 memset(p
, *f
, rlen
);
10275 while (rlen
>= fclen
) {
10281 memcpy(p
, f
, rlen2
);
10285 TERM_FILL(p
, termlen
);
10286 STR_SET_LEN(res
, p
-RSTRING_PTR(res
));
10287 rb_enc_associate(res
, enc
);
10289 cr
= ENC_CODERANGE_AND(cr
, ENC_CODERANGE(pad
));
10290 if (cr
!= ENC_CODERANGE_BROKEN
)
10291 ENC_CODERANGE_SET(res
, cr
);
10300 * str.ljust(integer, padstr=' ') -> new_str
10302 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10303 * String of length <i>integer</i> with <i>str</i> left justified
10304 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10306 * "hello".ljust(4) #=> "hello"
10307 * "hello".ljust(20) #=> "hello "
10308 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10312 rb_str_ljust(int argc
, VALUE
*argv
, VALUE str
)
10314 return rb_str_justify(argc
, argv
, str
, 'l');
10320 * str.rjust(integer, padstr=' ') -> new_str
10322 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10323 * String of length <i>integer</i> with <i>str</i> right justified
10324 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10326 * "hello".rjust(4) #=> "hello"
10327 * "hello".rjust(20) #=> " hello"
10328 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10332 rb_str_rjust(int argc
, VALUE
*argv
, VALUE str
)
10334 return rb_str_justify(argc
, argv
, str
, 'r');
10340 * str.center(width, padstr=' ') -> new_str
10342 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10343 * returns a new String of length +width+ with +str+ centered and padded with
10344 * +padstr+; otherwise, returns +str+.
10346 * "hello".center(4) #=> "hello"
10347 * "hello".center(20) #=> " hello "
10348 * "hello".center(20, '123') #=> "1231231hello12312312"
10352 rb_str_center(int argc
, VALUE
*argv
, VALUE str
)
10354 return rb_str_justify(argc
, argv
, str
, 'c');
10359 * str.partition(sep) -> [head, sep, tail]
10360 * str.partition(regexp) -> [head, match, tail]
10362 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10363 * and returns the part before it, the match, and the part
10365 * If it is not found, returns two empty strings and <i>str</i>.
10367 * "hello".partition("l") #=> ["he", "l", "lo"]
10368 * "hello".partition("x") #=> ["hello", "", ""]
10369 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10373 rb_str_partition(VALUE str
, VALUE sep
)
10377 sep
= get_pat_quoted(sep
, 0);
10378 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10379 if (rb_reg_search(sep
, str
, 0, 0) < 0) {
10382 VALUE match
= rb_backref_get();
10383 struct re_registers
*regs
= RMATCH_REGS(match
);
10386 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10389 pos
= rb_str_index(str
, sep
, 0);
10390 if (pos
< 0) goto failed
;
10392 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10394 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10395 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10398 return rb_ary_new3(3, str_duplicate(rb_cString
, str
), str_new_empty_String(str
), str_new_empty_String(str
));
10403 * str.rpartition(sep) -> [head, sep, tail]
10404 * str.rpartition(regexp) -> [head, match, tail]
10406 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10407 * of the string, and returns the part before it, the match, and the part
10409 * If it is not found, returns two empty strings and <i>str</i>.
10411 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10412 * "hello".rpartition("x") #=> ["", "", "hello"]
10413 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10415 * The match from the end means starting at the possible last position, not
10416 * the last of longest matches.
10418 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10420 * To partition at the last longest match, needs to combine with
10421 * negative lookbehind.
10423 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10425 * Or String#partition with negative lookforward.
10427 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10431 rb_str_rpartition(VALUE str
, VALUE sep
)
10433 long pos
= RSTRING_LEN(str
);
10435 sep
= get_pat_quoted(sep
, 0);
10436 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10437 if (rb_reg_search(sep
, str
, pos
, 1) < 0) {
10440 VALUE match
= rb_backref_get();
10441 struct re_registers
*regs
= RMATCH_REGS(match
);
10444 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10447 pos
= rb_str_sublen(str
, pos
);
10448 pos
= rb_str_rindex(str
, sep
, pos
);
10452 pos
= rb_str_offset(str
, pos
);
10455 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10457 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10458 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10460 return rb_ary_new3(3, str_new_empty_String(str
), str_new_empty_String(str
), str_duplicate(rb_cString
, str
));
10465 * str.start_with?([prefixes]+) -> true or false
10467 * Returns true if +str+ starts with one of the +prefixes+ given.
10468 * Each of the +prefixes+ should be a String or a Regexp.
10470 * "hello".start_with?("hell") #=> true
10471 * "hello".start_with?(/H/i) #=> true
10473 * # returns true if one of the prefixes matches.
10474 * "hello".start_with?("heaven", "hell") #=> true
10475 * "hello".start_with?("heaven", "paradise") #=> false
10479 rb_str_start_with(int argc
, VALUE
*argv
, VALUE str
)
10483 for (i
=0; i
<argc
; i
++) {
10484 VALUE tmp
= argv
[i
];
10485 if (RB_TYPE_P(tmp
, T_REGEXP
)) {
10486 if (rb_reg_start_with_p(tmp
, str
))
10491 rb_enc_check(str
, tmp
);
10492 if (RSTRING_LEN(str
) < RSTRING_LEN(tmp
)) continue;
10493 if (memcmp(RSTRING_PTR(str
), RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10502 * str.end_with?([suffixes]+) -> true or false
10504 * Returns true if +str+ ends with one of the +suffixes+ given.
10506 * "hello".end_with?("ello") #=> true
10508 * # returns true if one of the +suffixes+ matches.
10509 * "hello".end_with?("heaven", "ello") #=> true
10510 * "hello".end_with?("heaven", "paradise") #=> false
10514 rb_str_end_with(int argc
, VALUE
*argv
, VALUE str
)
10520 for (i
=0; i
<argc
; i
++) {
10521 VALUE tmp
= argv
[i
];
10524 enc
= rb_enc_check(str
, tmp
);
10525 if ((tlen
= RSTRING_LEN(tmp
)) == 0) return Qtrue
;
10526 if ((slen
= RSTRING_LEN(str
)) < tlen
) continue;
10527 p
= RSTRING_PTR(str
);
10530 if (rb_enc_left_char_head(p
, s
, e
, enc
) != s
)
10532 if (memcmp(s
, RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10539 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10540 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10542 * @param str the target
10543 * @param prefix the prefix
10544 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10545 * @retval Positive-Integer otherwise
10548 deleted_prefix_length(VALUE str
, VALUE prefix
)
10550 char *strptr
, *prefixptr
;
10551 long olen
, prefixlen
;
10553 StringValue(prefix
);
10554 if (is_broken_string(prefix
)) return 0;
10555 rb_enc_check(str
, prefix
);
10557 /* return 0 if not start with prefix */
10558 prefixlen
= RSTRING_LEN(prefix
);
10559 if (prefixlen
<= 0) return 0;
10560 olen
= RSTRING_LEN(str
);
10561 if (olen
< prefixlen
) return 0;
10562 strptr
= RSTRING_PTR(str
);
10563 prefixptr
= RSTRING_PTR(prefix
);
10564 if (memcmp(strptr
, prefixptr
, prefixlen
) != 0) return 0;
10571 * str.delete_prefix!(prefix) -> self or nil
10573 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10574 * <code>nil</code> if no change was made.
10576 * "hello".delete_prefix!("hel") #=> "lo"
10577 * "hello".delete_prefix!("llo") #=> nil
10581 rb_str_delete_prefix_bang(VALUE str
, VALUE prefix
)
10584 str_modify_keep_cr(str
);
10586 prefixlen
= deleted_prefix_length(str
, prefix
);
10587 if (prefixlen
<= 0) return Qnil
;
10589 return rb_str_drop_bytes(str
, prefixlen
);
10594 * str.delete_prefix(prefix) -> new_str
10596 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10598 * "hello".delete_prefix("hel") #=> "lo"
10599 * "hello".delete_prefix("llo") #=> "hello"
10603 rb_str_delete_prefix(VALUE str
, VALUE prefix
)
10607 prefixlen
= deleted_prefix_length(str
, prefix
);
10608 if (prefixlen
<= 0) return str_duplicate(rb_cString
, str
);
10610 return rb_str_subseq(str
, prefixlen
, RSTRING_LEN(str
) - prefixlen
);
10614 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10615 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10617 * @param str the target
10618 * @param suffix the suffix
10619 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10620 * @retval Positive-Integer otherwise
10623 deleted_suffix_length(VALUE str
, VALUE suffix
)
10625 char *strptr
, *suffixptr
, *s
;
10626 long olen
, suffixlen
;
10629 StringValue(suffix
);
10630 if (is_broken_string(suffix
)) return 0;
10631 enc
= rb_enc_check(str
, suffix
);
10633 /* return 0 if not start with suffix */
10634 suffixlen
= RSTRING_LEN(suffix
);
10635 if (suffixlen
<= 0) return 0;
10636 olen
= RSTRING_LEN(str
);
10637 if (olen
< suffixlen
) return 0;
10638 strptr
= RSTRING_PTR(str
);
10639 suffixptr
= RSTRING_PTR(suffix
);
10640 s
= strptr
+ olen
- suffixlen
;
10641 if (memcmp(s
, suffixptr
, suffixlen
) != 0) return 0;
10642 if (rb_enc_left_char_head(strptr
, s
, strptr
+ olen
, enc
) != s
) return 0;
10649 * str.delete_suffix!(suffix) -> self or nil
10651 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10652 * <code>nil</code> if no change was made.
10654 * "hello".delete_suffix!("llo") #=> "he"
10655 * "hello".delete_suffix!("hel") #=> nil
10659 rb_str_delete_suffix_bang(VALUE str
, VALUE suffix
)
10661 long olen
, suffixlen
, len
;
10662 str_modifiable(str
);
10664 suffixlen
= deleted_suffix_length(str
, suffix
);
10665 if (suffixlen
<= 0) return Qnil
;
10667 olen
= RSTRING_LEN(str
);
10668 str_modify_keep_cr(str
);
10669 len
= olen
- suffixlen
;
10670 STR_SET_LEN(str
, len
);
10671 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
10672 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
10673 ENC_CODERANGE_CLEAR(str
);
10680 * str.delete_suffix(suffix) -> new_str
10682 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10684 * "hello".delete_suffix("llo") #=> "he"
10685 * "hello".delete_suffix("hel") #=> "hello"
10689 rb_str_delete_suffix(VALUE str
, VALUE suffix
)
10693 suffixlen
= deleted_suffix_length(str
, suffix
);
10694 if (suffixlen
<= 0) return str_duplicate(rb_cString
, str
);
10696 return rb_str_subseq(str
, 0, RSTRING_LEN(str
) - suffixlen
);
10700 rb_str_setter(VALUE val
, ID id
, VALUE
*var
)
10702 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
)) {
10703 rb_raise(rb_eTypeError
, "value of %"PRIsVALUE
" must be String", rb_id2str(id
));
10709 rb_fs_setter(VALUE val
, ID id
, VALUE
*var
)
10711 val
= rb_fs_check(val
);
10713 rb_raise(rb_eTypeError
,
10714 "value of %"PRIsVALUE
" must be String or Regexp",
10718 rb_warn_deprecated("`$;'", NULL
);
10726 * str.force_encoding(encoding) -> str
10728 * Changes the encoding to +encoding+ and returns self.
10732 rb_str_force_encoding(VALUE str
, VALUE enc
)
10734 str_modifiable(str
);
10735 rb_enc_associate(str
, rb_to_encoding(enc
));
10736 ENC_CODERANGE_CLEAR(str
);
10744 * Returns a copied string whose encoding is ASCII-8BIT.
10748 rb_str_b(VALUE str
)
10751 if (FL_TEST(str
, STR_NOEMBED
)) {
10752 str2
= str_alloc_heap(rb_cString
);
10755 str2
= str_alloc_embed(rb_cString
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
10757 str_replace_shared_without_enc(str2
, str
);
10758 ENC_CODERANGE_CLEAR(str2
);
10764 * str.valid_encoding? -> true or false
10766 * Returns true for a string which is encoded correctly.
10768 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10769 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10770 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10774 rb_str_valid_encoding_p(VALUE str
)
10776 int cr
= rb_enc_str_coderange(str
);
10778 return RBOOL(cr
!= ENC_CODERANGE_BROKEN
);
10783 * str.ascii_only? -> true or false
10785 * Returns true for a string which has only ASCII characters.
10787 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10788 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10792 rb_str_is_ascii_only_p(VALUE str
)
10794 int cr
= rb_enc_str_coderange(str
);
10796 return RBOOL(cr
== ENC_CODERANGE_7BIT
);
10800 rb_str_ellipsize(VALUE str
, long len
)
10802 static const char ellipsis
[] = "...";
10803 const long ellipsislen
= sizeof(ellipsis
) - 1;
10804 rb_encoding
*const enc
= rb_enc_get(str
);
10805 const long blen
= RSTRING_LEN(str
);
10806 const char *const p
= RSTRING_PTR(str
), *e
= p
+ blen
;
10807 VALUE estr
, ret
= 0;
10809 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
10810 if (len
* rb_enc_mbminlen(enc
) >= blen
||
10811 (e
= rb_enc_nth(p
, e
, len
, enc
)) - p
== blen
) {
10814 else if (len
<= ellipsislen
||
10815 !(e
= rb_enc_step_back(p
, e
, e
, len
= ellipsislen
, enc
))) {
10816 if (rb_enc_asciicompat(enc
)) {
10817 ret
= rb_str_new(ellipsis
, len
);
10818 rb_enc_associate(ret
, enc
);
10821 estr
= rb_usascii_str_new(ellipsis
, len
);
10822 ret
= rb_str_encode(estr
, rb_enc_from_encoding(enc
), 0, Qnil
);
10825 else if (ret
= rb_str_subseq(str
, 0, e
- p
), rb_enc_asciicompat(enc
)) {
10826 rb_str_cat(ret
, ellipsis
, ellipsislen
);
10829 estr
= rb_str_encode(rb_usascii_str_new(ellipsis
, ellipsislen
),
10830 rb_enc_from_encoding(enc
), 0, Qnil
);
10831 rb_str_append(ret
, estr
);
10837 str_compat_and_valid(VALUE str
, rb_encoding
*enc
)
10840 str
= StringValue(str
);
10841 cr
= rb_enc_str_coderange(str
);
10842 if (cr
== ENC_CODERANGE_BROKEN
) {
10843 rb_raise(rb_eArgError
, "replacement must be valid byte sequence '%+"PRIsVALUE
"'", str
);
10846 rb_encoding
*e
= STR_ENC_GET(str
);
10847 if (cr
== ENC_CODERANGE_7BIT
? rb_enc_mbminlen(enc
) != 1 : enc
!= e
) {
10848 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
10849 rb_enc_name(enc
), rb_enc_name(e
));
10855 static VALUE
enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
);
10858 rb_str_scrub(VALUE str
, VALUE repl
)
10860 rb_encoding
*enc
= STR_ENC_GET(str
);
10861 return enc_str_scrub(enc
, str
, repl
, ENC_CODERANGE(str
));
10865 rb_enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
)
10867 int cr
= ENC_CODERANGE_UNKNOWN
;
10868 if (enc
== STR_ENC_GET(str
)) {
10869 /* cached coderange makes sense only when enc equals the
10870 * actual encoding of str */
10871 cr
= ENC_CODERANGE(str
);
10873 return enc_str_scrub(enc
, str
, repl
, cr
);
10877 enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
)
10881 const char *rep
, *p
, *e
, *p1
, *sp
;
10885 if (rb_block_given_p()) {
10887 rb_raise(rb_eArgError
, "both of block and replacement given");
10891 if (ENC_CODERANGE_CLEAN_P(cr
))
10894 if (!NIL_P(repl
)) {
10895 repl
= str_compat_and_valid(repl
, enc
);
10898 if (rb_enc_dummy_p(enc
)) {
10901 encidx
= rb_enc_to_index(enc
);
10903 #define DEFAULT_REPLACE_CHAR(str) do { \
10904 static const char replace[sizeof(str)-1] = str; \
10905 rep = replace; replen = (int)sizeof(replace); \
10908 slen
= RSTRING_LEN(str
);
10909 p
= RSTRING_PTR(str
);
10910 e
= RSTRING_END(str
);
10914 if (rb_enc_asciicompat(enc
)) {
10920 else if (!NIL_P(repl
)) {
10921 rep
= RSTRING_PTR(repl
);
10922 replen
= RSTRING_LEN(repl
);
10923 rep7bit_p
= (ENC_CODERANGE(repl
) == ENC_CODERANGE_7BIT
);
10925 else if (encidx
== rb_utf8_encindex()) {
10926 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10930 DEFAULT_REPLACE_CHAR("?");
10933 cr
= ENC_CODERANGE_7BIT
;
10935 p
= search_nonascii(p
, e
);
10940 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
10941 if (MBCLEN_NEEDMORE_P(ret
)) {
10944 else if (MBCLEN_CHARFOUND_P(ret
)) {
10945 cr
= ENC_CODERANGE_VALID
;
10946 p
+= MBCLEN_CHARFOUND_LEN(ret
);
10948 else if (MBCLEN_INVALID_P(ret
)) {
10950 * p1~p: valid ascii/multibyte chars
10951 * p ~e: invalid bytes + unknown bytes
10953 long clen
= rb_enc_mbmaxlen(enc
);
10954 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
10956 rb_str_buf_cat(buf
, p1
, p
- p1
);
10959 if (e
- p
< clen
) clen
= e
- p
;
10966 for (; clen
> 1; clen
--) {
10967 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
10968 if (MBCLEN_NEEDMORE_P(ret
)) break;
10969 if (MBCLEN_INVALID_P(ret
)) continue;
10974 rb_str_buf_cat(buf
, rep
, replen
);
10975 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
10978 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
10979 str_mod_check(str
, sp
, slen
);
10980 repl
= str_compat_and_valid(repl
, enc
);
10981 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
10982 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
10983 cr
= ENC_CODERANGE_VALID
;
10987 p
= search_nonascii(p
, e
);
10999 ENC_CODERANGE_SET(str
, cr
);
11002 buf
= rb_str_buf_new(RSTRING_LEN(str
));
11005 rb_str_buf_cat(buf
, p1
, p
- p1
);
11009 rb_str_buf_cat(buf
, rep
, replen
);
11010 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
11013 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
11014 str_mod_check(str
, sp
, slen
);
11015 repl
= str_compat_and_valid(repl
, enc
);
11016 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11017 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
11018 cr
= ENC_CODERANGE_VALID
;
11023 /* ASCII incompatible */
11024 long mbminlen
= rb_enc_mbminlen(enc
);
11028 else if (!NIL_P(repl
)) {
11029 rep
= RSTRING_PTR(repl
);
11030 replen
= RSTRING_LEN(repl
);
11032 else if (encidx
== ENCINDEX_UTF_16BE
) {
11033 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11035 else if (encidx
== ENCINDEX_UTF_16LE
) {
11036 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11038 else if (encidx
== ENCINDEX_UTF_32BE
) {
11039 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11041 else if (encidx
== ENCINDEX_UTF_32LE
) {
11042 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11045 DEFAULT_REPLACE_CHAR("?");
11049 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
11050 if (MBCLEN_NEEDMORE_P(ret
)) {
11053 else if (MBCLEN_CHARFOUND_P(ret
)) {
11054 p
+= MBCLEN_CHARFOUND_LEN(ret
);
11056 else if (MBCLEN_INVALID_P(ret
)) {
11058 long clen
= rb_enc_mbmaxlen(enc
);
11059 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
11060 if (p
> p1
) rb_str_buf_cat(buf
, p1
, p
- p1
);
11062 if (e
- p
< clen
) clen
= e
- p
;
11063 if (clen
<= mbminlen
* 2) {
11068 for (; clen
> mbminlen
; clen
-=mbminlen
) {
11069 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
11070 if (MBCLEN_NEEDMORE_P(ret
)) break;
11071 if (MBCLEN_INVALID_P(ret
)) continue;
11076 rb_str_buf_cat(buf
, rep
, replen
);
11079 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
11080 str_mod_check(str
, sp
, slen
);
11081 repl
= str_compat_and_valid(repl
, enc
);
11082 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11093 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
11096 buf
= rb_str_buf_new(RSTRING_LEN(str
));
11099 rb_str_buf_cat(buf
, p1
, p
- p1
);
11103 rb_str_buf_cat(buf
, rep
, replen
);
11106 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
11107 str_mod_check(str
, sp
, slen
);
11108 repl
= str_compat_and_valid(repl
, enc
);
11109 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11112 cr
= ENC_CODERANGE_VALID
;
11114 ENCODING_CODERANGE_SET(buf
, rb_enc_to_index(enc
), cr
);
11120 * str.scrub -> new_str
11121 * str.scrub(repl) -> new_str
11122 * str.scrub{|bytes|} -> new_str
11124 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11125 * character, else returns self.
11126 * If block is given, replace invalid bytes with returned value of the block.
11128 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11129 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11130 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11133 str_scrub(int argc
, VALUE
*argv
, VALUE str
)
11135 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11136 VALUE
new = rb_str_scrub(str
, repl
);
11137 return NIL_P(new) ? str_duplicate(rb_cString
, str
): new;
11142 * str.scrub! -> str
11143 * str.scrub!(repl) -> str
11144 * str.scrub!{|bytes|} -> str
11146 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11147 * character, else returns self.
11148 * If block is given, replace invalid bytes with returned value of the block.
11150 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11151 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11152 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11155 str_scrub_bang(int argc
, VALUE
*argv
, VALUE str
)
11157 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11158 VALUE
new = rb_str_scrub(str
, repl
);
11159 if (!NIL_P(new)) rb_str_replace(str
, new);
11163 static ID id_normalize
;
11164 static ID id_normalized_p
;
11165 static VALUE mUnicodeNormalize
;
11168 unicode_normalize_common(int argc
, VALUE
*argv
, VALUE str
, ID id
)
11170 static int UnicodeNormalizeRequired
= 0;
11173 if (!UnicodeNormalizeRequired
) {
11174 rb_require("unicode_normalize/normalize.rb");
11175 UnicodeNormalizeRequired
= 1;
11178 if (rb_check_arity(argc
, 0, 1)) argv2
[1] = argv
[0];
11179 return rb_funcallv(mUnicodeNormalize
, id
, argc
+1, argv2
);
11184 * str.unicode_normalize(form=:nfc)
11186 * Unicode Normalization---Returns a normalized form of +str+,
11187 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11188 * The normalization form used is determined by +form+, which can
11189 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11190 * The default is +:nfc+.
11192 * If the string is not in a Unicode Encoding, then an Exception is raised.
11193 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11194 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11195 * Anything other than UTF-8 is implemented by converting to UTF-8,
11196 * which makes it slower than UTF-8.
11198 * "a\u0300".unicode_normalize #=> "\u00E0"
11199 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11200 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11201 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11202 * #=> Encoding::CompatibilityError raised
11205 rb_str_unicode_normalize(int argc
, VALUE
*argv
, VALUE str
)
11207 return unicode_normalize_common(argc
, argv
, str
, id_normalize
);
11212 * str.unicode_normalize!(form=:nfc)
11214 * Destructive version of String#unicode_normalize, doing Unicode
11215 * normalization in place.
11218 rb_str_unicode_normalize_bang(int argc
, VALUE
*argv
, VALUE str
)
11220 return rb_str_replace(str
, unicode_normalize_common(argc
, argv
, str
, id_normalize
));
11224 * str.unicode_normalized?(form=:nfc)
11226 * Checks whether +str+ is in Unicode normalization form +form+,
11227 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11228 * The default is +:nfc+.
11230 * If the string is not in a Unicode Encoding, then an Exception is raised.
11231 * For details, see String#unicode_normalize.
11233 * "a\u0300".unicode_normalized? #=> false
11234 * "a\u0300".unicode_normalized?(:nfd) #=> true
11235 * "\u00E0".unicode_normalized? #=> true
11236 * "\u00E0".unicode_normalized?(:nfd) #=> false
11237 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11238 * #=> Encoding::CompatibilityError raised
11241 rb_str_unicode_normalized_p(int argc
, VALUE
*argv
, VALUE str
)
11243 return unicode_normalize_common(argc
, argv
, str
, id_normalized_p
);
11246 /**********************************************************************
11247 * Document-class: Symbol
11249 * Symbol objects represent named identifiers inside the Ruby interpreter.
11251 * You can create a \Symbol object explicitly with:
11253 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11255 * The same Symbol object will be
11256 * created for a given name or string for the duration of a program's
11257 * execution, regardless of the context or meaning of that name. Thus
11258 * if <code>Fred</code> is a constant in one context, a method in
11259 * another, and a class in a third, the Symbol <code>:Fred</code>
11260 * will be the same object in all three contexts.
11274 * $f1.object_id #=> 2514190
11275 * $f2.object_id #=> 2514190
11276 * $f3.object_id #=> 2514190
11278 * Constant, method, and variable names are returned as symbols:
11291 * One.instance_methods(true)
11293 * One.instance_variables
11295 * One.class_variables
11297 * global_variables.grep(/six/)
11302 * Symbol objects are different from String objects in that
11303 * Symbol objects represent identifiers, while String objects
11304 * represent text or data.
11308 * First, what's elsewhere. \Class \Symbol:
11310 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11311 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11313 * Here, class \Symbol provides methods that are useful for:
11315 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11316 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11317 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11319 * === Methods for Querying
11321 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11322 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11323 * in symbol that matches a given Regexp
11324 * or other object; returns +nil+ if no match is found.
11325 * - #[], #slice :: Returns a substring of symbol
11326 * determined by a given index, start/length, or range, or string.
11327 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11328 * - #encoding:: Returns the Encoding object that represents the encoding
11330 * - #end_with?:: Returns +true+ if symbol ends with
11331 * any of the given strings.
11332 * - #match:: Returns a MatchData object if symbol
11333 * matches a given Regexp; +nil+ otherwise.
11334 * - #match?:: Returns +true+ if symbol
11335 * matches a given Regexp; +false+ otherwise.
11336 * - #length, #size:: Returns the number of characters in symbol.
11337 * - #start_with?:: Returns +true+ if symbol starts with
11338 * any of the given strings.
11340 * === Methods for Comparing
11342 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11343 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11344 * has the same content and encoding.
11345 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11346 * symbol is smaller than, equal to, or larger than symbol.
11347 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11348 * after Unicode case folding; +false+ otherwise.
11350 * === Methods for Converting
11352 * - #capitalize:: Returns symbol with the first character upcased
11353 * and all other characters downcased.
11354 * - #downcase:: Returns symbol with all characters downcased.
11355 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11356 * - #name:: Returns the frozen string corresponding to symbol.
11357 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11358 * - #swapcase:: Returns symbol with all upcase characters downcased
11359 * and all downcase characters upcased.
11360 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11361 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11362 * - #to_sym, #intern:: Returns +self+.
11363 * - #upcase:: Returns symbol with all characters upcased.
11370 * sym == obj -> true or false
11372 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11373 * symbol, returns <code>true</code>.
11376 #define sym_equal rb_obj_equal
11379 sym_printable(const char *s
, const char *send
, rb_encoding
*enc
)
11383 int c
= rb_enc_precise_mbclen(s
, send
, enc
);
11385 if (!MBCLEN_CHARFOUND_P(c
)) return FALSE
;
11386 n
= MBCLEN_CHARFOUND_LEN(c
);
11387 c
= rb_enc_mbc_to_codepoint(s
, send
, enc
);
11388 if (!rb_enc_isprint(c
, enc
)) return FALSE
;
11395 rb_str_symname_p(VALUE sym
)
11400 rb_encoding
*resenc
= rb_default_internal_encoding();
11402 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11403 enc
= STR_ENC_GET(sym
);
11404 ptr
= RSTRING_PTR(sym
);
11405 len
= RSTRING_LEN(sym
);
11406 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(sym
)) || len
!= (long)strlen(ptr
) ||
11407 !rb_enc_symname2_p(ptr
, len
, enc
) || !sym_printable(ptr
, ptr
+ len
, enc
)) {
11414 rb_str_quote_unprintable(VALUE str
)
11419 rb_encoding
*resenc
;
11421 Check_Type(str
, T_STRING
);
11422 resenc
= rb_default_internal_encoding();
11423 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11424 enc
= STR_ENC_GET(str
);
11425 ptr
= RSTRING_PTR(str
);
11426 len
= RSTRING_LEN(str
);
11427 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(str
)) ||
11428 !sym_printable(ptr
, ptr
+ len
, enc
)) {
11429 return rb_str_escape(str
);
11434 MJIT_FUNC_EXPORTED VALUE
11435 rb_id_quote_unprintable(ID id
)
11437 VALUE str
= rb_id2str(id
);
11438 if (!rb_str_symname_p(str
)) {
11439 return rb_str_escape(str
);
11446 * sym.inspect -> string
11448 * Returns the representation of <i>sym</i> as a symbol literal.
11450 * :fred.inspect #=> ":fred"
11454 sym_inspect(VALUE sym
)
11456 VALUE str
= rb_sym2str(sym
);
11461 if (!rb_str_symname_p(str
)) {
11462 str
= rb_str_inspect(str
);
11463 len
= RSTRING_LEN(str
);
11464 rb_str_resize(str
, len
+ 1);
11465 dest
= RSTRING_PTR(str
);
11466 memmove(dest
+ 1, dest
, len
);
11469 rb_encoding
*enc
= STR_ENC_GET(str
);
11470 RSTRING_GETMEM(str
, ptr
, len
);
11471 str
= rb_enc_str_new(0, len
+ 1, enc
);
11472 dest
= RSTRING_PTR(str
);
11473 memcpy(dest
+ 1, ptr
, len
);
11479 #if 0 /* for RDoc */
11482 * sym.name -> string
11484 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11485 * returned string is frozen.
11487 * :fred.name #=> "fred"
11488 * :fred.name.frozen? #=> true
11489 * :fred.to_s #=> "fred"
11490 * :fred.to_s.frozen? #=> false
11493 rb_sym2str(VALUE sym
)
11502 * sym.id2name -> string
11503 * sym.to_s -> string
11505 * Returns the name or string corresponding to <i>sym</i>.
11507 * :fred.id2name #=> "fred"
11508 * :ginger.to_s #=> "ginger"
11510 * Note that this string is not frozen (unlike the symbol itself).
11511 * To get a frozen string, use #name.
11516 rb_sym_to_s(VALUE sym
)
11518 return str_new_shared(rb_cString
, rb_sym2str(sym
));
11524 * sym.to_sym -> sym
11525 * sym.intern -> sym
11527 * In general, <code>to_sym</code> returns the Symbol corresponding
11528 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11533 sym_to_sym(VALUE sym
)
11538 MJIT_FUNC_EXPORTED VALUE
11539 rb_sym_proc_call(ID mid
, int argc
, const VALUE
*argv
, int kw_splat
, VALUE passed_proc
)
11544 rb_raise(rb_eArgError
, "no receiver given");
11547 return rb_funcall_with_block_kw(obj
, mid
, argc
- 1, argv
+ 1, passed_proc
, kw_splat
);
11555 * Returns a _Proc_ object which responds to the given method by _sym_.
11557 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11561 rb_sym_to_proc(VALUE sym
)
11571 * Same as <code>sym.to_s.succ.intern</code>.
11575 sym_succ(VALUE sym
)
11577 return rb_str_intern(rb_str_succ(rb_sym2str(sym
)));
11583 * symbol <=> other_symbol -> -1, 0, +1, or nil
11585 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11586 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11587 * less than, equal to, or greater than +other_symbol+.
11589 * +nil+ is returned if the two values are incomparable.
11591 * See String#<=> for more information.
11595 sym_cmp(VALUE sym
, VALUE other
)
11597 if (!SYMBOL_P(other
)) {
11600 return rb_str_cmp_m(rb_sym2str(sym
), rb_sym2str(other
));
11605 * casecmp(other_symbol) -> -1, 0, 1, or nil
11607 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11609 * :aBcDeF.casecmp(:abcde) # => 1
11610 * :aBcDeF.casecmp(:abcdef) # => 0
11611 * :aBcDeF.casecmp(:abcdefg) # => -1
11612 * :abcdef.casecmp(:ABCDEF) # => 0
11614 * Returns +nil+ if the two symbols have incompatible encodings,
11615 * or if +other_symbol+ is not a symbol:
11617 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11618 * other_sym = :"\u{c4 d6 dc}"
11619 * sym.casecmp(other_sym) # => nil
11620 * :foo.casecmp(2) # => nil
11622 * Currently, case-insensitivity only works on characters A-Z/a-z,
11623 * not all of Unicode. This is different from Symbol#casecmp?.
11625 * Related: Symbol#casecmp?.
11630 sym_casecmp(VALUE sym
, VALUE other
)
11632 if (!SYMBOL_P(other
)) {
11635 return str_casecmp(rb_sym2str(sym
), rb_sym2str(other
));
11640 * casecmp?(other_symbol) -> true, false, or nil
11642 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11643 * Unicode case folding, +false+ if they are not equal:
11645 * :aBcDeF.casecmp?(:abcde) # => false
11646 * :aBcDeF.casecmp?(:abcdef) # => true
11647 * :aBcDeF.casecmp?(:abcdefg) # => false
11648 * :abcdef.casecmp?(:ABCDEF) # => true
11649 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11651 * Returns +nil+ if the two symbols have incompatible encodings,
11652 * or if +other_symbol+ is not a symbol:
11654 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11655 * other_sym = :"\u{c4 d6 dc}"
11656 * sym.casecmp?(other_sym) # => nil
11657 * :foo.casecmp?(2) # => nil
11659 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11661 * Related: Symbol#casecmp.
11666 sym_casecmp_p(VALUE sym
, VALUE other
)
11668 if (!SYMBOL_P(other
)) {
11671 return str_casecmp_p(rb_sym2str(sym
), rb_sym2str(other
));
11676 * sym =~ obj -> integer or nil
11678 * Returns <code>sym.to_s =~ obj</code>.
11682 sym_match(VALUE sym
, VALUE other
)
11684 return rb_str_match(rb_sym2str(sym
), other
);
11689 * sym.match(pattern) -> matchdata or nil
11690 * sym.match(pattern, pos) -> matchdata or nil
11692 * Returns <code>sym.to_s.match</code>.
11696 sym_match_m(int argc
, VALUE
*argv
, VALUE sym
)
11698 return rb_str_match_m(argc
, argv
, rb_sym2str(sym
));
11703 * sym.match?(pattern) -> true or false
11704 * sym.match?(pattern, pos) -> true or false
11706 * Returns <code>sym.to_s.match?</code>.
11710 sym_match_m_p(int argc
, VALUE
*argv
, VALUE sym
)
11712 return rb_str_match_m_p(argc
, argv
, sym
);
11718 * sym[b, n] -> string
11719 * sym.slice(idx) -> char
11720 * sym.slice(b, n) -> string
11722 * Returns <code>sym.to_s[]</code>.
11726 sym_aref(int argc
, VALUE
*argv
, VALUE sym
)
11728 return rb_str_aref_m(argc
, argv
, rb_sym2str(sym
));
11733 * sym.length -> integer
11734 * sym.size -> integer
11736 * Same as <code>sym.to_s.length</code>.
11740 sym_length(VALUE sym
)
11742 return rb_str_length(rb_sym2str(sym
));
11747 * sym.empty? -> true or false
11749 * Returns whether _sym_ is :"" or not.
11753 sym_empty(VALUE sym
)
11755 return rb_str_empty(rb_sym2str(sym
));
11760 * upcase(*options) -> symbol
11762 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11764 * See String#upcase.
11769 sym_upcase(int argc
, VALUE
*argv
, VALUE sym
)
11771 return rb_str_intern(rb_str_upcase(argc
, argv
, rb_sym2str(sym
)));
11776 * downcase(*options) -> symbol
11778 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11780 * See String#downcase.
11782 * Related: Symbol#upcase.
11787 sym_downcase(int argc
, VALUE
*argv
, VALUE sym
)
11789 return rb_str_intern(rb_str_downcase(argc
, argv
, rb_sym2str(sym
)));
11794 * capitalize(*options) -> symbol
11796 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11798 * See String#capitalize.
11803 sym_capitalize(int argc
, VALUE
*argv
, VALUE sym
)
11805 return rb_str_intern(rb_str_capitalize(argc
, argv
, rb_sym2str(sym
)));
11810 * swapcase(*options) -> symbol
11812 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11814 * See String#swapcase.
11819 sym_swapcase(int argc
, VALUE
*argv
, VALUE sym
)
11821 return rb_str_intern(rb_str_swapcase(argc
, argv
, rb_sym2str(sym
)));
11826 * sym.start_with?([prefixes]+) -> true or false
11828 * Returns true if +sym+ starts with one of the +prefixes+ given.
11829 * Each of the +prefixes+ should be a String or a Regexp.
11831 * :hello.start_with?("hell") #=> true
11832 * :hello.start_with?(/H/i) #=> true
11834 * # returns true if one of the prefixes matches.
11835 * :hello.start_with?("heaven", "hell") #=> true
11836 * :hello.start_with?("heaven", "paradise") #=> false
11840 sym_start_with(int argc
, VALUE
*argv
, VALUE sym
)
11842 return rb_str_start_with(argc
, argv
, rb_sym2str(sym
));
11847 * sym.end_with?([suffixes]+) -> true or false
11849 * Returns true if +sym+ ends with one of the +suffixes+ given.
11851 * :hello.end_with?("ello") #=> true
11853 * # returns true if one of the +suffixes+ matches.
11854 * :hello.end_with?("heaven", "ello") #=> true
11855 * :hello.end_with?("heaven", "paradise") #=> false
11859 sym_end_with(int argc
, VALUE
*argv
, VALUE sym
)
11861 return rb_str_end_with(argc
, argv
, rb_sym2str(sym
));
11866 * sym.encoding -> encoding
11868 * Returns the Encoding object that represents the encoding of _sym_.
11872 sym_encoding(VALUE sym
)
11874 return rb_obj_encoding(rb_sym2str(sym
));
11878 string_for_symbol(VALUE name
)
11880 if (!RB_TYPE_P(name
, T_STRING
)) {
11881 VALUE tmp
= rb_check_string_type(name
);
11883 rb_raise(rb_eTypeError
, "%+"PRIsVALUE
" is not a symbol",
11892 rb_to_id(VALUE name
)
11894 if (SYMBOL_P(name
)) {
11895 return SYM2ID(name
);
11897 name
= string_for_symbol(name
);
11898 return rb_intern_str(name
);
11902 rb_to_symbol(VALUE name
)
11904 if (SYMBOL_P(name
)) {
11907 name
= string_for_symbol(name
);
11908 return rb_str_intern(name
);
11913 * Symbol.all_symbols => array
11915 * Returns an array of all the symbols currently in Ruby's symbol
11918 * Symbol.all_symbols.size #=> 903
11919 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11920 * :chown, :EOFError, :$;, :String,
11921 * :LOCK_SH, :"setuid?", :$<,
11922 * :default_proc, :compact, :extend,
11923 * :Tms, :getwd, :$=, :ThreadGroup,
11928 sym_all_symbols(VALUE _
)
11930 return rb_sym_all_symbols();
11934 rb_str_to_interned_str(VALUE str
)
11936 return rb_fstring(str
);
11940 rb_interned_str(const char *ptr
, long len
)
11942 struct RString fake_str
;
11943 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), TRUE
);
11947 rb_interned_str_cstr(const char *ptr
)
11949 return rb_interned_str(ptr
, strlen(ptr
));
11953 rb_enc_interned_str(const char *ptr
, long len
, rb_encoding
*enc
)
11955 if (UNLIKELY(rb_enc_autoload_p(enc
))) {
11956 rb_enc_autoload(enc
);
11959 struct RString fake_str
;
11960 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), TRUE
);
11964 rb_enc_interned_str_cstr(const char *ptr
, rb_encoding
*enc
)
11966 return rb_enc_interned_str(ptr
, strlen(ptr
), enc
);
11970 * A \String object has an arbitrary sequence of bytes,
11971 * typically representing text or binary data.
11972 * A \String object may be created using String::new or as literals.
11974 * String objects differ from Symbol objects in that Symbol objects are
11975 * designed to be used as identifiers, instead of text or data.
11977 * You can create a \String object explicitly with:
11979 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11980 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11982 * You can convert certain objects to Strings with:
11984 * - \Method {String}[Kernel.html#method-i-String].
11986 * Some \String methods modify +self+.
11987 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11988 * and returns +self+;
11989 * often a similarly named method (without the <tt>!</tt>)
11990 * returns a new string.
11992 * In general, if there exist both bang and non-bang version of method,
11993 * the bang! mutates and the non-bang! does not.
11994 * However, a method without a bang can also mutate, such as String#replace.
11996 * == Substitution Methods
11998 * These methods perform substitutions:
12000 * - String#sub: One substitution (or none); returns a new string.
12001 * - String#sub!: One substitution (or none); returns +self+.
12002 * - String#gsub: Zero or more substitutions; returns a new string.
12003 * - String#gsub!: Zero or more substitutions; returns +self+.
12005 * Each of these methods takes:
12007 * - A first argument, +pattern+ (string or regexp),
12008 * that specifies the substring(s) to be replaced.
12010 * - Either of these:
12012 * - A second argument, +replacement+ (string or hash),
12013 * that determines the replacing string.
12014 * - A block that will determine the replacing string.
12016 * The examples in this section mostly use methods String#sub and String#gsub;
12017 * the principles illustrated apply to all four substitution methods.
12019 * <b>Argument +pattern+</b>
12021 * Argument +pattern+ is commonly a regular expression:
12024 * s.sub(/[aeiou]/, '*') # => "h*llo"
12025 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12026 * s.gsub(/[aeiou]/, '') # => "hll"
12027 * s.sub(/ell/, 'al') # => "halo"
12028 * s.gsub(/xyzzy/, '*') # => "hello"
12029 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12031 * When +pattern+ is a string, all its characters are treated
12032 * as ordinary characters (not as regexp special characters):
12034 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12036 * <b>\String +replacement+</b>
12038 * If +replacement+ is a string, that string will determine
12039 * the replacing string that is to be substituted for the matched text.
12041 * Each of the examples above uses a simple string as the replacing string.
12043 * \String +replacement+ may contain back-references to the pattern's captures:
12045 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12046 * - <tt>\k<name></tt> refers to the named capture +name+.
12048 * See rdoc-ref:regexp.rdoc for details.
12050 * Note that within the string +replacement+, a character combination
12051 * such as <tt>$&</tt> is treated as ordinary text, and not as
12052 * a special match variable.
12053 * However, you may refer to some special match variables using these
12056 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12057 * which contains the complete matched text.
12058 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12059 * which contains string after match.
12060 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12061 * which contains string before match.
12062 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12063 * which contains last capture group.
12065 * See rdoc-ref:regexp.rdoc for details.
12067 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12069 * Note also that a string literal consumes backslashes.
12070 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12072 * A back-reference is typically preceded by an additional backslash.
12073 * For example, if you want to write a back-reference <tt>\&</tt> in
12074 * +replacement+ with a double-quoted string literal, you need to write
12075 * <tt>"..\\\\&.."</tt>.
12077 * If you want to write a non-back-reference string <tt>\&</tt> in
12078 * +replacement+, you need first to escape the backslash to prevent
12079 * this method from interpreting it as a back-reference, and then you
12080 * need to escape the backslashes again to prevent a string literal from
12081 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12083 * You may want to use the block form to avoid a lot of backslashes.
12085 * <b>\Hash +replacement+</b>
12087 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12088 * the replacing string is the value for that key:
12090 * h = {'foo' => 'bar', 'baz' => 'bat'}
12091 * 'food'.sub('foo', h) # => "bard"
12093 * Note that a symbol key does not match:
12095 * h = {foo: 'bar', baz: 'bat'}
12096 * 'food'.sub('foo', h) # => "d"
12100 * In the block form, the current match string is passed to the block;
12101 * the block's return value becomes the replacing string:
12104 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12106 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12107 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12112 * First, what's elsewhere. \Class \String:
12114 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12115 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12117 * Here, class \String provides methods that are useful for:
12119 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12120 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12121 * - {Querying}[#class-String-label-Methods+for+Querying]
12122 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12123 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12124 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12125 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12126 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12128 * === Methods for Creating a \String
12130 * - ::new:: Returns a new string.
12131 * - ::try_convert:: Returns a new string created from a given object.
12133 * === Methods for a Frozen/Unfrozen String
12135 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12136 * +self+, if not frozen; +self.dup+ otherwise.
12137 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12138 * +self+, if already frozen; +self.freeze+ otherwise.
12139 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12141 * === Methods for Querying
12145 * - #length, #size:: Returns the count of characters (not bytes).
12146 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12147 * - #bytesize:: Returns the count of bytes.
12148 * - #count:: Returns the count of substrings matching given strings.
12152 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12153 * returns +nil+ if no match is found.
12154 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12155 * returns +nil+ if none found.
12156 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12157 * returns +nil+ if none found.
12158 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12159 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12160 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12161 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12162 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12166 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12167 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12168 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12169 * for its encoding.
12170 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12174 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12175 * - #hash:: Returns the integer hash code.
12177 * === Methods for Comparing
12179 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12180 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12181 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12182 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12183 * other string is smaller than, equal to, or larger than +self+.
12184 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12185 * +false+ otherwise.
12187 * === Methods for Modifying a \String
12189 * Each of these methods modifies +self+.
12193 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12194 * - #<<:: Returns +self+ concatenated with a given string or integer.
12198 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12199 * returns +self+ if any changes, +nil+ otherwise.
12200 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12201 * returns +self+ if any changes, +nil+ otherwise.
12202 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12203 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12204 * - #reverse!:: Returns +self+ with its characters in reverse order.
12205 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12206 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12207 * returns +self+ if any changes, +nil+ otherwise.
12208 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12209 * removing duplicates from the substrings that were modified;
12210 * returns +self+ if any changes, +nil+ otherwise.
12214 * - #capitalize!:: Upcases the initial character and downcases all others;
12215 * returns +self+ if any changes, +nil+ otherwise.
12216 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12217 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12218 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12219 * returns +self+ if any changes, +nil+ otherwise.
12223 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12224 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12225 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12226 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12230 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12231 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12232 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12233 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12234 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12235 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12236 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12237 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12238 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12239 * returns +self+ if any changes, +nil+ otherwise.
12241 * === Methods for Converting to New \String
12243 * Each of these methods returns a new \String based on +self+,
12244 * often just a modified copy of +self+.
12248 * - #*:: Returns the concatenation of multiple copies of +self+,
12249 * - #+:: Returns the concatenation of +self+ and a given other string.
12250 * - #center:: Returns a copy of +self+ centered between pad substring.
12251 * - #concat:: Returns the concatenation of +self+ with given other strings.
12252 * - #prepend:: Returns the concatenation of a given other string with +self+.
12253 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12254 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12258 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12259 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12260 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12261 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12265 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12266 * and all special characters escaped.
12267 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12268 * and all escaped characters unescaped.
12269 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12270 * replaced with a given replacement string;.
12271 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12272 * replaced with a given replacement string.
12273 * - #succ, #next:: Returns the string that is the successor to +self+.
12274 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12275 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12276 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12277 * removing duplicates from the substrings that were modified.
12278 * - #%:: Returns the string resulting from formatting a given object into +self+
12282 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12283 * and all other characters downcased.
12284 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12285 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12286 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12287 * and all downcase characters upcased.
12291 * - #delete:: Returns a copy of +self+ with characters removed
12292 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12293 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12294 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12295 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12296 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12297 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12298 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12299 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12300 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12301 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12302 * - #chr:: Returns the first character.
12306 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12307 * otherwise, returns +self+.
12309 * === Methods for Converting to Non-\String
12311 * Each of these methods converts the contents of +self+ to a non-\String.
12313 * <em>Characters, Bytes, and Clusters</em>
12315 * - #bytes:: Returns an array of the bytes in +self+.
12316 * - #chars:: Returns an array of the characters in +self+.
12317 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12318 * - #getbyte:: Returns an integer byte as determined by a given index.
12319 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12323 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12324 * - #partition:: Returns a 3-element array determined by the first substring that matches
12325 * a given substring or regexp,
12326 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12327 * a given substring or regexp,
12328 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12329 * or, if a block given, passes those substrings to the block.
12333 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12334 * if a block given, passes each matching substring to the block.
12335 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12336 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12340 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12341 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12342 * - #ord:: Returns the integer ordinal of the first character in +self+.
12343 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12344 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12346 * <em>Strings and Symbols</em>
12348 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12349 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12351 * === Methods for Iterating
12353 * - #each_byte:: Calls the given block with each successive byte in +self+.
12354 * - #each_char:: Calls the given block with each successive character in +self+.
12355 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12356 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12357 * - #each_line:: Calls the given block with each successive line in +self+,
12358 * as determined by a given record separator.
12359 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12365 rb_cString
= rb_define_class("String", rb_cObject
);
12366 assert(rb_vm_fstring_table());
12367 st_foreach(rb_vm_fstring_table(), fstring_set_class_i
, rb_cString
);
12368 rb_include_module(rb_cString
, rb_mComparable
);
12369 rb_define_alloc_func(rb_cString
, empty_str_alloc
);
12370 rb_define_singleton_method(rb_cString
, "try_convert", rb_str_s_try_convert
, 1);
12371 rb_define_method(rb_cString
, "initialize", rb_str_init
, -1);
12372 rb_define_method(rb_cString
, "initialize_copy", rb_str_replace
, 1);
12373 rb_define_method(rb_cString
, "<=>", rb_str_cmp_m
, 1);
12374 rb_define_method(rb_cString
, "==", rb_str_equal
, 1);
12375 rb_define_method(rb_cString
, "===", rb_str_equal
, 1);
12376 rb_define_method(rb_cString
, "eql?", rb_str_eql
, 1);
12377 rb_define_method(rb_cString
, "hash", rb_str_hash_m
, 0);
12378 rb_define_method(rb_cString
, "casecmp", rb_str_casecmp
, 1);
12379 rb_define_method(rb_cString
, "casecmp?", rb_str_casecmp_p
, 1);
12380 rb_define_method(rb_cString
, "+", rb_str_plus
, 1);
12381 rb_define_method(rb_cString
, "*", rb_str_times
, 1);
12382 rb_define_method(rb_cString
, "%", rb_str_format_m
, 1);
12383 rb_define_method(rb_cString
, "[]", rb_str_aref_m
, -1);
12384 rb_define_method(rb_cString
, "[]=", rb_str_aset_m
, -1);
12385 rb_define_method(rb_cString
, "insert", rb_str_insert
, 2);
12386 rb_define_method(rb_cString
, "length", rb_str_length
, 0);
12387 rb_define_method(rb_cString
, "size", rb_str_length
, 0);
12388 rb_define_method(rb_cString
, "bytesize", rb_str_bytesize
, 0);
12389 rb_define_method(rb_cString
, "empty?", rb_str_empty
, 0);
12390 rb_define_method(rb_cString
, "=~", rb_str_match
, 1);
12391 rb_define_method(rb_cString
, "match", rb_str_match_m
, -1);
12392 rb_define_method(rb_cString
, "match?", rb_str_match_m_p
, -1);
12393 rb_define_method(rb_cString
, "succ", rb_str_succ
, 0);
12394 rb_define_method(rb_cString
, "succ!", rb_str_succ_bang
, 0);
12395 rb_define_method(rb_cString
, "next", rb_str_succ
, 0);
12396 rb_define_method(rb_cString
, "next!", rb_str_succ_bang
, 0);
12397 rb_define_method(rb_cString
, "upto", rb_str_upto
, -1);
12398 rb_define_method(rb_cString
, "index", rb_str_index_m
, -1);
12399 rb_define_method(rb_cString
, "rindex", rb_str_rindex_m
, -1);
12400 rb_define_method(rb_cString
, "replace", rb_str_replace
, 1);
12401 rb_define_method(rb_cString
, "clear", rb_str_clear
, 0);
12402 rb_define_method(rb_cString
, "chr", rb_str_chr
, 0);
12403 rb_define_method(rb_cString
, "getbyte", rb_str_getbyte
, 1);
12404 rb_define_method(rb_cString
, "setbyte", rb_str_setbyte
, 2);
12405 rb_define_method(rb_cString
, "byteslice", rb_str_byteslice
, -1);
12406 rb_define_method(rb_cString
, "scrub", str_scrub
, -1);
12407 rb_define_method(rb_cString
, "scrub!", str_scrub_bang
, -1);
12408 rb_define_method(rb_cString
, "freeze", rb_str_freeze
, 0);
12409 rb_define_method(rb_cString
, "+@", str_uplus
, 0);
12410 rb_define_method(rb_cString
, "-@", str_uminus
, 0);
12412 rb_define_method(rb_cString
, "to_i", rb_str_to_i
, -1);
12413 rb_define_method(rb_cString
, "to_f", rb_str_to_f
, 0);
12414 rb_define_method(rb_cString
, "to_s", rb_str_to_s
, 0);
12415 rb_define_method(rb_cString
, "to_str", rb_str_to_s
, 0);
12416 rb_define_method(rb_cString
, "inspect", rb_str_inspect
, 0);
12417 rb_define_method(rb_cString
, "dump", rb_str_dump
, 0);
12418 rb_define_method(rb_cString
, "undump", str_undump
, 0);
12420 sym_ascii
= ID2SYM(rb_intern_const("ascii"));
12421 sym_turkic
= ID2SYM(rb_intern_const("turkic"));
12422 sym_lithuanian
= ID2SYM(rb_intern_const("lithuanian"));
12423 sym_fold
= ID2SYM(rb_intern_const("fold"));
12425 rb_define_method(rb_cString
, "upcase", rb_str_upcase
, -1);
12426 rb_define_method(rb_cString
, "downcase", rb_str_downcase
, -1);
12427 rb_define_method(rb_cString
, "capitalize", rb_str_capitalize
, -1);
12428 rb_define_method(rb_cString
, "swapcase", rb_str_swapcase
, -1);
12430 rb_define_method(rb_cString
, "upcase!", rb_str_upcase_bang
, -1);
12431 rb_define_method(rb_cString
, "downcase!", rb_str_downcase_bang
, -1);
12432 rb_define_method(rb_cString
, "capitalize!", rb_str_capitalize_bang
, -1);
12433 rb_define_method(rb_cString
, "swapcase!", rb_str_swapcase_bang
, -1);
12435 rb_define_method(rb_cString
, "hex", rb_str_hex
, 0);
12436 rb_define_method(rb_cString
, "oct", rb_str_oct
, 0);
12437 rb_define_method(rb_cString
, "split", rb_str_split_m
, -1);
12438 rb_define_method(rb_cString
, "lines", rb_str_lines
, -1);
12439 rb_define_method(rb_cString
, "bytes", rb_str_bytes
, 0);
12440 rb_define_method(rb_cString
, "chars", rb_str_chars
, 0);
12441 rb_define_method(rb_cString
, "codepoints", rb_str_codepoints
, 0);
12442 rb_define_method(rb_cString
, "grapheme_clusters", rb_str_grapheme_clusters
, 0);
12443 rb_define_method(rb_cString
, "reverse", rb_str_reverse
, 0);
12444 rb_define_method(rb_cString
, "reverse!", rb_str_reverse_bang
, 0);
12445 rb_define_method(rb_cString
, "concat", rb_str_concat_multi
, -1);
12446 rb_define_method(rb_cString
, "<<", rb_str_concat
, 1);
12447 rb_define_method(rb_cString
, "prepend", rb_str_prepend_multi
, -1);
12448 rb_define_method(rb_cString
, "crypt", rb_str_crypt
, 1);
12449 rb_define_method(rb_cString
, "intern", rb_str_intern
, 0); /* in symbol.c */
12450 rb_define_method(rb_cString
, "to_sym", rb_str_intern
, 0); /* in symbol.c */
12451 rb_define_method(rb_cString
, "ord", rb_str_ord
, 0);
12453 rb_define_method(rb_cString
, "include?", rb_str_include
, 1);
12454 rb_define_method(rb_cString
, "start_with?", rb_str_start_with
, -1);
12455 rb_define_method(rb_cString
, "end_with?", rb_str_end_with
, -1);
12457 rb_define_method(rb_cString
, "scan", rb_str_scan
, 1);
12459 rb_define_method(rb_cString
, "ljust", rb_str_ljust
, -1);
12460 rb_define_method(rb_cString
, "rjust", rb_str_rjust
, -1);
12461 rb_define_method(rb_cString
, "center", rb_str_center
, -1);
12463 rb_define_method(rb_cString
, "sub", rb_str_sub
, -1);
12464 rb_define_method(rb_cString
, "gsub", rb_str_gsub
, -1);
12465 rb_define_method(rb_cString
, "chop", rb_str_chop
, 0);
12466 rb_define_method(rb_cString
, "chomp", rb_str_chomp
, -1);
12467 rb_define_method(rb_cString
, "strip", rb_str_strip
, 0);
12468 rb_define_method(rb_cString
, "lstrip", rb_str_lstrip
, 0);
12469 rb_define_method(rb_cString
, "rstrip", rb_str_rstrip
, 0);
12470 rb_define_method(rb_cString
, "delete_prefix", rb_str_delete_prefix
, 1);
12471 rb_define_method(rb_cString
, "delete_suffix", rb_str_delete_suffix
, 1);
12473 rb_define_method(rb_cString
, "sub!", rb_str_sub_bang
, -1);
12474 rb_define_method(rb_cString
, "gsub!", rb_str_gsub_bang
, -1);
12475 rb_define_method(rb_cString
, "chop!", rb_str_chop_bang
, 0);
12476 rb_define_method(rb_cString
, "chomp!", rb_str_chomp_bang
, -1);
12477 rb_define_method(rb_cString
, "strip!", rb_str_strip_bang
, 0);
12478 rb_define_method(rb_cString
, "lstrip!", rb_str_lstrip_bang
, 0);
12479 rb_define_method(rb_cString
, "rstrip!", rb_str_rstrip_bang
, 0);
12480 rb_define_method(rb_cString
, "delete_prefix!", rb_str_delete_prefix_bang
, 1);
12481 rb_define_method(rb_cString
, "delete_suffix!", rb_str_delete_suffix_bang
, 1);
12483 rb_define_method(rb_cString
, "tr", rb_str_tr
, 2);
12484 rb_define_method(rb_cString
, "tr_s", rb_str_tr_s
, 2);
12485 rb_define_method(rb_cString
, "delete", rb_str_delete
, -1);
12486 rb_define_method(rb_cString
, "squeeze", rb_str_squeeze
, -1);
12487 rb_define_method(rb_cString
, "count", rb_str_count
, -1);
12489 rb_define_method(rb_cString
, "tr!", rb_str_tr_bang
, 2);
12490 rb_define_method(rb_cString
, "tr_s!", rb_str_tr_s_bang
, 2);
12491 rb_define_method(rb_cString
, "delete!", rb_str_delete_bang
, -1);
12492 rb_define_method(rb_cString
, "squeeze!", rb_str_squeeze_bang
, -1);
12494 rb_define_method(rb_cString
, "each_line", rb_str_each_line
, -1);
12495 rb_define_method(rb_cString
, "each_byte", rb_str_each_byte
, 0);
12496 rb_define_method(rb_cString
, "each_char", rb_str_each_char
, 0);
12497 rb_define_method(rb_cString
, "each_codepoint", rb_str_each_codepoint
, 0);
12498 rb_define_method(rb_cString
, "each_grapheme_cluster", rb_str_each_grapheme_cluster
, 0);
12500 rb_define_method(rb_cString
, "sum", rb_str_sum
, -1);
12502 rb_define_method(rb_cString
, "slice", rb_str_aref_m
, -1);
12503 rb_define_method(rb_cString
, "slice!", rb_str_slice_bang
, -1);
12505 rb_define_method(rb_cString
, "partition", rb_str_partition
, 1);
12506 rb_define_method(rb_cString
, "rpartition", rb_str_rpartition
, 1);
12508 rb_define_method(rb_cString
, "encoding", rb_obj_encoding
, 0); /* in encoding.c */
12509 rb_define_method(rb_cString
, "force_encoding", rb_str_force_encoding
, 1);
12510 rb_define_method(rb_cString
, "b", rb_str_b
, 0);
12511 rb_define_method(rb_cString
, "valid_encoding?", rb_str_valid_encoding_p
, 0);
12512 rb_define_method(rb_cString
, "ascii_only?", rb_str_is_ascii_only_p
, 0);
12514 /* define UnicodeNormalize module here so that we don't have to look it up */
12515 mUnicodeNormalize
= rb_define_module("UnicodeNormalize");
12516 id_normalize
= rb_intern_const("normalize");
12517 id_normalized_p
= rb_intern_const("normalized?");
12519 rb_define_method(rb_cString
, "unicode_normalize", rb_str_unicode_normalize
, -1);
12520 rb_define_method(rb_cString
, "unicode_normalize!", rb_str_unicode_normalize_bang
, -1);
12521 rb_define_method(rb_cString
, "unicode_normalized?", rb_str_unicode_normalized_p
, -1);
12524 rb_define_hooked_variable("$;", &rb_fs
, 0, rb_fs_setter
);
12525 rb_define_hooked_variable("$-F", &rb_fs
, 0, rb_fs_setter
);
12526 rb_gc_register_address(&rb_fs
);
12528 rb_cSymbol
= rb_define_class("Symbol", rb_cObject
);
12529 rb_include_module(rb_cSymbol
, rb_mComparable
);
12530 rb_undef_alloc_func(rb_cSymbol
);
12531 rb_undef_method(CLASS_OF(rb_cSymbol
), "new");
12532 rb_define_singleton_method(rb_cSymbol
, "all_symbols", sym_all_symbols
, 0);
12534 rb_define_method(rb_cSymbol
, "==", sym_equal
, 1);
12535 rb_define_method(rb_cSymbol
, "===", sym_equal
, 1);
12536 rb_define_method(rb_cSymbol
, "inspect", sym_inspect
, 0);
12537 rb_define_method(rb_cSymbol
, "to_s", rb_sym_to_s
, 0);
12538 rb_define_method(rb_cSymbol
, "id2name", rb_sym_to_s
, 0);
12539 rb_define_method(rb_cSymbol
, "name", rb_sym2str
, 0);
12540 rb_define_method(rb_cSymbol
, "intern", sym_to_sym
, 0);
12541 rb_define_method(rb_cSymbol
, "to_sym", sym_to_sym
, 0);
12542 rb_define_method(rb_cSymbol
, "to_proc", rb_sym_to_proc
, 0);
12543 rb_define_method(rb_cSymbol
, "succ", sym_succ
, 0);
12544 rb_define_method(rb_cSymbol
, "next", sym_succ
, 0);
12546 rb_define_method(rb_cSymbol
, "<=>", sym_cmp
, 1);
12547 rb_define_method(rb_cSymbol
, "casecmp", sym_casecmp
, 1);
12548 rb_define_method(rb_cSymbol
, "casecmp?", sym_casecmp_p
, 1);
12549 rb_define_method(rb_cSymbol
, "=~", sym_match
, 1);
12551 rb_define_method(rb_cSymbol
, "[]", sym_aref
, -1);
12552 rb_define_method(rb_cSymbol
, "slice", sym_aref
, -1);
12553 rb_define_method(rb_cSymbol
, "length", sym_length
, 0);
12554 rb_define_method(rb_cSymbol
, "size", sym_length
, 0);
12555 rb_define_method(rb_cSymbol
, "empty?", sym_empty
, 0);
12556 rb_define_method(rb_cSymbol
, "match", sym_match_m
, -1);
12557 rb_define_method(rb_cSymbol
, "match?", sym_match_m_p
, -1);
12559 rb_define_method(rb_cSymbol
, "upcase", sym_upcase
, -1);
12560 rb_define_method(rb_cSymbol
, "downcase", sym_downcase
, -1);
12561 rb_define_method(rb_cSymbol
, "capitalize", sym_capitalize
, -1);
12562 rb_define_method(rb_cSymbol
, "swapcase", sym_swapcase
, -1);
12564 rb_define_method(rb_cSymbol
, "start_with?", sym_start_with
, -1);
12565 rb_define_method(rb_cSymbol
, "end_with?", sym_end_with
, -1);
12567 rb_define_method(rb_cSymbol
, "encoding", sym_encoding
, 0);