Fix typos [ci skip]
[ruby-80x24.org.git] / string.c
blob24acdfae0e4bdb42defa96fb4581402fd31cb83a
1 /**********************************************************************
3 string.c -
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "gc.h"
27 #include "id.h"
28 #include "internal.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
42 #include "probes.h"
43 #include "ruby/encoding.h"
44 #include "ruby/re.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
47 #include "vm_sync.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
51 # include <crypt.h>
52 # endif
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
56 #endif
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
61 #undef rb_str_new
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
64 #undef rb_enc_str_new
65 #undef rb_str_new_cstr
66 #undef rb_tainted_str_new_cstr
67 #undef rb_usascii_str_new_cstr
68 #undef rb_utf8_str_new_cstr
69 #undef rb_enc_str_new_cstr
70 #undef rb_external_str_new_cstr
71 #undef rb_locale_str_new_cstr
72 #undef rb_str_dup_frozen
73 #undef rb_str_buf_new_cstr
74 #undef rb_str_buf_cat
75 #undef rb_str_buf_cat2
76 #undef rb_str_cat2
77 #undef rb_str_cat_cstr
78 #undef rb_fstring_cstr
80 VALUE rb_cString;
81 VALUE rb_cSymbol;
83 /* FLAGS of RString
85 * 1: RSTRING_NOEMBED
86 * 2: STR_SHARED (== ELTS_SHARED)
87 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
88 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
89 * other strings that rely on this string's buffer)
90 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
91 * early, specific to rb_str_tmp_frozen_{acquire,release})
92 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
93 * such as read(2). Any modification and realloc is prohibited)
95 * 8-9: ENC_CODERANGE (2 bits)
96 * 10-16: ENCODING (7 bits == 128)
97 * 17: RSTRING_FSTR
98 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
99 * used for a string object based on C string literal)
100 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
101 * object header is temporarily allocated on C stack)
104 #define RUBY_MAX_CHAR_LEN 16
105 #define STR_SHARED_ROOT FL_USER5
106 #define STR_BORROWED FL_USER6
107 #define STR_TMPLOCK FL_USER7
108 #define STR_NOFREE FL_USER18
109 #define STR_FAKESTR FL_USER19
111 #define STR_SET_NOEMBED(str) do {\
112 FL_SET((str), STR_NOEMBED);\
113 if (USE_RVARGC) {\
114 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
116 else {\
117 STR_SET_EMBED_LEN((str), 0);\
119 } while (0)
120 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121 #if USE_RVARGC
122 # define STR_SET_EMBED_LEN(str, n) do { \
123 assert(str_embed_capa(str) > (n));\
124 RSTRING(str)->as.embed.len = (n);\
125 } while (0)
126 #else
127 # define STR_SET_EMBED_LEN(str, n) do { \
128 long tmp_n = (n);\
129 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
130 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
131 } while (0)
132 #endif
134 #define STR_SET_LEN(str, n) do { \
135 if (STR_EMBED_P(str)) {\
136 STR_SET_EMBED_LEN((str), (n));\
138 else {\
139 RSTRING(str)->as.heap.len = (n);\
141 } while (0)
143 #define STR_DEC_LEN(str) do {\
144 if (STR_EMBED_P(str)) {\
145 long n = RSTRING_LEN(str);\
146 n--;\
147 STR_SET_EMBED_LEN((str), n);\
149 else {\
150 RSTRING(str)->as.heap.len--;\
152 } while (0)
154 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
155 #define TERM_FILL(ptr, termlen) do {\
156 char *const term_fill_ptr = (ptr);\
157 const int term_fill_len = (termlen);\
158 *term_fill_ptr = '\0';\
159 if (UNLIKELY(term_fill_len > 1))\
160 memset(term_fill_ptr, 0, term_fill_len);\
161 } while (0)
163 #define RESIZE_CAPA(str,capacity) do {\
164 const int termlen = TERM_LEN(str);\
165 RESIZE_CAPA_TERM(str,capacity,termlen);\
166 } while (0)
167 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
168 if (STR_EMBED_P(str)) {\
169 if (str_embed_capa(str) < capacity + termlen) {\
170 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
171 const long tlen = RSTRING_LEN(str);\
172 memcpy(tmp, RSTRING_PTR(str), tlen);\
173 RSTRING(str)->as.heap.ptr = tmp;\
174 RSTRING(str)->as.heap.len = tlen;\
175 STR_SET_NOEMBED(str);\
176 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 else {\
180 assert(!FL_TEST((str), STR_SHARED)); \
181 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
182 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
183 RSTRING(str)->as.heap.aux.capa = (capacity);\
185 } while (0)
187 #define STR_SET_SHARED(str, shared_str) do { \
188 if (!FL_TEST(str, STR_FAKESTR)) { \
189 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
190 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
191 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
192 FL_SET((str), STR_SHARED); \
193 FL_SET((shared_str), STR_SHARED_ROOT); \
194 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
195 FL_SET_RAW((shared_str), STR_BORROWED); \
197 } while (0)
199 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
200 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
201 /* TODO: include the terminator size in capa. */
203 #define STR_ENC_GET(str) get_encoding(str)
205 #if !defined SHARABLE_MIDDLE_SUBSTRING
206 # define SHARABLE_MIDDLE_SUBSTRING 0
207 #endif
208 #if !SHARABLE_MIDDLE_SUBSTRING
209 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
210 #else
211 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
212 #endif
215 static inline long
216 str_embed_capa(VALUE str)
218 #if USE_RVARGC
219 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
220 #else
221 return RSTRING_EMBED_LEN_MAX + 1;
222 #endif
225 static inline size_t
226 str_embed_size(long capa)
228 return offsetof(struct RString, as.embed.ary) + capa;
231 static inline bool
232 STR_EMBEDDABLE_P(long len, long termlen)
234 #if USE_RVARGC
235 return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
236 #else
237 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
238 #endif
241 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
242 static VALUE str_new_frozen(VALUE klass, VALUE orig);
243 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
244 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
245 static VALUE str_new(VALUE klass, const char *ptr, long len);
246 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
247 static inline void str_modifiable(VALUE str);
248 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
250 static inline void
251 str_make_independent(VALUE str)
253 long len = RSTRING_LEN(str);
254 int termlen = TERM_LEN(str);
255 str_make_independent_expand((str), len, 0L, termlen);
258 static inline int str_dependent_p(VALUE str);
260 void
261 rb_str_make_independent(VALUE str)
263 if (str_dependent_p(str)) {
264 str_make_independent(str);
268 void
269 rb_debug_rstring_null_ptr(const char *func)
271 fprintf(stderr, "%s is returning NULL!! "
272 "SIGSEGV is highly expected to follow immediately. "
273 "If you could reproduce, attach your debugger here, "
274 "and look at the passed string.",
275 func);
278 /* symbols for [up|down|swap]case/capitalize options */
279 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
281 static rb_encoding *
282 get_actual_encoding(const int encidx, VALUE str)
284 const unsigned char *q;
286 switch (encidx) {
287 case ENCINDEX_UTF_16:
288 if (RSTRING_LEN(str) < 2) break;
289 q = (const unsigned char *)RSTRING_PTR(str);
290 if (q[0] == 0xFE && q[1] == 0xFF) {
291 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
293 if (q[0] == 0xFF && q[1] == 0xFE) {
294 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
296 return rb_ascii8bit_encoding();
297 case ENCINDEX_UTF_32:
298 if (RSTRING_LEN(str) < 4) break;
299 q = (const unsigned char *)RSTRING_PTR(str);
300 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
301 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
303 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
304 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
306 return rb_ascii8bit_encoding();
308 return rb_enc_from_index(encidx);
311 static rb_encoding *
312 get_encoding(VALUE str)
314 return get_actual_encoding(ENCODING_GET(str), str);
317 static void
318 mustnot_broken(VALUE str)
320 if (is_broken_string(str)) {
321 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
325 static void
326 mustnot_wchar(VALUE str)
328 rb_encoding *enc = STR_ENC_GET(str);
329 if (rb_enc_mbminlen(enc) > 1) {
330 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
334 static int fstring_cmp(VALUE a, VALUE b);
336 static VALUE register_fstring(VALUE str, bool copy);
338 const struct st_hash_type rb_fstring_hash_type = {
339 fstring_cmp,
340 rb_str_hash,
343 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
345 struct fstr_update_arg {
346 VALUE fstr;
347 bool copy;
350 static int
351 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
354 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
355 VALUE str = (VALUE)*key;
357 if (existing) {
358 /* because of lazy sweep, str may be unmarked already and swept
359 * at next time */
361 if (rb_objspace_garbage_object_p(str)) {
362 arg->fstr = Qundef;
363 return ST_DELETE;
366 arg->fstr = str;
367 return ST_STOP;
369 else {
370 if (FL_TEST_RAW(str, STR_FAKESTR)) {
371 if (arg->copy) {
372 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
373 rb_enc_copy(new_str, str);
374 str = new_str;
376 else {
377 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
378 RSTRING(str)->as.heap.len,
379 ENCODING_GET(str));
381 OBJ_FREEZE_RAW(str);
383 else {
384 if (!OBJ_FROZEN(str))
385 str = str_new_frozen(rb_cString, str);
386 if (STR_SHARED_P(str)) { /* str should not be shared */
387 /* shared substring */
388 str_make_independent(str);
389 assert(OBJ_FROZEN(str));
391 if (!BARE_STRING_P(str)) {
392 str = str_new_frozen(rb_cString, str);
395 RBASIC(str)->flags |= RSTRING_FSTR;
397 *key = *value = arg->fstr = str;
398 return ST_CONTINUE;
402 RUBY_FUNC_EXPORTED
403 VALUE
404 rb_fstring(VALUE str)
406 VALUE fstr;
407 int bare;
409 Check_Type(str, T_STRING);
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE_RAW(str);
418 return str;
420 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
421 assert(OBJ_FROZEN(str));
422 return str;
426 if (!OBJ_FROZEN(str))
427 rb_str_resize(str, RSTRING_LEN(str));
429 fstr = register_fstring(str, FALSE);
431 if (!bare) {
432 str_replace_shared_without_enc(str, fstr);
433 OBJ_FREEZE_RAW(str);
434 return str;
436 return fstr;
439 static VALUE
440 register_fstring(VALUE str, bool copy)
442 struct fstr_update_arg args;
443 args.copy = copy;
445 RB_VM_LOCK_ENTER();
447 st_table *frozen_strings = rb_vm_fstring_table();
448 do {
449 args.fstr = str;
450 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
451 } while (args.fstr == Qundef);
453 RB_VM_LOCK_LEAVE();
455 assert(OBJ_FROZEN(args.fstr));
456 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
457 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
458 assert(RBASIC_CLASS(args.fstr) == rb_cString);
459 return args.fstr;
462 static VALUE
463 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
465 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
466 /* SHARED to be allocated by the callback */
468 if (!name) {
469 RUBY_ASSERT_ALWAYS(len == 0);
470 name = "";
473 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
475 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
476 fake_str->as.heap.len = len;
477 fake_str->as.heap.ptr = (char *)name;
478 fake_str->as.heap.aux.capa = len;
479 return (VALUE)fake_str;
483 * set up a fake string which refers a static string literal.
485 VALUE
486 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
488 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
492 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
493 * shared string which refers a static string literal. `ptr` must
494 * point a constant string.
496 MJIT_FUNC_EXPORTED VALUE
497 rb_fstring_new(const char *ptr, long len)
499 struct RString fake_str;
500 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
503 VALUE
504 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
506 struct RString fake_str;
507 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
510 VALUE
511 rb_fstring_cstr(const char *ptr)
513 return rb_fstring_new(ptr, strlen(ptr));
516 static int
517 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
519 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
520 return ST_CONTINUE;
523 static int
524 fstring_cmp(VALUE a, VALUE b)
526 long alen, blen;
527 const char *aptr, *bptr;
528 RSTRING_GETMEM(a, aptr, alen);
529 RSTRING_GETMEM(b, bptr, blen);
530 return (alen != blen ||
531 ENCODING_GET(a) != ENCODING_GET(b) ||
532 memcmp(aptr, bptr, alen) != 0);
535 static inline int
536 single_byte_optimizable(VALUE str)
538 rb_encoding *enc;
540 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
541 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
542 return 1;
544 enc = STR_ENC_GET(str);
545 if (rb_enc_mbmaxlen(enc) == 1)
546 return 1;
548 /* Conservative. Possibly single byte.
549 * "\xa1" in Shift_JIS for example. */
550 return 0;
553 VALUE rb_fs;
555 static inline const char *
556 search_nonascii(const char *p, const char *e)
558 const uintptr_t *s, *t;
560 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
561 # if SIZEOF_UINTPTR_T == 8
562 # define NONASCII_MASK UINT64_C(0x8080808080808080)
563 # elif SIZEOF_UINTPTR_T == 4
564 # define NONASCII_MASK UINT32_C(0x80808080)
565 # else
566 # error "don't know what to do."
567 # endif
568 #else
569 # if SIZEOF_UINTPTR_T == 8
570 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
571 # elif SIZEOF_UINTPTR_T == 4
572 # define NONASCII_MASK 0x80808080UL /* or...? */
573 # else
574 # error "don't know what to do."
575 # endif
576 #endif
578 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
579 #if !UNALIGNED_WORD_ACCESS
580 if ((uintptr_t)p % SIZEOF_VOIDP) {
581 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
582 p += l;
583 switch (l) {
584 default: UNREACHABLE;
585 #if SIZEOF_VOIDP > 4
586 case 7: if (p[-7]&0x80) return p-7;
587 case 6: if (p[-6]&0x80) return p-6;
588 case 5: if (p[-5]&0x80) return p-5;
589 case 4: if (p[-4]&0x80) return p-4;
590 #endif
591 case 3: if (p[-3]&0x80) return p-3;
592 case 2: if (p[-2]&0x80) return p-2;
593 case 1: if (p[-1]&0x80) return p-1;
594 case 0: break;
597 #endif
598 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
599 #define aligned_ptr(value) \
600 __builtin_assume_aligned((value), sizeof(uintptr_t))
601 #else
602 #define aligned_ptr(value) (uintptr_t *)(value)
603 #endif
604 s = aligned_ptr(p);
605 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
606 #undef aligned_ptr
607 for (;s < t; s++) {
608 if (*s & NONASCII_MASK) {
609 #ifdef WORDS_BIGENDIAN
610 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
611 #else
612 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
613 #endif
616 p = (const char *)s;
619 switch (e - p) {
620 default: UNREACHABLE;
621 #if SIZEOF_VOIDP > 4
622 case 7: if (e[-7]&0x80) return e-7;
623 case 6: if (e[-6]&0x80) return e-6;
624 case 5: if (e[-5]&0x80) return e-5;
625 case 4: if (e[-4]&0x80) return e-4;
626 #endif
627 case 3: if (e[-3]&0x80) return e-3;
628 case 2: if (e[-2]&0x80) return e-2;
629 case 1: if (e[-1]&0x80) return e-1;
630 case 0: return NULL;
634 static int
635 coderange_scan(const char *p, long len, rb_encoding *enc)
637 const char *e = p + len;
639 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
640 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
641 p = search_nonascii(p, e);
642 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
645 if (rb_enc_asciicompat(enc)) {
646 p = search_nonascii(p, e);
647 if (!p) return ENC_CODERANGE_7BIT;
648 for (;;) {
649 int ret = rb_enc_precise_mbclen(p, e, enc);
650 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
651 p += MBCLEN_CHARFOUND_LEN(ret);
652 if (p == e) break;
653 p = search_nonascii(p, e);
654 if (!p) break;
657 else {
658 while (p < e) {
659 int ret = rb_enc_precise_mbclen(p, e, enc);
660 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
661 p += MBCLEN_CHARFOUND_LEN(ret);
664 return ENC_CODERANGE_VALID;
667 long
668 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
670 const char *p = s;
672 if (*cr == ENC_CODERANGE_BROKEN)
673 return e - s;
675 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
676 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
677 if (*cr == ENC_CODERANGE_VALID) return e - s;
678 p = search_nonascii(p, e);
679 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
680 return e - s;
682 else if (rb_enc_asciicompat(enc)) {
683 p = search_nonascii(p, e);
684 if (!p) {
685 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
686 return e - s;
688 for (;;) {
689 int ret = rb_enc_precise_mbclen(p, e, enc);
690 if (!MBCLEN_CHARFOUND_P(ret)) {
691 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
692 return p - s;
694 p += MBCLEN_CHARFOUND_LEN(ret);
695 if (p == e) break;
696 p = search_nonascii(p, e);
697 if (!p) break;
700 else {
701 while (p < e) {
702 int ret = rb_enc_precise_mbclen(p, e, enc);
703 if (!MBCLEN_CHARFOUND_P(ret)) {
704 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
705 return p - s;
707 p += MBCLEN_CHARFOUND_LEN(ret);
710 *cr = ENC_CODERANGE_VALID;
711 return e - s;
714 static inline void
715 str_enc_copy(VALUE str1, VALUE str2)
717 rb_enc_set_index(str1, ENCODING_GET(str2));
720 static void
721 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
723 /* this function is designed for copying encoding and coderange
724 * from src to new string "dest" which is made from the part of src.
726 str_enc_copy(dest, src);
727 if (RSTRING_LEN(dest) == 0) {
728 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
729 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
730 else
731 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
732 return;
734 switch (ENC_CODERANGE(src)) {
735 case ENC_CODERANGE_7BIT:
736 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
737 break;
738 case ENC_CODERANGE_VALID:
739 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
740 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
741 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
742 else
743 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
744 break;
745 default:
746 break;
750 static void
751 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
753 str_enc_copy(dest, src);
754 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
757 static int
758 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
760 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
761 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
762 return ENC_CODERANGE_BROKEN;
764 else {
765 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
770 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
772 return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
776 rb_enc_str_coderange(VALUE str)
778 int cr = ENC_CODERANGE(str);
780 if (cr == ENC_CODERANGE_UNKNOWN) {
781 int encidx = ENCODING_GET(str);
782 rb_encoding *enc = rb_enc_from_index(encidx);
783 cr = enc_coderange_scan(str, enc, encidx);
784 ENC_CODERANGE_SET(str, cr);
786 return cr;
790 rb_enc_str_asciionly_p(VALUE str)
792 rb_encoding *enc = STR_ENC_GET(str);
794 if (!rb_enc_asciicompat(enc))
795 return FALSE;
796 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
797 return TRUE;
798 return FALSE;
801 static inline void
802 str_mod_check(VALUE s, const char *p, long len)
804 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
805 rb_raise(rb_eRuntimeError, "string modified");
809 static size_t
810 str_capacity(VALUE str, const int termlen)
812 if (STR_EMBED_P(str)) {
813 #if USE_RVARGC
814 return str_embed_capa(str) - termlen;
815 #else
816 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
817 #endif
819 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
820 return RSTRING(str)->as.heap.len;
822 else {
823 return RSTRING(str)->as.heap.aux.capa;
827 size_t
828 rb_str_capacity(VALUE str)
830 return str_capacity(str, TERM_LEN(str));
833 static inline void
834 must_not_null(const char *ptr)
836 if (!ptr) {
837 rb_raise(rb_eArgError, "NULL pointer given");
841 static inline VALUE
842 str_alloc(VALUE klass, size_t size)
844 assert(size > 0);
845 RVARGC_NEWOBJ_OF(str, struct RString, klass,
846 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
847 return (VALUE)str;
850 static inline VALUE
851 str_alloc_embed(VALUE klass, size_t capa)
853 size_t size = str_embed_size(capa);
854 assert(rb_gc_size_allocatable_p(size));
855 #if !USE_RVARGC
856 assert(size <= sizeof(struct RString));
857 #endif
858 return str_alloc(klass, size);
861 static inline VALUE
862 str_alloc_heap(VALUE klass)
864 return str_alloc(klass, sizeof(struct RString));
867 static inline VALUE
868 empty_str_alloc(VALUE klass)
870 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
871 VALUE str = str_alloc_embed(klass, 0);
872 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
873 return str;
876 static VALUE
877 str_new0(VALUE klass, const char *ptr, long len, int termlen)
879 VALUE str;
881 if (len < 0) {
882 rb_raise(rb_eArgError, "negative string size (or size too big)");
885 RUBY_DTRACE_CREATE_HOOK(STRING, len);
887 if (STR_EMBEDDABLE_P(len, termlen)) {
888 str = str_alloc_embed(klass, len + termlen);
889 if (len == 0) {
890 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
893 else {
894 str = str_alloc_heap(klass);
895 RSTRING(str)->as.heap.aux.capa = len;
896 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
897 * integer overflow. If we can STATIC_ASSERT that, the following
898 * mul_add_mul can be reverted to a simple ALLOC_N. */
899 RSTRING(str)->as.heap.ptr =
900 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
901 STR_SET_NOEMBED(str);
903 if (ptr) {
904 memcpy(RSTRING_PTR(str), ptr, len);
906 STR_SET_LEN(str, len);
907 TERM_FILL(RSTRING_PTR(str) + len, termlen);
908 return str;
911 static VALUE
912 str_new(VALUE klass, const char *ptr, long len)
914 return str_new0(klass, ptr, len, 1);
917 VALUE
918 rb_str_new(const char *ptr, long len)
920 return str_new(rb_cString, ptr, len);
923 VALUE
924 rb_usascii_str_new(const char *ptr, long len)
926 VALUE str = rb_str_new(ptr, len);
927 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
928 return str;
931 VALUE
932 rb_utf8_str_new(const char *ptr, long len)
934 VALUE str = str_new(rb_cString, ptr, len);
935 rb_enc_associate_index(str, rb_utf8_encindex());
936 return str;
939 VALUE
940 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
942 VALUE str;
944 if (!enc) return rb_str_new(ptr, len);
946 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
947 rb_enc_associate(str, enc);
948 return str;
951 VALUE
952 rb_str_new_cstr(const char *ptr)
954 must_not_null(ptr);
955 /* rb_str_new_cstr() can take pointer from non-malloc-generated
956 * memory regions, and that cannot be detected by the MSAN. Just
957 * trust the programmer that the argument passed here is a sane C
958 * string. */
959 __msan_unpoison_string(ptr);
960 return rb_str_new(ptr, strlen(ptr));
963 VALUE
964 rb_usascii_str_new_cstr(const char *ptr)
966 VALUE str = rb_str_new_cstr(ptr);
967 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
968 return str;
971 VALUE
972 rb_utf8_str_new_cstr(const char *ptr)
974 VALUE str = rb_str_new_cstr(ptr);
975 rb_enc_associate_index(str, rb_utf8_encindex());
976 return str;
979 VALUE
980 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
982 must_not_null(ptr);
983 if (rb_enc_mbminlen(enc) != 1) {
984 rb_raise(rb_eArgError, "wchar encoding given");
986 return rb_enc_str_new(ptr, strlen(ptr), enc);
989 static VALUE
990 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
992 VALUE str;
994 if (len < 0) {
995 rb_raise(rb_eArgError, "negative string size (or size too big)");
998 if (!ptr) {
999 rb_encoding *enc = rb_enc_get_from_index(encindex);
1000 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1002 else {
1003 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004 str = str_alloc_heap(klass);
1005 RSTRING(str)->as.heap.len = len;
1006 RSTRING(str)->as.heap.ptr = (char *)ptr;
1007 RSTRING(str)->as.heap.aux.capa = len;
1008 STR_SET_NOEMBED(str);
1009 RBASIC(str)->flags |= STR_NOFREE;
1011 rb_enc_associate_index(str, encindex);
1012 return str;
1015 VALUE
1016 rb_str_new_static(const char *ptr, long len)
1018 return str_new_static(rb_cString, ptr, len, 0);
1021 VALUE
1022 rb_usascii_str_new_static(const char *ptr, long len)
1024 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1027 VALUE
1028 rb_utf8_str_new_static(const char *ptr, long len)
1030 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1033 VALUE
1034 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1036 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1039 VALUE
1040 rb_tainted_str_new(const char *ptr, long len)
1042 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new", NULL);
1043 return rb_str_new(ptr, len);
1046 VALUE
1047 rb_tainted_str_new_cstr(const char *ptr)
1049 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new_cstr", NULL);
1050 return rb_str_new_cstr(ptr);
1053 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1054 rb_encoding *from, rb_encoding *to,
1055 int ecflags, VALUE ecopts);
1057 static inline bool
1058 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1060 int encidx = rb_enc_to_index(enc);
1061 if (rb_enc_get_index(str) == encidx)
1062 return is_ascii_string(str);
1063 return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1066 VALUE
1067 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1069 long len;
1070 const char *ptr;
1071 VALUE newstr;
1073 if (!to) return str;
1074 if (!from) from = rb_enc_get(str);
1075 if (from == to) return str;
1076 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1077 to == rb_ascii8bit_encoding()) {
1078 if (STR_ENC_GET(str) != to) {
1079 str = rb_str_dup(str);
1080 rb_enc_associate(str, to);
1082 return str;
1085 RSTRING_GETMEM(str, ptr, len);
1086 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1087 from, to, ecflags, ecopts);
1088 if (NIL_P(newstr)) {
1089 /* some error, return original */
1090 return str;
1092 return newstr;
1095 VALUE
1096 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1097 rb_encoding *from, int ecflags, VALUE ecopts)
1099 long olen;
1101 olen = RSTRING_LEN(newstr);
1102 if (ofs < -olen || olen < ofs)
1103 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1104 if (ofs < 0) ofs += olen;
1105 if (!from) {
1106 STR_SET_LEN(newstr, ofs);
1107 return rb_str_cat(newstr, ptr, len);
1110 rb_str_modify(newstr);
1111 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1112 rb_enc_get(newstr),
1113 ecflags, ecopts);
1116 VALUE
1117 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1119 STR_SET_LEN(str, 0);
1120 rb_enc_associate(str, enc);
1121 rb_str_cat(str, ptr, len);
1122 return str;
1125 static VALUE
1126 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1127 rb_encoding *from, rb_encoding *to,
1128 int ecflags, VALUE ecopts)
1130 rb_econv_t *ec;
1131 rb_econv_result_t ret;
1132 long olen;
1133 VALUE econv_wrapper;
1134 const unsigned char *start, *sp;
1135 unsigned char *dest, *dp;
1136 size_t converted_output = (size_t)ofs;
1138 olen = rb_str_capacity(newstr);
1140 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1141 RBASIC_CLEAR_CLASS(econv_wrapper);
1142 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1143 if (!ec) return Qnil;
1144 DATA_PTR(econv_wrapper) = ec;
1146 sp = (unsigned char*)ptr;
1147 start = sp;
1148 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1149 (dp = dest + converted_output),
1150 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1151 ret == econv_destination_buffer_full) {
1152 /* destination buffer short */
1153 size_t converted_input = sp - start;
1154 size_t rest = len - converted_input;
1155 converted_output = dp - dest;
1156 rb_str_set_len(newstr, converted_output);
1157 if (converted_input && converted_output &&
1158 rest < (LONG_MAX / converted_output)) {
1159 rest = (rest * converted_output) / converted_input;
1161 else {
1162 rest = olen;
1164 olen += rest < 2 ? 2 : rest;
1165 rb_str_resize(newstr, olen);
1167 DATA_PTR(econv_wrapper) = 0;
1168 rb_econv_close(ec);
1169 switch (ret) {
1170 case econv_finished:
1171 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1172 rb_str_set_len(newstr, len);
1173 rb_enc_associate(newstr, to);
1174 return newstr;
1176 default:
1177 return Qnil;
1181 VALUE
1182 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1184 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1187 VALUE
1188 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1190 rb_encoding *ienc;
1191 VALUE str;
1192 const int eidx = rb_enc_to_index(eenc);
1194 if (!ptr) {
1195 return rb_enc_str_new(ptr, len, eenc);
1198 /* ASCII-8BIT case, no conversion */
1199 if ((eidx == rb_ascii8bit_encindex()) ||
1200 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1201 return rb_str_new(ptr, len);
1203 /* no default_internal or same encoding, no conversion */
1204 ienc = rb_default_internal_encoding();
1205 if (!ienc || eenc == ienc) {
1206 return rb_enc_str_new(ptr, len, eenc);
1208 /* ASCII compatible, and ASCII only string, no conversion in
1209 * default_internal */
1210 if ((eidx == rb_ascii8bit_encindex()) ||
1211 (eidx == rb_usascii_encindex()) ||
1212 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1213 return rb_enc_str_new(ptr, len, ienc);
1215 /* convert from the given encoding to default_internal */
1216 str = rb_enc_str_new(NULL, 0, ienc);
1217 /* when the conversion failed for some reason, just ignore the
1218 * default_internal and result in the given encoding as-is. */
1219 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1220 rb_str_initialize(str, ptr, len, eenc);
1222 return str;
1225 VALUE
1226 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1228 int eidx = rb_enc_to_index(eenc);
1229 if (eidx == rb_usascii_encindex() &&
1230 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1231 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1232 return str;
1234 rb_enc_associate_index(str, eidx);
1235 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1238 VALUE
1239 rb_external_str_new(const char *ptr, long len)
1241 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1244 VALUE
1245 rb_external_str_new_cstr(const char *ptr)
1247 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1250 VALUE
1251 rb_locale_str_new(const char *ptr, long len)
1253 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1256 VALUE
1257 rb_locale_str_new_cstr(const char *ptr)
1259 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1262 VALUE
1263 rb_filesystem_str_new(const char *ptr, long len)
1265 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1268 VALUE
1269 rb_filesystem_str_new_cstr(const char *ptr)
1271 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1274 VALUE
1275 rb_str_export(VALUE str)
1277 return rb_str_export_to_enc(str, rb_default_external_encoding());
1280 VALUE
1281 rb_str_export_locale(VALUE str)
1283 return rb_str_export_to_enc(str, rb_locale_encoding());
1286 VALUE
1287 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1289 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1292 static VALUE
1293 str_replace_shared_without_enc(VALUE str2, VALUE str)
1295 const int termlen = TERM_LEN(str);
1296 char *ptr;
1297 long len;
1299 RSTRING_GETMEM(str, ptr, len);
1300 if (str_embed_capa(str2) >= len + termlen) {
1301 char *ptr2 = RSTRING(str2)->as.embed.ary;
1302 STR_SET_EMBED(str2);
1303 memcpy(ptr2, RSTRING_PTR(str), len);
1304 STR_SET_EMBED_LEN(str2, len);
1305 TERM_FILL(ptr2+len, termlen);
1307 else {
1308 VALUE root;
1309 if (STR_SHARED_P(str)) {
1310 root = RSTRING(str)->as.heap.aux.shared;
1311 RSTRING_GETMEM(str, ptr, len);
1313 else {
1314 root = rb_str_new_frozen(str);
1315 RSTRING_GETMEM(root, ptr, len);
1317 assert(OBJ_FROZEN(root));
1318 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1319 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1320 rb_fatal("about to free a possible shared root");
1322 char *ptr2 = STR_HEAP_PTR(str2);
1323 if (ptr2 != ptr) {
1324 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1327 FL_SET(str2, STR_NOEMBED);
1328 RSTRING(str2)->as.heap.len = len;
1329 RSTRING(str2)->as.heap.ptr = ptr;
1330 STR_SET_SHARED(str2, root);
1332 return str2;
1335 static VALUE
1336 str_replace_shared(VALUE str2, VALUE str)
1338 str_replace_shared_without_enc(str2, str);
1339 rb_enc_cr_str_exact_copy(str2, str);
1340 return str2;
1343 static VALUE
1344 str_new_shared(VALUE klass, VALUE str)
1346 return str_replace_shared(str_alloc_heap(klass), str);
1349 VALUE
1350 rb_str_new_shared(VALUE str)
1352 return str_new_shared(rb_obj_class(str), str);
1355 VALUE
1356 rb_str_new_frozen(VALUE orig)
1358 if (OBJ_FROZEN(orig)) return orig;
1359 return str_new_frozen(rb_obj_class(orig), orig);
1362 static VALUE
1363 rb_str_new_frozen_String(VALUE orig)
1365 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1366 return str_new_frozen(rb_cString, orig);
1369 VALUE
1370 rb_str_tmp_frozen_acquire(VALUE orig)
1372 if (OBJ_FROZEN_RAW(orig)) return orig;
1373 return str_new_frozen_buffer(0, orig, FALSE);
1376 void
1377 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1379 if (RBASIC_CLASS(tmp) != 0)
1380 return;
1382 if (STR_EMBED_P(tmp)) {
1383 assert(OBJ_FROZEN_RAW(tmp));
1385 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1386 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1387 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1389 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1390 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1391 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1393 /* Unshare orig since the root (tmp) only has this one child. */
1394 FL_UNSET_RAW(orig, STR_SHARED);
1395 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1396 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1397 assert(OBJ_FROZEN_RAW(tmp));
1399 /* Make tmp embedded and empty so it is safe for sweeping. */
1400 STR_SET_EMBED(tmp);
1401 STR_SET_EMBED_LEN(tmp, 0);
1406 static VALUE
1407 str_new_frozen(VALUE klass, VALUE orig)
1409 return str_new_frozen_buffer(klass, orig, TRUE);
1412 static VALUE
1413 heap_str_make_shared(VALUE klass, VALUE orig)
1415 assert(!STR_EMBED_P(orig));
1416 assert(!STR_SHARED_P(orig));
1418 VALUE str = str_alloc_heap(klass);
1419 STR_SET_NOEMBED(str);
1420 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1421 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1422 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1423 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1424 RBASIC(orig)->flags &= ~STR_NOFREE;
1425 STR_SET_SHARED(orig, str);
1426 if (klass == 0)
1427 FL_UNSET_RAW(str, STR_BORROWED);
1428 return str;
1431 static VALUE
1432 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1434 VALUE str;
1436 long len = RSTRING_LEN(orig);
1438 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1439 str = str_new(klass, RSTRING_PTR(orig), len);
1440 assert(STR_EMBED_P(str));
1442 else {
1443 if (FL_TEST_RAW(orig, STR_SHARED)) {
1444 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1445 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1446 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1447 assert(ofs >= 0);
1448 assert(rest >= 0);
1449 assert(ofs + rest <= RSTRING_LEN(shared));
1450 #if !USE_RVARGC
1451 assert(!STR_EMBED_P(shared));
1452 #endif
1453 assert(OBJ_FROZEN(shared));
1455 if ((ofs > 0) || (rest > 0) ||
1456 (klass != RBASIC(shared)->klass) ||
1457 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1458 str = str_new_shared(klass, shared);
1459 assert(!STR_EMBED_P(str));
1460 RSTRING(str)->as.heap.ptr += ofs;
1461 RSTRING(str)->as.heap.len -= ofs + rest;
1463 else {
1464 if (RBASIC_CLASS(shared) == 0)
1465 FL_SET_RAW(shared, STR_BORROWED);
1466 return shared;
1469 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1470 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1471 STR_SET_EMBED(str);
1472 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1473 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1474 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1476 else {
1477 str = heap_str_make_shared(klass, orig);
1481 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1482 OBJ_FREEZE(str);
1483 return str;
1486 VALUE
1487 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1489 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1492 static VALUE
1493 str_new_empty_String(VALUE str)
1495 VALUE v = rb_str_new(0, 0);
1496 rb_enc_copy(v, str);
1497 return v;
1500 #define STR_BUF_MIN_SIZE 63
1501 #if !USE_RVARGC
1502 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1503 #endif
1505 VALUE
1506 rb_str_buf_new(long capa)
1508 if (STR_EMBEDDABLE_P(capa, 1)) {
1509 return str_alloc_embed(rb_cString, capa + 1);
1512 VALUE str = str_alloc_heap(rb_cString);
1514 #if !USE_RVARGC
1515 if (capa < STR_BUF_MIN_SIZE) {
1516 capa = STR_BUF_MIN_SIZE;
1518 #endif
1519 FL_SET(str, STR_NOEMBED);
1520 RSTRING(str)->as.heap.aux.capa = capa;
1521 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1522 RSTRING(str)->as.heap.ptr[0] = '\0';
1524 return str;
1527 VALUE
1528 rb_str_buf_new_cstr(const char *ptr)
1530 VALUE str;
1531 long len = strlen(ptr);
1533 str = rb_str_buf_new(len);
1534 rb_str_buf_cat(str, ptr, len);
1536 return str;
1539 VALUE
1540 rb_str_tmp_new(long len)
1542 return str_new(0, 0, len);
1545 void
1546 rb_str_free(VALUE str)
1548 if (FL_TEST(str, RSTRING_FSTR)) {
1549 st_data_t fstr = (st_data_t)str;
1551 RB_VM_LOCK_ENTER();
1553 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1554 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1556 RB_VM_LOCK_LEAVE();
1559 if (STR_EMBED_P(str)) {
1560 RB_DEBUG_COUNTER_INC(obj_str_embed);
1562 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1563 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1564 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1566 else {
1567 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1568 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1572 RUBY_FUNC_EXPORTED size_t
1573 rb_str_memsize(VALUE str)
1575 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1576 return STR_HEAP_SIZE(str);
1578 else {
1579 return 0;
1583 VALUE
1584 rb_str_to_str(VALUE str)
1586 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1589 static inline void str_discard(VALUE str);
1590 static void str_shared_replace(VALUE str, VALUE str2);
1592 void
1593 rb_str_shared_replace(VALUE str, VALUE str2)
1595 if (str != str2) str_shared_replace(str, str2);
1598 static void
1599 str_shared_replace(VALUE str, VALUE str2)
1601 rb_encoding *enc;
1602 int cr;
1603 int termlen;
1605 RUBY_ASSERT(str2 != str);
1606 enc = STR_ENC_GET(str2);
1607 cr = ENC_CODERANGE(str2);
1608 str_discard(str);
1609 termlen = rb_enc_mbminlen(enc);
1611 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1612 STR_SET_EMBED(str);
1613 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1614 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1615 rb_enc_associate(str, enc);
1616 ENC_CODERANGE_SET(str, cr);
1618 else {
1619 #if USE_RVARGC
1620 if (STR_EMBED_P(str2)) {
1621 assert(!FL_TEST(str2, STR_SHARED));
1622 long len = RSTRING(str2)->as.embed.len;
1623 assert(len + termlen <= str_embed_capa(str2));
1625 char *new_ptr = ALLOC_N(char, len + termlen);
1626 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1627 RSTRING(str2)->as.heap.ptr = new_ptr;
1628 RSTRING(str2)->as.heap.len = len;
1629 RSTRING(str2)->as.heap.aux.capa = len;
1630 STR_SET_NOEMBED(str2);
1632 #endif
1634 STR_SET_NOEMBED(str);
1635 FL_UNSET(str, STR_SHARED);
1636 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1637 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1639 if (FL_TEST(str2, STR_SHARED)) {
1640 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1641 STR_SET_SHARED(str, shared);
1643 else {
1644 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1647 /* abandon str2 */
1648 STR_SET_EMBED(str2);
1649 RSTRING_PTR(str2)[0] = 0;
1650 STR_SET_EMBED_LEN(str2, 0);
1651 rb_enc_associate(str, enc);
1652 ENC_CODERANGE_SET(str, cr);
1656 VALUE
1657 rb_obj_as_string(VALUE obj)
1659 VALUE str;
1661 if (RB_TYPE_P(obj, T_STRING)) {
1662 return obj;
1664 str = rb_funcall(obj, idTo_s, 0);
1665 return rb_obj_as_string_result(str, obj);
1668 MJIT_FUNC_EXPORTED VALUE
1669 rb_obj_as_string_result(VALUE str, VALUE obj)
1671 if (!RB_TYPE_P(str, T_STRING))
1672 return rb_any_to_s(obj);
1673 return str;
1676 static VALUE
1677 str_replace(VALUE str, VALUE str2)
1679 long len;
1681 len = RSTRING_LEN(str2);
1682 if (STR_SHARED_P(str2)) {
1683 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1684 assert(OBJ_FROZEN(shared));
1685 STR_SET_NOEMBED(str);
1686 RSTRING(str)->as.heap.len = len;
1687 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1688 STR_SET_SHARED(str, shared);
1689 rb_enc_cr_str_exact_copy(str, str2);
1691 else {
1692 str_replace_shared(str, str2);
1695 return str;
1698 static inline VALUE
1699 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1701 assert(size > 0);
1702 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1703 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1704 return (VALUE)str;
1707 static inline VALUE
1708 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1710 size_t size = str_embed_size(capa);
1711 assert(rb_gc_size_allocatable_p(size));
1712 #if !USE_RVARGC
1713 assert(size <= sizeof(struct RString));
1714 #endif
1715 return ec_str_alloc(ec, klass, size);
1718 static inline VALUE
1719 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1721 return ec_str_alloc(ec, klass, sizeof(struct RString));
1724 static inline VALUE
1725 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1727 const VALUE flag_mask =
1728 #if !USE_RVARGC
1729 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1730 #endif
1731 ENC_CODERANGE_MASK | ENCODING_MASK |
1732 FL_FREEZE
1734 VALUE flags = FL_TEST_RAW(str, flag_mask);
1735 int encidx = 0;
1736 if (STR_EMBED_P(str)) {
1737 long len = RSTRING_EMBED_LEN(str);
1739 assert(str_embed_capa(dup) >= len + 1);
1740 STR_SET_EMBED_LEN(dup, len);
1741 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1743 else {
1744 VALUE root = str;
1745 if (FL_TEST_RAW(str, STR_SHARED)) {
1746 root = RSTRING(str)->as.heap.aux.shared;
1748 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1749 root = str = str_new_frozen(klass, str);
1750 flags = FL_TEST_RAW(str, flag_mask);
1752 assert(!STR_SHARED_P(root));
1753 assert(RB_OBJ_FROZEN_RAW(root));
1754 #if USE_RVARGC
1755 if (1) {
1756 #else
1757 if (STR_EMBED_P(root)) {
1758 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1759 char, RSTRING_EMBED_LEN_MAX + 1);
1761 else {
1762 #endif
1763 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1764 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1765 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1766 flags |= RSTRING_NOEMBED | STR_SHARED;
1770 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1771 encidx = rb_enc_get_index(str);
1772 flags &= ~ENCODING_MASK;
1774 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1775 if (encidx) rb_enc_associate_index(dup, encidx);
1776 return dup;
1779 static inline VALUE
1780 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1782 VALUE dup;
1783 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784 dup = ec_str_alloc_heap(ec, klass);
1786 else {
1787 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1790 return str_duplicate_setup(klass, str, dup);
1793 static inline VALUE
1794 str_duplicate(VALUE klass, VALUE str)
1796 VALUE dup;
1797 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1798 dup = str_alloc_heap(klass);
1800 else {
1801 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1804 return str_duplicate_setup(klass, str, dup);
1807 VALUE
1808 rb_str_dup(VALUE str)
1810 return str_duplicate(rb_obj_class(str), str);
1813 VALUE
1814 rb_str_resurrect(VALUE str)
1816 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1817 return str_duplicate(rb_cString, str);
1820 VALUE
1821 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1823 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1824 return ec_str_duplicate(ec, rb_cString, str);
1828 * call-seq:
1829 * String.new(string = '') -> new_string
1830 * String.new(string = '', encoding: encoding) -> new_string
1831 * String.new(string = '', capacity: size) -> new_string
1833 * Returns a new \String that is a copy of +string+.
1835 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1836 * s = String.new
1837 * s # => ""
1838 * s.encoding # => #<Encoding:ASCII-8BIT>
1840 * With the single \String argument +string+, returns a copy of +string+
1841 * with the same encoding as +string+:
1842 * s = String.new("Que veut dire \u{e7}a?")
1843 * s # => "Que veut dire \u{e7}a?"
1844 * s.encoding # => #<Encoding:UTF-8>
1846 * Literal strings like <tt>""</tt> or here-documents always use
1847 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1849 * With keyword +encoding+, returns a copy of +str+
1850 * with the specified encoding:
1851 * s = String.new(encoding: 'ASCII')
1852 * s.encoding # => #<Encoding:US-ASCII>
1853 * s = String.new('foo', encoding: 'ASCII')
1854 * s.encoding # => #<Encoding:US-ASCII>
1856 * Note that these are equivalent:
1857 * s0 = String.new('foo', encoding: 'ASCII')
1858 * s1 = 'foo'.force_encoding('ASCII')
1859 * s0.encoding == s1.encoding # => true
1861 * With keyword +capacity+, returns a copy of +str+;
1862 * the given +capacity+ may set the size of the internal buffer,
1863 * which may affect performance:
1864 * String.new(capacity: 1) # => ""
1865 * String.new(capacity: 4096) # => ""
1867 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1869 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1873 static VALUE
1874 rb_str_init(int argc, VALUE *argv, VALUE str)
1876 static ID keyword_ids[2];
1877 VALUE orig, opt, venc, vcapa;
1878 VALUE kwargs[2];
1879 rb_encoding *enc = 0;
1880 int n;
1882 if (!keyword_ids[0]) {
1883 keyword_ids[0] = rb_id_encoding();
1884 CONST_ID(keyword_ids[1], "capacity");
1887 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1888 if (!NIL_P(opt)) {
1889 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1890 venc = kwargs[0];
1891 vcapa = kwargs[1];
1892 if (venc != Qundef && !NIL_P(venc)) {
1893 enc = rb_to_encoding(venc);
1895 if (vcapa != Qundef && !NIL_P(vcapa)) {
1896 long capa = NUM2LONG(vcapa);
1897 long len = 0;
1898 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1900 if (capa < STR_BUF_MIN_SIZE) {
1901 capa = STR_BUF_MIN_SIZE;
1903 if (n == 1) {
1904 StringValue(orig);
1905 len = RSTRING_LEN(orig);
1906 if (capa < len) {
1907 capa = len;
1909 if (orig == str) n = 0;
1911 str_modifiable(str);
1912 if (STR_EMBED_P(str)) { /* make noembed always */
1913 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1914 #if USE_RVARGC
1915 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1916 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1917 #else
1918 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1919 #endif
1920 RSTRING(str)->as.heap.ptr = new_ptr;
1922 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1923 const size_t size = (size_t)capa + termlen;
1924 const char *const old_ptr = RSTRING_PTR(str);
1925 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1926 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1927 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1928 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1929 RSTRING(str)->as.heap.ptr = new_ptr;
1931 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1932 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1933 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1935 RSTRING(str)->as.heap.len = len;
1936 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1937 if (n == 1) {
1938 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1939 rb_enc_cr_str_exact_copy(str, orig);
1941 FL_SET(str, STR_NOEMBED);
1942 RSTRING(str)->as.heap.aux.capa = capa;
1944 else if (n == 1) {
1945 rb_str_replace(str, orig);
1947 if (enc) {
1948 rb_enc_associate(str, enc);
1949 ENC_CODERANGE_CLEAR(str);
1952 else if (n == 1) {
1953 rb_str_replace(str, orig);
1955 return str;
1958 #ifdef NONASCII_MASK
1959 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1962 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1963 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1964 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1966 * if (!(byte & 0x80))
1967 * byte |= 0x40; // turn on bit6
1968 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1970 * This function calculates whether a byte is leading or not for all bytes
1971 * in the argument word by concurrently using the above logic, and then
1972 * adds up the number of leading bytes in the word.
1974 static inline uintptr_t
1975 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1977 uintptr_t d = *s;
1979 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1980 d = (d>>6) | (~d>>7);
1981 d &= NONASCII_MASK >> 7;
1983 /* Gather all bytes. */
1984 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1985 /* use only if it can use POPCNT */
1986 return rb_popcount_intptr(d);
1987 #else
1988 d += (d>>8);
1989 d += (d>>16);
1990 # if SIZEOF_VOIDP == 8
1991 d += (d>>32);
1992 # endif
1993 return (d&0xF);
1994 #endif
1996 #endif
1998 static inline long
1999 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2001 long c;
2002 const char *q;
2004 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2005 long diff = (long)(e - p);
2006 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2008 #ifdef NONASCII_MASK
2009 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2010 uintptr_t len = 0;
2011 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2012 const uintptr_t *s, *t;
2013 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2014 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2015 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2016 while (p < (const char *)s) {
2017 if (is_utf8_lead_byte(*p)) len++;
2018 p++;
2020 while (s < t) {
2021 len += count_utf8_lead_bytes_with_word(s);
2022 s++;
2024 p = (const char *)s;
2026 while (p < e) {
2027 if (is_utf8_lead_byte(*p)) len++;
2028 p++;
2030 return (long)len;
2032 #endif
2033 else if (rb_enc_asciicompat(enc)) {
2034 c = 0;
2035 if (ENC_CODERANGE_CLEAN_P(cr)) {
2036 while (p < e) {
2037 if (ISASCII(*p)) {
2038 q = search_nonascii(p, e);
2039 if (!q)
2040 return c + (e - p);
2041 c += q - p;
2042 p = q;
2044 p += rb_enc_fast_mbclen(p, e, enc);
2045 c++;
2048 else {
2049 while (p < e) {
2050 if (ISASCII(*p)) {
2051 q = search_nonascii(p, e);
2052 if (!q)
2053 return c + (e - p);
2054 c += q - p;
2055 p = q;
2057 p += rb_enc_mbclen(p, e, enc);
2058 c++;
2061 return c;
2064 for (c=0; p<e; c++) {
2065 p += rb_enc_mbclen(p, e, enc);
2067 return c;
2070 long
2071 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2073 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2076 /* To get strlen with cr
2077 * Note that given cr is not used.
2079 long
2080 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2082 long c;
2083 const char *q;
2084 int ret;
2086 *cr = 0;
2087 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2088 long diff = (long)(e - p);
2089 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2091 else if (rb_enc_asciicompat(enc)) {
2092 c = 0;
2093 while (p < e) {
2094 if (ISASCII(*p)) {
2095 q = search_nonascii(p, e);
2096 if (!q) {
2097 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2098 return c + (e - p);
2100 c += q - p;
2101 p = q;
2103 ret = rb_enc_precise_mbclen(p, e, enc);
2104 if (MBCLEN_CHARFOUND_P(ret)) {
2105 *cr |= ENC_CODERANGE_VALID;
2106 p += MBCLEN_CHARFOUND_LEN(ret);
2108 else {
2109 *cr = ENC_CODERANGE_BROKEN;
2110 p++;
2112 c++;
2114 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2115 return c;
2118 for (c=0; p<e; c++) {
2119 ret = rb_enc_precise_mbclen(p, e, enc);
2120 if (MBCLEN_CHARFOUND_P(ret)) {
2121 *cr |= ENC_CODERANGE_VALID;
2122 p += MBCLEN_CHARFOUND_LEN(ret);
2124 else {
2125 *cr = ENC_CODERANGE_BROKEN;
2126 if (p + rb_enc_mbminlen(enc) <= e)
2127 p += rb_enc_mbminlen(enc);
2128 else
2129 p = e;
2132 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2133 return c;
2136 /* enc must be str's enc or rb_enc_check(str, str2) */
2137 static long
2138 str_strlen(VALUE str, rb_encoding *enc)
2140 const char *p, *e;
2141 int cr;
2143 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2144 if (!enc) enc = STR_ENC_GET(str);
2145 p = RSTRING_PTR(str);
2146 e = RSTRING_END(str);
2147 cr = ENC_CODERANGE(str);
2149 if (cr == ENC_CODERANGE_UNKNOWN) {
2150 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2151 if (cr) ENC_CODERANGE_SET(str, cr);
2152 return n;
2154 else {
2155 return enc_strlen(p, e, enc, cr);
2159 long
2160 rb_str_strlen(VALUE str)
2162 return str_strlen(str, NULL);
2166 * call-seq:
2167 * length -> integer
2169 * Returns the count of characters (not bytes) in +self+:
2171 * "\x80\u3042".length # => 2
2172 * "hello".length # => 5
2174 * String#size is an alias for String#length.
2176 * Related: String#bytesize.
2179 VALUE
2180 rb_str_length(VALUE str)
2182 return LONG2NUM(str_strlen(str, NULL));
2186 * call-seq:
2187 * bytesize -> integer
2189 * Returns the count of bytes in +self+:
2191 * "\x80\u3042".bytesize # => 4
2192 * "hello".bytesize # => 5
2194 * Related: String#length.
2197 static VALUE
2198 rb_str_bytesize(VALUE str)
2200 return LONG2NUM(RSTRING_LEN(str));
2204 * call-seq:
2205 * empty? -> true or false
2207 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2209 * "hello".empty? # => false
2210 * " ".empty? # => false
2211 * "".empty? # => true
2215 static VALUE
2216 rb_str_empty(VALUE str)
2218 return RBOOL(RSTRING_LEN(str) == 0);
2222 * call-seq:
2223 * string + other_string -> new_string
2225 * Returns a new \String containing +other_string+ concatenated to +self+:
2227 * "Hello from " + self.to_s # => "Hello from main"
2231 VALUE
2232 rb_str_plus(VALUE str1, VALUE str2)
2234 VALUE str3;
2235 rb_encoding *enc;
2236 char *ptr1, *ptr2, *ptr3;
2237 long len1, len2;
2238 int termlen;
2240 StringValue(str2);
2241 enc = rb_enc_check_str(str1, str2);
2242 RSTRING_GETMEM(str1, ptr1, len1);
2243 RSTRING_GETMEM(str2, ptr2, len2);
2244 termlen = rb_enc_mbminlen(enc);
2245 if (len1 > LONG_MAX - len2) {
2246 rb_raise(rb_eArgError, "string size too big");
2248 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2249 ptr3 = RSTRING_PTR(str3);
2250 memcpy(ptr3, ptr1, len1);
2251 memcpy(ptr3+len1, ptr2, len2);
2252 TERM_FILL(&ptr3[len1+len2], termlen);
2254 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2255 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2256 RB_GC_GUARD(str1);
2257 RB_GC_GUARD(str2);
2258 return str3;
2261 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2262 MJIT_FUNC_EXPORTED VALUE
2263 rb_str_opt_plus(VALUE str1, VALUE str2)
2265 assert(RBASIC_CLASS(str1) == rb_cString);
2266 assert(RBASIC_CLASS(str2) == rb_cString);
2267 long len1, len2;
2268 MAYBE_UNUSED(char) *ptr1, *ptr2;
2269 RSTRING_GETMEM(str1, ptr1, len1);
2270 RSTRING_GETMEM(str2, ptr2, len2);
2271 int enc1 = rb_enc_get_index(str1);
2272 int enc2 = rb_enc_get_index(str2);
2274 if (enc1 < 0) {
2275 return Qundef;
2277 else if (enc2 < 0) {
2278 return Qundef;
2280 else if (enc1 != enc2) {
2281 return Qundef;
2283 else if (len1 > LONG_MAX - len2) {
2284 return Qundef;
2286 else {
2287 return rb_str_plus(str1, str2);
2293 * call-seq:
2294 * string * integer -> new_string
2296 * Returns a new \String containing +integer+ copies of +self+:
2298 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2299 * "Ho! " * 0 # => ""
2303 VALUE
2304 rb_str_times(VALUE str, VALUE times)
2306 VALUE str2;
2307 long n, len;
2308 char *ptr2;
2309 int termlen;
2311 if (times == INT2FIX(1)) {
2312 return str_duplicate(rb_cString, str);
2314 if (times == INT2FIX(0)) {
2315 str2 = str_alloc_embed(rb_cString, 0);
2316 rb_enc_copy(str2, str);
2317 return str2;
2319 len = NUM2LONG(times);
2320 if (len < 0) {
2321 rb_raise(rb_eArgError, "negative argument");
2323 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2324 if (STR_EMBEDDABLE_P(len, 1)) {
2325 str2 = str_alloc_embed(rb_cString, len + 1);
2326 memset(RSTRING_PTR(str2), 0, len + 1);
2328 else {
2329 str2 = str_alloc_heap(rb_cString);
2330 RSTRING(str2)->as.heap.aux.capa = len;
2331 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2332 STR_SET_NOEMBED(str2);
2334 STR_SET_LEN(str2, len);
2335 rb_enc_copy(str2, str);
2336 return str2;
2338 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2339 rb_raise(rb_eArgError, "argument too big");
2342 len *= RSTRING_LEN(str);
2343 termlen = TERM_LEN(str);
2344 str2 = str_new0(rb_cString, 0, len, termlen);
2345 ptr2 = RSTRING_PTR(str2);
2346 if (len) {
2347 n = RSTRING_LEN(str);
2348 memcpy(ptr2, RSTRING_PTR(str), n);
2349 while (n <= len/2) {
2350 memcpy(ptr2 + n, ptr2, n);
2351 n *= 2;
2353 memcpy(ptr2 + n, ptr2, len-n);
2355 STR_SET_LEN(str2, len);
2356 TERM_FILL(&ptr2[len], termlen);
2357 rb_enc_cr_str_copy_for_substr(str2, str);
2359 return str2;
2363 * call-seq:
2364 * string % object -> new_string
2366 * Returns the result of formatting +object+ into the format specification +self+
2367 * (see Kernel#sprintf for formatting details):
2369 * "%05d" % 123 # => "00123"
2371 * If +self+ contains multiple substitutions, +object+ must be
2372 * an \Array or \Hash containing the values to be substituted:
2374 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2375 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2376 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2380 static VALUE
2381 rb_str_format_m(VALUE str, VALUE arg)
2383 VALUE tmp = rb_check_array_type(arg);
2385 if (!NIL_P(tmp)) {
2386 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2388 return rb_str_format(1, &arg, str);
2391 static inline void
2392 rb_check_lockedtmp(VALUE str)
2394 if (FL_TEST(str, STR_TMPLOCK)) {
2395 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2399 static inline void
2400 str_modifiable(VALUE str)
2402 rb_check_lockedtmp(str);
2403 rb_check_frozen(str);
2406 static inline int
2407 str_dependent_p(VALUE str)
2409 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2410 return 0;
2412 else {
2413 return 1;
2417 static inline int
2418 str_independent(VALUE str)
2420 str_modifiable(str);
2421 return !str_dependent_p(str);
2424 static void
2425 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2427 char *ptr;
2428 char *oldptr;
2429 long capa = len + expand;
2431 if (len > capa) len = capa;
2433 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2434 ptr = RSTRING(str)->as.heap.ptr;
2435 STR_SET_EMBED(str);
2436 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2437 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2438 STR_SET_EMBED_LEN(str, len);
2439 return;
2442 ptr = ALLOC_N(char, (size_t)capa + termlen);
2443 oldptr = RSTRING_PTR(str);
2444 if (oldptr) {
2445 memcpy(ptr, oldptr, len);
2447 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2448 xfree(oldptr);
2450 STR_SET_NOEMBED(str);
2451 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2452 TERM_FILL(ptr + len, termlen);
2453 RSTRING(str)->as.heap.ptr = ptr;
2454 RSTRING(str)->as.heap.len = len;
2455 RSTRING(str)->as.heap.aux.capa = capa;
2458 void
2459 rb_str_modify(VALUE str)
2461 if (!str_independent(str))
2462 str_make_independent(str);
2463 ENC_CODERANGE_CLEAR(str);
2466 void
2467 rb_str_modify_expand(VALUE str, long expand)
2469 int termlen = TERM_LEN(str);
2470 long len = RSTRING_LEN(str);
2472 if (expand < 0) {
2473 rb_raise(rb_eArgError, "negative expanding string size");
2475 if (expand >= LONG_MAX - len) {
2476 rb_raise(rb_eArgError, "string size too big");
2479 if (!str_independent(str)) {
2480 str_make_independent_expand(str, len, expand, termlen);
2482 else if (expand > 0) {
2483 RESIZE_CAPA_TERM(str, len + expand, termlen);
2485 ENC_CODERANGE_CLEAR(str);
2488 /* As rb_str_modify(), but don't clear coderange */
2489 static void
2490 str_modify_keep_cr(VALUE str)
2492 if (!str_independent(str))
2493 str_make_independent(str);
2494 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2495 /* Force re-scan later */
2496 ENC_CODERANGE_CLEAR(str);
2499 static inline void
2500 str_discard(VALUE str)
2502 str_modifiable(str);
2503 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2504 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2505 RSTRING(str)->as.heap.ptr = 0;
2506 RSTRING(str)->as.heap.len = 0;
2510 void
2511 rb_must_asciicompat(VALUE str)
2513 rb_encoding *enc = rb_enc_get(str);
2514 if (!rb_enc_asciicompat(enc)) {
2515 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2519 VALUE
2520 rb_string_value(volatile VALUE *ptr)
2522 VALUE s = *ptr;
2523 if (!RB_TYPE_P(s, T_STRING)) {
2524 s = rb_str_to_str(s);
2525 *ptr = s;
2527 return s;
2530 char *
2531 rb_string_value_ptr(volatile VALUE *ptr)
2533 VALUE str = rb_string_value(ptr);
2534 return RSTRING_PTR(str);
2537 static int
2538 zero_filled(const char *s, int n)
2540 for (; n > 0; --n) {
2541 if (*s++) return 0;
2543 return 1;
2546 static const char *
2547 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2549 const char *e = s + len;
2551 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2552 if (zero_filled(s, minlen)) return s;
2554 return 0;
2557 static char *
2558 str_fill_term(VALUE str, char *s, long len, int termlen)
2560 /* This function assumes that (capa + termlen) bytes of memory
2561 * is allocated, like many other functions in this file.
2563 if (str_dependent_p(str)) {
2564 if (!zero_filled(s + len, termlen))
2565 str_make_independent_expand(str, len, 0L, termlen);
2567 else {
2568 TERM_FILL(s + len, termlen);
2569 return s;
2571 return RSTRING_PTR(str);
2574 void
2575 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2577 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2578 long len = RSTRING_LEN(str);
2580 assert(capa >= len);
2581 if (capa - len < termlen) {
2582 rb_check_lockedtmp(str);
2583 str_make_independent_expand(str, len, 0L, termlen);
2585 else if (str_dependent_p(str)) {
2586 if (termlen > oldtermlen)
2587 str_make_independent_expand(str, len, 0L, termlen);
2589 else {
2590 if (!STR_EMBED_P(str)) {
2591 /* modify capa instead of realloc */
2592 assert(!FL_TEST((str), STR_SHARED));
2593 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2595 if (termlen > oldtermlen) {
2596 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2600 return;
2603 static char *
2604 str_null_check(VALUE str, int *w)
2606 char *s = RSTRING_PTR(str);
2607 long len = RSTRING_LEN(str);
2608 rb_encoding *enc = rb_enc_get(str);
2609 const int minlen = rb_enc_mbminlen(enc);
2611 if (minlen > 1) {
2612 *w = 1;
2613 if (str_null_char(s, len, minlen, enc)) {
2614 return NULL;
2616 return str_fill_term(str, s, len, minlen);
2618 *w = 0;
2619 if (!s || memchr(s, 0, len)) {
2620 return NULL;
2622 if (s[len]) {
2623 s = str_fill_term(str, s, len, minlen);
2625 return s;
2628 char *
2629 rb_str_to_cstr(VALUE str)
2631 int w;
2632 return str_null_check(str, &w);
2635 char *
2636 rb_string_value_cstr(volatile VALUE *ptr)
2638 VALUE str = rb_string_value(ptr);
2639 int w;
2640 char *s = str_null_check(str, &w);
2641 if (!s) {
2642 if (w) {
2643 rb_raise(rb_eArgError, "string contains null char");
2645 rb_raise(rb_eArgError, "string contains null byte");
2647 return s;
2650 char *
2651 rb_str_fill_terminator(VALUE str, const int newminlen)
2653 char *s = RSTRING_PTR(str);
2654 long len = RSTRING_LEN(str);
2655 return str_fill_term(str, s, len, newminlen);
2658 VALUE
2659 rb_check_string_type(VALUE str)
2661 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2662 return str;
2666 * call-seq:
2667 * String.try_convert(object) -> object, new_string, or nil
2669 * If +object+ is a \String object, returns +object+.
2671 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2672 * calls <tt>object.to_str</tt> and returns the result.
2674 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2676 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2678 static VALUE
2679 rb_str_s_try_convert(VALUE dummy, VALUE str)
2681 return rb_check_string_type(str);
2684 static char*
2685 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2687 long nth = *nthp;
2688 if (rb_enc_mbmaxlen(enc) == 1) {
2689 p += nth;
2691 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2692 p += nth * rb_enc_mbmaxlen(enc);
2694 else if (rb_enc_asciicompat(enc)) {
2695 const char *p2, *e2;
2696 int n;
2698 while (p < e && 0 < nth) {
2699 e2 = p + nth;
2700 if (e < e2) {
2701 *nthp = nth;
2702 return (char *)e;
2704 if (ISASCII(*p)) {
2705 p2 = search_nonascii(p, e2);
2706 if (!p2) {
2707 nth -= e2 - p;
2708 *nthp = nth;
2709 return (char *)e2;
2711 nth -= p2 - p;
2712 p = p2;
2714 n = rb_enc_mbclen(p, e, enc);
2715 p += n;
2716 nth--;
2718 *nthp = nth;
2719 if (nth != 0) {
2720 return (char *)e;
2722 return (char *)p;
2724 else {
2725 while (p < e && nth--) {
2726 p += rb_enc_mbclen(p, e, enc);
2729 if (p > e) p = e;
2730 *nthp = nth;
2731 return (char*)p;
2734 char*
2735 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2737 return str_nth_len(p, e, &nth, enc);
2740 static char*
2741 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2743 if (singlebyte)
2744 p += nth;
2745 else {
2746 p = str_nth_len(p, e, &nth, enc);
2748 if (!p) return 0;
2749 if (p > e) p = e;
2750 return (char *)p;
2753 /* char offset to byte offset */
2754 static long
2755 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2757 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2758 if (!pp) return e - p;
2759 return pp - p;
2762 long
2763 rb_str_offset(VALUE str, long pos)
2765 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2766 STR_ENC_GET(str), single_byte_optimizable(str));
2769 #ifdef NONASCII_MASK
2770 static char *
2771 str_utf8_nth(const char *p, const char *e, long *nthp)
2773 long nth = *nthp;
2774 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2775 const uintptr_t *s, *t;
2776 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2777 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2778 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2779 while (p < (const char *)s) {
2780 if (is_utf8_lead_byte(*p)) nth--;
2781 p++;
2783 do {
2784 nth -= count_utf8_lead_bytes_with_word(s);
2785 s++;
2786 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2787 p = (char *)s;
2789 while (p < e) {
2790 if (is_utf8_lead_byte(*p)) {
2791 if (nth == 0) break;
2792 nth--;
2794 p++;
2796 *nthp = nth;
2797 return (char *)p;
2800 static long
2801 str_utf8_offset(const char *p, const char *e, long nth)
2803 const char *pp = str_utf8_nth(p, e, &nth);
2804 return pp - p;
2806 #endif
2808 /* byte offset to char offset */
2809 long
2810 rb_str_sublen(VALUE str, long pos)
2812 if (single_byte_optimizable(str) || pos < 0)
2813 return pos;
2814 else {
2815 char *p = RSTRING_PTR(str);
2816 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2820 VALUE
2821 rb_str_subseq(VALUE str, long beg, long len)
2823 VALUE str2;
2825 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2826 SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2827 long olen;
2828 str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2829 RSTRING(str2)->as.heap.ptr += beg;
2830 olen = RSTRING(str2)->as.heap.len;
2831 if (olen > len) RSTRING(str2)->as.heap.len = len;
2833 else {
2834 str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2835 RB_GC_GUARD(str);
2838 rb_enc_cr_str_copy_for_substr(str2, str);
2840 return str2;
2843 char *
2844 rb_str_subpos(VALUE str, long beg, long *lenp)
2846 long len = *lenp;
2847 long slen = -1L;
2848 long blen = RSTRING_LEN(str);
2849 rb_encoding *enc = STR_ENC_GET(str);
2850 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2852 if (len < 0) return 0;
2853 if (!blen) {
2854 len = 0;
2856 if (single_byte_optimizable(str)) {
2857 if (beg > blen) return 0;
2858 if (beg < 0) {
2859 beg += blen;
2860 if (beg < 0) return 0;
2862 if (len > blen - beg)
2863 len = blen - beg;
2864 if (len < 0) return 0;
2865 p = s + beg;
2866 goto end;
2868 if (beg < 0) {
2869 if (len > -beg) len = -beg;
2870 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2871 beg = -beg;
2872 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2873 p = e;
2874 if (!p) return 0;
2875 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2876 if (!p) return 0;
2877 len = e - p;
2878 goto end;
2880 else {
2881 slen = str_strlen(str, enc);
2882 beg += slen;
2883 if (beg < 0) return 0;
2884 p = s + beg;
2885 if (len == 0) goto end;
2888 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2889 return 0;
2891 if (len == 0) {
2892 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2893 p = s + beg;
2895 #ifdef NONASCII_MASK
2896 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2897 enc == rb_utf8_encoding()) {
2898 p = str_utf8_nth(s, e, &beg);
2899 if (beg > 0) return 0;
2900 len = str_utf8_offset(p, e, len);
2902 #endif
2903 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2904 int char_sz = rb_enc_mbmaxlen(enc);
2906 p = s + beg * char_sz;
2907 if (p > e) {
2908 return 0;
2910 else if (len * char_sz > e - p)
2911 len = e - p;
2912 else
2913 len *= char_sz;
2915 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2916 if (beg > 0) return 0;
2917 len = 0;
2919 else {
2920 len = str_offset(p, e, len, enc, 0);
2922 end:
2923 *lenp = len;
2924 RB_GC_GUARD(str);
2925 return p;
2928 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2930 VALUE
2931 rb_str_substr(VALUE str, long beg, long len)
2933 return str_substr(str, beg, len, TRUE);
2936 static VALUE
2937 str_substr(VALUE str, long beg, long len, int empty)
2939 VALUE str2;
2940 char *p = rb_str_subpos(str, beg, &len);
2942 if (!p) return Qnil;
2943 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2944 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2945 long ofs = p - RSTRING_PTR(str);
2946 str2 = rb_str_new_frozen(str);
2947 str2 = str_new_shared(rb_cString, str2);
2948 RSTRING(str2)->as.heap.ptr += ofs;
2949 RSTRING(str2)->as.heap.len = len;
2950 ENC_CODERANGE_CLEAR(str2);
2952 else {
2953 if (!len && !empty) return Qnil;
2954 str2 = rb_str_new(p, len);
2955 RB_GC_GUARD(str);
2957 rb_enc_cr_str_copy_for_substr(str2, str);
2959 return str2;
2962 VALUE
2963 rb_str_freeze(VALUE str)
2965 if (OBJ_FROZEN(str)) return str;
2966 rb_str_resize(str, RSTRING_LEN(str));
2967 return rb_obj_freeze(str);
2972 * call-seq:
2973 * +string -> new_string or self
2975 * Returns +self+ if +self+ is not frozen.
2977 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2979 static VALUE
2980 str_uplus(VALUE str)
2982 if (OBJ_FROZEN(str)) {
2983 return rb_str_dup(str);
2985 else {
2986 return str;
2991 * call-seq:
2992 * -string -> frozen_string
2994 * Returns a frozen, possibly pre-existing copy of the string.
2996 * The returned \String will be deduplicated as long as it does not have
2997 * any instance variables set on it.
2999 static VALUE
3000 str_uminus(VALUE str)
3002 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3003 str = rb_str_dup(str);
3005 return rb_fstring(str);
3008 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3009 #define rb_str_dup_frozen rb_str_new_frozen
3011 VALUE
3012 rb_str_locktmp(VALUE str)
3014 if (FL_TEST(str, STR_TMPLOCK)) {
3015 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3017 FL_SET(str, STR_TMPLOCK);
3018 return str;
3021 VALUE
3022 rb_str_unlocktmp(VALUE str)
3024 if (!FL_TEST(str, STR_TMPLOCK)) {
3025 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3027 FL_UNSET(str, STR_TMPLOCK);
3028 return str;
3031 RUBY_FUNC_EXPORTED VALUE
3032 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3034 rb_str_locktmp(str);
3035 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3038 void
3039 rb_str_set_len(VALUE str, long len)
3041 long capa;
3042 const int termlen = TERM_LEN(str);
3044 str_modifiable(str);
3045 if (STR_SHARED_P(str)) {
3046 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3048 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3049 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3051 STR_SET_LEN(str, len);
3052 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3055 VALUE
3056 rb_str_resize(VALUE str, long len)
3058 long slen;
3059 int independent;
3061 if (len < 0) {
3062 rb_raise(rb_eArgError, "negative string size (or size too big)");
3065 independent = str_independent(str);
3066 ENC_CODERANGE_CLEAR(str);
3067 slen = RSTRING_LEN(str);
3070 long capa;
3071 const int termlen = TERM_LEN(str);
3072 if (STR_EMBED_P(str)) {
3073 if (len == slen) return str;
3074 if (str_embed_capa(str) >= len + termlen) {
3075 STR_SET_EMBED_LEN(str, len);
3076 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3077 return str;
3079 str_make_independent_expand(str, slen, len - slen, termlen);
3081 else if (str_embed_capa(str) >= len + termlen) {
3082 char *ptr = STR_HEAP_PTR(str);
3083 STR_SET_EMBED(str);
3084 if (slen > len) slen = len;
3085 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3086 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3087 STR_SET_EMBED_LEN(str, len);
3088 if (independent) ruby_xfree(ptr);
3089 return str;
3091 else if (!independent) {
3092 if (len == slen) return str;
3093 str_make_independent_expand(str, slen, len - slen, termlen);
3095 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3096 (capa - len) > (len < 1024 ? len : 1024)) {
3097 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3098 (size_t)len + termlen, STR_HEAP_SIZE(str));
3099 RSTRING(str)->as.heap.aux.capa = len;
3101 else if (len == slen) return str;
3102 RSTRING(str)->as.heap.len = len;
3103 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3105 return str;
3108 static VALUE
3109 str_buf_cat(VALUE str, const char *ptr, long len)
3111 long capa, total, olen, off = -1;
3112 char *sptr;
3113 const int termlen = TERM_LEN(str);
3114 #if !USE_RVARGC
3115 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3116 #endif
3118 RSTRING_GETMEM(str, sptr, olen);
3119 if (ptr >= sptr && ptr <= sptr + olen) {
3120 off = ptr - sptr;
3122 rb_str_modify(str);
3123 if (len == 0) return 0;
3124 if (STR_EMBED_P(str)) {
3125 capa = str_embed_capa(str) - termlen;
3126 sptr = RSTRING(str)->as.embed.ary;
3127 olen = RSTRING_EMBED_LEN(str);
3129 else {
3130 capa = RSTRING(str)->as.heap.aux.capa;
3131 sptr = RSTRING(str)->as.heap.ptr;
3132 olen = RSTRING(str)->as.heap.len;
3134 if (olen > LONG_MAX - len) {
3135 rb_raise(rb_eArgError, "string sizes too big");
3137 total = olen + len;
3138 if (capa < total) {
3139 if (total >= LONG_MAX / 2) {
3140 capa = total;
3142 while (total > capa) {
3143 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3145 RESIZE_CAPA_TERM(str, capa, termlen);
3146 sptr = RSTRING_PTR(str);
3148 if (off != -1) {
3149 ptr = sptr + off;
3151 memcpy(sptr + olen, ptr, len);
3152 STR_SET_LEN(str, total);
3153 TERM_FILL(sptr + total, termlen); /* sentinel */
3155 return str;
3158 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3160 VALUE
3161 rb_str_cat(VALUE str, const char *ptr, long len)
3163 if (len == 0) return str;
3164 if (len < 0) {
3165 rb_raise(rb_eArgError, "negative string size (or size too big)");
3167 return str_buf_cat(str, ptr, len);
3170 VALUE
3171 rb_str_cat_cstr(VALUE str, const char *ptr)
3173 must_not_null(ptr);
3174 return rb_str_buf_cat(str, ptr, strlen(ptr));
3177 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3178 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3179 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3181 static VALUE
3182 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3183 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3185 int str_encindex = ENCODING_GET(str);
3186 int res_encindex;
3187 int str_cr, res_cr;
3188 rb_encoding *str_enc, *ptr_enc;
3190 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3192 if (str_encindex == ptr_encindex) {
3193 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3194 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3197 else {
3198 str_enc = rb_enc_from_index(str_encindex);
3199 ptr_enc = rb_enc_from_index(ptr_encindex);
3200 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3201 if (len == 0)
3202 return str;
3203 if (RSTRING_LEN(str) == 0) {
3204 rb_str_buf_cat(str, ptr, len);
3205 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3206 return str;
3208 goto incompatible;
3210 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3211 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3213 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3214 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3215 str_cr = rb_enc_str_coderange(str);
3219 if (ptr_cr_ret)
3220 *ptr_cr_ret = ptr_cr;
3222 if (str_encindex != ptr_encindex &&
3223 str_cr != ENC_CODERANGE_7BIT &&
3224 ptr_cr != ENC_CODERANGE_7BIT) {
3225 str_enc = rb_enc_from_index(str_encindex);
3226 ptr_enc = rb_enc_from_index(ptr_encindex);
3227 goto incompatible;
3230 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231 res_encindex = str_encindex;
3232 res_cr = ENC_CODERANGE_UNKNOWN;
3234 else if (str_cr == ENC_CODERANGE_7BIT) {
3235 if (ptr_cr == ENC_CODERANGE_7BIT) {
3236 res_encindex = str_encindex;
3237 res_cr = ENC_CODERANGE_7BIT;
3239 else {
3240 res_encindex = ptr_encindex;
3241 res_cr = ptr_cr;
3244 else if (str_cr == ENC_CODERANGE_VALID) {
3245 res_encindex = str_encindex;
3246 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3247 res_cr = str_cr;
3248 else
3249 res_cr = ptr_cr;
3251 else { /* str_cr == ENC_CODERANGE_BROKEN */
3252 res_encindex = str_encindex;
3253 res_cr = str_cr;
3254 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3257 if (len < 0) {
3258 rb_raise(rb_eArgError, "negative string size (or size too big)");
3260 str_buf_cat(str, ptr, len);
3261 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3262 return str;
3264 incompatible:
3265 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3266 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3267 UNREACHABLE_RETURN(Qundef);
3270 VALUE
3271 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3273 return rb_enc_cr_str_buf_cat(str, ptr, len,
3274 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3277 VALUE
3278 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3280 /* ptr must reference NUL terminated ASCII string. */
3281 int encindex = ENCODING_GET(str);
3282 rb_encoding *enc = rb_enc_from_index(encindex);
3283 if (rb_enc_asciicompat(enc)) {
3284 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3285 encindex, ENC_CODERANGE_7BIT, 0);
3287 else {
3288 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3289 while (*ptr) {
3290 unsigned int c = (unsigned char)*ptr;
3291 int len = rb_enc_codelen(c, enc);
3292 rb_enc_mbcput(c, buf, enc);
3293 rb_enc_cr_str_buf_cat(str, buf, len,
3294 encindex, ENC_CODERANGE_VALID, 0);
3295 ptr++;
3297 return str;
3301 VALUE
3302 rb_str_buf_append(VALUE str, VALUE str2)
3304 int str2_cr;
3306 str2_cr = ENC_CODERANGE(str2);
3308 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3309 ENCODING_GET(str2), str2_cr, &str2_cr);
3311 ENC_CODERANGE_SET(str2, str2_cr);
3313 return str;
3316 VALUE
3317 rb_str_append(VALUE str, VALUE str2)
3319 StringValue(str2);
3320 return rb_str_buf_append(str, str2);
3323 #define MIN_PRE_ALLOC_SIZE 48
3325 MJIT_FUNC_EXPORTED VALUE
3326 rb_str_concat_literals(size_t num, const VALUE *strary)
3328 VALUE str;
3329 size_t i, s;
3330 long len = 1;
3332 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3333 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3335 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3336 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3337 str = rb_str_resurrect(strary[0]);
3338 s = 1;
3340 else {
3341 str = rb_str_buf_new(len);
3342 rb_enc_copy(str, strary[0]);
3343 s = 0;
3346 for (i = s; i < num; ++i) {
3347 const VALUE v = strary[i];
3348 int encidx = ENCODING_GET(v);
3350 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3351 encidx, ENC_CODERANGE(v), NULL);
3352 if (encidx != ENCINDEX_US_ASCII) {
3353 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3354 rb_enc_set_index(str, encidx);
3357 return str;
3361 * call-seq:
3362 * concat(*objects) -> string
3364 * Concatenates each object in +objects+ to +self+ and returns +self+:
3366 * s = 'foo'
3367 * s.concat('bar', 'baz') # => "foobarbaz"
3368 * s # => "foobarbaz"
3370 * For each given object +object+ that is an \Integer,
3371 * the value is considered a codepoint and converted to a character before concatenation:
3373 * s = 'foo'
3374 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3376 * Related: String#<<, which takes a single argument.
3378 static VALUE
3379 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3381 str_modifiable(str);
3383 if (argc == 1) {
3384 return rb_str_concat(str, argv[0]);
3386 else if (argc > 1) {
3387 int i;
3388 VALUE arg_str = rb_str_tmp_new(0);
3389 rb_enc_copy(arg_str, str);
3390 for (i = 0; i < argc; i++) {
3391 rb_str_concat(arg_str, argv[i]);
3393 rb_str_buf_append(str, arg_str);
3396 return str;
3400 * call-seq:
3401 * string << object -> string
3403 * Concatenates +object+ to +self+ and returns +self+:
3405 * s = 'foo'
3406 * s << 'bar' # => "foobar"
3407 * s # => "foobar"
3409 * If +object+ is an \Integer,
3410 * the value is considered a codepoint and converted to a character before concatenation:
3412 * s = 'foo'
3413 * s << 33 # => "foo!"
3415 * Related: String#concat, which takes multiple arguments.
3417 VALUE
3418 rb_str_concat(VALUE str1, VALUE str2)
3420 unsigned int code;
3421 rb_encoding *enc = STR_ENC_GET(str1);
3422 int encidx;
3424 if (RB_INTEGER_TYPE_P(str2)) {
3425 if (rb_num_to_uint(str2, &code) == 0) {
3427 else if (FIXNUM_P(str2)) {
3428 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3430 else {
3431 rb_raise(rb_eRangeError, "bignum out of char range");
3434 else {
3435 return rb_str_append(str1, str2);
3438 encidx = rb_enc_to_index(enc);
3439 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3440 /* US-ASCII automatically extended to ASCII-8BIT */
3441 char buf[1];
3442 buf[0] = (char)code;
3443 if (code > 0xFF) {
3444 rb_raise(rb_eRangeError, "%u out of char range", code);
3446 rb_str_cat(str1, buf, 1);
3447 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3448 rb_enc_associate_index(str1, ENCINDEX_ASCII);
3449 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3452 else {
3453 long pos = RSTRING_LEN(str1);
3454 int cr = ENC_CODERANGE(str1);
3455 int len;
3456 char *buf;
3458 switch (len = rb_enc_codelen(code, enc)) {
3459 case ONIGERR_INVALID_CODE_POINT_VALUE:
3460 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3461 break;
3462 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3463 case 0:
3464 rb_raise(rb_eRangeError, "%u out of char range", code);
3465 break;
3467 buf = ALLOCA_N(char, len + 1);
3468 rb_enc_mbcput(code, buf, enc);
3469 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3470 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3472 rb_str_resize(str1, pos+len);
3473 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3474 if (cr == ENC_CODERANGE_7BIT && code > 127)
3475 cr = ENC_CODERANGE_VALID;
3476 ENC_CODERANGE_SET(str1, cr);
3478 return str1;
3482 * call-seq:
3483 * prepend(*other_strings) -> string
3485 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3487 * s = 'foo'
3488 * s.prepend('bar', 'baz') # => "barbazfoo"
3489 * s # => "barbazfoo"
3491 * Related: String#concat.
3494 static VALUE
3495 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3497 str_modifiable(str);
3499 if (argc == 1) {
3500 rb_str_update(str, 0L, 0L, argv[0]);
3502 else if (argc > 1) {
3503 int i;
3504 VALUE arg_str = rb_str_tmp_new(0);
3505 rb_enc_copy(arg_str, str);
3506 for (i = 0; i < argc; i++) {
3507 rb_str_append(arg_str, argv[i]);
3509 rb_str_update(str, 0L, 0L, arg_str);
3512 return str;
3515 st_index_t
3516 rb_str_hash(VALUE str)
3518 int e = ENCODING_GET(str);
3519 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3520 e = 0;
3522 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3526 rb_str_hash_cmp(VALUE str1, VALUE str2)
3528 long len1, len2;
3529 const char *ptr1, *ptr2;
3530 RSTRING_GETMEM(str1, ptr1, len1);
3531 RSTRING_GETMEM(str2, ptr2, len2);
3532 return (len1 != len2 ||
3533 !rb_str_comparable(str1, str2) ||
3534 memcmp(ptr1, ptr2, len1) != 0);
3538 * call-seq:
3539 * hash -> integer
3541 * Returns the integer hash value for +self+.
3542 * The value is based on the length, content and encoding of +self+.
3544 * Related: Object#hash.
3547 static VALUE
3548 rb_str_hash_m(VALUE str)
3550 st_index_t hval = rb_str_hash(str);
3551 return ST2FIX(hval);
3554 #define lesser(a,b) (((a)>(b))?(b):(a))
3557 rb_str_comparable(VALUE str1, VALUE str2)
3559 int idx1, idx2;
3560 int rc1, rc2;
3562 if (RSTRING_LEN(str1) == 0) return TRUE;
3563 if (RSTRING_LEN(str2) == 0) return TRUE;
3564 idx1 = ENCODING_GET(str1);
3565 idx2 = ENCODING_GET(str2);
3566 if (idx1 == idx2) return TRUE;
3567 rc1 = rb_enc_str_coderange(str1);
3568 rc2 = rb_enc_str_coderange(str2);
3569 if (rc1 == ENC_CODERANGE_7BIT) {
3570 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3571 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3572 return TRUE;
3574 if (rc2 == ENC_CODERANGE_7BIT) {
3575 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3576 return TRUE;
3578 return FALSE;
3582 rb_str_cmp(VALUE str1, VALUE str2)
3584 long len1, len2;
3585 const char *ptr1, *ptr2;
3586 int retval;
3588 if (str1 == str2) return 0;
3589 RSTRING_GETMEM(str1, ptr1, len1);
3590 RSTRING_GETMEM(str2, ptr2, len2);
3591 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3592 if (len1 == len2) {
3593 if (!rb_str_comparable(str1, str2)) {
3594 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3595 return 1;
3596 return -1;
3598 return 0;
3600 if (len1 > len2) return 1;
3601 return -1;
3603 if (retval > 0) return 1;
3604 return -1;
3608 * call-seq:
3609 * string == object -> true or false
3610 * string === object -> true or false
3612 * Returns +true+ if +object+ has the same length and content;
3613 * as +self+; +false+ otherwise:
3615 * s = 'foo'
3616 * s == 'foo' # => true
3617 * s == 'food' # => false
3618 * s == 'FOO' # => false
3620 * Returns +false+ if the two strings' encodings are not compatible:
3621 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3623 * If +object+ is not an instance of \String but responds to +to_str+, then the
3624 * two strings are compared using <code>object.==</code>.
3627 VALUE
3628 rb_str_equal(VALUE str1, VALUE str2)
3630 if (str1 == str2) return Qtrue;
3631 if (!RB_TYPE_P(str2, T_STRING)) {
3632 if (!rb_respond_to(str2, idTo_str)) {
3633 return Qfalse;
3635 return rb_equal(str2, str1);
3637 return rb_str_eql_internal(str1, str2);
3641 * call-seq:
3642 * eql?(object) -> true or false
3644 * Returns +true+ if +object+ has the same length and content;
3645 * as +self+; +false+ otherwise:
3647 * s = 'foo'
3648 * s.eql?('foo') # => true
3649 * s.eql?('food') # => false
3650 * s.eql?('FOO') # => false
3652 * Returns +false+ if the two strings' encodings are not compatible:
3654 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3658 MJIT_FUNC_EXPORTED VALUE
3659 rb_str_eql(VALUE str1, VALUE str2)
3661 if (str1 == str2) return Qtrue;
3662 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3663 return rb_str_eql_internal(str1, str2);
3667 * call-seq:
3668 * string <=> other_string -> -1, 0, 1, or nil
3670 * Compares +self+ and +other_string+, returning:
3672 * - -1 if +other_string+ is larger.
3673 * - 0 if the two are equal.
3674 * - 1 if +other_string+ is smaller.
3675 * - +nil+ if the two are incomparable.
3677 * Examples:
3679 * 'foo' <=> 'foo' # => 0
3680 * 'foo' <=> 'food' # => -1
3681 * 'food' <=> 'foo' # => 1
3682 * 'FOO' <=> 'foo' # => -1
3683 * 'foo' <=> 'FOO' # => 1
3684 * 'foo' <=> 1 # => nil
3688 static VALUE
3689 rb_str_cmp_m(VALUE str1, VALUE str2)
3691 int result;
3692 VALUE s = rb_check_string_type(str2);
3693 if (NIL_P(s)) {
3694 return rb_invcmp(str1, str2);
3696 result = rb_str_cmp(str1, s);
3697 return INT2FIX(result);
3700 static VALUE str_casecmp(VALUE str1, VALUE str2);
3701 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3704 * call-seq:
3705 * casecmp(other_string) -> -1, 0, 1, or nil
3707 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3709 * - -1 if <tt>other_string.downcase</tt> is larger.
3710 * - 0 if the two are equal.
3711 * - 1 if <tt>other_string.downcase</tt> is smaller.
3712 * - +nil+ if the two are incomparable.
3714 * Examples:
3716 * 'foo'.casecmp('foo') # => 0
3717 * 'foo'.casecmp('food') # => -1
3718 * 'food'.casecmp('foo') # => 1
3719 * 'FOO'.casecmp('foo') # => 0
3720 * 'foo'.casecmp('FOO') # => 0
3721 * 'foo'.casecmp(1) # => nil
3723 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3725 * Related: String#casecmp?.
3729 static VALUE
3730 rb_str_casecmp(VALUE str1, VALUE str2)
3732 VALUE s = rb_check_string_type(str2);
3733 if (NIL_P(s)) {
3734 return Qnil;
3736 return str_casecmp(str1, s);
3739 static VALUE
3740 str_casecmp(VALUE str1, VALUE str2)
3742 long len;
3743 rb_encoding *enc;
3744 const char *p1, *p1end, *p2, *p2end;
3746 enc = rb_enc_compatible(str1, str2);
3747 if (!enc) {
3748 return Qnil;
3751 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3752 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3753 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3754 while (p1 < p1end && p2 < p2end) {
3755 if (*p1 != *p2) {
3756 unsigned int c1 = TOLOWER(*p1 & 0xff);
3757 unsigned int c2 = TOLOWER(*p2 & 0xff);
3758 if (c1 != c2)
3759 return INT2FIX(c1 < c2 ? -1 : 1);
3761 p1++;
3762 p2++;
3765 else {
3766 while (p1 < p1end && p2 < p2end) {
3767 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3768 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3770 if (0 <= c1 && 0 <= c2) {
3771 c1 = TOLOWER(c1);
3772 c2 = TOLOWER(c2);
3773 if (c1 != c2)
3774 return INT2FIX(c1 < c2 ? -1 : 1);
3776 else {
3777 int r;
3778 l1 = rb_enc_mbclen(p1, p1end, enc);
3779 l2 = rb_enc_mbclen(p2, p2end, enc);
3780 len = l1 < l2 ? l1 : l2;
3781 r = memcmp(p1, p2, len);
3782 if (r != 0)
3783 return INT2FIX(r < 0 ? -1 : 1);
3784 if (l1 != l2)
3785 return INT2FIX(l1 < l2 ? -1 : 1);
3787 p1 += l1;
3788 p2 += l2;
3791 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3792 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3793 return INT2FIX(-1);
3797 * call-seq:
3798 * casecmp?(other_string) -> true, false, or nil
3800 * Returns +true+ if +self+ and +other_string+ are equal after
3801 * Unicode case folding, otherwise +false+:
3803 * 'foo'.casecmp?('foo') # => true
3804 * 'foo'.casecmp?('food') # => false
3805 * 'food'.casecmp?('foo') # => false
3806 * 'FOO'.casecmp?('foo') # => true
3807 * 'foo'.casecmp?('FOO') # => true
3809 * Returns +nil+ if the two values are incomparable:
3811 * 'foo'.casecmp?(1) # => nil
3813 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3815 * Related: String#casecmp.
3819 static VALUE
3820 rb_str_casecmp_p(VALUE str1, VALUE str2)
3822 VALUE s = rb_check_string_type(str2);
3823 if (NIL_P(s)) {
3824 return Qnil;
3826 return str_casecmp_p(str1, s);
3829 static VALUE
3830 str_casecmp_p(VALUE str1, VALUE str2)
3832 rb_encoding *enc;
3833 VALUE folded_str1, folded_str2;
3834 VALUE fold_opt = sym_fold;
3836 enc = rb_enc_compatible(str1, str2);
3837 if (!enc) {
3838 return Qnil;
3841 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3842 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3844 return rb_str_eql(folded_str1, folded_str2);
3847 static long
3848 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3849 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3851 const char *search_start = str_ptr;
3852 long pos, search_len = str_len - offset;
3854 for (;;) {
3855 const char *t;
3856 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3857 if (pos < 0) return pos;
3858 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3859 if (t == search_start + pos) break;
3860 search_len -= t - search_start;
3861 if (search_len <= 0) return -1;
3862 offset += t - search_start;
3863 search_start = t;
3865 return pos + offset;
3868 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3870 static long
3871 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3873 const char *str_ptr, *str_ptr_end, *sub_ptr;
3874 long str_len, sub_len;
3875 rb_encoding *enc;
3877 enc = rb_enc_check(str, sub);
3878 if (is_broken_string(sub)) return -1;
3880 str_ptr = RSTRING_PTR(str);
3881 str_ptr_end = RSTRING_END(str);
3882 str_len = RSTRING_LEN(str);
3883 sub_ptr = RSTRING_PTR(sub);
3884 sub_len = RSTRING_LEN(sub);
3886 if (str_len < sub_len) return -1;
3888 if (offset != 0) {
3889 long str_len_char, sub_len_char;
3890 int single_byte = single_byte_optimizable(str);
3891 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3892 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3893 if (offset < 0) {
3894 offset += str_len_char;
3895 if (offset < 0) return -1;
3897 if (str_len_char - offset < sub_len_char) return -1;
3898 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3899 str_ptr += offset;
3901 if (sub_len == 0) return offset;
3903 /* need proceed one character at a time */
3904 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3909 * call-seq:
3910 * index(substring, offset = 0) -> integer or nil
3911 * index(regexp, offset = 0) -> integer or nil
3913 * Returns the \Integer index of the first occurrence of the given +substring+,
3914 * or +nil+ if none found:
3916 * 'foo'.index('f') # => 0
3917 * 'foo'.index('o') # => 1
3918 * 'foo'.index('oo') # => 1
3919 * 'foo'.index('ooo') # => nil
3921 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3922 * or +nil+ if none found:
3924 * 'foo'.index(/f/) # => 0
3925 * 'foo'.index(/o/) # => 1
3926 * 'foo'.index(/oo/) # => 1
3927 * 'foo'.index(/ooo/) # => nil
3929 * \Integer argument +offset+, if given, specifies the position in the
3930 * string to begin the search:
3932 * 'foo'.index('o', 1) # => 1
3933 * 'foo'.index('o', 2) # => 2
3934 * 'foo'.index('o', 3) # => nil
3936 * If +offset+ is negative, counts backward from the end of +self+:
3938 * 'foo'.index('o', -1) # => 2
3939 * 'foo'.index('o', -2) # => 1
3940 * 'foo'.index('o', -3) # => 1
3941 * 'foo'.index('o', -4) # => nil
3943 * Related: String#rindex.
3946 static VALUE
3947 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3949 VALUE sub;
3950 VALUE initpos;
3951 long pos;
3953 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3954 pos = NUM2LONG(initpos);
3956 else {
3957 pos = 0;
3959 if (pos < 0) {
3960 pos += str_strlen(str, NULL);
3961 if (pos < 0) {
3962 if (RB_TYPE_P(sub, T_REGEXP)) {
3963 rb_backref_set(Qnil);
3965 return Qnil;
3969 if (RB_TYPE_P(sub, T_REGEXP)) {
3970 if (pos > str_strlen(str, NULL))
3971 return Qnil;
3972 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3973 rb_enc_check(str, sub), single_byte_optimizable(str));
3975 if (rb_reg_search(sub, str, pos, 0) < 0) {
3976 return Qnil;
3978 else {
3979 VALUE match = rb_backref_get();
3980 struct re_registers *regs = RMATCH_REGS(match);
3981 pos = rb_str_sublen(str, BEG(0));
3982 return LONG2NUM(pos);
3985 else {
3986 StringValue(sub);
3987 pos = rb_str_index(str, sub, pos);
3988 pos = rb_str_sublen(str, pos);
3991 if (pos == -1) return Qnil;
3992 return LONG2NUM(pos);
3995 #ifdef HAVE_MEMRCHR
3996 static long
3997 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3999 char *hit, *adjusted;
4000 int c;
4001 long slen, searchlen;
4002 char *sbeg, *e, *t;
4004 slen = RSTRING_LEN(sub);
4005 if (slen == 0) return pos;
4006 sbeg = RSTRING_PTR(str);
4007 e = RSTRING_END(str);
4008 t = RSTRING_PTR(sub);
4009 c = *t & 0xff;
4010 searchlen = s - sbeg + 1;
4012 do {
4013 hit = memrchr(sbeg, c, searchlen);
4014 if (!hit) break;
4015 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4016 if (hit != adjusted) {
4017 searchlen = adjusted - sbeg;
4018 continue;
4020 if (memcmp(hit, t, slen) == 0)
4021 return rb_str_sublen(str, hit - sbeg);
4022 searchlen = adjusted - sbeg;
4023 } while (searchlen > 0);
4025 return -1;
4027 #else
4028 static long
4029 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4031 long slen;
4032 char *sbeg, *e, *t;
4034 sbeg = RSTRING_PTR(str);
4035 e = RSTRING_END(str);
4036 t = RSTRING_PTR(sub);
4037 slen = RSTRING_LEN(sub);
4039 while (s) {
4040 if (memcmp(s, t, slen) == 0) {
4041 return pos;
4043 if (pos == 0) break;
4044 pos--;
4045 s = rb_enc_prev_char(sbeg, s, e, enc);
4048 return -1;
4050 #endif
4052 static long
4053 rb_str_rindex(VALUE str, VALUE sub, long pos)
4055 long len, slen;
4056 char *sbeg, *s;
4057 rb_encoding *enc;
4058 int singlebyte;
4060 enc = rb_enc_check(str, sub);
4061 if (is_broken_string(sub)) return -1;
4062 singlebyte = single_byte_optimizable(str);
4063 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4064 slen = str_strlen(sub, enc); /* rb_enc_check */
4066 /* substring longer than string */
4067 if (len < slen) return -1;
4068 if (len - pos < slen) pos = len - slen;
4069 if (len == 0) return pos;
4071 sbeg = RSTRING_PTR(str);
4073 if (pos == 0) {
4074 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4075 return 0;
4076 else
4077 return -1;
4080 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4081 return str_rindex(str, sub, s, pos, enc);
4085 * call-seq:
4086 * rindex(substring, offset = self.length) -> integer or nil
4087 * rindex(regexp, offset = self.length) -> integer or nil
4089 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4090 * or +nil+ if none found:
4092 * 'foo'.rindex('f') # => 0
4093 * 'foo'.rindex('o') # => 2
4094 * 'foo'.rindex('oo') # => 1
4095 * 'foo'.rindex('ooo') # => nil
4097 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4098 * or +nil+ if none found:
4100 * 'foo'.rindex(/f/) # => 0
4101 * 'foo'.rindex(/o/) # => 2
4102 * 'foo'.rindex(/oo/) # => 1
4103 * 'foo'.rindex(/ooo/) # => nil
4105 * The _last_ match means starting at the possible last position, not
4106 * the last of longest matches.
4108 * 'foo'.rindex(/o+/) # => 2
4109 * $~ #=> #<MatchData "o">
4111 * To get the last longest match, needs to combine with negative
4112 * lookbehind.
4114 * 'foo'.rindex(/(?<!o)o+/) # => 1
4115 * $~ #=> #<MatchData "oo">
4117 * Or String#index with negative lookforward.
4119 * 'foo'.index(/o+(?!.*o)/) # => 1
4120 * $~ #=> #<MatchData "oo">
4122 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4123 * string to _end_ the search:
4125 * 'foo'.rindex('o', 0) # => nil
4126 * 'foo'.rindex('o', 1) # => 1
4127 * 'foo'.rindex('o', 2) # => 2
4128 * 'foo'.rindex('o', 3) # => 2
4130 * If +offset+ is a negative \Integer, the maximum starting position in the
4131 * string to _end_ the search is the sum of the string's length and +offset+:
4133 * 'foo'.rindex('o', -1) # => 2
4134 * 'foo'.rindex('o', -2) # => 1
4135 * 'foo'.rindex('o', -3) # => nil
4136 * 'foo'.rindex('o', -4) # => nil
4138 * Related: String#index.
4141 static VALUE
4142 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4144 VALUE sub;
4145 VALUE vpos;
4146 rb_encoding *enc = STR_ENC_GET(str);
4147 long pos, len = str_strlen(str, enc); /* str's enc */
4149 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4150 pos = NUM2LONG(vpos);
4151 if (pos < 0) {
4152 pos += len;
4153 if (pos < 0) {
4154 if (RB_TYPE_P(sub, T_REGEXP)) {
4155 rb_backref_set(Qnil);
4157 return Qnil;
4160 if (pos > len) pos = len;
4162 else {
4163 pos = len;
4166 if (RB_TYPE_P(sub, T_REGEXP)) {
4167 /* enc = rb_get_check(str, sub); */
4168 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4169 enc, single_byte_optimizable(str));
4171 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4172 VALUE match = rb_backref_get();
4173 struct re_registers *regs = RMATCH_REGS(match);
4174 pos = rb_str_sublen(str, BEG(0));
4175 return LONG2NUM(pos);
4178 else {
4179 StringValue(sub);
4180 pos = rb_str_rindex(str, sub, pos);
4181 if (pos >= 0) return LONG2NUM(pos);
4183 return Qnil;
4187 * call-seq:
4188 * string =~ regexp -> integer or nil
4189 * string =~ object -> integer or nil
4191 * Returns the \Integer index of the first substring that matches
4192 * the given +regexp+, or +nil+ if no match found:
4194 * 'foo' =~ /f/ # => 0
4195 * 'foo' =~ /o/ # => 1
4196 * 'foo' =~ /x/ # => nil
4198 * Note: also updates
4199 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4201 * If the given +object+ is not a \Regexp, returns the value
4202 * returned by <tt>object =~ self</tt>.
4204 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4205 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4207 * number= nil
4208 * "no. 9" =~ /(?<number>\d+)/
4209 * number # => nil (not assigned)
4210 * /(?<number>\d+)/ =~ "no. 9"
4211 * number #=> "9"
4215 static VALUE
4216 rb_str_match(VALUE x, VALUE y)
4218 switch (OBJ_BUILTIN_TYPE(y)) {
4219 case T_STRING:
4220 rb_raise(rb_eTypeError, "type mismatch: String given");
4222 case T_REGEXP:
4223 return rb_reg_match(y, x);
4225 default:
4226 return rb_funcall(y, idEqTilde, 1, x);
4231 static VALUE get_pat(VALUE);
4235 * call-seq:
4236 * match(pattern, offset = 0) -> matchdata or nil
4237 * match(pattern, offset = 0) {|matchdata| ... } -> object
4239 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4241 * Note: also updates
4242 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4244 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4245 * regexp = Regexp.new(pattern)
4246 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4247 * (see Regexp#match):
4248 * matchdata = <tt>regexp.match(self)
4250 * With no block given, returns the computed +matchdata+:
4252 * 'foo'.match('f') # => #<MatchData "f">
4253 * 'foo'.match('o') # => #<MatchData "o">
4254 * 'foo'.match('x') # => nil
4256 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4258 * 'foo'.match('f', 1) # => nil
4259 * 'foo'.match('o', 1) # => #<MatchData "o">
4261 * With a block given, calls the block with the computed +matchdata+
4262 * and returns the block's return value:
4264 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4265 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4266 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4270 static VALUE
4271 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4273 VALUE re, result;
4274 if (argc < 1)
4275 rb_check_arity(argc, 1, 2);
4276 re = argv[0];
4277 argv[0] = str;
4278 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4279 if (!NIL_P(result) && rb_block_given_p()) {
4280 return rb_yield(result);
4282 return result;
4286 * call-seq:
4287 * match?(pattern, offset = 0) -> true or false
4289 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4291 * Note: does not update
4292 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4294 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4295 * regexp = Regexp.new(pattern)
4297 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4298 * +false+ otherwise:
4300 * 'foo'.match?(/o/) # => true
4301 * 'foo'.match?('o') # => true
4302 * 'foo'.match?(/x/) # => false
4304 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4305 * 'foo'.match?('f', 1) # => false
4306 * 'foo'.match?('o', 1) # => true
4310 static VALUE
4311 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4313 VALUE re;
4314 rb_check_arity(argc, 1, 2);
4315 re = get_pat(argv[0]);
4316 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4319 enum neighbor_char {
4320 NEIGHBOR_NOT_CHAR,
4321 NEIGHBOR_FOUND,
4322 NEIGHBOR_WRAPPED
4325 static enum neighbor_char
4326 enc_succ_char(char *p, long len, rb_encoding *enc)
4328 long i;
4329 int l;
4331 if (rb_enc_mbminlen(enc) > 1) {
4332 /* wchar, trivial case */
4333 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4334 if (!MBCLEN_CHARFOUND_P(r)) {
4335 return NEIGHBOR_NOT_CHAR;
4337 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4338 l = rb_enc_code_to_mbclen(c, enc);
4339 if (!l) return NEIGHBOR_NOT_CHAR;
4340 if (l != len) return NEIGHBOR_WRAPPED;
4341 rb_enc_mbcput(c, p, enc);
4342 r = rb_enc_precise_mbclen(p, p + len, enc);
4343 if (!MBCLEN_CHARFOUND_P(r)) {
4344 return NEIGHBOR_NOT_CHAR;
4346 return NEIGHBOR_FOUND;
4348 while (1) {
4349 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4350 p[i] = '\0';
4351 if (i < 0)
4352 return NEIGHBOR_WRAPPED;
4353 ++((unsigned char*)p)[i];
4354 l = rb_enc_precise_mbclen(p, p+len, enc);
4355 if (MBCLEN_CHARFOUND_P(l)) {
4356 l = MBCLEN_CHARFOUND_LEN(l);
4357 if (l == len) {
4358 return NEIGHBOR_FOUND;
4360 else {
4361 memset(p+l, 0xff, len-l);
4364 if (MBCLEN_INVALID_P(l) && i < len-1) {
4365 long len2;
4366 int l2;
4367 for (len2 = len-1; 0 < len2; len2--) {
4368 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4369 if (!MBCLEN_INVALID_P(l2))
4370 break;
4372 memset(p+len2+1, 0xff, len-(len2+1));
4377 static enum neighbor_char
4378 enc_pred_char(char *p, long len, rb_encoding *enc)
4380 long i;
4381 int l;
4382 if (rb_enc_mbminlen(enc) > 1) {
4383 /* wchar, trivial case */
4384 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4385 if (!MBCLEN_CHARFOUND_P(r)) {
4386 return NEIGHBOR_NOT_CHAR;
4388 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4389 if (!c) return NEIGHBOR_NOT_CHAR;
4390 --c;
4391 l = rb_enc_code_to_mbclen(c, enc);
4392 if (!l) return NEIGHBOR_NOT_CHAR;
4393 if (l != len) return NEIGHBOR_WRAPPED;
4394 rb_enc_mbcput(c, p, enc);
4395 r = rb_enc_precise_mbclen(p, p + len, enc);
4396 if (!MBCLEN_CHARFOUND_P(r)) {
4397 return NEIGHBOR_NOT_CHAR;
4399 return NEIGHBOR_FOUND;
4401 while (1) {
4402 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4403 p[i] = '\xff';
4404 if (i < 0)
4405 return NEIGHBOR_WRAPPED;
4406 --((unsigned char*)p)[i];
4407 l = rb_enc_precise_mbclen(p, p+len, enc);
4408 if (MBCLEN_CHARFOUND_P(l)) {
4409 l = MBCLEN_CHARFOUND_LEN(l);
4410 if (l == len) {
4411 return NEIGHBOR_FOUND;
4413 else {
4414 memset(p+l, 0, len-l);
4417 if (MBCLEN_INVALID_P(l) && i < len-1) {
4418 long len2;
4419 int l2;
4420 for (len2 = len-1; 0 < len2; len2--) {
4421 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4422 if (!MBCLEN_INVALID_P(l2))
4423 break;
4425 memset(p+len2+1, 0, len-(len2+1));
4431 overwrite +p+ by succeeding letter in +enc+ and returns
4432 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4433 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4434 assuming each ranges are successive, and mbclen
4435 never change in each ranges.
4436 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4437 character.
4439 static enum neighbor_char
4440 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4442 enum neighbor_char ret;
4443 unsigned int c;
4444 int ctype;
4445 int range;
4446 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4448 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4449 int try;
4450 const int max_gaps = 1;
4452 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4453 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4454 ctype = ONIGENC_CTYPE_DIGIT;
4455 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4456 ctype = ONIGENC_CTYPE_ALPHA;
4457 else
4458 return NEIGHBOR_NOT_CHAR;
4460 MEMCPY(save, p, char, len);
4461 for (try = 0; try <= max_gaps; ++try) {
4462 ret = enc_succ_char(p, len, enc);
4463 if (ret == NEIGHBOR_FOUND) {
4464 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4465 if (rb_enc_isctype(c, ctype, enc))
4466 return NEIGHBOR_FOUND;
4469 MEMCPY(p, save, char, len);
4470 range = 1;
4471 while (1) {
4472 MEMCPY(save, p, char, len);
4473 ret = enc_pred_char(p, len, enc);
4474 if (ret == NEIGHBOR_FOUND) {
4475 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4476 if (!rb_enc_isctype(c, ctype, enc)) {
4477 MEMCPY(p, save, char, len);
4478 break;
4481 else {
4482 MEMCPY(p, save, char, len);
4483 break;
4485 range++;
4487 if (range == 1) {
4488 return NEIGHBOR_NOT_CHAR;
4491 if (ctype != ONIGENC_CTYPE_DIGIT) {
4492 MEMCPY(carry, p, char, len);
4493 return NEIGHBOR_WRAPPED;
4496 MEMCPY(carry, p, char, len);
4497 enc_succ_char(carry, len, enc);
4498 return NEIGHBOR_WRAPPED;
4502 static VALUE str_succ(VALUE str);
4505 * call-seq:
4506 * succ -> new_str
4508 * Returns the successor to +self+. The successor is calculated by
4509 * incrementing characters.
4511 * The first character to be incremented is the rightmost alphanumeric:
4512 * or, if no alphanumerics, the rightmost character:
4514 * 'THX1138'.succ # => "THX1139"
4515 * '<<koala>>'.succ # => "<<koalb>>"
4516 * '***'.succ # => '**+'
4518 * The successor to a digit is another digit, "carrying" to the next-left
4519 * character for a "rollover" from 9 to 0, and prepending another digit
4520 * if necessary:
4522 * '00'.succ # => "01"
4523 * '09'.succ # => "10"
4524 * '99'.succ # => "100"
4526 * The successor to a letter is another letter of the same case,
4527 * carrying to the next-left character for a rollover,
4528 * and prepending another same-case letter if necessary:
4530 * 'aa'.succ # => "ab"
4531 * 'az'.succ # => "ba"
4532 * 'zz'.succ # => "aaa"
4533 * 'AA'.succ # => "AB"
4534 * 'AZ'.succ # => "BA"
4535 * 'ZZ'.succ # => "AAA"
4537 * The successor to a non-alphanumeric character is the next character
4538 * in the underlying character set's collating sequence,
4539 * carrying to the next-left character for a rollover,
4540 * and prepending another character if necessary:
4542 * s = 0.chr * 3
4543 * s # => "\x00\x00\x00"
4544 * s.succ # => "\x00\x00\x01"
4545 * s = 255.chr * 3
4546 * s # => "\xFF\xFF\xFF"
4547 * s.succ # => "\x01\x00\x00\x00"
4549 * Carrying can occur between and among mixtures of alphanumeric characters:
4551 * s = 'zz99zz99'
4552 * s.succ # => "aaa00aa00"
4553 * s = '99zz99zz'
4554 * s.succ # => "100aa00aa"
4556 * The successor to an empty \String is a new empty \String:
4558 * ''.succ # => ""
4560 * String#next is an alias for String#succ.
4563 VALUE
4564 rb_str_succ(VALUE orig)
4566 VALUE str;
4567 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4568 rb_enc_cr_str_copy_for_substr(str, orig);
4569 return str_succ(str);
4572 static VALUE
4573 str_succ(VALUE str)
4575 rb_encoding *enc;
4576 char *sbeg, *s, *e, *last_alnum = 0;
4577 int found_alnum = 0;
4578 long l, slen;
4579 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4580 long carry_pos = 0, carry_len = 1;
4581 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4583 slen = RSTRING_LEN(str);
4584 if (slen == 0) return str;
4586 enc = STR_ENC_GET(str);
4587 sbeg = RSTRING_PTR(str);
4588 s = e = sbeg + slen;
4590 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4591 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4592 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4593 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4594 break;
4597 l = rb_enc_precise_mbclen(s, e, enc);
4598 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4599 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4600 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4601 switch (neighbor) {
4602 case NEIGHBOR_NOT_CHAR:
4603 continue;
4604 case NEIGHBOR_FOUND:
4605 return str;
4606 case NEIGHBOR_WRAPPED:
4607 last_alnum = s;
4608 break;
4610 found_alnum = 1;
4611 carry_pos = s - sbeg;
4612 carry_len = l;
4614 if (!found_alnum) { /* str contains no alnum */
4615 s = e;
4616 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4617 enum neighbor_char neighbor;
4618 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4619 l = rb_enc_precise_mbclen(s, e, enc);
4620 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4621 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4622 MEMCPY(tmp, s, char, l);
4623 neighbor = enc_succ_char(tmp, l, enc);
4624 switch (neighbor) {
4625 case NEIGHBOR_FOUND:
4626 MEMCPY(s, tmp, char, l);
4627 return str;
4628 break;
4629 case NEIGHBOR_WRAPPED:
4630 MEMCPY(s, tmp, char, l);
4631 break;
4632 case NEIGHBOR_NOT_CHAR:
4633 break;
4635 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4636 /* wrapped to \0...\0. search next valid char. */
4637 enc_succ_char(s, l, enc);
4639 if (!rb_enc_asciicompat(enc)) {
4640 MEMCPY(carry, s, char, l);
4641 carry_len = l;
4643 carry_pos = s - sbeg;
4645 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4647 RESIZE_CAPA(str, slen + carry_len);
4648 sbeg = RSTRING_PTR(str);
4649 s = sbeg + carry_pos;
4650 memmove(s + carry_len, s, slen - carry_pos);
4651 memmove(s, carry, carry_len);
4652 slen += carry_len;
4653 STR_SET_LEN(str, slen);
4654 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4655 rb_enc_str_coderange(str);
4656 return str;
4661 * call-seq:
4662 * succ! -> self
4664 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4666 * String#next! is an alias for String#succ!.
4669 static VALUE
4670 rb_str_succ_bang(VALUE str)
4672 rb_str_modify(str);
4673 str_succ(str);
4674 return str;
4677 static int
4678 all_digits_p(const char *s, long len)
4680 while (len-- > 0) {
4681 if (!ISDIGIT(*s)) return 0;
4682 s++;
4684 return 1;
4687 static int
4688 str_upto_i(VALUE str, VALUE arg)
4690 rb_yield(str);
4691 return 0;
4695 * call-seq:
4696 * upto(other_string, exclusive = false) {|string| ... } -> self
4697 * upto(other_string, exclusive = false) -> new_enumerator
4699 * With a block given, calls the block with each \String value
4700 * returned by successive calls to String#succ;
4701 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4702 * the sequence terminates when value +other_string+ is reached;
4703 * returns +self+:
4705 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4706 * Output:
4708 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4710 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4712 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4714 * Output:
4716 * a8 a9 b0 b1 b2 b3 b4 b5
4718 * If +other_string+ would not be reached, does not call the block:
4720 * '25'.upto('5') {|s| fail s }
4721 * 'aa'.upto('a') {|s| fail s }
4723 * With no block given, returns a new \Enumerator:
4725 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4729 static VALUE
4730 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4732 VALUE end, exclusive;
4734 rb_scan_args(argc, argv, "11", &end, &exclusive);
4735 RETURN_ENUMERATOR(beg, argc, argv);
4736 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4739 VALUE
4740 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4742 VALUE current, after_end;
4743 ID succ;
4744 int n, ascii;
4745 rb_encoding *enc;
4747 CONST_ID(succ, "succ");
4748 StringValue(end);
4749 enc = rb_enc_check(beg, end);
4750 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4751 /* single character */
4752 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4753 char c = RSTRING_PTR(beg)[0];
4754 char e = RSTRING_PTR(end)[0];
4756 if (c > e || (excl && c == e)) return beg;
4757 for (;;) {
4758 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4759 if (!excl && c == e) break;
4760 c++;
4761 if (excl && c == e) break;
4763 return beg;
4765 /* both edges are all digits */
4766 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4767 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4768 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4769 VALUE b, e;
4770 int width;
4772 width = RSTRING_LENINT(beg);
4773 b = rb_str_to_inum(beg, 10, FALSE);
4774 e = rb_str_to_inum(end, 10, FALSE);
4775 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4776 long bi = FIX2LONG(b);
4777 long ei = FIX2LONG(e);
4778 rb_encoding *usascii = rb_usascii_encoding();
4780 while (bi <= ei) {
4781 if (excl && bi == ei) break;
4782 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4783 bi++;
4786 else {
4787 ID op = excl ? '<' : idLE;
4788 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4790 args[0] = INT2FIX(width);
4791 while (rb_funcall(b, op, 1, e)) {
4792 args[1] = b;
4793 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4794 b = rb_funcallv(b, succ, 0, 0);
4797 return beg;
4799 /* normal case */
4800 n = rb_str_cmp(beg, end);
4801 if (n > 0 || (excl && n == 0)) return beg;
4803 after_end = rb_funcallv(end, succ, 0, 0);
4804 current = str_duplicate(rb_cString, beg);
4805 while (!rb_str_equal(current, after_end)) {
4806 VALUE next = Qnil;
4807 if (excl || !rb_str_equal(current, end))
4808 next = rb_funcallv(current, succ, 0, 0);
4809 if ((*each)(current, arg)) break;
4810 if (NIL_P(next)) break;
4811 current = next;
4812 StringValue(current);
4813 if (excl && rb_str_equal(current, end)) break;
4814 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4815 break;
4818 return beg;
4821 VALUE
4822 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4824 VALUE current;
4825 ID succ;
4827 CONST_ID(succ, "succ");
4828 /* both edges are all digits */
4829 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4830 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4831 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4832 int width = RSTRING_LENINT(beg);
4833 b = rb_str_to_inum(beg, 10, FALSE);
4834 if (FIXNUM_P(b)) {
4835 long bi = FIX2LONG(b);
4836 rb_encoding *usascii = rb_usascii_encoding();
4838 while (FIXABLE(bi)) {
4839 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4840 bi++;
4842 b = LONG2NUM(bi);
4844 args[0] = INT2FIX(width);
4845 while (1) {
4846 args[1] = b;
4847 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4848 b = rb_funcallv(b, succ, 0, 0);
4851 /* normal case */
4852 current = str_duplicate(rb_cString, beg);
4853 while (1) {
4854 VALUE next = rb_funcallv(current, succ, 0, 0);
4855 if ((*each)(current, arg)) break;
4856 current = next;
4857 StringValue(current);
4858 if (RSTRING_LEN(current) == 0)
4859 break;
4862 return beg;
4865 static int
4866 include_range_i(VALUE str, VALUE arg)
4868 VALUE *argp = (VALUE *)arg;
4869 if (!rb_equal(str, *argp)) return 0;
4870 *argp = Qnil;
4871 return 1;
4874 VALUE
4875 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4877 beg = rb_str_new_frozen(beg);
4878 StringValue(end);
4879 end = rb_str_new_frozen(end);
4880 if (NIL_P(val)) return Qfalse;
4881 val = rb_check_string_type(val);
4882 if (NIL_P(val)) return Qfalse;
4883 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4884 rb_enc_asciicompat(STR_ENC_GET(end)) &&
4885 rb_enc_asciicompat(STR_ENC_GET(val))) {
4886 const char *bp = RSTRING_PTR(beg);
4887 const char *ep = RSTRING_PTR(end);
4888 const char *vp = RSTRING_PTR(val);
4889 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4890 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4891 return Qfalse;
4892 else {
4893 char b = *bp;
4894 char e = *ep;
4895 char v = *vp;
4897 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4898 if (b <= v && v < e) return Qtrue;
4899 return RBOOL(!RTEST(exclusive) && v == e);
4903 #if 0
4904 /* both edges are all digits */
4905 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4906 all_digits_p(bp, RSTRING_LEN(beg)) &&
4907 all_digits_p(ep, RSTRING_LEN(end))) {
4908 /* TODO */
4910 #endif
4912 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4914 return RBOOL(NIL_P(val));
4917 static VALUE
4918 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4920 if (rb_reg_search(re, str, 0, 0) >= 0) {
4921 VALUE match = rb_backref_get();
4922 int nth = rb_reg_backref_number(match, backref);
4923 return rb_reg_nth_match(nth, match);
4925 return Qnil;
4928 static VALUE
4929 rb_str_aref(VALUE str, VALUE indx)
4931 long idx;
4933 if (FIXNUM_P(indx)) {
4934 idx = FIX2LONG(indx);
4936 else if (RB_TYPE_P(indx, T_REGEXP)) {
4937 return rb_str_subpat(str, indx, INT2FIX(0));
4939 else if (RB_TYPE_P(indx, T_STRING)) {
4940 if (rb_str_index(str, indx, 0) != -1)
4941 return str_duplicate(rb_cString, indx);
4942 return Qnil;
4944 else {
4945 /* check if indx is Range */
4946 long beg, len = str_strlen(str, NULL);
4947 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4948 case Qfalse:
4949 break;
4950 case Qnil:
4951 return Qnil;
4952 default:
4953 return rb_str_substr(str, beg, len);
4955 idx = NUM2LONG(indx);
4958 return str_substr(str, idx, 1, FALSE);
4963 * call-seq:
4964 * string[index] -> new_string or nil
4965 * string[start, length] -> new_string or nil
4966 * string[range] -> new_string or nil
4967 * string[regexp, capture = 0] -> new_string or nil
4968 * string[substring] -> new_string or nil
4970 * Returns the substring of +self+ specified by the arguments.
4972 * When the single \Integer argument +index+ is given,
4973 * returns the 1-character substring found in +self+ at offset +index+:
4975 * 'bar'[2] # => "r"
4977 * Counts backward from the end of +self+ if +index+ is negative:
4979 * 'foo'[-3] # => "f"
4981 * Returns +nil+ if +index+ is out of range:
4983 * 'foo'[3] # => nil
4984 * 'foo'[-4] # => nil
4986 * When the two \Integer arguments +start+ and +length+ are given,
4987 * returns the substring of the given +length+ found in +self+ at offset +start+:
4989 * 'foo'[0, 2] # => "fo"
4990 * 'foo'[0, 0] # => ""
4992 * Counts backward from the end of +self+ if +start+ is negative:
4994 * 'foo'[-2, 2] # => "oo"
4996 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4998 * 'foo'[3, 2] # => ""
5000 * Returns +nil+ if +start+ is out of range:
5002 * 'foo'[4, 2] # => nil
5003 * 'foo'[-4, 2] # => nil
5005 * Returns the trailing substring of +self+ if +length+ is large:
5007 * 'foo'[1, 50] # => "oo"
5009 * Returns +nil+ if +length+ is negative:
5011 * 'foo'[0, -1] # => nil
5013 * When the single \Range argument +range+ is given,
5014 * derives +start+ and +length+ values from the given +range+,
5015 * and returns values as above:
5017 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5018 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5020 * When the \Regexp argument +regexp+ is given,
5021 * and the +capture+ argument is <tt>0</tt>,
5022 * returns the first matching substring found in +self+,
5023 * or +nil+ if none found:
5025 * 'foo'[/o/] # => "o"
5026 * 'foo'[/x/] # => nil
5027 * s = 'hello there'
5028 * s[/[aeiou](.)\1/] # => "ell"
5029 * s[/[aeiou](.)\1/, 0] # => "ell"
5031 * If argument +capture+ is given and not <tt>0</tt>,
5032 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5033 * the method call returns only the specified capture
5034 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5036 * s = 'hello there'
5037 * s[/[aeiou](.)\1/, 1] # => "l"
5038 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5039 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5041 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5042 * capture group name is given, +IndexError+ is raised.
5044 * When the single \String argument +substring+ is given,
5045 * returns the substring from +self+ if found, otherwise +nil+:
5047 * 'foo'['oo'] # => "oo"
5048 * 'foo'['xx'] # => nil
5050 * String#slice is an alias for String#[].
5053 static VALUE
5054 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5056 if (argc == 2) {
5057 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5058 return rb_str_subpat(str, argv[0], argv[1]);
5060 else {
5061 long beg = NUM2LONG(argv[0]);
5062 long len = NUM2LONG(argv[1]);
5063 return rb_str_substr(str, beg, len);
5066 rb_check_arity(argc, 1, 2);
5067 return rb_str_aref(str, argv[0]);
5070 VALUE
5071 rb_str_drop_bytes(VALUE str, long len)
5073 char *ptr = RSTRING_PTR(str);
5074 long olen = RSTRING_LEN(str), nlen;
5076 str_modifiable(str);
5077 if (len > olen) len = olen;
5078 nlen = olen - len;
5079 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5080 char *oldptr = ptr;
5081 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5082 STR_SET_EMBED(str);
5083 STR_SET_EMBED_LEN(str, nlen);
5084 ptr = RSTRING(str)->as.embed.ary;
5085 memmove(ptr, oldptr + len, nlen);
5086 if (fl == STR_NOEMBED) xfree(oldptr);
5088 else {
5089 if (!STR_SHARED_P(str)) {
5090 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5091 rb_enc_cr_str_exact_copy(shared, str);
5092 OBJ_FREEZE(shared);
5094 ptr = RSTRING(str)->as.heap.ptr += len;
5095 RSTRING(str)->as.heap.len = nlen;
5097 ptr[nlen] = 0;
5098 ENC_CODERANGE_CLEAR(str);
5099 return str;
5102 static void
5103 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5105 char *sptr;
5106 long slen, vlen = RSTRING_LEN(val);
5107 int cr;
5109 if (beg == 0 && vlen == 0) {
5110 rb_str_drop_bytes(str, len);
5111 return;
5114 str_modify_keep_cr(str);
5115 RSTRING_GETMEM(str, sptr, slen);
5116 if (len < vlen) {
5117 /* expand string */
5118 RESIZE_CAPA(str, slen + vlen - len);
5119 sptr = RSTRING_PTR(str);
5122 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5123 cr = rb_enc_str_coderange(val);
5124 else
5125 cr = ENC_CODERANGE_UNKNOWN;
5127 if (vlen != len) {
5128 memmove(sptr + beg + vlen,
5129 sptr + beg + len,
5130 slen - (beg + len));
5132 if (vlen < beg && len < 0) {
5133 MEMZERO(sptr + slen, char, -len);
5135 if (vlen > 0) {
5136 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5138 slen += vlen - len;
5139 STR_SET_LEN(str, slen);
5140 TERM_FILL(&sptr[slen], TERM_LEN(str));
5141 ENC_CODERANGE_SET(str, cr);
5144 void
5145 rb_str_update(VALUE str, long beg, long len, VALUE val)
5147 long slen;
5148 char *p, *e;
5149 rb_encoding *enc;
5150 int singlebyte = single_byte_optimizable(str);
5151 int cr;
5153 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5155 StringValue(val);
5156 enc = rb_enc_check(str, val);
5157 slen = str_strlen(str, enc); /* rb_enc_check */
5159 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5160 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5162 if (beg < 0) {
5163 beg += slen;
5165 assert(beg >= 0);
5166 assert(beg <= slen);
5167 if (len > slen - beg) {
5168 len = slen - beg;
5170 str_modify_keep_cr(str);
5171 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5172 if (!p) p = RSTRING_END(str);
5173 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5174 if (!e) e = RSTRING_END(str);
5175 /* error check */
5176 beg = p - RSTRING_PTR(str); /* physical position */
5177 len = e - p; /* physical length */
5178 rb_str_splice_0(str, beg, len, val);
5179 rb_enc_associate(str, enc);
5180 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5181 if (cr != ENC_CODERANGE_BROKEN)
5182 ENC_CODERANGE_SET(str, cr);
5185 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5187 static void
5188 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5190 int nth;
5191 VALUE match;
5192 long start, end, len;
5193 rb_encoding *enc;
5194 struct re_registers *regs;
5196 if (rb_reg_search(re, str, 0, 0) < 0) {
5197 rb_raise(rb_eIndexError, "regexp not matched");
5199 match = rb_backref_get();
5200 nth = rb_reg_backref_number(match, backref);
5201 regs = RMATCH_REGS(match);
5202 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5203 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5205 if (nth < 0) {
5206 nth += regs->num_regs;
5209 start = BEG(nth);
5210 if (start == -1) {
5211 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5213 end = END(nth);
5214 len = end - start;
5215 StringValue(val);
5216 enc = rb_enc_check_str(str, val);
5217 rb_str_splice_0(str, start, len, val);
5218 rb_enc_associate(str, enc);
5221 static VALUE
5222 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5224 long idx, beg;
5226 switch (TYPE(indx)) {
5227 case T_REGEXP:
5228 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5229 return val;
5231 case T_STRING:
5232 beg = rb_str_index(str, indx, 0);
5233 if (beg < 0) {
5234 rb_raise(rb_eIndexError, "string not matched");
5236 beg = rb_str_sublen(str, beg);
5237 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5238 return val;
5240 default:
5241 /* check if indx is Range */
5243 long beg, len;
5244 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5245 rb_str_splice(str, beg, len, val);
5246 return val;
5249 /* FALLTHROUGH */
5251 case T_FIXNUM:
5252 idx = NUM2LONG(indx);
5253 rb_str_splice(str, idx, 1, val);
5254 return val;
5259 * call-seq:
5260 * str[integer] = new_str
5261 * str[integer, integer] = new_str
5262 * str[range] = aString
5263 * str[regexp] = new_str
5264 * str[regexp, integer] = new_str
5265 * str[regexp, name] = new_str
5266 * str[other_str] = new_str
5268 * Element Assignment---Replaces some or all of the content of
5269 * <i>str</i>. The portion of the string affected is determined using
5270 * the same criteria as String#[]. If the replacement string is not
5271 * the same length as the text it is replacing, the string will be
5272 * adjusted accordingly. If the regular expression or string is used
5273 * as the index doesn't match a position in the string, IndexError is
5274 * raised. If the regular expression form is used, the optional
5275 * second Integer allows you to specify which portion of the match to
5276 * replace (effectively using the MatchData indexing rules. The forms
5277 * that take an Integer will raise an IndexError if the value is out
5278 * of range; the Range form will raise a RangeError, and the Regexp
5279 * and String will raise an IndexError on negative match.
5282 static VALUE
5283 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5285 if (argc == 3) {
5286 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5287 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5289 else {
5290 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5292 return argv[2];
5294 rb_check_arity(argc, 2, 3);
5295 return rb_str_aset(str, argv[0], argv[1]);
5299 * call-seq:
5300 * insert(index, other_string) -> self
5302 * Inserts the given +other_string+ into +self+; returns +self+.
5304 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5306 * 'foo'.insert(1, 'bar') # => "fbaroo"
5308 * If the \Integer +index+ is negative, counts backward from the end of +self+
5309 * and inserts +other_string+ at offset <tt>index+1</tt>
5310 * (that is, _after_ <tt>self[index]</tt>):
5312 * 'foo'.insert(-2, 'bar') # => "fobaro"
5316 static VALUE
5317 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5319 long pos = NUM2LONG(idx);
5321 if (pos == -1) {
5322 return rb_str_append(str, str2);
5324 else if (pos < 0) {
5325 pos++;
5327 rb_str_splice(str, pos, 0, str2);
5328 return str;
5333 * call-seq:
5334 * slice!(index) -> new_string or nil
5335 * slice!(start, length) -> new_string or nil
5336 * slice!(range) -> new_string or nil
5337 * slice!(regexp, capture = 0) -> new_string or nil
5338 * slice!(substring) -> new_string or nil
5340 * Removes the substring of +self+ specified by the arguments;
5341 * returns the removed substring.
5343 * See String#[] for details about the arguments that specify the substring.
5345 * A few examples:
5347 * string = "This is a string"
5348 * string.slice!(2) #=> "i"
5349 * string.slice!(3..6) #=> " is "
5350 * string.slice!(/s.*t/) #=> "sa st"
5351 * string.slice!("r") #=> "r"
5352 * string #=> "Thing"
5356 static VALUE
5357 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5359 VALUE result = Qnil;
5360 VALUE indx;
5361 long beg, len = 1;
5362 char *p;
5364 rb_check_arity(argc, 1, 2);
5365 str_modify_keep_cr(str);
5366 indx = argv[0];
5367 if (RB_TYPE_P(indx, T_REGEXP)) {
5368 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5369 VALUE match = rb_backref_get();
5370 struct re_registers *regs = RMATCH_REGS(match);
5371 int nth = 0;
5372 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5373 if ((nth += regs->num_regs) <= 0) return Qnil;
5375 else if (nth >= regs->num_regs) return Qnil;
5376 beg = BEG(nth);
5377 len = END(nth) - beg;
5378 goto subseq;
5380 else if (argc == 2) {
5381 beg = NUM2LONG(indx);
5382 len = NUM2LONG(argv[1]);
5383 goto num_index;
5385 else if (FIXNUM_P(indx)) {
5386 beg = FIX2LONG(indx);
5387 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5388 if (!len) return Qnil;
5389 beg = p - RSTRING_PTR(str);
5390 goto subseq;
5392 else if (RB_TYPE_P(indx, T_STRING)) {
5393 beg = rb_str_index(str, indx, 0);
5394 if (beg == -1) return Qnil;
5395 len = RSTRING_LEN(indx);
5396 result = str_duplicate(rb_cString, indx);
5397 goto squash;
5399 else {
5400 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5401 case Qnil:
5402 return Qnil;
5403 case Qfalse:
5404 beg = NUM2LONG(indx);
5405 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5406 if (!len) return Qnil;
5407 beg = p - RSTRING_PTR(str);
5408 goto subseq;
5409 default:
5410 goto num_index;
5414 num_index:
5415 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5416 beg = p - RSTRING_PTR(str);
5418 subseq:
5419 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5420 rb_enc_cr_str_copy_for_substr(result, str);
5422 squash:
5423 if (len > 0) {
5424 if (beg == 0) {
5425 rb_str_drop_bytes(str, len);
5427 else {
5428 char *sptr = RSTRING_PTR(str);
5429 long slen = RSTRING_LEN(str);
5430 if (beg + len > slen) /* pathological check */
5431 len = slen - beg;
5432 memmove(sptr + beg,
5433 sptr + beg + len,
5434 slen - (beg + len));
5435 slen -= len;
5436 STR_SET_LEN(str, slen);
5437 TERM_FILL(&sptr[slen], TERM_LEN(str));
5440 return result;
5443 static VALUE
5444 get_pat(VALUE pat)
5446 VALUE val;
5448 switch (OBJ_BUILTIN_TYPE(pat)) {
5449 case T_REGEXP:
5450 return pat;
5452 case T_STRING:
5453 break;
5455 default:
5456 val = rb_check_string_type(pat);
5457 if (NIL_P(val)) {
5458 Check_Type(pat, T_REGEXP);
5460 pat = val;
5463 return rb_reg_regcomp(pat);
5466 static VALUE
5467 get_pat_quoted(VALUE pat, int check)
5469 VALUE val;
5471 switch (OBJ_BUILTIN_TYPE(pat)) {
5472 case T_REGEXP:
5473 return pat;
5475 case T_STRING:
5476 break;
5478 default:
5479 val = rb_check_string_type(pat);
5480 if (NIL_P(val)) {
5481 Check_Type(pat, T_REGEXP);
5483 pat = val;
5485 if (check && is_broken_string(pat)) {
5486 rb_exc_raise(rb_reg_check_preprocess(pat));
5488 return pat;
5491 static long
5492 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5494 if (BUILTIN_TYPE(pat) == T_STRING) {
5495 pos = rb_strseq_index(str, pat, pos, 1);
5496 if (set_backref_str) {
5497 if (pos >= 0) {
5498 str = rb_str_new_frozen_String(str);
5499 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5501 else {
5502 rb_backref_set(Qnil);
5505 return pos;
5507 else {
5508 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5514 * call-seq:
5515 * sub!(pattern, replacement) -> self or nil
5516 * sub!(pattern) {|match| ... } -> self or nil
5518 * Returns +self+ with only the first occurrence
5519 * (not all occurrences) of the given +pattern+ replaced.
5521 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5523 * Related: String#sub, String#gsub, String#gsub!.
5527 static VALUE
5528 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5530 VALUE pat, repl, hash = Qnil;
5531 int iter = 0;
5532 long plen;
5533 int min_arity = rb_block_given_p() ? 1 : 2;
5534 long beg;
5536 rb_check_arity(argc, min_arity, 2);
5537 if (argc == 1) {
5538 iter = 1;
5540 else {
5541 repl = argv[1];
5542 hash = rb_check_hash_type(argv[1]);
5543 if (NIL_P(hash)) {
5544 StringValue(repl);
5548 pat = get_pat_quoted(argv[0], 1);
5550 str_modifiable(str);
5551 beg = rb_pat_search(pat, str, 0, 1);
5552 if (beg >= 0) {
5553 rb_encoding *enc;
5554 int cr = ENC_CODERANGE(str);
5555 long beg0, end0;
5556 VALUE match, match0 = Qnil;
5557 struct re_registers *regs;
5558 char *p, *rp;
5559 long len, rlen;
5561 match = rb_backref_get();
5562 regs = RMATCH_REGS(match);
5563 if (RB_TYPE_P(pat, T_STRING)) {
5564 beg0 = beg;
5565 end0 = beg0 + RSTRING_LEN(pat);
5566 match0 = pat;
5568 else {
5569 beg0 = BEG(0);
5570 end0 = END(0);
5571 if (iter) match0 = rb_reg_nth_match(0, match);
5574 if (iter || !NIL_P(hash)) {
5575 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5577 if (iter) {
5578 repl = rb_obj_as_string(rb_yield(match0));
5580 else {
5581 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5582 repl = rb_obj_as_string(repl);
5584 str_mod_check(str, p, len);
5585 rb_check_frozen(str);
5587 else {
5588 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5591 enc = rb_enc_compatible(str, repl);
5592 if (!enc) {
5593 rb_encoding *str_enc = STR_ENC_GET(str);
5594 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5595 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5596 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5597 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5598 rb_enc_name(str_enc),
5599 rb_enc_name(STR_ENC_GET(repl)));
5601 enc = STR_ENC_GET(repl);
5603 rb_str_modify(str);
5604 rb_enc_associate(str, enc);
5605 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5606 int cr2 = ENC_CODERANGE(repl);
5607 if (cr2 == ENC_CODERANGE_BROKEN ||
5608 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5609 cr = ENC_CODERANGE_UNKNOWN;
5610 else
5611 cr = cr2;
5613 plen = end0 - beg0;
5614 rlen = RSTRING_LEN(repl);
5615 len = RSTRING_LEN(str);
5616 if (rlen > plen) {
5617 RESIZE_CAPA(str, len + rlen - plen);
5619 p = RSTRING_PTR(str);
5620 if (rlen != plen) {
5621 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5623 rp = RSTRING_PTR(repl);
5624 memmove(p + beg0, rp, rlen);
5625 len += rlen - plen;
5626 STR_SET_LEN(str, len);
5627 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5628 ENC_CODERANGE_SET(str, cr);
5630 return str;
5632 return Qnil;
5637 * call-seq:
5638 * sub(pattern, replacement) -> new_string
5639 * sub(pattern) {|match| ... } -> new_string
5641 * Returns a copy of +self+ with only the first occurrence
5642 * (not all occurrences) of the given +pattern+ replaced.
5644 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5646 * Related: String#sub!, String#gsub, String#gsub!.
5650 static VALUE
5651 rb_str_sub(int argc, VALUE *argv, VALUE str)
5653 str = str_duplicate(rb_cString, str);
5654 rb_str_sub_bang(argc, argv, str);
5655 return str;
5658 static VALUE
5659 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5661 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5662 struct re_registers *regs;
5663 long beg, beg0, end0;
5664 long offset, blen, slen, len, last;
5665 enum {STR, ITER, MAP} mode = STR;
5666 char *sp, *cp;
5667 int need_backref = -1;
5668 rb_encoding *str_enc;
5670 switch (argc) {
5671 case 1:
5672 RETURN_ENUMERATOR(str, argc, argv);
5673 mode = ITER;
5674 break;
5675 case 2:
5676 repl = argv[1];
5677 hash = rb_check_hash_type(argv[1]);
5678 if (NIL_P(hash)) {
5679 StringValue(repl);
5681 else {
5682 mode = MAP;
5684 break;
5685 default:
5686 rb_error_arity(argc, 1, 2);
5689 pat = get_pat_quoted(argv[0], 1);
5690 beg = rb_pat_search(pat, str, 0, need_backref);
5691 if (beg < 0) {
5692 if (bang) return Qnil; /* no match, no substitution */
5693 return str_duplicate(rb_cString, str);
5696 offset = 0;
5697 blen = RSTRING_LEN(str) + 30; /* len + margin */
5698 dest = rb_str_buf_new(blen);
5699 sp = RSTRING_PTR(str);
5700 slen = RSTRING_LEN(str);
5701 cp = sp;
5702 str_enc = STR_ENC_GET(str);
5703 rb_enc_associate(dest, str_enc);
5704 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5706 do {
5707 match = rb_backref_get();
5708 regs = RMATCH_REGS(match);
5709 if (RB_TYPE_P(pat, T_STRING)) {
5710 beg0 = beg;
5711 end0 = beg0 + RSTRING_LEN(pat);
5712 match0 = pat;
5714 else {
5715 beg0 = BEG(0);
5716 end0 = END(0);
5717 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5720 if (mode) {
5721 if (mode == ITER) {
5722 val = rb_obj_as_string(rb_yield(match0));
5724 else {
5725 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5726 val = rb_obj_as_string(val);
5728 str_mod_check(str, sp, slen);
5729 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5730 rb_raise(rb_eRuntimeError, "block should not cheat");
5733 else if (need_backref) {
5734 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5735 if (need_backref < 0) {
5736 need_backref = val != repl;
5739 else {
5740 val = repl;
5743 len = beg0 - offset; /* copy pre-match substr */
5744 if (len) {
5745 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5748 rb_str_buf_append(dest, val);
5750 last = offset;
5751 offset = end0;
5752 if (beg0 == end0) {
5754 * Always consume at least one character of the input string
5755 * in order to prevent infinite loops.
5757 if (RSTRING_LEN(str) <= end0) break;
5758 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5759 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5760 offset = end0 + len;
5762 cp = RSTRING_PTR(str) + offset;
5763 if (offset > RSTRING_LEN(str)) break;
5764 beg = rb_pat_search(pat, str, offset, need_backref);
5765 } while (beg >= 0);
5766 if (RSTRING_LEN(str) > offset) {
5767 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5769 rb_pat_search(pat, str, last, 1);
5770 if (bang) {
5771 str_shared_replace(str, dest);
5773 else {
5774 str = dest;
5777 return str;
5782 * call-seq:
5783 * gsub!(pattern, replacement) -> self or nil
5784 * gsub!(pattern) {|match| ... } -> self or nil
5785 * gsub!(pattern) -> an_enumerator
5787 * Performs the specified substring replacement(s) on +self+;
5788 * returns +self+ if any replacement occurred, +nil+ otherwise.
5790 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5792 * Returns an Enumerator if no +replacement+ and no block given.
5794 * Related: String#sub, String#gsub, String#sub!.
5798 static VALUE
5799 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5801 str_modify_keep_cr(str);
5802 return str_gsub(argc, argv, str, 1);
5807 * call-seq:
5808 * gsub(pattern, replacement) -> new_string
5809 * gsub(pattern) {|match| ... } -> new_string
5810 * gsub(pattern) -> enumerator
5812 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5814 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5816 * Returns an Enumerator if no +replacement+ and no block given.
5818 * Related: String#sub, String#sub!, String#gsub!.
5822 static VALUE
5823 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5825 return str_gsub(argc, argv, str, 0);
5830 * call-seq:
5831 * replace(other_string) -> self
5833 * Replaces the contents of +self+ with the contents of +other_string+:
5835 * s = 'foo' # => "foo"
5836 * s.replace('bar') # => "bar"
5840 VALUE
5841 rb_str_replace(VALUE str, VALUE str2)
5843 str_modifiable(str);
5844 if (str == str2) return str;
5846 StringValue(str2);
5847 str_discard(str);
5848 return str_replace(str, str2);
5852 * call-seq:
5853 * clear -> self
5855 * Removes the contents of +self+:
5857 * s = 'foo' # => "foo"
5858 * s.clear # => ""
5862 static VALUE
5863 rb_str_clear(VALUE str)
5865 str_discard(str);
5866 STR_SET_EMBED(str);
5867 STR_SET_EMBED_LEN(str, 0);
5868 RSTRING_PTR(str)[0] = 0;
5869 if (rb_enc_asciicompat(STR_ENC_GET(str)))
5870 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5871 else
5872 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5873 return str;
5877 * call-seq:
5878 * chr -> string
5880 * Returns a string containing the first character of +self+:
5882 * s = 'foo' # => "foo"
5883 * s.chr # => "f"
5887 static VALUE
5888 rb_str_chr(VALUE str)
5890 return rb_str_substr(str, 0, 1);
5894 * call-seq:
5895 * getbyte(index) -> integer
5897 * Returns the byte at zero-based +index+ as an integer:
5899 * s = 'abcde' # => "abcde"
5900 * s.getbyte(0) # => 97
5901 * s.getbyte(1) # => 98
5903 * Related: String#setbyte.
5905 static VALUE
5906 rb_str_getbyte(VALUE str, VALUE index)
5908 long pos = NUM2LONG(index);
5910 if (pos < 0)
5911 pos += RSTRING_LEN(str);
5912 if (pos < 0 || RSTRING_LEN(str) <= pos)
5913 return Qnil;
5915 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5919 * call-seq:
5920 * setbyte(index, integer) -> integer
5922 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5924 * s = 'abcde' # => "abcde"
5925 * s.setbyte(0, 98) # => 98
5926 * s # => "bbcde"
5928 * Related: String#getbyte.
5930 static VALUE
5931 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5933 long pos = NUM2LONG(index);
5934 long len = RSTRING_LEN(str);
5935 char *ptr, *head, *left = 0;
5936 rb_encoding *enc;
5937 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5939 if (pos < -len || len <= pos)
5940 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5941 if (pos < 0)
5942 pos += len;
5944 VALUE v = rb_to_int(value);
5945 VALUE w = rb_int_and(v, INT2FIX(0xff));
5946 char byte = (char)(NUM2INT(w) & 0xFF);
5948 if (!str_independent(str))
5949 str_make_independent(str);
5950 enc = STR_ENC_GET(str);
5951 head = RSTRING_PTR(str);
5952 ptr = &head[pos];
5953 if (!STR_EMBED_P(str)) {
5954 cr = ENC_CODERANGE(str);
5955 switch (cr) {
5956 case ENC_CODERANGE_7BIT:
5957 left = ptr;
5958 *ptr = byte;
5959 if (ISASCII(byte)) goto end;
5960 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5961 if (!MBCLEN_CHARFOUND_P(nlen))
5962 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5963 else
5964 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5965 goto end;
5966 case ENC_CODERANGE_VALID:
5967 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5968 width = rb_enc_precise_mbclen(left, head+len, enc);
5969 *ptr = byte;
5970 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5971 if (!MBCLEN_CHARFOUND_P(nlen))
5972 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5973 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5974 ENC_CODERANGE_CLEAR(str);
5975 goto end;
5978 ENC_CODERANGE_CLEAR(str);
5979 *ptr = byte;
5981 end:
5982 return value;
5985 static VALUE
5986 str_byte_substr(VALUE str, long beg, long len, int empty)
5988 char *p, *s = RSTRING_PTR(str);
5989 long n = RSTRING_LEN(str);
5990 VALUE str2;
5992 if (beg > n || len < 0) return Qnil;
5993 if (beg < 0) {
5994 beg += n;
5995 if (beg < 0) return Qnil;
5997 if (len > n - beg)
5998 len = n - beg;
5999 if (len <= 0) {
6000 if (!empty) return Qnil;
6001 len = 0;
6002 p = 0;
6004 else
6005 p = s + beg;
6007 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
6008 str2 = rb_str_new_frozen(str);
6009 str2 = str_new_shared(rb_cString, str2);
6010 RSTRING(str2)->as.heap.ptr += beg;
6011 RSTRING(str2)->as.heap.len = len;
6013 else {
6014 str2 = rb_str_new(p, len);
6017 str_enc_copy(str2, str);
6019 if (RSTRING_LEN(str2) == 0) {
6020 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6021 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6022 else
6023 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6025 else {
6026 switch (ENC_CODERANGE(str)) {
6027 case ENC_CODERANGE_7BIT:
6028 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6029 break;
6030 default:
6031 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6032 break;
6036 return str2;
6039 static VALUE
6040 str_byte_aref(VALUE str, VALUE indx)
6042 long idx;
6043 if (FIXNUM_P(indx)) {
6044 idx = FIX2LONG(indx);
6046 else {
6047 /* check if indx is Range */
6048 long beg, len = RSTRING_LEN(str);
6050 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6051 case Qfalse:
6052 break;
6053 case Qnil:
6054 return Qnil;
6055 default:
6056 return str_byte_substr(str, beg, len, TRUE);
6059 idx = NUM2LONG(indx);
6061 return str_byte_substr(str, idx, 1, FALSE);
6065 * call-seq:
6066 * byteslice(index, length = 1) -> string or nil
6067 * byteslice(range) -> string or nil
6069 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6071 * With integer arguments +index+ and +length+ given,
6072 * returns the substring beginning at the given +index+
6073 * of the given +length+ (if possible),
6074 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6076 * s = '0123456789' # => "0123456789"
6077 * s.byteslice(2) # => "2"
6078 * s.byteslice(200) # => nil
6079 * s.byteslice(4, 3) # => "456"
6080 * s.byteslice(4, 30) # => "456789"
6081 * s.byteslice(4, -1) # => nil
6082 * s.byteslice(40, 2) # => nil
6084 * In either case above, counts backwards from the end of +self+
6085 * if +index+ is negative:
6087 * s = '0123456789' # => "0123456789"
6088 * s.byteslice(-4) # => "6"
6089 * s.byteslice(-4, 3) # => "678"
6091 * With Range argument +range+ given, returns
6092 * <tt>byteslice(range.begin, range.size)</tt>:
6094 * s = '0123456789' # => "0123456789"
6095 * s.byteslice(4..6) # => "456"
6096 * s.byteslice(-6..-4) # => "456"
6097 * s.byteslice(5..2) # => "" # range.size is zero.
6098 * s.byteslice(40..42) # => nil
6100 * In all cases, a returned string has the same encoding as +self+:
6102 * s.encoding # => #<Encoding:UTF-8>
6103 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6107 static VALUE
6108 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6110 if (argc == 2) {
6111 long beg = NUM2LONG(argv[0]);
6112 long end = NUM2LONG(argv[1]);
6113 return str_byte_substr(str, beg, end, TRUE);
6115 rb_check_arity(argc, 1, 2);
6116 return str_byte_aref(str, argv[0]);
6120 * call-seq:
6121 * reverse -> string
6123 * Returns a new string with the characters from +self+ in reverse order.
6125 * 'stressed'.reverse # => "desserts"
6129 static VALUE
6130 rb_str_reverse(VALUE str)
6132 rb_encoding *enc;
6133 VALUE rev;
6134 char *s, *e, *p;
6135 int cr;
6137 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6138 enc = STR_ENC_GET(str);
6139 rev = rb_str_new(0, RSTRING_LEN(str));
6140 s = RSTRING_PTR(str); e = RSTRING_END(str);
6141 p = RSTRING_END(rev);
6142 cr = ENC_CODERANGE(str);
6144 if (RSTRING_LEN(str) > 1) {
6145 if (single_byte_optimizable(str)) {
6146 while (s < e) {
6147 *--p = *s++;
6150 else if (cr == ENC_CODERANGE_VALID) {
6151 while (s < e) {
6152 int clen = rb_enc_fast_mbclen(s, e, enc);
6154 p -= clen;
6155 memcpy(p, s, clen);
6156 s += clen;
6159 else {
6160 cr = rb_enc_asciicompat(enc) ?
6161 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6162 while (s < e) {
6163 int clen = rb_enc_mbclen(s, e, enc);
6165 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6166 p -= clen;
6167 memcpy(p, s, clen);
6168 s += clen;
6172 STR_SET_LEN(rev, RSTRING_LEN(str));
6173 str_enc_copy(rev, str);
6174 ENC_CODERANGE_SET(rev, cr);
6176 return rev;
6181 * call-seq:
6182 * reverse! -> self
6184 * Returns +self+ with its characters reversed:
6186 * s = 'stressed'
6187 * s.reverse! # => "desserts"
6188 * s # => "desserts"
6192 static VALUE
6193 rb_str_reverse_bang(VALUE str)
6195 if (RSTRING_LEN(str) > 1) {
6196 if (single_byte_optimizable(str)) {
6197 char *s, *e, c;
6199 str_modify_keep_cr(str);
6200 s = RSTRING_PTR(str);
6201 e = RSTRING_END(str) - 1;
6202 while (s < e) {
6203 c = *s;
6204 *s++ = *e;
6205 *e-- = c;
6208 else {
6209 str_shared_replace(str, rb_str_reverse(str));
6212 else {
6213 str_modify_keep_cr(str);
6215 return str;
6220 * call-seq:
6221 * include? other_string -> true or false
6223 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6225 * s = 'foo'
6226 * s.include?('f') # => true
6227 * s.include?('fo') # => true
6228 * s.include?('food') # => false
6232 static VALUE
6233 rb_str_include(VALUE str, VALUE arg)
6235 long i;
6237 StringValue(arg);
6238 i = rb_str_index(str, arg, 0);
6240 return RBOOL(i != -1);
6245 * call-seq:
6246 * to_i(base = 10) -> integer
6248 * Returns the result of interpreting leading characters in +self+
6249 * as an integer in the given +base+ (which must be in (2..36)):
6251 * '123456'.to_i # => 123456
6252 * '123def'.to_i(16) # => 1195503
6254 * Characters past a leading valid number (in the given +base+) are ignored:
6256 * '12.345'.to_i # => 12
6257 * '12345'.to_i(2) # => 1
6259 * Returns zero if there is no leading valid number:
6261 * 'abcdef'.to_i # => 0
6262 * '2'.to_i(2) # => 0
6266 static VALUE
6267 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6269 int base = 10;
6271 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6272 rb_raise(rb_eArgError, "invalid radix %d", base);
6274 return rb_str_to_inum(str, base, FALSE);
6279 * call-seq:
6280 * to_f -> float
6282 * Returns the result of interpreting leading characters in +self+ as a Float:
6284 * '3.14159'.to_f # => 3.14159
6285 '1.234e-2'.to_f # => 0.01234
6287 * Characters past a leading valid number (in the given +base+) are ignored:
6289 * '3.14 (pi to two places)'.to_f # => 3.14
6291 * Returns zero if there is no leading valid number:
6293 * 'abcdef'.to_f # => 0.0
6297 static VALUE
6298 rb_str_to_f(VALUE str)
6300 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6305 * call-seq:
6306 * to_s -> self or string
6308 * Returns +self+ if +self+ is a \String,
6309 * or +self+ converted to a \String if +self+ is a subclass of \String.
6311 * String#to_str is an alias for String#to_s.
6315 static VALUE
6316 rb_str_to_s(VALUE str)
6318 if (rb_obj_class(str) != rb_cString) {
6319 return str_duplicate(rb_cString, str);
6321 return str;
6324 #if 0
6325 static void
6326 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6328 char s[RUBY_MAX_CHAR_LEN];
6329 int n = rb_enc_codelen(c, enc);
6331 rb_enc_mbcput(c, s, enc);
6332 rb_enc_str_buf_cat(str, s, n, enc);
6334 #endif
6336 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6339 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6341 char buf[CHAR_ESC_LEN + 1];
6342 int l;
6344 #if SIZEOF_INT > 4
6345 c &= 0xffffffff;
6346 #endif
6347 if (unicode_p) {
6348 if (c < 0x7F && ISPRINT(c)) {
6349 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6351 else if (c < 0x10000) {
6352 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6354 else {
6355 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6358 else {
6359 if (c < 0x100) {
6360 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6362 else {
6363 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6366 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6367 rb_str_buf_cat(result, buf, l);
6368 return l;
6371 const char *
6372 ruby_escaped_char(int c)
6374 switch (c) {
6375 case '\0': return "\\0";
6376 case '\n': return "\\n";
6377 case '\r': return "\\r";
6378 case '\t': return "\\t";
6379 case '\f': return "\\f";
6380 case '\013': return "\\v";
6381 case '\010': return "\\b";
6382 case '\007': return "\\a";
6383 case '\033': return "\\e";
6384 case '\x7f': return "\\c?";
6386 return NULL;
6389 VALUE
6390 rb_str_escape(VALUE str)
6392 int encidx = ENCODING_GET(str);
6393 rb_encoding *enc = rb_enc_from_index(encidx);
6394 const char *p = RSTRING_PTR(str);
6395 const char *pend = RSTRING_END(str);
6396 const char *prev = p;
6397 char buf[CHAR_ESC_LEN + 1];
6398 VALUE result = rb_str_buf_new(0);
6399 int unicode_p = rb_enc_unicode_p(enc);
6400 int asciicompat = rb_enc_asciicompat(enc);
6402 while (p < pend) {
6403 unsigned int c;
6404 const char *cc;
6405 int n = rb_enc_precise_mbclen(p, pend, enc);
6406 if (!MBCLEN_CHARFOUND_P(n)) {
6407 if (p > prev) str_buf_cat(result, prev, p - prev);
6408 n = rb_enc_mbminlen(enc);
6409 if (pend < p + n)
6410 n = (int)(pend - p);
6411 while (n--) {
6412 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6413 str_buf_cat(result, buf, strlen(buf));
6414 prev = ++p;
6416 continue;
6418 n = MBCLEN_CHARFOUND_LEN(n);
6419 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6420 p += n;
6421 cc = ruby_escaped_char(c);
6422 if (cc) {
6423 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6424 str_buf_cat(result, cc, strlen(cc));
6425 prev = p;
6427 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6429 else {
6430 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6431 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6432 prev = p;
6435 if (p > prev) str_buf_cat(result, prev, p - prev);
6436 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6438 return result;
6442 * call-seq:
6443 * inspect -> string
6445 * Returns a printable version of +self+, enclosed in double-quotes,
6446 * and with special characters escaped:
6448 * s = "foo\tbar\tbaz\n"
6449 * # => "foo\tbar\tbaz\n"
6450 * s.inspect
6451 * # => "\"foo\\tbar\\tbaz\\n\""
6455 VALUE
6456 rb_str_inspect(VALUE str)
6458 int encidx = ENCODING_GET(str);
6459 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6460 const char *p, *pend, *prev;
6461 char buf[CHAR_ESC_LEN + 1];
6462 VALUE result = rb_str_buf_new(0);
6463 rb_encoding *resenc = rb_default_internal_encoding();
6464 int unicode_p = rb_enc_unicode_p(enc);
6465 int asciicompat = rb_enc_asciicompat(enc);
6467 if (resenc == NULL) resenc = rb_default_external_encoding();
6468 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6469 rb_enc_associate(result, resenc);
6470 str_buf_cat2(result, "\"");
6472 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6473 prev = p;
6474 actenc = get_actual_encoding(encidx, str);
6475 if (actenc != enc) {
6476 enc = actenc;
6477 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6479 while (p < pend) {
6480 unsigned int c, cc;
6481 int n;
6483 n = rb_enc_precise_mbclen(p, pend, enc);
6484 if (!MBCLEN_CHARFOUND_P(n)) {
6485 if (p > prev) str_buf_cat(result, prev, p - prev);
6486 n = rb_enc_mbminlen(enc);
6487 if (pend < p + n)
6488 n = (int)(pend - p);
6489 while (n--) {
6490 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6491 str_buf_cat(result, buf, strlen(buf));
6492 prev = ++p;
6494 continue;
6496 n = MBCLEN_CHARFOUND_LEN(n);
6497 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6498 p += n;
6499 if ((asciicompat || unicode_p) &&
6500 (c == '"'|| c == '\\' ||
6501 (c == '#' &&
6502 p < pend &&
6503 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6504 (cc = rb_enc_codepoint(p,pend,enc),
6505 (cc == '$' || cc == '@' || cc == '{'))))) {
6506 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6507 str_buf_cat2(result, "\\");
6508 if (asciicompat || enc == resenc) {
6509 prev = p - n;
6510 continue;
6513 switch (c) {
6514 case '\n': cc = 'n'; break;
6515 case '\r': cc = 'r'; break;
6516 case '\t': cc = 't'; break;
6517 case '\f': cc = 'f'; break;
6518 case '\013': cc = 'v'; break;
6519 case '\010': cc = 'b'; break;
6520 case '\007': cc = 'a'; break;
6521 case 033: cc = 'e'; break;
6522 default: cc = 0; break;
6524 if (cc) {
6525 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6526 buf[0] = '\\';
6527 buf[1] = (char)cc;
6528 str_buf_cat(result, buf, 2);
6529 prev = p;
6530 continue;
6532 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6533 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6534 continue;
6536 else {
6537 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6538 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6539 prev = p;
6540 continue;
6543 if (p > prev) str_buf_cat(result, prev, p - prev);
6544 str_buf_cat2(result, "\"");
6546 return result;
6549 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6552 * call-seq:
6553 * dump -> string
6555 * Returns a printable version of +self+, enclosed in double-quotes,
6556 * with special characters escaped, and with non-printing characters
6557 * replaced by hexadecimal notation:
6559 * "hello \n ''".dump # => "\"hello \\n ''\""
6560 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6562 * Related: String#undump (inverse of String#dump).
6566 VALUE
6567 rb_str_dump(VALUE str)
6569 int encidx = rb_enc_get_index(str);
6570 rb_encoding *enc = rb_enc_from_index(encidx);
6571 long len;
6572 const char *p, *pend;
6573 char *q, *qend;
6574 VALUE result;
6575 int u8 = (encidx == rb_utf8_encindex());
6576 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6578 len = 2; /* "" */
6579 if (!rb_enc_asciicompat(enc)) {
6580 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6581 len += strlen(enc->name);
6584 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6585 while (p < pend) {
6586 int clen;
6587 unsigned char c = *p++;
6589 switch (c) {
6590 case '"': case '\\':
6591 case '\n': case '\r':
6592 case '\t': case '\f':
6593 case '\013': case '\010': case '\007': case '\033':
6594 clen = 2;
6595 break;
6597 case '#':
6598 clen = IS_EVSTR(p, pend) ? 2 : 1;
6599 break;
6601 default:
6602 if (ISPRINT(c)) {
6603 clen = 1;
6605 else {
6606 if (u8 && c > 0x7F) { /* \u notation */
6607 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6608 if (MBCLEN_CHARFOUND_P(n)) {
6609 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6610 if (cc <= 0xFFFF)
6611 clen = 6; /* \uXXXX */
6612 else if (cc <= 0xFFFFF)
6613 clen = 9; /* \u{XXXXX} */
6614 else
6615 clen = 10; /* \u{XXXXXX} */
6616 p += MBCLEN_CHARFOUND_LEN(n)-1;
6617 break;
6620 clen = 4; /* \xNN */
6622 break;
6625 if (clen > LONG_MAX - len) {
6626 rb_raise(rb_eRuntimeError, "string size too big");
6628 len += clen;
6631 result = rb_str_new(0, len);
6632 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6633 q = RSTRING_PTR(result); qend = q + len + 1;
6635 *q++ = '"';
6636 while (p < pend) {
6637 unsigned char c = *p++;
6639 if (c == '"' || c == '\\') {
6640 *q++ = '\\';
6641 *q++ = c;
6643 else if (c == '#') {
6644 if (IS_EVSTR(p, pend)) *q++ = '\\';
6645 *q++ = '#';
6647 else if (c == '\n') {
6648 *q++ = '\\';
6649 *q++ = 'n';
6651 else if (c == '\r') {
6652 *q++ = '\\';
6653 *q++ = 'r';
6655 else if (c == '\t') {
6656 *q++ = '\\';
6657 *q++ = 't';
6659 else if (c == '\f') {
6660 *q++ = '\\';
6661 *q++ = 'f';
6663 else if (c == '\013') {
6664 *q++ = '\\';
6665 *q++ = 'v';
6667 else if (c == '\010') {
6668 *q++ = '\\';
6669 *q++ = 'b';
6671 else if (c == '\007') {
6672 *q++ = '\\';
6673 *q++ = 'a';
6675 else if (c == '\033') {
6676 *q++ = '\\';
6677 *q++ = 'e';
6679 else if (ISPRINT(c)) {
6680 *q++ = c;
6682 else {
6683 *q++ = '\\';
6684 if (u8) {
6685 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6686 if (MBCLEN_CHARFOUND_P(n)) {
6687 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6688 p += n;
6689 if (cc <= 0xFFFF)
6690 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6691 else
6692 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6693 q += strlen(q);
6694 continue;
6697 snprintf(q, qend-q, "x%02X", c);
6698 q += 3;
6701 *q++ = '"';
6702 *q = '\0';
6703 if (!rb_enc_asciicompat(enc)) {
6704 snprintf(q, qend-q, nonascii_suffix, enc->name);
6705 encidx = rb_ascii8bit_encindex();
6707 /* result from dump is ASCII */
6708 rb_enc_associate_index(result, encidx);
6709 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6710 return result;
6713 static int
6714 unescape_ascii(unsigned int c)
6716 switch (c) {
6717 case 'n':
6718 return '\n';
6719 case 'r':
6720 return '\r';
6721 case 't':
6722 return '\t';
6723 case 'f':
6724 return '\f';
6725 case 'v':
6726 return '\13';
6727 case 'b':
6728 return '\010';
6729 case 'a':
6730 return '\007';
6731 case 'e':
6732 return 033;
6734 UNREACHABLE_RETURN(-1);
6737 static void
6738 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6740 const char *s = *ss;
6741 unsigned int c;
6742 int codelen;
6743 size_t hexlen;
6744 unsigned char buf[6];
6745 static rb_encoding *enc_utf8 = NULL;
6747 switch (*s) {
6748 case '\\':
6749 case '"':
6750 case '#':
6751 rb_str_cat(undumped, s, 1); /* cat itself */
6752 s++;
6753 break;
6754 case 'n':
6755 case 'r':
6756 case 't':
6757 case 'f':
6758 case 'v':
6759 case 'b':
6760 case 'a':
6761 case 'e':
6762 *buf = unescape_ascii(*s);
6763 rb_str_cat(undumped, (char *)buf, 1);
6764 s++;
6765 break;
6766 case 'u':
6767 if (*binary) {
6768 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6770 *utf8 = true;
6771 if (++s >= s_end) {
6772 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6774 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6775 if (*penc != enc_utf8) {
6776 *penc = enc_utf8;
6777 rb_enc_associate(undumped, enc_utf8);
6779 if (*s == '{') { /* handle \u{...} form */
6780 s++;
6781 for (;;) {
6782 if (s >= s_end) {
6783 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6785 if (*s == '}') {
6786 s++;
6787 break;
6789 if (ISSPACE(*s)) {
6790 s++;
6791 continue;
6793 c = scan_hex(s, s_end-s, &hexlen);
6794 if (hexlen == 0 || hexlen > 6) {
6795 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6797 if (c > 0x10ffff) {
6798 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6800 if (0xd800 <= c && c <= 0xdfff) {
6801 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6803 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804 rb_str_cat(undumped, (char *)buf, codelen);
6805 s += hexlen;
6808 else { /* handle \uXXXX form */
6809 c = scan_hex(s, 4, &hexlen);
6810 if (hexlen != 4) {
6811 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6813 if (0xd800 <= c && c <= 0xdfff) {
6814 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6816 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6817 rb_str_cat(undumped, (char *)buf, codelen);
6818 s += hexlen;
6820 break;
6821 case 'x':
6822 if (*utf8) {
6823 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6825 *binary = true;
6826 if (++s >= s_end) {
6827 rb_raise(rb_eRuntimeError, "invalid hex escape");
6829 *buf = scan_hex(s, 2, &hexlen);
6830 if (hexlen != 2) {
6831 rb_raise(rb_eRuntimeError, "invalid hex escape");
6833 rb_str_cat(undumped, (char *)buf, 1);
6834 s += hexlen;
6835 break;
6836 default:
6837 rb_str_cat(undumped, s-1, 2);
6838 s++;
6841 *ss = s;
6844 static VALUE rb_str_is_ascii_only_p(VALUE str);
6847 * call-seq:
6848 * undump -> string
6850 * Returns an unescaped version of +self+:
6852 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6853 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6854 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6855 * s_undumped == s_orig # => true
6857 * Related: String#dump (inverse of String#undump).
6861 static VALUE
6862 str_undump(VALUE str)
6864 const char *s = RSTRING_PTR(str);
6865 const char *s_end = RSTRING_END(str);
6866 rb_encoding *enc = rb_enc_get(str);
6867 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6868 bool utf8 = false;
6869 bool binary = false;
6870 int w;
6872 rb_must_asciicompat(str);
6873 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6874 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6876 if (!str_null_check(str, &w)) {
6877 rb_raise(rb_eRuntimeError, "string contains null byte");
6879 if (RSTRING_LEN(str) < 2) goto invalid_format;
6880 if (*s != '"') goto invalid_format;
6882 /* strip '"' at the start */
6883 s++;
6885 for (;;) {
6886 if (s >= s_end) {
6887 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6890 if (*s == '"') {
6891 /* epilogue */
6892 s++;
6893 if (s == s_end) {
6894 /* ascii compatible dumped string */
6895 break;
6897 else {
6898 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6899 static const char dup_suffix[] = ".dup";
6900 const char *encname;
6901 int encidx;
6902 ptrdiff_t size;
6904 /* check separately for strings dumped by older versions */
6905 size = sizeof(dup_suffix) - 1;
6906 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6908 size = sizeof(force_encoding_suffix) - 1;
6909 if (s_end - s <= size) goto invalid_format;
6910 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6911 s += size;
6913 if (utf8) {
6914 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6917 encname = s;
6918 s = memchr(s, '"', s_end-s);
6919 size = s - encname;
6920 if (!s) goto invalid_format;
6921 if (s_end - s != 2) goto invalid_format;
6922 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6924 encidx = rb_enc_find_index2(encname, (long)size);
6925 if (encidx < 0) {
6926 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6928 rb_enc_associate_index(undumped, encidx);
6930 break;
6933 if (*s == '\\') {
6934 s++;
6935 if (s >= s_end) {
6936 rb_raise(rb_eRuntimeError, "invalid escape");
6938 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6940 else {
6941 rb_str_cat(undumped, s++, 1);
6945 return undumped;
6946 invalid_format:
6947 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6950 static void
6951 rb_str_check_dummy_enc(rb_encoding *enc)
6953 if (rb_enc_dummy_p(enc)) {
6954 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6955 rb_enc_name(enc));
6959 static rb_encoding *
6960 str_true_enc(VALUE str)
6962 rb_encoding *enc = STR_ENC_GET(str);
6963 rb_str_check_dummy_enc(enc);
6964 return enc;
6967 static OnigCaseFoldType
6968 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6970 if (argc==0)
6971 return flags;
6972 if (argc>2)
6973 rb_raise(rb_eArgError, "too many options");
6974 if (argv[0]==sym_turkic) {
6975 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6976 if (argc==2) {
6977 if (argv[1]==sym_lithuanian)
6978 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6979 else
6980 rb_raise(rb_eArgError, "invalid second option");
6983 else if (argv[0]==sym_lithuanian) {
6984 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6985 if (argc==2) {
6986 if (argv[1]==sym_turkic)
6987 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6988 else
6989 rb_raise(rb_eArgError, "invalid second option");
6992 else if (argc>1)
6993 rb_raise(rb_eArgError, "too many options");
6994 else if (argv[0]==sym_ascii)
6995 flags |= ONIGENC_CASE_ASCII_ONLY;
6996 else if (argv[0]==sym_fold) {
6997 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6998 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6999 else
7000 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7002 else
7003 rb_raise(rb_eArgError, "invalid option");
7004 return flags;
7007 static inline bool
7008 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7010 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7011 return true;
7012 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7015 /* 16 should be long enough to absorb any kind of single character length increase */
7016 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7017 #ifndef CASEMAP_DEBUG
7018 # define CASEMAP_DEBUG 0
7019 #endif
7021 struct mapping_buffer;
7022 typedef struct mapping_buffer {
7023 size_t capa;
7024 size_t used;
7025 struct mapping_buffer *next;
7026 OnigUChar space[FLEX_ARY_LEN];
7027 } mapping_buffer;
7029 static void
7030 mapping_buffer_free(void *p)
7032 mapping_buffer *previous_buffer;
7033 mapping_buffer *current_buffer = p;
7034 while (current_buffer) {
7035 previous_buffer = current_buffer;
7036 current_buffer = current_buffer->next;
7037 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7041 static const rb_data_type_t mapping_buffer_type = {
7042 "mapping_buffer",
7043 {0, mapping_buffer_free,}
7046 static VALUE
7047 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7049 VALUE target;
7051 const OnigUChar *source_current, *source_end;
7052 int target_length = 0;
7053 VALUE buffer_anchor;
7054 mapping_buffer *current_buffer = 0;
7055 mapping_buffer **pre_buffer;
7056 size_t buffer_count = 0;
7057 int buffer_length_or_invalid;
7059 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7061 source_current = (OnigUChar*)RSTRING_PTR(source);
7062 source_end = (OnigUChar*)RSTRING_END(source);
7064 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7065 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7066 while (source_current < source_end) {
7067 /* increase multiplier using buffer count to converge quickly */
7068 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7069 if (CASEMAP_DEBUG) {
7070 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7072 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7073 *pre_buffer = current_buffer;
7074 pre_buffer = &current_buffer->next;
7075 current_buffer->next = NULL;
7076 current_buffer->capa = capa;
7077 buffer_length_or_invalid = enc->case_map(flags,
7078 &source_current, source_end,
7079 current_buffer->space,
7080 current_buffer->space+current_buffer->capa,
7081 enc);
7082 if (buffer_length_or_invalid < 0) {
7083 current_buffer = DATA_PTR(buffer_anchor);
7084 DATA_PTR(buffer_anchor) = 0;
7085 mapping_buffer_free(current_buffer);
7086 rb_raise(rb_eArgError, "input string invalid");
7088 target_length += current_buffer->used = buffer_length_or_invalid;
7090 if (CASEMAP_DEBUG) {
7091 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7094 if (buffer_count==1) {
7095 target = rb_str_new((const char*)current_buffer->space, target_length);
7097 else {
7098 char *target_current;
7100 target = rb_str_new(0, target_length);
7101 target_current = RSTRING_PTR(target);
7102 current_buffer = DATA_PTR(buffer_anchor);
7103 while (current_buffer) {
7104 memcpy(target_current, current_buffer->space, current_buffer->used);
7105 target_current += current_buffer->used;
7106 current_buffer = current_buffer->next;
7109 current_buffer = DATA_PTR(buffer_anchor);
7110 DATA_PTR(buffer_anchor) = 0;
7111 mapping_buffer_free(current_buffer);
7113 /* TODO: check about string terminator character */
7114 str_enc_copy(target, source);
7115 /*ENC_CODERANGE_SET(mapped, cr);*/
7117 return target;
7120 static VALUE
7121 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7123 const OnigUChar *source_current, *source_end;
7124 OnigUChar *target_current, *target_end;
7125 long old_length = RSTRING_LEN(source);
7126 int length_or_invalid;
7128 if (old_length == 0) return Qnil;
7130 source_current = (OnigUChar*)RSTRING_PTR(source);
7131 source_end = (OnigUChar*)RSTRING_END(source);
7132 if (source == target) {
7133 target_current = (OnigUChar*)source_current;
7134 target_end = (OnigUChar*)source_end;
7136 else {
7137 target_current = (OnigUChar*)RSTRING_PTR(target);
7138 target_end = (OnigUChar*)RSTRING_END(target);
7141 length_or_invalid = onigenc_ascii_only_case_map(flags,
7142 &source_current, source_end,
7143 target_current, target_end, enc);
7144 if (length_or_invalid < 0)
7145 rb_raise(rb_eArgError, "input string invalid");
7146 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7147 fprintf(stderr, "problem with rb_str_ascii_casemap"
7148 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7149 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7150 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7153 str_enc_copy(target, source);
7155 return target;
7158 static bool
7159 upcase_single(VALUE str)
7161 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7162 bool modified = false;
7164 while (s < send) {
7165 unsigned int c = *(unsigned char*)s;
7167 if ('a' <= c && c <= 'z') {
7168 *s = 'A' + (c - 'a');
7169 modified = true;
7171 s++;
7173 return modified;
7177 * call-seq:
7178 * upcase!(*options) -> self or nil
7180 * Upcases the characters in +self+;
7181 * returns +self+ if any changes were made, +nil+ otherwise:
7183 * s = 'Hello World!' # => "Hello World!"
7184 * s.upcase! # => "HELLO WORLD!"
7185 * s # => "HELLO WORLD!"
7186 * s.upcase! # => nil
7188 * The casing may be affected by the given +options+;
7189 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7191 * Related: String#upcase, String#downcase, String#downcase!.
7195 static VALUE
7196 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7198 rb_encoding *enc;
7199 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7201 flags = check_case_options(argc, argv, flags);
7202 str_modify_keep_cr(str);
7203 enc = str_true_enc(str);
7204 if (case_option_single_p(flags, enc, str)) {
7205 if (upcase_single(str))
7206 flags |= ONIGENC_CASE_MODIFIED;
7208 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7209 rb_str_ascii_casemap(str, str, &flags, enc);
7210 else
7211 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7213 if (ONIGENC_CASE_MODIFIED&flags) return str;
7214 return Qnil;
7219 * call-seq:
7220 * upcase(*options) -> string
7222 * Returns a string containing the upcased characters in +self+:
7224 * s = 'Hello World!' # => "Hello World!"
7225 * s.upcase # => "HELLO WORLD!"
7227 * The casing may be affected by the given +options+;
7228 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7230 * Related: String#upcase!, String#downcase, String#downcase!.
7234 static VALUE
7235 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7237 rb_encoding *enc;
7238 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7239 VALUE ret;
7241 flags = check_case_options(argc, argv, flags);
7242 enc = str_true_enc(str);
7243 if (case_option_single_p(flags, enc, str)) {
7244 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7245 str_enc_copy(ret, str);
7246 upcase_single(ret);
7248 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7249 ret = rb_str_new(0, RSTRING_LEN(str));
7250 rb_str_ascii_casemap(str, ret, &flags, enc);
7252 else {
7253 ret = rb_str_casemap(str, &flags, enc);
7256 return ret;
7259 static bool
7260 downcase_single(VALUE str)
7262 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7263 bool modified = false;
7265 while (s < send) {
7266 unsigned int c = *(unsigned char*)s;
7268 if ('A' <= c && c <= 'Z') {
7269 *s = 'a' + (c - 'A');
7270 modified = true;
7272 s++;
7275 return modified;
7279 * call-seq:
7280 * downcase!(*options) -> self or nil
7282 * Downcases the characters in +self+;
7283 * returns +self+ if any changes were made, +nil+ otherwise:
7285 * s = 'Hello World!' # => "Hello World!"
7286 * s.downcase! # => "hello world!"
7287 * s # => "hello world!"
7288 * s.downcase! # => nil
7290 * The casing may be affected by the given +options+;
7291 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7293 * Related: String#downcase, String#upcase, String#upcase!.
7297 static VALUE
7298 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7300 rb_encoding *enc;
7301 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7303 flags = check_case_options(argc, argv, flags);
7304 str_modify_keep_cr(str);
7305 enc = str_true_enc(str);
7306 if (case_option_single_p(flags, enc, str)) {
7307 if (downcase_single(str))
7308 flags |= ONIGENC_CASE_MODIFIED;
7310 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7311 rb_str_ascii_casemap(str, str, &flags, enc);
7312 else
7313 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7315 if (ONIGENC_CASE_MODIFIED&flags) return str;
7316 return Qnil;
7321 * call-seq:
7322 * downcase(*options) -> string
7324 * Returns a string containing the downcased characters in +self+:
7326 * s = 'Hello World!' # => "Hello World!"
7327 * s.downcase # => "hello world!"
7329 * The casing may be affected by the given +options+;
7330 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7332 * Related: String#downcase!, String#upcase, String#upcase!.
7336 static VALUE
7337 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7339 rb_encoding *enc;
7340 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7341 VALUE ret;
7343 flags = check_case_options(argc, argv, flags);
7344 enc = str_true_enc(str);
7345 if (case_option_single_p(flags, enc, str)) {
7346 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7347 str_enc_copy(ret, str);
7348 downcase_single(ret);
7350 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7351 ret = rb_str_new(0, RSTRING_LEN(str));
7352 rb_str_ascii_casemap(str, ret, &flags, enc);
7354 else {
7355 ret = rb_str_casemap(str, &flags, enc);
7358 return ret;
7363 * call-seq:
7364 * capitalize!(*options) -> self or nil
7366 * Upcases the first character in +self+;
7367 * downcases the remaining characters;
7368 * returns +self+ if any changes were made, +nil+ otherwise:
7370 * s = 'hello World!' # => "hello World!"
7371 * s.capitalize! # => "Hello world!"
7372 * s # => "Hello world!"
7373 * s.capitalize! # => nil
7375 * The casing may be affected by the given +options+;
7376 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7378 * Related: String#capitalize.
7382 static VALUE
7383 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7385 rb_encoding *enc;
7386 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7388 flags = check_case_options(argc, argv, flags);
7389 str_modify_keep_cr(str);
7390 enc = str_true_enc(str);
7391 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7392 if (flags&ONIGENC_CASE_ASCII_ONLY)
7393 rb_str_ascii_casemap(str, str, &flags, enc);
7394 else
7395 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7397 if (ONIGENC_CASE_MODIFIED&flags) return str;
7398 return Qnil;
7403 * call-seq:
7404 * capitalize(*options) -> string
7406 * Returns a string containing the characters in +self+;
7407 * the first character is upcased;
7408 * the remaining characters are downcased:
7410 * s = 'hello World!' # => "hello World!"
7411 * s.capitalize # => "Hello world!"
7413 * The casing may be affected by the given +options+;
7414 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7416 * Related: String#capitalize!.
7420 static VALUE
7421 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7423 rb_encoding *enc;
7424 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7425 VALUE ret;
7427 flags = check_case_options(argc, argv, flags);
7428 enc = str_true_enc(str);
7429 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7430 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7431 ret = rb_str_new(0, RSTRING_LEN(str));
7432 rb_str_ascii_casemap(str, ret, &flags, enc);
7434 else {
7435 ret = rb_str_casemap(str, &flags, enc);
7437 return ret;
7442 * call-seq:
7443 * swapcase!(*options) -> self or nil
7445 * Upcases each lowercase character in +self+;
7446 * downcases uppercase character;
7447 * returns +self+ if any changes were made, +nil+ otherwise:
7449 * s = 'Hello World!' # => "Hello World!"
7450 * s.swapcase! # => "hELLO wORLD!"
7451 * s # => "Hello World!"
7452 * ''.swapcase! # => nil
7454 * The casing may be affected by the given +options+;
7455 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7457 * Related: String#swapcase.
7461 static VALUE
7462 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7464 rb_encoding *enc;
7465 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7467 flags = check_case_options(argc, argv, flags);
7468 str_modify_keep_cr(str);
7469 enc = str_true_enc(str);
7470 if (flags&ONIGENC_CASE_ASCII_ONLY)
7471 rb_str_ascii_casemap(str, str, &flags, enc);
7472 else
7473 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7475 if (ONIGENC_CASE_MODIFIED&flags) return str;
7476 return Qnil;
7481 * call-seq:
7482 * swapcase(*options) -> string
7484 * Returns a string containing the characters in +self+, with cases reversed;
7485 * each uppercase character is downcased;
7486 * each lowercase character is upcased:
7488 * s = 'Hello World!' # => "Hello World!"
7489 * s.swapcase # => "hELLO wORLD!"
7491 * The casing may be affected by the given +options+;
7492 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7494 * Related: String#swapcase!.
7498 static VALUE
7499 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7501 rb_encoding *enc;
7502 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7503 VALUE ret;
7505 flags = check_case_options(argc, argv, flags);
7506 enc = str_true_enc(str);
7507 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7508 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7509 ret = rb_str_new(0, RSTRING_LEN(str));
7510 rb_str_ascii_casemap(str, ret, &flags, enc);
7512 else {
7513 ret = rb_str_casemap(str, &flags, enc);
7515 return ret;
7518 typedef unsigned char *USTR;
7520 struct tr {
7521 int gen;
7522 unsigned int now, max;
7523 char *p, *pend;
7526 static unsigned int
7527 trnext(struct tr *t, rb_encoding *enc)
7529 int n;
7531 for (;;) {
7532 nextpart:
7533 if (!t->gen) {
7534 if (t->p == t->pend) return -1;
7535 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7536 t->p += n;
7538 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7539 t->p += n;
7540 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7541 t->p += n;
7542 if (t->p < t->pend) {
7543 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7544 t->p += n;
7545 if (t->now > c) {
7546 if (t->now < 0x80 && c < 0x80) {
7547 rb_raise(rb_eArgError,
7548 "invalid range \"%c-%c\" in string transliteration",
7549 t->now, c);
7551 else {
7552 rb_raise(rb_eArgError, "invalid range in string transliteration");
7554 continue; /* not reached */
7556 t->gen = 1;
7557 t->max = c;
7560 return t->now;
7562 else {
7563 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7564 if (t->now == t->max) {
7565 t->gen = 0;
7566 goto nextpart;
7569 if (t->now < t->max) {
7570 return t->now;
7572 else {
7573 t->gen = 0;
7574 return t->max;
7580 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7582 static VALUE
7583 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7585 const unsigned int errc = -1;
7586 unsigned int trans[256];
7587 rb_encoding *enc, *e1, *e2;
7588 struct tr trsrc, trrepl;
7589 int cflag = 0;
7590 unsigned int c, c0, last = 0;
7591 int modify = 0, i, l;
7592 unsigned char *s, *send;
7593 VALUE hash = 0;
7594 int singlebyte = single_byte_optimizable(str);
7595 int termlen;
7596 int cr;
7598 #define CHECK_IF_ASCII(c) \
7599 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7600 (cr = ENC_CODERANGE_VALID) : 0)
7602 StringValue(src);
7603 StringValue(repl);
7604 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7605 if (RSTRING_LEN(repl) == 0) {
7606 return rb_str_delete_bang(1, &src, str);
7609 cr = ENC_CODERANGE(str);
7610 e1 = rb_enc_check(str, src);
7611 e2 = rb_enc_check(str, repl);
7612 if (e1 == e2) {
7613 enc = e1;
7615 else {
7616 enc = rb_enc_check(src, repl);
7618 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7619 if (RSTRING_LEN(src) > 1 &&
7620 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7621 trsrc.p + l < trsrc.pend) {
7622 cflag = 1;
7623 trsrc.p += l;
7625 trrepl.p = RSTRING_PTR(repl);
7626 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7627 trsrc.gen = trrepl.gen = 0;
7628 trsrc.now = trrepl.now = 0;
7629 trsrc.max = trrepl.max = 0;
7631 if (cflag) {
7632 for (i=0; i<256; i++) {
7633 trans[i] = 1;
7635 while ((c = trnext(&trsrc, enc)) != errc) {
7636 if (c < 256) {
7637 trans[c] = errc;
7639 else {
7640 if (!hash) hash = rb_hash_new();
7641 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7644 while ((c = trnext(&trrepl, enc)) != errc)
7645 /* retrieve last replacer */;
7646 last = trrepl.now;
7647 for (i=0; i<256; i++) {
7648 if (trans[i] != errc) {
7649 trans[i] = last;
7653 else {
7654 unsigned int r;
7656 for (i=0; i<256; i++) {
7657 trans[i] = errc;
7659 while ((c = trnext(&trsrc, enc)) != errc) {
7660 r = trnext(&trrepl, enc);
7661 if (r == errc) r = trrepl.now;
7662 if (c < 256) {
7663 trans[c] = r;
7664 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7666 else {
7667 if (!hash) hash = rb_hash_new();
7668 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7673 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7674 cr = ENC_CODERANGE_7BIT;
7675 str_modify_keep_cr(str);
7676 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7677 termlen = rb_enc_mbminlen(enc);
7678 if (sflag) {
7679 int clen, tlen;
7680 long offset, max = RSTRING_LEN(str);
7681 unsigned int save = -1;
7682 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7684 while (s < send) {
7685 int may_modify = 0;
7687 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7688 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7690 s += clen;
7691 if (c < 256) {
7692 c = trans[c];
7694 else if (hash) {
7695 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7696 if (NIL_P(tmp)) {
7697 if (cflag) c = last;
7698 else c = errc;
7700 else if (cflag) c = errc;
7701 else c = NUM2INT(tmp);
7703 else {
7704 c = errc;
7706 if (c != (unsigned int)-1) {
7707 if (save == c) {
7708 CHECK_IF_ASCII(c);
7709 continue;
7711 save = c;
7712 tlen = rb_enc_codelen(c, enc);
7713 modify = 1;
7715 else {
7716 save = -1;
7717 c = c0;
7718 if (enc != e1) may_modify = 1;
7720 if ((offset = t - buf) + tlen > max) {
7721 size_t MAYBE_UNUSED(old) = max + termlen;
7722 max = offset + tlen + (send - s);
7723 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7724 t = buf + offset;
7726 rb_enc_mbcput(c, t, enc);
7727 if (may_modify && memcmp(s, t, tlen) != 0) {
7728 modify = 1;
7730 CHECK_IF_ASCII(c);
7731 t += tlen;
7733 if (!STR_EMBED_P(str)) {
7734 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7736 TERM_FILL((char *)t, termlen);
7737 RSTRING(str)->as.heap.ptr = (char *)buf;
7738 RSTRING(str)->as.heap.len = t - buf;
7739 STR_SET_NOEMBED(str);
7740 RSTRING(str)->as.heap.aux.capa = max;
7742 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7743 while (s < send) {
7744 c = (unsigned char)*s;
7745 if (trans[c] != errc) {
7746 if (!cflag) {
7747 c = trans[c];
7748 *s = c;
7749 modify = 1;
7751 else {
7752 *s = last;
7753 modify = 1;
7756 CHECK_IF_ASCII(c);
7757 s++;
7760 else {
7761 int clen, tlen;
7762 long offset, max = (long)((send - s) * 1.2);
7763 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7765 while (s < send) {
7766 int may_modify = 0;
7767 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7768 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7770 if (c < 256) {
7771 c = trans[c];
7773 else if (hash) {
7774 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7775 if (NIL_P(tmp)) {
7776 if (cflag) c = last;
7777 else c = errc;
7779 else if (cflag) c = errc;
7780 else c = NUM2INT(tmp);
7782 else {
7783 c = cflag ? last : errc;
7785 if (c != errc) {
7786 tlen = rb_enc_codelen(c, enc);
7787 modify = 1;
7789 else {
7790 c = c0;
7791 if (enc != e1) may_modify = 1;
7793 if ((offset = t - buf) + tlen > max) {
7794 size_t MAYBE_UNUSED(old) = max + termlen;
7795 max = offset + tlen + (long)((send - s) * 1.2);
7796 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7797 t = buf + offset;
7799 if (s != t) {
7800 rb_enc_mbcput(c, t, enc);
7801 if (may_modify && memcmp(s, t, tlen) != 0) {
7802 modify = 1;
7805 CHECK_IF_ASCII(c);
7806 s += clen;
7807 t += tlen;
7809 if (!STR_EMBED_P(str)) {
7810 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7812 TERM_FILL((char *)t, termlen);
7813 RSTRING(str)->as.heap.ptr = (char *)buf;
7814 RSTRING(str)->as.heap.len = t - buf;
7815 STR_SET_NOEMBED(str);
7816 RSTRING(str)->as.heap.aux.capa = max;
7819 if (modify) {
7820 if (cr != ENC_CODERANGE_BROKEN)
7821 ENC_CODERANGE_SET(str, cr);
7822 rb_enc_associate(str, enc);
7823 return str;
7825 return Qnil;
7830 * call-seq:
7831 * str.tr!(from_str, to_str) -> str or nil
7833 * Translates <i>str</i> in place, using the same rules as
7834 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7835 * were made.
7838 static VALUE
7839 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7841 return tr_trans(str, src, repl, 0);
7846 * call-seq:
7847 * str.tr(from_str, to_str) => new_str
7849 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7850 * corresponding characters in +to_str+. If +to_str+ is shorter than
7851 * +from_str+, it is padded with its last character in order to maintain the
7852 * correspondence.
7854 * "hello".tr('el', 'ip') #=> "hippo"
7855 * "hello".tr('aeiou', '*') #=> "h*ll*"
7856 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7858 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7859 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7860 * all characters except those listed.
7862 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7863 * "hello".tr('^aeiou', '*') #=> "*e**o"
7865 * The backslash character <code>\\</code> can be used to escape
7866 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7867 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7869 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7870 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7872 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7873 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7874 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7876 * "X['\\b']".tr("X\\", "") #=> "['b']"
7877 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7880 static VALUE
7881 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7883 str = str_duplicate(rb_cString, str);
7884 tr_trans(str, src, repl, 0);
7885 return str;
7888 #define TR_TABLE_MAX (UCHAR_MAX+1)
7889 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7890 static void
7891 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7892 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7894 const unsigned int errc = -1;
7895 char buf[TR_TABLE_MAX];
7896 struct tr tr;
7897 unsigned int c;
7898 VALUE table = 0, ptable = 0;
7899 int i, l, cflag = 0;
7901 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7902 tr.gen = tr.now = tr.max = 0;
7904 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7905 cflag = 1;
7906 tr.p += l;
7908 if (first) {
7909 for (i=0; i<TR_TABLE_MAX; i++) {
7910 stable[i] = 1;
7912 stable[TR_TABLE_MAX] = cflag;
7914 else if (stable[TR_TABLE_MAX] && !cflag) {
7915 stable[TR_TABLE_MAX] = 0;
7917 for (i=0; i<TR_TABLE_MAX; i++) {
7918 buf[i] = cflag;
7921 while ((c = trnext(&tr, enc)) != errc) {
7922 if (c < TR_TABLE_MAX) {
7923 buf[(unsigned char)c] = !cflag;
7925 else {
7926 VALUE key = UINT2NUM(c);
7928 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7929 if (cflag) {
7930 ptable = *ctablep;
7931 table = ptable ? ptable : rb_hash_new();
7932 *ctablep = table;
7934 else {
7935 table = rb_hash_new();
7936 ptable = *tablep;
7937 *tablep = table;
7940 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7941 rb_hash_aset(table, key, Qtrue);
7945 for (i=0; i<TR_TABLE_MAX; i++) {
7946 stable[i] = stable[i] && buf[i];
7948 if (!table && !cflag) {
7949 *tablep = 0;
7954 static int
7955 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7957 if (c < TR_TABLE_MAX) {
7958 return table[c] != 0;
7960 else {
7961 VALUE v = UINT2NUM(c);
7963 if (del) {
7964 if (!NIL_P(rb_hash_lookup(del, v)) &&
7965 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7966 return TRUE;
7969 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7970 return FALSE;
7972 return table[TR_TABLE_MAX] ? TRUE : FALSE;
7977 * call-seq:
7978 * str.delete!([other_str]+) -> str or nil
7980 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7981 * <code>nil</code> if <i>str</i> was not modified.
7984 static VALUE
7985 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7987 char squeez[TR_TABLE_SIZE];
7988 rb_encoding *enc = 0;
7989 char *s, *send, *t;
7990 VALUE del = 0, nodel = 0;
7991 int modify = 0;
7992 int i, ascompat, cr;
7994 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7995 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7996 for (i=0; i<argc; i++) {
7997 VALUE s = argv[i];
7999 StringValue(s);
8000 enc = rb_enc_check(str, s);
8001 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8004 str_modify_keep_cr(str);
8005 ascompat = rb_enc_asciicompat(enc);
8006 s = t = RSTRING_PTR(str);
8007 send = RSTRING_END(str);
8008 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8009 while (s < send) {
8010 unsigned int c;
8011 int clen;
8013 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8014 if (squeez[c]) {
8015 modify = 1;
8017 else {
8018 if (t != s) *t = c;
8019 t++;
8021 s++;
8023 else {
8024 c = rb_enc_codepoint_len(s, send, &clen, enc);
8026 if (tr_find(c, squeez, del, nodel)) {
8027 modify = 1;
8029 else {
8030 if (t != s) rb_enc_mbcput(c, t, enc);
8031 t += clen;
8032 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8034 s += clen;
8037 TERM_FILL(t, TERM_LEN(str));
8038 STR_SET_LEN(str, t - RSTRING_PTR(str));
8039 ENC_CODERANGE_SET(str, cr);
8041 if (modify) return str;
8042 return Qnil;
8047 * call-seq:
8048 * str.delete([other_str]+) -> new_str
8050 * Returns a copy of <i>str</i> with all characters in the intersection of its
8051 * arguments deleted. Uses the same rules for building the set of characters as
8052 * String#count.
8054 * "hello".delete "l","lo" #=> "heo"
8055 * "hello".delete "lo" #=> "he"
8056 * "hello".delete "aeiou", "^e" #=> "hell"
8057 * "hello".delete "ej-m" #=> "ho"
8060 static VALUE
8061 rb_str_delete(int argc, VALUE *argv, VALUE str)
8063 str = str_duplicate(rb_cString, str);
8064 rb_str_delete_bang(argc, argv, str);
8065 return str;
8070 * call-seq:
8071 * str.squeeze!([other_str]*) -> str or nil
8073 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8074 * <code>nil</code> if no changes were made.
8077 static VALUE
8078 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8080 char squeez[TR_TABLE_SIZE];
8081 rb_encoding *enc = 0;
8082 VALUE del = 0, nodel = 0;
8083 unsigned char *s, *send, *t;
8084 int i, modify = 0;
8085 int ascompat, singlebyte = single_byte_optimizable(str);
8086 unsigned int save;
8088 if (argc == 0) {
8089 enc = STR_ENC_GET(str);
8091 else {
8092 for (i=0; i<argc; i++) {
8093 VALUE s = argv[i];
8095 StringValue(s);
8096 enc = rb_enc_check(str, s);
8097 if (singlebyte && !single_byte_optimizable(s))
8098 singlebyte = 0;
8099 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8103 str_modify_keep_cr(str);
8104 s = t = (unsigned char *)RSTRING_PTR(str);
8105 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8106 send = (unsigned char *)RSTRING_END(str);
8107 save = -1;
8108 ascompat = rb_enc_asciicompat(enc);
8110 if (singlebyte) {
8111 while (s < send) {
8112 unsigned int c = *s++;
8113 if (c != save || (argc > 0 && !squeez[c])) {
8114 *t++ = save = c;
8118 else {
8119 while (s < send) {
8120 unsigned int c;
8121 int clen;
8123 if (ascompat && (c = *s) < 0x80) {
8124 if (c != save || (argc > 0 && !squeez[c])) {
8125 *t++ = save = c;
8127 s++;
8129 else {
8130 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8132 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8133 if (t != s) rb_enc_mbcput(c, t, enc);
8134 save = c;
8135 t += clen;
8137 s += clen;
8142 TERM_FILL((char *)t, TERM_LEN(str));
8143 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8144 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8145 modify = 1;
8148 if (modify) return str;
8149 return Qnil;
8154 * call-seq:
8155 * str.squeeze([other_str]*) -> new_str
8157 * Builds a set of characters from the <i>other_str</i> parameter(s)
8158 * using the procedure described for String#count. Returns a new
8159 * string where runs of the same character that occur in this set are
8160 * replaced by a single character. If no arguments are given, all
8161 * runs of identical characters are replaced by a single character.
8163 * "yellow moon".squeeze #=> "yelow mon"
8164 * " now is the".squeeze(" ") #=> " now is the"
8165 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8168 static VALUE
8169 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8171 str = str_duplicate(rb_cString, str);
8172 rb_str_squeeze_bang(argc, argv, str);
8173 return str;
8178 * call-seq:
8179 * str.tr_s!(from_str, to_str) -> str or nil
8181 * Performs String#tr_s processing on <i>str</i> in place,
8182 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8185 static VALUE
8186 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8188 return tr_trans(str, src, repl, 1);
8193 * call-seq:
8194 * str.tr_s(from_str, to_str) -> new_str
8196 * Processes a copy of <i>str</i> as described under String#tr, then
8197 * removes duplicate characters in regions that were affected by the
8198 * translation.
8200 * "hello".tr_s('l', 'r') #=> "hero"
8201 * "hello".tr_s('el', '*') #=> "h*o"
8202 * "hello".tr_s('el', 'hx') #=> "hhxo"
8205 static VALUE
8206 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8208 str = str_duplicate(rb_cString, str);
8209 tr_trans(str, src, repl, 1);
8210 return str;
8215 * call-seq:
8216 * str.count([other_str]+) -> integer
8218 * Each +other_str+ parameter defines a set of characters to count. The
8219 * intersection of these sets defines the characters to count in +str+. Any
8220 * +other_str+ that starts with a caret <code>^</code> is negated. The
8221 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8222 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8223 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8224 * sequence or the end of a +other_str+.
8226 * a = "hello world"
8227 * a.count "lo" #=> 5
8228 * a.count "lo", "o" #=> 2
8229 * a.count "hello", "^l" #=> 4
8230 * a.count "ej-m" #=> 4
8232 * "hello^world".count "\\^aeiou" #=> 4
8233 * "hello-world".count "a\\-eo" #=> 4
8235 * c = "hello world\\r\\n"
8236 * c.count "\\" #=> 2
8237 * c.count "\\A" #=> 0
8238 * c.count "X-\\w" #=> 3
8241 static VALUE
8242 rb_str_count(int argc, VALUE *argv, VALUE str)
8244 char table[TR_TABLE_SIZE];
8245 rb_encoding *enc = 0;
8246 VALUE del = 0, nodel = 0, tstr;
8247 char *s, *send;
8248 int i;
8249 int ascompat;
8250 size_t n = 0;
8252 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8254 tstr = argv[0];
8255 StringValue(tstr);
8256 enc = rb_enc_check(str, tstr);
8257 if (argc == 1) {
8258 const char *ptstr;
8259 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8260 (ptstr = RSTRING_PTR(tstr),
8261 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8262 !is_broken_string(str)) {
8263 int clen;
8264 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8266 s = RSTRING_PTR(str);
8267 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8268 send = RSTRING_END(str);
8269 while (s < send) {
8270 if (*(unsigned char*)s++ == c) n++;
8272 return SIZET2NUM(n);
8276 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8277 for (i=1; i<argc; i++) {
8278 tstr = argv[i];
8279 StringValue(tstr);
8280 enc = rb_enc_check(str, tstr);
8281 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8284 s = RSTRING_PTR(str);
8285 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8286 send = RSTRING_END(str);
8287 ascompat = rb_enc_asciicompat(enc);
8288 while (s < send) {
8289 unsigned int c;
8291 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8292 if (table[c]) {
8293 n++;
8295 s++;
8297 else {
8298 int clen;
8299 c = rb_enc_codepoint_len(s, send, &clen, enc);
8300 if (tr_find(c, table, del, nodel)) {
8301 n++;
8303 s += clen;
8307 return SIZET2NUM(n);
8310 static VALUE
8311 rb_fs_check(VALUE val)
8313 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8314 val = rb_check_string_type(val);
8315 if (NIL_P(val)) return 0;
8317 return val;
8320 static const char isspacetable[256] = {
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8339 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8341 static long
8342 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8344 if (empty_count >= 0 && len == 0) {
8345 return empty_count + 1;
8347 if (empty_count > 0) {
8348 /* make different substrings */
8349 if (result) {
8350 do {
8351 rb_ary_push(result, str_new_empty_String(str));
8352 } while (--empty_count > 0);
8354 else {
8355 do {
8356 rb_yield(str_new_empty_String(str));
8357 } while (--empty_count > 0);
8360 str = rb_str_subseq(str, beg, len);
8361 if (result) {
8362 rb_ary_push(result, str);
8364 else {
8365 rb_yield(str);
8367 return empty_count;
8370 typedef enum {
8371 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8372 } split_type_t;
8374 static split_type_t
8375 literal_split_pattern(VALUE spat, split_type_t default_type)
8377 rb_encoding *enc = STR_ENC_GET(spat);
8378 const char *ptr;
8379 long len;
8380 RSTRING_GETMEM(spat, ptr, len);
8381 if (len == 0) {
8382 /* Special case - split into chars */
8383 return SPLIT_TYPE_CHARS;
8385 else if (rb_enc_asciicompat(enc)) {
8386 if (len == 1 && ptr[0] == ' ') {
8387 return SPLIT_TYPE_AWK;
8390 else {
8391 int l;
8392 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8393 return SPLIT_TYPE_AWK;
8396 return default_type;
8400 * call-seq:
8401 * str.split(pattern=nil, [limit]) -> an_array
8402 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8404 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8405 * of these substrings.
8407 * If <i>pattern</i> is a String, then its contents are used as
8408 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8409 * space, <i>str</i> is split on whitespace, with leading and trailing
8410 * whitespace and runs of contiguous whitespace characters ignored.
8412 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8413 * pattern matches. Whenever the pattern matches a zero-length string,
8414 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8415 * groups, the respective matches will be returned in the array as well.
8417 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8418 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8419 * split on whitespace as if ' ' were specified.
8421 * If the <i>limit</i> parameter is omitted, trailing null fields are
8422 * suppressed. If <i>limit</i> is a positive number, at most that number
8423 * of split substrings will be returned (captured groups will be returned
8424 * as well, but are not counted towards the limit).
8425 * If <i>limit</i> is <code>1</code>, the entire
8426 * string is returned as the only entry in an array. If negative, there is no
8427 * limit to the number of fields returned, and trailing null fields are not
8428 * suppressed.
8430 * When the input +str+ is empty an empty Array is returned as the string is
8431 * considered to have no fields to split.
8433 * " now's the time ".split #=> ["now's", "the", "time"]
8434 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8435 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8436 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8437 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8438 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8439 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8441 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8442 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8443 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8444 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8446 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8448 * "".split(',', -1) #=> []
8450 * If a block is given, invoke the block with each split substring.
8454 static VALUE
8455 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8457 rb_encoding *enc;
8458 VALUE spat;
8459 VALUE limit;
8460 split_type_t split_type;
8461 long beg, end, i = 0, empty_count = -1;
8462 int lim = 0;
8463 VALUE result, tmp;
8465 result = rb_block_given_p() ? Qfalse : Qnil;
8466 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8467 lim = NUM2INT(limit);
8468 if (lim <= 0) limit = Qnil;
8469 else if (lim == 1) {
8470 if (RSTRING_LEN(str) == 0)
8471 return result ? rb_ary_new2(0) : str;
8472 tmp = str_duplicate(rb_cString, str);
8473 if (!result) {
8474 rb_yield(tmp);
8475 return str;
8477 return rb_ary_new3(1, tmp);
8479 i = 1;
8481 if (NIL_P(limit) && !lim) empty_count = 0;
8483 enc = STR_ENC_GET(str);
8484 split_type = SPLIT_TYPE_REGEXP;
8485 if (!NIL_P(spat)) {
8486 spat = get_pat_quoted(spat, 0);
8488 else if (NIL_P(spat = rb_fs)) {
8489 split_type = SPLIT_TYPE_AWK;
8491 else if (!(spat = rb_fs_check(spat))) {
8492 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8494 else {
8495 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8497 if (split_type != SPLIT_TYPE_AWK) {
8498 switch (BUILTIN_TYPE(spat)) {
8499 case T_REGEXP:
8500 rb_reg_options(spat); /* check if uninitialized */
8501 tmp = RREGEXP_SRC(spat);
8502 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8503 if (split_type == SPLIT_TYPE_AWK) {
8504 spat = tmp;
8505 split_type = SPLIT_TYPE_STRING;
8507 break;
8509 case T_STRING:
8510 mustnot_broken(spat);
8511 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8512 break;
8514 default:
8515 UNREACHABLE_RETURN(Qnil);
8519 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8521 if (result) result = rb_ary_new();
8522 beg = 0;
8523 char *ptr = RSTRING_PTR(str);
8524 char *eptr = RSTRING_END(str);
8525 if (split_type == SPLIT_TYPE_AWK) {
8526 char *bptr = ptr;
8527 int skip = 1;
8528 unsigned int c;
8530 end = beg;
8531 if (is_ascii_string(str)) {
8532 while (ptr < eptr) {
8533 c = (unsigned char)*ptr++;
8534 if (skip) {
8535 if (ascii_isspace(c)) {
8536 beg = ptr - bptr;
8538 else {
8539 end = ptr - bptr;
8540 skip = 0;
8541 if (!NIL_P(limit) && lim <= i) break;
8544 else if (ascii_isspace(c)) {
8545 SPLIT_STR(beg, end-beg);
8546 skip = 1;
8547 beg = ptr - bptr;
8548 if (!NIL_P(limit)) ++i;
8550 else {
8551 end = ptr - bptr;
8555 else {
8556 while (ptr < eptr) {
8557 int n;
8559 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8560 ptr += n;
8561 if (skip) {
8562 if (rb_isspace(c)) {
8563 beg = ptr - bptr;
8565 else {
8566 end = ptr - bptr;
8567 skip = 0;
8568 if (!NIL_P(limit) && lim <= i) break;
8571 else if (rb_isspace(c)) {
8572 SPLIT_STR(beg, end-beg);
8573 skip = 1;
8574 beg = ptr - bptr;
8575 if (!NIL_P(limit)) ++i;
8577 else {
8578 end = ptr - bptr;
8583 else if (split_type == SPLIT_TYPE_STRING) {
8584 char *str_start = ptr;
8585 char *substr_start = ptr;
8586 char *sptr = RSTRING_PTR(spat);
8587 long slen = RSTRING_LEN(spat);
8589 mustnot_broken(str);
8590 enc = rb_enc_check(str, spat);
8591 while (ptr < eptr &&
8592 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8593 /* Check we are at the start of a char */
8594 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8595 if (t != ptr + end) {
8596 ptr = t;
8597 continue;
8599 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8600 ptr += end + slen;
8601 substr_start = ptr;
8602 if (!NIL_P(limit) && lim <= ++i) break;
8604 beg = ptr - str_start;
8606 else if (split_type == SPLIT_TYPE_CHARS) {
8607 char *str_start = ptr;
8608 int n;
8610 mustnot_broken(str);
8611 enc = rb_enc_get(str);
8612 while (ptr < eptr &&
8613 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8614 SPLIT_STR(ptr - str_start, n);
8615 ptr += n;
8616 if (!NIL_P(limit) && lim <= ++i) break;
8618 beg = ptr - str_start;
8620 else {
8621 long len = RSTRING_LEN(str);
8622 long start = beg;
8623 long idx;
8624 int last_null = 0;
8625 struct re_registers *regs;
8626 VALUE match = 0;
8628 for (; rb_reg_search(spat, str, start, 0) >= 0;
8629 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8630 match = rb_backref_get();
8631 if (!result) rb_match_busy(match);
8632 regs = RMATCH_REGS(match);
8633 end = BEG(0);
8634 if (start == end && BEG(0) == END(0)) {
8635 if (!ptr) {
8636 SPLIT_STR(0, 0);
8637 break;
8639 else if (last_null == 1) {
8640 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8641 beg = start;
8643 else {
8644 if (start == len)
8645 start++;
8646 else
8647 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8648 last_null = 1;
8649 continue;
8652 else {
8653 SPLIT_STR(beg, end-beg);
8654 beg = start = END(0);
8656 last_null = 0;
8658 for (idx=1; idx < regs->num_regs; idx++) {
8659 if (BEG(idx) == -1) continue;
8660 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8662 if (!NIL_P(limit) && lim <= ++i) break;
8664 if (match) rb_match_unbusy(match);
8666 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8667 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8670 return result ? result : str;
8673 VALUE
8674 rb_str_split(VALUE str, const char *sep0)
8676 VALUE sep;
8678 StringValue(str);
8679 sep = rb_str_new_cstr(sep0);
8680 return rb_str_split_m(1, &sep, str);
8683 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8685 static inline int
8686 enumerator_element(VALUE ary, VALUE e)
8688 if (ary) {
8689 rb_ary_push(ary, e);
8690 return 0;
8692 else {
8693 rb_yield(e);
8694 return 1;
8698 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8700 static const char *
8701 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8703 const char *prev = rb_enc_prev_char(p, e, e, enc);
8704 if (rb_enc_is_newline(prev, e, enc)) {
8705 e = prev;
8706 prev = rb_enc_prev_char(p, e, e, enc);
8707 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8708 e = prev;
8710 return e;
8713 static VALUE
8714 get_rs(void)
8716 VALUE rs = rb_rs;
8717 if (!NIL_P(rs) &&
8718 (!RB_TYPE_P(rs, T_STRING) ||
8719 RSTRING_LEN(rs) != 1 ||
8720 RSTRING_PTR(rs)[0] != '\n')) {
8721 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8723 return rs;
8726 #define rb_rs get_rs()
8728 static VALUE
8729 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8731 rb_encoding *enc;
8732 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8733 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8734 long pos, len, rslen;
8735 int rsnewline = 0;
8737 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8738 rs = rb_rs;
8739 if (!NIL_P(opts)) {
8740 static ID keywords[1];
8741 if (!keywords[0]) {
8742 keywords[0] = rb_intern_const("chomp");
8744 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8745 chomp = (chomp != Qundef && RTEST(chomp));
8748 if (NIL_P(rs)) {
8749 if (!ENUM_ELEM(ary, str)) {
8750 return ary;
8752 else {
8753 return orig;
8757 if (!RSTRING_LEN(str)) goto end;
8758 str = rb_str_new_frozen(str);
8759 ptr = subptr = RSTRING_PTR(str);
8760 pend = RSTRING_END(str);
8761 len = RSTRING_LEN(str);
8762 StringValue(rs);
8763 rslen = RSTRING_LEN(rs);
8765 if (rs == rb_default_rs)
8766 enc = rb_enc_get(str);
8767 else
8768 enc = rb_enc_check(str, rs);
8770 if (rslen == 0) {
8771 /* paragraph mode */
8772 int n;
8773 const char *eol = NULL;
8774 subend = subptr;
8775 while (subend < pend) {
8776 do {
8777 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8778 n = 0;
8779 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8780 if (rb_enc_is_newline(subend + n, pend, enc)) {
8781 if (eol == subend) break;
8782 subend += rslen;
8783 if (subptr) eol = subend;
8785 else {
8786 if (!subptr) subptr = subend;
8787 subend += rslen;
8789 rslen = 0;
8790 } while (subend < pend);
8791 if (!subptr) break;
8792 line = rb_str_subseq(str, subptr - ptr,
8793 subend - subptr + (chomp ? 0 : rslen));
8794 if (ENUM_ELEM(ary, line)) {
8795 str_mod_check(str, ptr, len);
8797 subptr = eol = NULL;
8799 goto end;
8801 else {
8802 rsptr = RSTRING_PTR(rs);
8803 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8804 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8805 rsnewline = 1;
8809 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8810 rs = rb_str_new(rsptr, rslen);
8811 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8812 rsptr = RSTRING_PTR(rs);
8813 rslen = RSTRING_LEN(rs);
8816 while (subptr < pend) {
8817 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8818 if (pos < 0) break;
8819 hit = subptr + pos;
8820 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8821 if (hit != adjusted) {
8822 subptr = adjusted;
8823 continue;
8825 subend = hit += rslen;
8826 if (chomp) {
8827 if (rsnewline) {
8828 subend = chomp_newline(subptr, subend, enc);
8830 else {
8831 subend -= rslen;
8834 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8835 if (ENUM_ELEM(ary, line)) {
8836 str_mod_check(str, ptr, len);
8838 subptr = hit;
8841 if (subptr != pend) {
8842 if (chomp) {
8843 if (rsnewline) {
8844 pend = chomp_newline(subptr, pend, enc);
8846 else if (pend - subptr >= rslen &&
8847 memcmp(pend - rslen, rsptr, rslen) == 0) {
8848 pend -= rslen;
8851 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8852 ENUM_ELEM(ary, line);
8853 RB_GC_GUARD(str);
8856 end:
8857 if (ary)
8858 return ary;
8859 else
8860 return orig;
8864 * call-seq:
8865 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8866 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8868 * Splits <i>str</i> using the supplied parameter as the record
8869 * separator (<code>$/</code> by default), passing each substring in
8870 * turn to the supplied block. If a zero-length record separator is
8871 * supplied, the string is split into paragraphs delimited by
8872 * multiple successive newlines.
8874 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8875 * line.
8877 * If no block is given, an enumerator is returned instead.
8879 * "hello\nworld".each_line {|s| p s}
8880 * # prints:
8881 * # "hello\n"
8882 * # "world"
8884 * "hello\nworld".each_line('l') {|s| p s}
8885 * # prints:
8886 * # "hel"
8887 * # "l"
8888 * # "o\nworl"
8889 * # "d"
8891 * "hello\n\n\nworld".each_line('') {|s| p s}
8892 * # prints
8893 * # "hello\n\n"
8894 * # "world"
8896 * "hello\nworld".each_line(chomp: true) {|s| p s}
8897 * # prints:
8898 * # "hello"
8899 * # "world"
8901 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8902 * # prints:
8903 * # "he"
8904 * # ""
8905 * # "o\nwor"
8906 * # "d"
8910 static VALUE
8911 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8913 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8914 return rb_str_enumerate_lines(argc, argv, str, 0);
8918 * call-seq:
8919 * str.lines(separator=$/, chomp: false) -> an_array
8921 * Returns an array of lines in <i>str</i> split using the supplied
8922 * record separator (<code>$/</code> by default). This is a
8923 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8925 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8926 * line.
8928 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8929 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8930 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8932 * If a block is given, which is a deprecated form, works the same as
8933 * <code>each_line</code>.
8936 static VALUE
8937 rb_str_lines(int argc, VALUE *argv, VALUE str)
8939 VALUE ary = WANTARRAY("lines", 0);
8940 return rb_str_enumerate_lines(argc, argv, str, ary);
8943 static VALUE
8944 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8946 return LONG2FIX(RSTRING_LEN(str));
8949 static VALUE
8950 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8952 long i;
8954 for (i=0; i<RSTRING_LEN(str); i++) {
8955 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8957 if (ary)
8958 return ary;
8959 else
8960 return str;
8964 * call-seq:
8965 * str.each_byte {|integer| block } -> str
8966 * str.each_byte -> an_enumerator
8968 * Passes each byte in <i>str</i> to the given block, or returns an
8969 * enumerator if no block is given.
8971 * "hello".each_byte {|c| print c, ' ' }
8973 * <em>produces:</em>
8975 * 104 101 108 108 111
8978 static VALUE
8979 rb_str_each_byte(VALUE str)
8981 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8982 return rb_str_enumerate_bytes(str, 0);
8986 * call-seq:
8987 * str.bytes -> an_array
8989 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8990 * <code>str.each_byte.to_a</code>.
8992 * If a block is given, which is a deprecated form, works the same as
8993 * <code>each_byte</code>.
8996 static VALUE
8997 rb_str_bytes(VALUE str)
8999 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9000 return rb_str_enumerate_bytes(str, ary);
9003 static VALUE
9004 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9006 return rb_str_length(str);
9009 static VALUE
9010 rb_str_enumerate_chars(VALUE str, VALUE ary)
9012 VALUE orig = str;
9013 long i, len, n;
9014 const char *ptr;
9015 rb_encoding *enc;
9017 str = rb_str_new_frozen(str);
9018 ptr = RSTRING_PTR(str);
9019 len = RSTRING_LEN(str);
9020 enc = rb_enc_get(str);
9022 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9023 for (i = 0; i < len; i += n) {
9024 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9025 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9028 else {
9029 for (i = 0; i < len; i += n) {
9030 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9031 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9034 RB_GC_GUARD(str);
9035 if (ary)
9036 return ary;
9037 else
9038 return orig;
9042 * call-seq:
9043 * str.each_char {|cstr| block } -> str
9044 * str.each_char -> an_enumerator
9046 * Passes each character in <i>str</i> to the given block, or returns
9047 * an enumerator if no block is given.
9049 * "hello".each_char {|c| print c, ' ' }
9051 * <em>produces:</em>
9053 * h e l l o
9056 static VALUE
9057 rb_str_each_char(VALUE str)
9059 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9060 return rb_str_enumerate_chars(str, 0);
9064 * call-seq:
9065 * str.chars -> an_array
9067 * Returns an array of characters in <i>str</i>. This is a shorthand
9068 * for <code>str.each_char.to_a</code>.
9070 * If a block is given, which is a deprecated form, works the same as
9071 * <code>each_char</code>.
9074 static VALUE
9075 rb_str_chars(VALUE str)
9077 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9078 return rb_str_enumerate_chars(str, ary);
9081 static VALUE
9082 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9084 VALUE orig = str;
9085 int n;
9086 unsigned int c;
9087 const char *ptr, *end;
9088 rb_encoding *enc;
9090 if (single_byte_optimizable(str))
9091 return rb_str_enumerate_bytes(str, ary);
9093 str = rb_str_new_frozen(str);
9094 ptr = RSTRING_PTR(str);
9095 end = RSTRING_END(str);
9096 enc = STR_ENC_GET(str);
9098 while (ptr < end) {
9099 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9100 ENUM_ELEM(ary, UINT2NUM(c));
9101 ptr += n;
9103 RB_GC_GUARD(str);
9104 if (ary)
9105 return ary;
9106 else
9107 return orig;
9111 * call-seq:
9112 * str.each_codepoint {|integer| block } -> str
9113 * str.each_codepoint -> an_enumerator
9115 * Passes the Integer ordinal of each character in <i>str</i>,
9116 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9117 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9118 * values are directly derived from the binary representation
9119 * of each character.
9121 * If no block is given, an enumerator is returned instead.
9123 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9125 * <em>produces:</em>
9127 * 104 101 108 108 111 1593
9130 static VALUE
9131 rb_str_each_codepoint(VALUE str)
9133 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9134 return rb_str_enumerate_codepoints(str, 0);
9138 * call-seq:
9139 * str.codepoints -> an_array
9141 * Returns an array of the Integer ordinals of the
9142 * characters in <i>str</i>. This is a shorthand for
9143 * <code>str.each_codepoint.to_a</code>.
9145 * If a block is given, which is a deprecated form, works the same as
9146 * <code>each_codepoint</code>.
9149 static VALUE
9150 rb_str_codepoints(VALUE str)
9152 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9153 return rb_str_enumerate_codepoints(str, ary);
9156 static regex_t *
9157 get_reg_grapheme_cluster(rb_encoding *enc)
9159 int encidx = rb_enc_to_index(enc);
9160 regex_t *reg_grapheme_cluster = NULL;
9161 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9163 /* synchronize */
9164 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9165 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9167 if (!reg_grapheme_cluster) {
9168 const OnigUChar source_ascii[] = "\\X";
9169 OnigErrorInfo einfo;
9170 const OnigUChar *source = source_ascii;
9171 size_t source_len = sizeof(source_ascii) - 1;
9172 switch (encidx) {
9173 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9174 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9175 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9176 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9177 #define CASE_UTF(e) \
9178 case ENCINDEX_UTF_##e: { \
9179 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9180 source = source_UTF_##e; \
9181 source_len = sizeof(source_UTF_##e); \
9182 break; \
9184 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9185 #undef CASE_UTF
9186 #undef CHARS_16BE
9187 #undef CHARS_16LE
9188 #undef CHARS_32BE
9189 #undef CHARS_32LE
9191 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9192 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9193 if (r) {
9194 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9195 onig_error_code_to_str(message, r, &einfo);
9196 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9198 if (encidx == rb_utf8_encindex()) {
9199 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9202 return reg_grapheme_cluster;
9205 static VALUE
9206 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9208 size_t grapheme_cluster_count = 0;
9209 regex_t *reg_grapheme_cluster = NULL;
9210 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9211 const char *ptr, *end;
9213 if (!rb_enc_unicode_p(enc)) {
9214 return rb_str_length(str);
9217 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9218 ptr = RSTRING_PTR(str);
9219 end = RSTRING_END(str);
9221 while (ptr < end) {
9222 OnigPosition len = onig_match(reg_grapheme_cluster,
9223 (const OnigUChar *)ptr, (const OnigUChar *)end,
9224 (const OnigUChar *)ptr, NULL, 0);
9225 if (len <= 0) break;
9226 grapheme_cluster_count++;
9227 ptr += len;
9230 return SIZET2NUM(grapheme_cluster_count);
9233 static VALUE
9234 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9236 VALUE orig = str;
9237 regex_t *reg_grapheme_cluster = NULL;
9238 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9239 const char *ptr0, *ptr, *end;
9241 if (!rb_enc_unicode_p(enc)) {
9242 return rb_str_enumerate_chars(str, ary);
9245 if (!ary) str = rb_str_new_frozen(str);
9246 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9247 ptr0 = ptr = RSTRING_PTR(str);
9248 end = RSTRING_END(str);
9250 while (ptr < end) {
9251 OnigPosition len = onig_match(reg_grapheme_cluster,
9252 (const OnigUChar *)ptr, (const OnigUChar *)end,
9253 (const OnigUChar *)ptr, NULL, 0);
9254 if (len <= 0) break;
9255 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9256 ptr += len;
9258 RB_GC_GUARD(str);
9259 if (ary)
9260 return ary;
9261 else
9262 return orig;
9266 * call-seq:
9267 * str.each_grapheme_cluster {|cstr| block } -> str
9268 * str.each_grapheme_cluster -> an_enumerator
9270 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9271 * an enumerator if no block is given.
9272 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9273 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9275 * "a\u0300".each_char.to_a.size #=> 2
9276 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9280 static VALUE
9281 rb_str_each_grapheme_cluster(VALUE str)
9283 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9284 return rb_str_enumerate_grapheme_clusters(str, 0);
9288 * call-seq:
9289 * str.grapheme_clusters -> an_array
9291 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9292 * for <code>str.each_grapheme_cluster.to_a</code>.
9294 * If a block is given, which is a deprecated form, works the same as
9295 * <code>each_grapheme_cluster</code>.
9298 static VALUE
9299 rb_str_grapheme_clusters(VALUE str)
9301 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9302 return rb_str_enumerate_grapheme_clusters(str, ary);
9305 static long
9306 chopped_length(VALUE str)
9308 rb_encoding *enc = STR_ENC_GET(str);
9309 const char *p, *p2, *beg, *end;
9311 beg = RSTRING_PTR(str);
9312 end = beg + RSTRING_LEN(str);
9313 if (beg >= end) return 0;
9314 p = rb_enc_prev_char(beg, end, end, enc);
9315 if (!p) return 0;
9316 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9317 p2 = rb_enc_prev_char(beg, p, end, enc);
9318 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9320 return p - beg;
9324 * call-seq:
9325 * str.chop! -> str or nil
9327 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9328 * <code>nil</code> if <i>str</i> is the empty string. See also
9329 * String#chomp!.
9332 static VALUE
9333 rb_str_chop_bang(VALUE str)
9335 str_modify_keep_cr(str);
9336 if (RSTRING_LEN(str) > 0) {
9337 long len;
9338 len = chopped_length(str);
9339 STR_SET_LEN(str, len);
9340 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9341 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9342 ENC_CODERANGE_CLEAR(str);
9344 return str;
9346 return Qnil;
9351 * call-seq:
9352 * str.chop -> new_str
9354 * Returns a new String with the last character removed. If the
9355 * string ends with <code>\r\n</code>, both characters are
9356 * removed. Applying <code>chop</code> to an empty string returns an
9357 * empty string. String#chomp is often a safer alternative, as it
9358 * leaves the string unchanged if it doesn't end in a record
9359 * separator.
9361 * "string\r\n".chop #=> "string"
9362 * "string\n\r".chop #=> "string\n"
9363 * "string\n".chop #=> "string"
9364 * "string".chop #=> "strin"
9365 * "x".chop.chop #=> ""
9368 static VALUE
9369 rb_str_chop(VALUE str)
9371 return rb_str_subseq(str, 0, chopped_length(str));
9374 static long
9375 smart_chomp(VALUE str, const char *e, const char *p)
9377 rb_encoding *enc = rb_enc_get(str);
9378 if (rb_enc_mbminlen(enc) > 1) {
9379 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9380 if (rb_enc_is_newline(pp, e, enc)) {
9381 e = pp;
9383 pp = e - rb_enc_mbminlen(enc);
9384 if (pp >= p) {
9385 pp = rb_enc_left_char_head(p, pp, e, enc);
9386 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9387 e = pp;
9391 else {
9392 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9393 case '\n':
9394 if (--e > p && *(e-1) == '\r') {
9395 --e;
9397 break;
9398 case '\r':
9399 --e;
9400 break;
9403 return e - p;
9406 static long
9407 chompped_length(VALUE str, VALUE rs)
9409 rb_encoding *enc;
9410 int newline;
9411 char *pp, *e, *rsptr;
9412 long rslen;
9413 char *const p = RSTRING_PTR(str);
9414 long len = RSTRING_LEN(str);
9416 if (len == 0) return 0;
9417 e = p + len;
9418 if (rs == rb_default_rs) {
9419 return smart_chomp(str, e, p);
9422 enc = rb_enc_get(str);
9423 RSTRING_GETMEM(rs, rsptr, rslen);
9424 if (rslen == 0) {
9425 if (rb_enc_mbminlen(enc) > 1) {
9426 while (e > p) {
9427 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9428 if (!rb_enc_is_newline(pp, e, enc)) break;
9429 e = pp;
9430 pp -= rb_enc_mbminlen(enc);
9431 if (pp >= p) {
9432 pp = rb_enc_left_char_head(p, pp, e, enc);
9433 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9434 e = pp;
9439 else {
9440 while (e > p && *(e-1) == '\n') {
9441 --e;
9442 if (e > p && *(e-1) == '\r')
9443 --e;
9446 return e - p;
9448 if (rslen > len) return len;
9450 enc = rb_enc_get(rs);
9451 newline = rsptr[rslen-1];
9452 if (rslen == rb_enc_mbminlen(enc)) {
9453 if (rslen == 1) {
9454 if (newline == '\n')
9455 return smart_chomp(str, e, p);
9457 else {
9458 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9459 return smart_chomp(str, e, p);
9463 enc = rb_enc_check(str, rs);
9464 if (is_broken_string(rs)) {
9465 return len;
9467 pp = e - rslen;
9468 if (p[len-1] == newline &&
9469 (rslen <= 1 ||
9470 memcmp(rsptr, pp, rslen) == 0)) {
9471 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9472 return len - rslen;
9473 RB_GC_GUARD(rs);
9475 return len;
9479 * Returns the separator for arguments of rb_str_chomp.
9481 * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9483 static VALUE
9484 chomp_rs(int argc, const VALUE *argv)
9486 rb_check_arity(argc, 0, 1);
9487 if (argc > 0) {
9488 VALUE rs = argv[0];
9489 if (!NIL_P(rs)) StringValue(rs);
9490 return rs;
9492 else {
9493 return rb_rs;
9497 VALUE
9498 rb_str_chomp_string(VALUE str, VALUE rs)
9500 long olen = RSTRING_LEN(str);
9501 long len = chompped_length(str, rs);
9502 if (len >= olen) return Qnil;
9503 str_modify_keep_cr(str);
9504 STR_SET_LEN(str, len);
9505 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9506 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9507 ENC_CODERANGE_CLEAR(str);
9509 return str;
9513 * call-seq:
9514 * str.chomp!(separator=$/) -> str or nil
9516 * Modifies <i>str</i> in place as described for String#chomp,
9517 * returning <i>str</i>, or <code>nil</code> if no modifications were
9518 * made.
9521 static VALUE
9522 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9524 VALUE rs;
9525 str_modifiable(str);
9526 if (RSTRING_LEN(str) == 0) return Qnil;
9527 rs = chomp_rs(argc, argv);
9528 if (NIL_P(rs)) return Qnil;
9529 return rb_str_chomp_string(str, rs);
9534 * call-seq:
9535 * str.chomp(separator=$/) -> new_str
9537 * Returns a new String with the given record separator removed
9538 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9539 * changed from the default Ruby record separator, then <code>chomp</code> also
9540 * removes carriage return characters (that is, it will remove <code>\n</code>,
9541 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9542 * it will remove all trailing newlines from the string.
9544 * "hello".chomp #=> "hello"
9545 * "hello\n".chomp #=> "hello"
9546 * "hello\r\n".chomp #=> "hello"
9547 * "hello\n\r".chomp #=> "hello\n"
9548 * "hello\r".chomp #=> "hello"
9549 * "hello \n there".chomp #=> "hello \n there"
9550 * "hello".chomp("llo") #=> "he"
9551 * "hello\r\n\r\n".chomp('') #=> "hello"
9552 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9555 static VALUE
9556 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9558 VALUE rs = chomp_rs(argc, argv);
9559 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9560 return rb_str_subseq(str, 0, chompped_length(str, rs));
9563 static long
9564 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9566 const char *const start = s;
9568 if (!s || s >= e) return 0;
9570 /* remove spaces at head */
9571 if (single_byte_optimizable(str)) {
9572 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9574 else {
9575 while (s < e) {
9576 int n;
9577 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9579 if (cc && !rb_isspace(cc)) break;
9580 s += n;
9583 return s - start;
9587 * call-seq:
9588 * str.lstrip! -> self or nil
9590 * Removes leading whitespace from the receiver.
9591 * Returns the altered receiver, or +nil+ if no change was made.
9592 * See also String#rstrip! and String#strip!.
9594 * Refer to String#strip for the definition of whitespace.
9596 * " hello ".lstrip! #=> "hello "
9597 * "hello ".lstrip! #=> nil
9598 * "hello".lstrip! #=> nil
9601 static VALUE
9602 rb_str_lstrip_bang(VALUE str)
9604 rb_encoding *enc;
9605 char *start, *s;
9606 long olen, loffset;
9608 str_modify_keep_cr(str);
9609 enc = STR_ENC_GET(str);
9610 RSTRING_GETMEM(str, start, olen);
9611 loffset = lstrip_offset(str, start, start+olen, enc);
9612 if (loffset > 0) {
9613 long len = olen-loffset;
9614 s = start + loffset;
9615 memmove(start, s, len);
9616 STR_SET_LEN(str, len);
9617 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9618 return str;
9620 return Qnil;
9625 * call-seq:
9626 * str.lstrip -> new_str
9628 * Returns a copy of the receiver with leading whitespace removed.
9629 * See also String#rstrip and String#strip.
9631 * Refer to String#strip for the definition of whitespace.
9633 * " hello ".lstrip #=> "hello "
9634 * "hello".lstrip #=> "hello"
9637 static VALUE
9638 rb_str_lstrip(VALUE str)
9640 char *start;
9641 long len, loffset;
9642 RSTRING_GETMEM(str, start, len);
9643 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9644 if (loffset <= 0) return str_duplicate(rb_cString, str);
9645 return rb_str_subseq(str, loffset, len - loffset);
9648 static long
9649 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9651 const char *t;
9653 rb_str_check_dummy_enc(enc);
9654 if (!s || s >= e) return 0;
9655 t = e;
9657 /* remove trailing spaces or '\0's */
9658 if (single_byte_optimizable(str)) {
9659 unsigned char c;
9660 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9662 else {
9663 char *tp;
9665 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9666 unsigned int c = rb_enc_codepoint(tp, e, enc);
9667 if (c && !rb_isspace(c)) break;
9668 t = tp;
9671 return e - t;
9675 * call-seq:
9676 * str.rstrip! -> self or nil
9678 * Removes trailing whitespace from the receiver.
9679 * Returns the altered receiver, or +nil+ if no change was made.
9680 * See also String#lstrip! and String#strip!.
9682 * Refer to String#strip for the definition of whitespace.
9684 * " hello ".rstrip! #=> " hello"
9685 * " hello".rstrip! #=> nil
9686 * "hello".rstrip! #=> nil
9689 static VALUE
9690 rb_str_rstrip_bang(VALUE str)
9692 rb_encoding *enc;
9693 char *start;
9694 long olen, roffset;
9696 str_modify_keep_cr(str);
9697 enc = STR_ENC_GET(str);
9698 RSTRING_GETMEM(str, start, olen);
9699 roffset = rstrip_offset(str, start, start+olen, enc);
9700 if (roffset > 0) {
9701 long len = olen - roffset;
9703 STR_SET_LEN(str, len);
9704 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9705 return str;
9707 return Qnil;
9712 * call-seq:
9713 * str.rstrip -> new_str
9715 * Returns a copy of the receiver with trailing whitespace removed.
9716 * See also String#lstrip and String#strip.
9718 * Refer to String#strip for the definition of whitespace.
9720 * " hello ".rstrip #=> " hello"
9721 * "hello".rstrip #=> "hello"
9724 static VALUE
9725 rb_str_rstrip(VALUE str)
9727 rb_encoding *enc;
9728 char *start;
9729 long olen, roffset;
9731 enc = STR_ENC_GET(str);
9732 RSTRING_GETMEM(str, start, olen);
9733 roffset = rstrip_offset(str, start, start+olen, enc);
9735 if (roffset <= 0) return str_duplicate(rb_cString, str);
9736 return rb_str_subseq(str, 0, olen-roffset);
9741 * call-seq:
9742 * str.strip! -> self or nil
9744 * Removes leading and trailing whitespace from the receiver.
9745 * Returns the altered receiver, or +nil+ if there was no change.
9747 * Refer to String#strip for the definition of whitespace.
9749 * " hello ".strip! #=> "hello"
9750 * "hello".strip! #=> nil
9753 static VALUE
9754 rb_str_strip_bang(VALUE str)
9756 char *start;
9757 long olen, loffset, roffset;
9758 rb_encoding *enc;
9760 str_modify_keep_cr(str);
9761 enc = STR_ENC_GET(str);
9762 RSTRING_GETMEM(str, start, olen);
9763 loffset = lstrip_offset(str, start, start+olen, enc);
9764 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9766 if (loffset > 0 || roffset > 0) {
9767 long len = olen-roffset;
9768 if (loffset > 0) {
9769 len -= loffset;
9770 memmove(start, start + loffset, len);
9772 STR_SET_LEN(str, len);
9773 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9774 return str;
9776 return Qnil;
9781 * call-seq:
9782 * str.strip -> new_str
9784 * Returns a copy of the receiver with leading and trailing whitespace removed.
9786 * Whitespace is defined as any of the following characters:
9787 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9789 * " hello ".strip #=> "hello"
9790 * "\tgoodbye\r\n".strip #=> "goodbye"
9791 * "\x00\t\n\v\f\r ".strip #=> ""
9792 * "hello".strip #=> "hello"
9795 static VALUE
9796 rb_str_strip(VALUE str)
9798 char *start;
9799 long olen, loffset, roffset;
9800 rb_encoding *enc = STR_ENC_GET(str);
9802 RSTRING_GETMEM(str, start, olen);
9803 loffset = lstrip_offset(str, start, start+olen, enc);
9804 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9806 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9807 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9810 static VALUE
9811 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9813 VALUE result, match;
9814 struct re_registers *regs;
9815 int i;
9816 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9817 if (pos >= 0) {
9818 if (BUILTIN_TYPE(pat) == T_STRING) {
9819 regs = NULL;
9820 end = pos + RSTRING_LEN(pat);
9822 else {
9823 match = rb_backref_get();
9824 regs = RMATCH_REGS(match);
9825 pos = BEG(0);
9826 end = END(0);
9828 if (pos == end) {
9829 rb_encoding *enc = STR_ENC_GET(str);
9831 * Always consume at least one character of the input string
9833 if (RSTRING_LEN(str) > end)
9834 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9835 RSTRING_END(str), enc);
9836 else
9837 *start = end + 1;
9839 else {
9840 *start = end;
9842 if (!regs || regs->num_regs == 1) {
9843 result = rb_str_subseq(str, pos, end - pos);
9844 return result;
9846 result = rb_ary_new2(regs->num_regs);
9847 for (i=1; i < regs->num_regs; i++) {
9848 VALUE s = Qnil;
9849 if (BEG(i) >= 0) {
9850 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9852 rb_ary_push(result, s);
9855 return result;
9857 return Qnil;
9862 * call-seq:
9863 * str.scan(pattern) -> array
9864 * str.scan(pattern) {|match, ...| block } -> str
9866 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9867 * Regexp or a String). For each match, a result is
9868 * generated and either added to the result array or passed to the block. If
9869 * the pattern contains no groups, each individual result consists of the
9870 * matched string, <code>$&</code>. If the pattern contains groups, each
9871 * individual result is itself an array containing one entry per group.
9873 * a = "cruel world"
9874 * a.scan(/\w+/) #=> ["cruel", "world"]
9875 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9876 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9877 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9879 * And the block form:
9881 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9882 * print "\n"
9883 * a.scan(/(.)(.)/) {|x,y| print y, x }
9884 * print "\n"
9886 * <em>produces:</em>
9888 * <<cruel>> <<world>>
9889 * rceu lowlr
9892 static VALUE
9893 rb_str_scan(VALUE str, VALUE pat)
9895 VALUE result;
9896 long start = 0;
9897 long last = -1, prev = 0;
9898 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9900 pat = get_pat_quoted(pat, 1);
9901 mustnot_broken(str);
9902 if (!rb_block_given_p()) {
9903 VALUE ary = rb_ary_new();
9905 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9906 last = prev;
9907 prev = start;
9908 rb_ary_push(ary, result);
9910 if (last >= 0) rb_pat_search(pat, str, last, 1);
9911 else rb_backref_set(Qnil);
9912 return ary;
9915 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9916 last = prev;
9917 prev = start;
9918 rb_yield(result);
9919 str_mod_check(str, p, len);
9921 if (last >= 0) rb_pat_search(pat, str, last, 1);
9922 return str;
9927 * call-seq:
9928 * str.hex -> integer
9930 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9931 * (with an optional sign and an optional <code>0x</code>) and returns the
9932 * corresponding number. Zero is returned on error.
9934 * "0x0a".hex #=> 10
9935 * "-1234".hex #=> -4660
9936 * "0".hex #=> 0
9937 * "wombat".hex #=> 0
9940 static VALUE
9941 rb_str_hex(VALUE str)
9943 return rb_str_to_inum(str, 16, FALSE);
9948 * call-seq:
9949 * str.oct -> integer
9951 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9952 * optional sign) and returns the corresponding number. Returns 0 if the
9953 * conversion fails.
9955 * "123".oct #=> 83
9956 * "-377".oct #=> -255
9957 * "bad".oct #=> 0
9958 * "0377bad".oct #=> 255
9960 * If +str+ starts with <code>0</code>, radix indicators are honored.
9961 * See Kernel#Integer.
9964 static VALUE
9965 rb_str_oct(VALUE str)
9967 return rb_str_to_inum(str, -8, FALSE);
9970 #ifndef HAVE_CRYPT_R
9971 # include "ruby/thread_native.h"
9972 # include "ruby/atomic.h"
9974 static struct {
9975 rb_atomic_t initialized;
9976 rb_nativethread_lock_t lock;
9977 } crypt_mutex;
9979 static void
9980 crypt_mutex_destroy(void)
9982 RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9983 rb_nativethread_lock_destroy(&crypt_mutex.lock);
9984 crypt_mutex.initialized = 0;
9987 static void
9988 crypt_mutex_initialize(void)
9990 rb_atomic_t i;
9991 while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9992 switch (i) {
9993 case 0:
9994 rb_nativethread_lock_initialize(&crypt_mutex.lock);
9995 atexit(crypt_mutex_destroy);
9996 RUBY_ASSERT(crypt_mutex.initialized == 2);
9997 RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9998 break;
9999 case 1:
10000 break;
10001 default:
10002 rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
10005 #endif
10008 * call-seq:
10009 * str.crypt(salt_str) -> new_str
10011 * Returns the string generated by calling <code>crypt(3)</code>
10012 * standard library function with <code>str</code> and
10013 * <code>salt_str</code>, in this order, as its arguments. Please do
10014 * not use this method any longer. It is legacy; provided only for
10015 * backward compatibility with ruby scripts in earlier days. It is
10016 * bad to use in contemporary programs for several reasons:
10018 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10019 * run. The generated string lacks data portability.
10021 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10022 * (i.e. silently ends up in unexpected results).
10024 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10025 * thread safe.
10027 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10028 * very very weak. According to its manpage, Linux's traditional
10029 * <code>crypt(3)</code> output has only 2**56 variations; too
10030 * easy to brute force today. And this is the default behaviour.
10032 * * In order to make things robust some OSes implement so-called
10033 * "modular" usage. To go through, you have to do a complex
10034 * build-up of the <code>salt_str</code> parameter, by hand.
10035 * Failure in generation of a proper salt string tends not to
10036 * yield any errors; typos in parameters are normally not
10037 * detectable.
10039 * * For instance, in the following example, the second invocation
10040 * of String#crypt is wrong; it has a typo in "round=" (lacks
10041 * "s"). However the call does not fail and something unexpected
10042 * is generated.
10044 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10045 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10047 * * Even in the "modular" mode, some hash functions are considered
10048 * archaic and no longer recommended at all; for instance module
10049 * <code>$1$</code> is officially abandoned by its author: see
10050 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10051 * instance module <code>$3$</code> is considered completely
10052 * broken: see the manpage of FreeBSD.
10054 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10055 * written above, <code>crypt(3)</code> on Mac OS never fails.
10056 * This means even if you build up a proper salt string it
10057 * generates a traditional DES hash anyways, and there is no way
10058 * for you to be aware of.
10060 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10062 * If for some reason you cannot migrate to other secure contemporary
10063 * password hashing algorithms, install the string-crypt gem and
10064 * <code>require 'string/crypt'</code> to continue using it.
10067 static VALUE
10068 rb_str_crypt(VALUE str, VALUE salt)
10070 #ifdef HAVE_CRYPT_R
10071 VALUE databuf;
10072 struct crypt_data *data;
10073 # define CRYPT_END() ALLOCV_END(databuf)
10074 #else
10075 extern char *crypt(const char *, const char *);
10076 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10077 #endif
10078 VALUE result;
10079 const char *s, *saltp;
10080 char *res;
10081 #ifdef BROKEN_CRYPT
10082 char salt_8bit_clean[3];
10083 #endif
10085 StringValue(salt);
10086 mustnot_wchar(str);
10087 mustnot_wchar(salt);
10088 s = StringValueCStr(str);
10089 saltp = RSTRING_PTR(salt);
10090 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10091 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10094 #ifdef BROKEN_CRYPT
10095 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10096 salt_8bit_clean[0] = saltp[0] & 0x7f;
10097 salt_8bit_clean[1] = saltp[1] & 0x7f;
10098 salt_8bit_clean[2] = '\0';
10099 saltp = salt_8bit_clean;
10101 #endif
10102 #ifdef HAVE_CRYPT_R
10103 data = ALLOCV(databuf, sizeof(struct crypt_data));
10104 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10105 data->initialized = 0;
10106 # endif
10107 res = crypt_r(s, saltp, data);
10108 #else
10109 crypt_mutex_initialize();
10110 rb_nativethread_lock_lock(&crypt_mutex.lock);
10111 res = crypt(s, saltp);
10112 #endif
10113 if (!res) {
10114 int err = errno;
10115 CRYPT_END();
10116 rb_syserr_fail(err, "crypt");
10118 result = rb_str_new_cstr(res);
10119 CRYPT_END();
10120 return result;
10125 * call-seq:
10126 * str.ord -> integer
10128 * Returns the Integer ordinal of a one-character string.
10130 * "a".ord #=> 97
10133 static VALUE
10134 rb_str_ord(VALUE s)
10136 unsigned int c;
10138 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10139 return UINT2NUM(c);
10142 * call-seq:
10143 * str.sum(n=16) -> integer
10145 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10146 * where <em>n</em> is the optional Integer parameter, defaulting
10147 * to 16. The result is simply the sum of the binary value of each byte in
10148 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10149 * checksum.
10152 static VALUE
10153 rb_str_sum(int argc, VALUE *argv, VALUE str)
10155 int bits = 16;
10156 char *ptr, *p, *pend;
10157 long len;
10158 VALUE sum = INT2FIX(0);
10159 unsigned long sum0 = 0;
10161 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10162 bits = 0;
10164 ptr = p = RSTRING_PTR(str);
10165 len = RSTRING_LEN(str);
10166 pend = p + len;
10168 while (p < pend) {
10169 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10170 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10171 str_mod_check(str, ptr, len);
10172 sum0 = 0;
10174 sum0 += (unsigned char)*p;
10175 p++;
10178 if (bits == 0) {
10179 if (sum0) {
10180 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10183 else {
10184 if (sum == INT2FIX(0)) {
10185 if (bits < (int)sizeof(long)*CHAR_BIT) {
10186 sum0 &= (((unsigned long)1)<<bits)-1;
10188 sum = LONG2FIX(sum0);
10190 else {
10191 VALUE mod;
10193 if (sum0) {
10194 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10197 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10198 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10199 sum = rb_funcall(sum, '&', 1, mod);
10202 return sum;
10205 static VALUE
10206 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10208 rb_encoding *enc;
10209 VALUE w;
10210 long width, len, flen = 1, fclen = 1;
10211 VALUE res;
10212 char *p;
10213 const char *f = " ";
10214 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10215 VALUE pad;
10216 int singlebyte = 1, cr;
10217 int termlen;
10219 rb_scan_args(argc, argv, "11", &w, &pad);
10220 enc = STR_ENC_GET(str);
10221 termlen = rb_enc_mbminlen(enc);
10222 width = NUM2LONG(w);
10223 if (argc == 2) {
10224 StringValue(pad);
10225 enc = rb_enc_check(str, pad);
10226 f = RSTRING_PTR(pad);
10227 flen = RSTRING_LEN(pad);
10228 fclen = str_strlen(pad, enc); /* rb_enc_check */
10229 singlebyte = single_byte_optimizable(pad);
10230 if (flen == 0 || fclen == 0) {
10231 rb_raise(rb_eArgError, "zero width padding");
10234 len = str_strlen(str, enc); /* rb_enc_check */
10235 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10236 n = width - len;
10237 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10238 rlen = n - llen;
10239 cr = ENC_CODERANGE(str);
10240 if (flen > 1) {
10241 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10242 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10244 size = RSTRING_LEN(str);
10245 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10246 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10247 (len += llen2 + rlen2) >= LONG_MAX - size) {
10248 rb_raise(rb_eArgError, "argument too big");
10250 len += size;
10251 res = str_new0(rb_cString, 0, len, termlen);
10252 p = RSTRING_PTR(res);
10253 if (flen <= 1) {
10254 memset(p, *f, llen);
10255 p += llen;
10257 else {
10258 while (llen >= fclen) {
10259 memcpy(p,f,flen);
10260 p += flen;
10261 llen -= fclen;
10263 if (llen > 0) {
10264 memcpy(p, f, llen2);
10265 p += llen2;
10268 memcpy(p, RSTRING_PTR(str), size);
10269 p += size;
10270 if (flen <= 1) {
10271 memset(p, *f, rlen);
10272 p += rlen;
10274 else {
10275 while (rlen >= fclen) {
10276 memcpy(p,f,flen);
10277 p += flen;
10278 rlen -= fclen;
10280 if (rlen > 0) {
10281 memcpy(p, f, rlen2);
10282 p += rlen2;
10285 TERM_FILL(p, termlen);
10286 STR_SET_LEN(res, p-RSTRING_PTR(res));
10287 rb_enc_associate(res, enc);
10288 if (argc == 2)
10289 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10290 if (cr != ENC_CODERANGE_BROKEN)
10291 ENC_CODERANGE_SET(res, cr);
10293 RB_GC_GUARD(pad);
10294 return res;
10299 * call-seq:
10300 * str.ljust(integer, padstr=' ') -> new_str
10302 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10303 * String of length <i>integer</i> with <i>str</i> left justified
10304 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10306 * "hello".ljust(4) #=> "hello"
10307 * "hello".ljust(20) #=> "hello "
10308 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10311 static VALUE
10312 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10314 return rb_str_justify(argc, argv, str, 'l');
10319 * call-seq:
10320 * str.rjust(integer, padstr=' ') -> new_str
10322 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10323 * String of length <i>integer</i> with <i>str</i> right justified
10324 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10326 * "hello".rjust(4) #=> "hello"
10327 * "hello".rjust(20) #=> " hello"
10328 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10331 static VALUE
10332 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10334 return rb_str_justify(argc, argv, str, 'r');
10339 * call-seq:
10340 * str.center(width, padstr=' ') -> new_str
10342 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10343 * returns a new String of length +width+ with +str+ centered and padded with
10344 * +padstr+; otherwise, returns +str+.
10346 * "hello".center(4) #=> "hello"
10347 * "hello".center(20) #=> " hello "
10348 * "hello".center(20, '123') #=> "1231231hello12312312"
10351 static VALUE
10352 rb_str_center(int argc, VALUE *argv, VALUE str)
10354 return rb_str_justify(argc, argv, str, 'c');
10358 * call-seq:
10359 * str.partition(sep) -> [head, sep, tail]
10360 * str.partition(regexp) -> [head, match, tail]
10362 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10363 * and returns the part before it, the match, and the part
10364 * after it.
10365 * If it is not found, returns two empty strings and <i>str</i>.
10367 * "hello".partition("l") #=> ["he", "l", "lo"]
10368 * "hello".partition("x") #=> ["hello", "", ""]
10369 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10372 static VALUE
10373 rb_str_partition(VALUE str, VALUE sep)
10375 long pos;
10377 sep = get_pat_quoted(sep, 0);
10378 if (RB_TYPE_P(sep, T_REGEXP)) {
10379 if (rb_reg_search(sep, str, 0, 0) < 0) {
10380 goto failed;
10382 VALUE match = rb_backref_get();
10383 struct re_registers *regs = RMATCH_REGS(match);
10385 pos = BEG(0);
10386 sep = rb_str_subseq(str, pos, END(0) - pos);
10388 else {
10389 pos = rb_str_index(str, sep, 0);
10390 if (pos < 0) goto failed;
10392 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10393 sep,
10394 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10395 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10397 failed:
10398 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10402 * call-seq:
10403 * str.rpartition(sep) -> [head, sep, tail]
10404 * str.rpartition(regexp) -> [head, match, tail]
10406 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10407 * of the string, and returns the part before it, the match, and the part
10408 * after it.
10409 * If it is not found, returns two empty strings and <i>str</i>.
10411 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10412 * "hello".rpartition("x") #=> ["", "", "hello"]
10413 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10415 * The match from the end means starting at the possible last position, not
10416 * the last of longest matches.
10418 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10420 * To partition at the last longest match, needs to combine with
10421 * negative lookbehind.
10423 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10425 * Or String#partition with negative lookforward.
10427 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10430 static VALUE
10431 rb_str_rpartition(VALUE str, VALUE sep)
10433 long pos = RSTRING_LEN(str);
10435 sep = get_pat_quoted(sep, 0);
10436 if (RB_TYPE_P(sep, T_REGEXP)) {
10437 if (rb_reg_search(sep, str, pos, 1) < 0) {
10438 goto failed;
10440 VALUE match = rb_backref_get();
10441 struct re_registers *regs = RMATCH_REGS(match);
10443 pos = BEG(0);
10444 sep = rb_str_subseq(str, pos, END(0) - pos);
10446 else {
10447 pos = rb_str_sublen(str, pos);
10448 pos = rb_str_rindex(str, sep, pos);
10449 if (pos < 0) {
10450 goto failed;
10452 pos = rb_str_offset(str, pos);
10455 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10456 sep,
10457 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10458 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10459 failed:
10460 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10464 * call-seq:
10465 * str.start_with?([prefixes]+) -> true or false
10467 * Returns true if +str+ starts with one of the +prefixes+ given.
10468 * Each of the +prefixes+ should be a String or a Regexp.
10470 * "hello".start_with?("hell") #=> true
10471 * "hello".start_with?(/H/i) #=> true
10473 * # returns true if one of the prefixes matches.
10474 * "hello".start_with?("heaven", "hell") #=> true
10475 * "hello".start_with?("heaven", "paradise") #=> false
10478 static VALUE
10479 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10481 int i;
10483 for (i=0; i<argc; i++) {
10484 VALUE tmp = argv[i];
10485 if (RB_TYPE_P(tmp, T_REGEXP)) {
10486 if (rb_reg_start_with_p(tmp, str))
10487 return Qtrue;
10489 else {
10490 StringValue(tmp);
10491 rb_enc_check(str, tmp);
10492 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10493 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10494 return Qtrue;
10497 return Qfalse;
10501 * call-seq:
10502 * str.end_with?([suffixes]+) -> true or false
10504 * Returns true if +str+ ends with one of the +suffixes+ given.
10506 * "hello".end_with?("ello") #=> true
10508 * # returns true if one of the +suffixes+ matches.
10509 * "hello".end_with?("heaven", "ello") #=> true
10510 * "hello".end_with?("heaven", "paradise") #=> false
10513 static VALUE
10514 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10516 int i;
10517 char *p, *s, *e;
10518 rb_encoding *enc;
10520 for (i=0; i<argc; i++) {
10521 VALUE tmp = argv[i];
10522 long slen, tlen;
10523 StringValue(tmp);
10524 enc = rb_enc_check(str, tmp);
10525 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10526 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10527 p = RSTRING_PTR(str);
10528 e = p + slen;
10529 s = e - tlen;
10530 if (rb_enc_left_char_head(p, s, e, enc) != s)
10531 continue;
10532 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10533 return Qtrue;
10535 return Qfalse;
10539 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10540 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10542 * @param str the target
10543 * @param prefix the prefix
10544 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10545 * @retval Positive-Integer otherwise
10547 static long
10548 deleted_prefix_length(VALUE str, VALUE prefix)
10550 char *strptr, *prefixptr;
10551 long olen, prefixlen;
10553 StringValue(prefix);
10554 if (is_broken_string(prefix)) return 0;
10555 rb_enc_check(str, prefix);
10557 /* return 0 if not start with prefix */
10558 prefixlen = RSTRING_LEN(prefix);
10559 if (prefixlen <= 0) return 0;
10560 olen = RSTRING_LEN(str);
10561 if (olen < prefixlen) return 0;
10562 strptr = RSTRING_PTR(str);
10563 prefixptr = RSTRING_PTR(prefix);
10564 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10566 return prefixlen;
10570 * call-seq:
10571 * str.delete_prefix!(prefix) -> self or nil
10573 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10574 * <code>nil</code> if no change was made.
10576 * "hello".delete_prefix!("hel") #=> "lo"
10577 * "hello".delete_prefix!("llo") #=> nil
10580 static VALUE
10581 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10583 long prefixlen;
10584 str_modify_keep_cr(str);
10586 prefixlen = deleted_prefix_length(str, prefix);
10587 if (prefixlen <= 0) return Qnil;
10589 return rb_str_drop_bytes(str, prefixlen);
10593 * call-seq:
10594 * str.delete_prefix(prefix) -> new_str
10596 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10598 * "hello".delete_prefix("hel") #=> "lo"
10599 * "hello".delete_prefix("llo") #=> "hello"
10602 static VALUE
10603 rb_str_delete_prefix(VALUE str, VALUE prefix)
10605 long prefixlen;
10607 prefixlen = deleted_prefix_length(str, prefix);
10608 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10610 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10614 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10615 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10617 * @param str the target
10618 * @param suffix the suffix
10619 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10620 * @retval Positive-Integer otherwise
10622 static long
10623 deleted_suffix_length(VALUE str, VALUE suffix)
10625 char *strptr, *suffixptr, *s;
10626 long olen, suffixlen;
10627 rb_encoding *enc;
10629 StringValue(suffix);
10630 if (is_broken_string(suffix)) return 0;
10631 enc = rb_enc_check(str, suffix);
10633 /* return 0 if not start with suffix */
10634 suffixlen = RSTRING_LEN(suffix);
10635 if (suffixlen <= 0) return 0;
10636 olen = RSTRING_LEN(str);
10637 if (olen < suffixlen) return 0;
10638 strptr = RSTRING_PTR(str);
10639 suffixptr = RSTRING_PTR(suffix);
10640 s = strptr + olen - suffixlen;
10641 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10642 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10644 return suffixlen;
10648 * call-seq:
10649 * str.delete_suffix!(suffix) -> self or nil
10651 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10652 * <code>nil</code> if no change was made.
10654 * "hello".delete_suffix!("llo") #=> "he"
10655 * "hello".delete_suffix!("hel") #=> nil
10658 static VALUE
10659 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10661 long olen, suffixlen, len;
10662 str_modifiable(str);
10664 suffixlen = deleted_suffix_length(str, suffix);
10665 if (suffixlen <= 0) return Qnil;
10667 olen = RSTRING_LEN(str);
10668 str_modify_keep_cr(str);
10669 len = olen - suffixlen;
10670 STR_SET_LEN(str, len);
10671 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10672 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10673 ENC_CODERANGE_CLEAR(str);
10675 return str;
10679 * call-seq:
10680 * str.delete_suffix(suffix) -> new_str
10682 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10684 * "hello".delete_suffix("llo") #=> "he"
10685 * "hello".delete_suffix("hel") #=> "hello"
10688 static VALUE
10689 rb_str_delete_suffix(VALUE str, VALUE suffix)
10691 long suffixlen;
10693 suffixlen = deleted_suffix_length(str, suffix);
10694 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10696 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10699 void
10700 rb_str_setter(VALUE val, ID id, VALUE *var)
10702 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10703 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10705 *var = val;
10708 static void
10709 rb_fs_setter(VALUE val, ID id, VALUE *var)
10711 val = rb_fs_check(val);
10712 if (!val) {
10713 rb_raise(rb_eTypeError,
10714 "value of %"PRIsVALUE" must be String or Regexp",
10715 rb_id2str(id));
10717 if (!NIL_P(val)) {
10718 rb_warn_deprecated("`$;'", NULL);
10720 *var = val;
10725 * call-seq:
10726 * str.force_encoding(encoding) -> str
10728 * Changes the encoding to +encoding+ and returns self.
10731 static VALUE
10732 rb_str_force_encoding(VALUE str, VALUE enc)
10734 str_modifiable(str);
10735 rb_enc_associate(str, rb_to_encoding(enc));
10736 ENC_CODERANGE_CLEAR(str);
10737 return str;
10741 * call-seq:
10742 * str.b -> str
10744 * Returns a copied string whose encoding is ASCII-8BIT.
10747 static VALUE
10748 rb_str_b(VALUE str)
10750 VALUE str2;
10751 if (FL_TEST(str, STR_NOEMBED)) {
10752 str2 = str_alloc_heap(rb_cString);
10754 else {
10755 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10757 str_replace_shared_without_enc(str2, str);
10758 ENC_CODERANGE_CLEAR(str2);
10759 return str2;
10763 * call-seq:
10764 * str.valid_encoding? -> true or false
10766 * Returns true for a string which is encoded correctly.
10768 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10769 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10770 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10773 static VALUE
10774 rb_str_valid_encoding_p(VALUE str)
10776 int cr = rb_enc_str_coderange(str);
10778 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10782 * call-seq:
10783 * str.ascii_only? -> true or false
10785 * Returns true for a string which has only ASCII characters.
10787 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10788 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10791 static VALUE
10792 rb_str_is_ascii_only_p(VALUE str)
10794 int cr = rb_enc_str_coderange(str);
10796 return RBOOL(cr == ENC_CODERANGE_7BIT);
10799 VALUE
10800 rb_str_ellipsize(VALUE str, long len)
10802 static const char ellipsis[] = "...";
10803 const long ellipsislen = sizeof(ellipsis) - 1;
10804 rb_encoding *const enc = rb_enc_get(str);
10805 const long blen = RSTRING_LEN(str);
10806 const char *const p = RSTRING_PTR(str), *e = p + blen;
10807 VALUE estr, ret = 0;
10809 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10810 if (len * rb_enc_mbminlen(enc) >= blen ||
10811 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10812 ret = str;
10814 else if (len <= ellipsislen ||
10815 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10816 if (rb_enc_asciicompat(enc)) {
10817 ret = rb_str_new(ellipsis, len);
10818 rb_enc_associate(ret, enc);
10820 else {
10821 estr = rb_usascii_str_new(ellipsis, len);
10822 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10825 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10826 rb_str_cat(ret, ellipsis, ellipsislen);
10828 else {
10829 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10830 rb_enc_from_encoding(enc), 0, Qnil);
10831 rb_str_append(ret, estr);
10833 return ret;
10836 static VALUE
10837 str_compat_and_valid(VALUE str, rb_encoding *enc)
10839 int cr;
10840 str = StringValue(str);
10841 cr = rb_enc_str_coderange(str);
10842 if (cr == ENC_CODERANGE_BROKEN) {
10843 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10845 else {
10846 rb_encoding *e = STR_ENC_GET(str);
10847 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10848 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10849 rb_enc_name(enc), rb_enc_name(e));
10852 return str;
10855 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10857 VALUE
10858 rb_str_scrub(VALUE str, VALUE repl)
10860 rb_encoding *enc = STR_ENC_GET(str);
10861 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10864 VALUE
10865 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10867 int cr = ENC_CODERANGE_UNKNOWN;
10868 if (enc == STR_ENC_GET(str)) {
10869 /* cached coderange makes sense only when enc equals the
10870 * actual encoding of str */
10871 cr = ENC_CODERANGE(str);
10873 return enc_str_scrub(enc, str, repl, cr);
10876 static VALUE
10877 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10879 int encidx;
10880 VALUE buf = Qnil;
10881 const char *rep, *p, *e, *p1, *sp;
10882 long replen = -1;
10883 long slen;
10885 if (rb_block_given_p()) {
10886 if (!NIL_P(repl))
10887 rb_raise(rb_eArgError, "both of block and replacement given");
10888 replen = 0;
10891 if (ENC_CODERANGE_CLEAN_P(cr))
10892 return Qnil;
10894 if (!NIL_P(repl)) {
10895 repl = str_compat_and_valid(repl, enc);
10898 if (rb_enc_dummy_p(enc)) {
10899 return Qnil;
10901 encidx = rb_enc_to_index(enc);
10903 #define DEFAULT_REPLACE_CHAR(str) do { \
10904 static const char replace[sizeof(str)-1] = str; \
10905 rep = replace; replen = (int)sizeof(replace); \
10906 } while (0)
10908 slen = RSTRING_LEN(str);
10909 p = RSTRING_PTR(str);
10910 e = RSTRING_END(str);
10911 p1 = p;
10912 sp = p;
10914 if (rb_enc_asciicompat(enc)) {
10915 int rep7bit_p;
10916 if (!replen) {
10917 rep = NULL;
10918 rep7bit_p = FALSE;
10920 else if (!NIL_P(repl)) {
10921 rep = RSTRING_PTR(repl);
10922 replen = RSTRING_LEN(repl);
10923 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10925 else if (encidx == rb_utf8_encindex()) {
10926 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10927 rep7bit_p = FALSE;
10929 else {
10930 DEFAULT_REPLACE_CHAR("?");
10931 rep7bit_p = TRUE;
10933 cr = ENC_CODERANGE_7BIT;
10935 p = search_nonascii(p, e);
10936 if (!p) {
10937 p = e;
10939 while (p < e) {
10940 int ret = rb_enc_precise_mbclen(p, e, enc);
10941 if (MBCLEN_NEEDMORE_P(ret)) {
10942 break;
10944 else if (MBCLEN_CHARFOUND_P(ret)) {
10945 cr = ENC_CODERANGE_VALID;
10946 p += MBCLEN_CHARFOUND_LEN(ret);
10948 else if (MBCLEN_INVALID_P(ret)) {
10950 * p1~p: valid ascii/multibyte chars
10951 * p ~e: invalid bytes + unknown bytes
10953 long clen = rb_enc_mbmaxlen(enc);
10954 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10955 if (p > p1) {
10956 rb_str_buf_cat(buf, p1, p - p1);
10959 if (e - p < clen) clen = e - p;
10960 if (clen <= 2) {
10961 clen = 1;
10963 else {
10964 const char *q = p;
10965 clen--;
10966 for (; clen > 1; clen--) {
10967 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10968 if (MBCLEN_NEEDMORE_P(ret)) break;
10969 if (MBCLEN_INVALID_P(ret)) continue;
10970 UNREACHABLE;
10973 if (rep) {
10974 rb_str_buf_cat(buf, rep, replen);
10975 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10977 else {
10978 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10979 str_mod_check(str, sp, slen);
10980 repl = str_compat_and_valid(repl, enc);
10981 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10982 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10983 cr = ENC_CODERANGE_VALID;
10985 p += clen;
10986 p1 = p;
10987 p = search_nonascii(p, e);
10988 if (!p) {
10989 p = e;
10990 break;
10993 else {
10994 UNREACHABLE;
10997 if (NIL_P(buf)) {
10998 if (p == e) {
10999 ENC_CODERANGE_SET(str, cr);
11000 return Qnil;
11002 buf = rb_str_buf_new(RSTRING_LEN(str));
11004 if (p1 < p) {
11005 rb_str_buf_cat(buf, p1, p - p1);
11007 if (p < e) {
11008 if (rep) {
11009 rb_str_buf_cat(buf, rep, replen);
11010 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11012 else {
11013 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11014 str_mod_check(str, sp, slen);
11015 repl = str_compat_and_valid(repl, enc);
11016 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11017 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11018 cr = ENC_CODERANGE_VALID;
11022 else {
11023 /* ASCII incompatible */
11024 long mbminlen = rb_enc_mbminlen(enc);
11025 if (!replen) {
11026 rep = NULL;
11028 else if (!NIL_P(repl)) {
11029 rep = RSTRING_PTR(repl);
11030 replen = RSTRING_LEN(repl);
11032 else if (encidx == ENCINDEX_UTF_16BE) {
11033 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11035 else if (encidx == ENCINDEX_UTF_16LE) {
11036 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11038 else if (encidx == ENCINDEX_UTF_32BE) {
11039 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11041 else if (encidx == ENCINDEX_UTF_32LE) {
11042 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11044 else {
11045 DEFAULT_REPLACE_CHAR("?");
11048 while (p < e) {
11049 int ret = rb_enc_precise_mbclen(p, e, enc);
11050 if (MBCLEN_NEEDMORE_P(ret)) {
11051 break;
11053 else if (MBCLEN_CHARFOUND_P(ret)) {
11054 p += MBCLEN_CHARFOUND_LEN(ret);
11056 else if (MBCLEN_INVALID_P(ret)) {
11057 const char *q = p;
11058 long clen = rb_enc_mbmaxlen(enc);
11059 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11060 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11062 if (e - p < clen) clen = e - p;
11063 if (clen <= mbminlen * 2) {
11064 clen = mbminlen;
11066 else {
11067 clen -= mbminlen;
11068 for (; clen > mbminlen; clen-=mbminlen) {
11069 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11070 if (MBCLEN_NEEDMORE_P(ret)) break;
11071 if (MBCLEN_INVALID_P(ret)) continue;
11072 UNREACHABLE;
11075 if (rep) {
11076 rb_str_buf_cat(buf, rep, replen);
11078 else {
11079 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11080 str_mod_check(str, sp, slen);
11081 repl = str_compat_and_valid(repl, enc);
11082 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11084 p += clen;
11085 p1 = p;
11087 else {
11088 UNREACHABLE;
11091 if (NIL_P(buf)) {
11092 if (p == e) {
11093 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11094 return Qnil;
11096 buf = rb_str_buf_new(RSTRING_LEN(str));
11098 if (p1 < p) {
11099 rb_str_buf_cat(buf, p1, p - p1);
11101 if (p < e) {
11102 if (rep) {
11103 rb_str_buf_cat(buf, rep, replen);
11105 else {
11106 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11107 str_mod_check(str, sp, slen);
11108 repl = str_compat_and_valid(repl, enc);
11109 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11112 cr = ENC_CODERANGE_VALID;
11114 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11115 return buf;
11119 * call-seq:
11120 * str.scrub -> new_str
11121 * str.scrub(repl) -> new_str
11122 * str.scrub{|bytes|} -> new_str
11124 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11125 * character, else returns self.
11126 * If block is given, replace invalid bytes with returned value of the block.
11128 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11129 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11130 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11132 static VALUE
11133 str_scrub(int argc, VALUE *argv, VALUE str)
11135 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11136 VALUE new = rb_str_scrub(str, repl);
11137 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11141 * call-seq:
11142 * str.scrub! -> str
11143 * str.scrub!(repl) -> str
11144 * str.scrub!{|bytes|} -> str
11146 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11147 * character, else returns self.
11148 * If block is given, replace invalid bytes with returned value of the block.
11150 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11151 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11152 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11154 static VALUE
11155 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11157 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11158 VALUE new = rb_str_scrub(str, repl);
11159 if (!NIL_P(new)) rb_str_replace(str, new);
11160 return str;
11163 static ID id_normalize;
11164 static ID id_normalized_p;
11165 static VALUE mUnicodeNormalize;
11167 static VALUE
11168 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11170 static int UnicodeNormalizeRequired = 0;
11171 VALUE argv2[2];
11173 if (!UnicodeNormalizeRequired) {
11174 rb_require("unicode_normalize/normalize.rb");
11175 UnicodeNormalizeRequired = 1;
11177 argv2[0] = str;
11178 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11179 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11183 * call-seq:
11184 * str.unicode_normalize(form=:nfc)
11186 * Unicode Normalization---Returns a normalized form of +str+,
11187 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11188 * The normalization form used is determined by +form+, which can
11189 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11190 * The default is +:nfc+.
11192 * If the string is not in a Unicode Encoding, then an Exception is raised.
11193 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11194 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11195 * Anything other than UTF-8 is implemented by converting to UTF-8,
11196 * which makes it slower than UTF-8.
11198 * "a\u0300".unicode_normalize #=> "\u00E0"
11199 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11200 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11201 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11202 * #=> Encoding::CompatibilityError raised
11204 static VALUE
11205 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11207 return unicode_normalize_common(argc, argv, str, id_normalize);
11211 * call-seq:
11212 * str.unicode_normalize!(form=:nfc)
11214 * Destructive version of String#unicode_normalize, doing Unicode
11215 * normalization in place.
11217 static VALUE
11218 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11220 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11223 /* call-seq:
11224 * str.unicode_normalized?(form=:nfc)
11226 * Checks whether +str+ is in Unicode normalization form +form+,
11227 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11228 * The default is +:nfc+.
11230 * If the string is not in a Unicode Encoding, then an Exception is raised.
11231 * For details, see String#unicode_normalize.
11233 * "a\u0300".unicode_normalized? #=> false
11234 * "a\u0300".unicode_normalized?(:nfd) #=> true
11235 * "\u00E0".unicode_normalized? #=> true
11236 * "\u00E0".unicode_normalized?(:nfd) #=> false
11237 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11238 * #=> Encoding::CompatibilityError raised
11240 static VALUE
11241 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11243 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11246 /**********************************************************************
11247 * Document-class: Symbol
11249 * Symbol objects represent named identifiers inside the Ruby interpreter.
11251 * You can create a \Symbol object explicitly with:
11253 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11255 * The same Symbol object will be
11256 * created for a given name or string for the duration of a program's
11257 * execution, regardless of the context or meaning of that name. Thus
11258 * if <code>Fred</code> is a constant in one context, a method in
11259 * another, and a class in a third, the Symbol <code>:Fred</code>
11260 * will be the same object in all three contexts.
11262 * module One
11263 * class Fred
11264 * end
11265 * $f1 = :Fred
11266 * end
11267 * module Two
11268 * Fred = 1
11269 * $f2 = :Fred
11270 * end
11271 * def Fred()
11272 * end
11273 * $f3 = :Fred
11274 * $f1.object_id #=> 2514190
11275 * $f2.object_id #=> 2514190
11276 * $f3.object_id #=> 2514190
11278 * Constant, method, and variable names are returned as symbols:
11280 * module One
11281 * Two = 2
11282 * def three; 3 end
11283 * @four = 4
11284 * @@five = 5
11285 * $six = 6
11286 * end
11287 * seven = 7
11289 * One.constants
11290 * # => [:Two]
11291 * One.instance_methods(true)
11292 * # => [:three]
11293 * One.instance_variables
11294 * # => [:@four]
11295 * One.class_variables
11296 * # => [:@@five]
11297 * global_variables.grep(/six/)
11298 * # => [:$six]
11299 * local_variables
11300 * # => [:seven]
11302 * Symbol objects are different from String objects in that
11303 * Symbol objects represent identifiers, while String objects
11304 * represent text or data.
11306 * == What's Here
11308 * First, what's elsewhere. \Class \Symbol:
11310 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11311 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11313 * Here, class \Symbol provides methods that are useful for:
11315 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11316 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11317 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11319 * === Methods for Querying
11321 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11322 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11323 * in symbol that matches a given Regexp
11324 * or other object; returns +nil+ if no match is found.
11325 * - #[], #slice :: Returns a substring of symbol
11326 * determined by a given index, start/length, or range, or string.
11327 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11328 * - #encoding:: Returns the Encoding object that represents the encoding
11329 * of symbol.
11330 * - #end_with?:: Returns +true+ if symbol ends with
11331 * any of the given strings.
11332 * - #match:: Returns a MatchData object if symbol
11333 * matches a given Regexp; +nil+ otherwise.
11334 * - #match?:: Returns +true+ if symbol
11335 * matches a given Regexp; +false+ otherwise.
11336 * - #length, #size:: Returns the number of characters in symbol.
11337 * - #start_with?:: Returns +true+ if symbol starts with
11338 * any of the given strings.
11340 * === Methods for Comparing
11342 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11343 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11344 * has the same content and encoding.
11345 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11346 * symbol is smaller than, equal to, or larger than symbol.
11347 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11348 * after Unicode case folding; +false+ otherwise.
11350 * === Methods for Converting
11352 * - #capitalize:: Returns symbol with the first character upcased
11353 * and all other characters downcased.
11354 * - #downcase:: Returns symbol with all characters downcased.
11355 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11356 * - #name:: Returns the frozen string corresponding to symbol.
11357 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11358 * - #swapcase:: Returns symbol with all upcase characters downcased
11359 * and all downcase characters upcased.
11360 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11361 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11362 * - #to_sym, #intern:: Returns +self+.
11363 * - #upcase:: Returns symbol with all characters upcased.
11369 * call-seq:
11370 * sym == obj -> true or false
11372 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11373 * symbol, returns <code>true</code>.
11376 #define sym_equal rb_obj_equal
11378 static int
11379 sym_printable(const char *s, const char *send, rb_encoding *enc)
11381 while (s < send) {
11382 int n;
11383 int c = rb_enc_precise_mbclen(s, send, enc);
11385 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11386 n = MBCLEN_CHARFOUND_LEN(c);
11387 c = rb_enc_mbc_to_codepoint(s, send, enc);
11388 if (!rb_enc_isprint(c, enc)) return FALSE;
11389 s += n;
11391 return TRUE;
11395 rb_str_symname_p(VALUE sym)
11397 rb_encoding *enc;
11398 const char *ptr;
11399 long len;
11400 rb_encoding *resenc = rb_default_internal_encoding();
11402 if (resenc == NULL) resenc = rb_default_external_encoding();
11403 enc = STR_ENC_GET(sym);
11404 ptr = RSTRING_PTR(sym);
11405 len = RSTRING_LEN(sym);
11406 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11407 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11408 return FALSE;
11410 return TRUE;
11413 VALUE
11414 rb_str_quote_unprintable(VALUE str)
11416 rb_encoding *enc;
11417 const char *ptr;
11418 long len;
11419 rb_encoding *resenc;
11421 Check_Type(str, T_STRING);
11422 resenc = rb_default_internal_encoding();
11423 if (resenc == NULL) resenc = rb_default_external_encoding();
11424 enc = STR_ENC_GET(str);
11425 ptr = RSTRING_PTR(str);
11426 len = RSTRING_LEN(str);
11427 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11428 !sym_printable(ptr, ptr + len, enc)) {
11429 return rb_str_escape(str);
11431 return str;
11434 MJIT_FUNC_EXPORTED VALUE
11435 rb_id_quote_unprintable(ID id)
11437 VALUE str = rb_id2str(id);
11438 if (!rb_str_symname_p(str)) {
11439 return rb_str_escape(str);
11441 return str;
11445 * call-seq:
11446 * sym.inspect -> string
11448 * Returns the representation of <i>sym</i> as a symbol literal.
11450 * :fred.inspect #=> ":fred"
11453 static VALUE
11454 sym_inspect(VALUE sym)
11456 VALUE str = rb_sym2str(sym);
11457 const char *ptr;
11458 long len;
11459 char *dest;
11461 if (!rb_str_symname_p(str)) {
11462 str = rb_str_inspect(str);
11463 len = RSTRING_LEN(str);
11464 rb_str_resize(str, len + 1);
11465 dest = RSTRING_PTR(str);
11466 memmove(dest + 1, dest, len);
11468 else {
11469 rb_encoding *enc = STR_ENC_GET(str);
11470 RSTRING_GETMEM(str, ptr, len);
11471 str = rb_enc_str_new(0, len + 1, enc);
11472 dest = RSTRING_PTR(str);
11473 memcpy(dest + 1, ptr, len);
11475 dest[0] = ':';
11476 return str;
11479 #if 0 /* for RDoc */
11481 * call-seq:
11482 * sym.name -> string
11484 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11485 * returned string is frozen.
11487 * :fred.name #=> "fred"
11488 * :fred.name.frozen? #=> true
11489 * :fred.to_s #=> "fred"
11490 * :fred.to_s.frozen? #=> false
11492 VALUE
11493 rb_sym2str(VALUE sym)
11497 #endif
11501 * call-seq:
11502 * sym.id2name -> string
11503 * sym.to_s -> string
11505 * Returns the name or string corresponding to <i>sym</i>.
11507 * :fred.id2name #=> "fred"
11508 * :ginger.to_s #=> "ginger"
11510 * Note that this string is not frozen (unlike the symbol itself).
11511 * To get a frozen string, use #name.
11515 VALUE
11516 rb_sym_to_s(VALUE sym)
11518 return str_new_shared(rb_cString, rb_sym2str(sym));
11523 * call-seq:
11524 * sym.to_sym -> sym
11525 * sym.intern -> sym
11527 * In general, <code>to_sym</code> returns the Symbol corresponding
11528 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11529 * in this case.
11532 static VALUE
11533 sym_to_sym(VALUE sym)
11535 return sym;
11538 MJIT_FUNC_EXPORTED VALUE
11539 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11541 VALUE obj;
11543 if (argc < 1) {
11544 rb_raise(rb_eArgError, "no receiver given");
11546 obj = argv[0];
11547 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11550 #if 0
11552 * call-seq:
11553 * sym.to_proc
11555 * Returns a _Proc_ object which responds to the given method by _sym_.
11557 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11560 VALUE
11561 rb_sym_to_proc(VALUE sym)
11564 #endif
11567 * call-seq:
11569 * sym.succ
11571 * Same as <code>sym.to_s.succ.intern</code>.
11574 static VALUE
11575 sym_succ(VALUE sym)
11577 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11581 * call-seq:
11583 * symbol <=> other_symbol -> -1, 0, +1, or nil
11585 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11586 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11587 * less than, equal to, or greater than +other_symbol+.
11589 * +nil+ is returned if the two values are incomparable.
11591 * See String#<=> for more information.
11594 static VALUE
11595 sym_cmp(VALUE sym, VALUE other)
11597 if (!SYMBOL_P(other)) {
11598 return Qnil;
11600 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11604 * call-seq:
11605 * casecmp(other_symbol) -> -1, 0, 1, or nil
11607 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11609 * :aBcDeF.casecmp(:abcde) # => 1
11610 * :aBcDeF.casecmp(:abcdef) # => 0
11611 * :aBcDeF.casecmp(:abcdefg) # => -1
11612 * :abcdef.casecmp(:ABCDEF) # => 0
11614 * Returns +nil+ if the two symbols have incompatible encodings,
11615 * or if +other_symbol+ is not a symbol:
11617 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11618 * other_sym = :"\u{c4 d6 dc}"
11619 * sym.casecmp(other_sym) # => nil
11620 * :foo.casecmp(2) # => nil
11622 * Currently, case-insensitivity only works on characters A-Z/a-z,
11623 * not all of Unicode. This is different from Symbol#casecmp?.
11625 * Related: Symbol#casecmp?.
11629 static VALUE
11630 sym_casecmp(VALUE sym, VALUE other)
11632 if (!SYMBOL_P(other)) {
11633 return Qnil;
11635 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11639 * call-seq:
11640 * casecmp?(other_symbol) -> true, false, or nil
11642 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11643 * Unicode case folding, +false+ if they are not equal:
11645 * :aBcDeF.casecmp?(:abcde) # => false
11646 * :aBcDeF.casecmp?(:abcdef) # => true
11647 * :aBcDeF.casecmp?(:abcdefg) # => false
11648 * :abcdef.casecmp?(:ABCDEF) # => true
11649 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11651 * Returns +nil+ if the two symbols have incompatible encodings,
11652 * or if +other_symbol+ is not a symbol:
11654 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11655 * other_sym = :"\u{c4 d6 dc}"
11656 * sym.casecmp?(other_sym) # => nil
11657 * :foo.casecmp?(2) # => nil
11659 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11661 * Related: Symbol#casecmp.
11665 static VALUE
11666 sym_casecmp_p(VALUE sym, VALUE other)
11668 if (!SYMBOL_P(other)) {
11669 return Qnil;
11671 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11675 * call-seq:
11676 * sym =~ obj -> integer or nil
11678 * Returns <code>sym.to_s =~ obj</code>.
11681 static VALUE
11682 sym_match(VALUE sym, VALUE other)
11684 return rb_str_match(rb_sym2str(sym), other);
11688 * call-seq:
11689 * sym.match(pattern) -> matchdata or nil
11690 * sym.match(pattern, pos) -> matchdata or nil
11692 * Returns <code>sym.to_s.match</code>.
11695 static VALUE
11696 sym_match_m(int argc, VALUE *argv, VALUE sym)
11698 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11702 * call-seq:
11703 * sym.match?(pattern) -> true or false
11704 * sym.match?(pattern, pos) -> true or false
11706 * Returns <code>sym.to_s.match?</code>.
11709 static VALUE
11710 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11712 return rb_str_match_m_p(argc, argv, sym);
11716 * call-seq:
11717 * sym[idx] -> char
11718 * sym[b, n] -> string
11719 * sym.slice(idx) -> char
11720 * sym.slice(b, n) -> string
11722 * Returns <code>sym.to_s[]</code>.
11725 static VALUE
11726 sym_aref(int argc, VALUE *argv, VALUE sym)
11728 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11732 * call-seq:
11733 * sym.length -> integer
11734 * sym.size -> integer
11736 * Same as <code>sym.to_s.length</code>.
11739 static VALUE
11740 sym_length(VALUE sym)
11742 return rb_str_length(rb_sym2str(sym));
11746 * call-seq:
11747 * sym.empty? -> true or false
11749 * Returns whether _sym_ is :"" or not.
11752 static VALUE
11753 sym_empty(VALUE sym)
11755 return rb_str_empty(rb_sym2str(sym));
11759 * call-seq:
11760 * upcase(*options) -> symbol
11762 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11764 * See String#upcase.
11768 static VALUE
11769 sym_upcase(int argc, VALUE *argv, VALUE sym)
11771 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11775 * call-seq:
11776 * downcase(*options) -> symbol
11778 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11780 * See String#downcase.
11782 * Related: Symbol#upcase.
11786 static VALUE
11787 sym_downcase(int argc, VALUE *argv, VALUE sym)
11789 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11793 * call-seq:
11794 * capitalize(*options) -> symbol
11796 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11798 * See String#capitalize.
11802 static VALUE
11803 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11805 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11809 * call-seq:
11810 * swapcase(*options) -> symbol
11812 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11814 * See String#swapcase.
11818 static VALUE
11819 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11821 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11825 * call-seq:
11826 * sym.start_with?([prefixes]+) -> true or false
11828 * Returns true if +sym+ starts with one of the +prefixes+ given.
11829 * Each of the +prefixes+ should be a String or a Regexp.
11831 * :hello.start_with?("hell") #=> true
11832 * :hello.start_with?(/H/i) #=> true
11834 * # returns true if one of the prefixes matches.
11835 * :hello.start_with?("heaven", "hell") #=> true
11836 * :hello.start_with?("heaven", "paradise") #=> false
11839 static VALUE
11840 sym_start_with(int argc, VALUE *argv, VALUE sym)
11842 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11846 * call-seq:
11847 * sym.end_with?([suffixes]+) -> true or false
11849 * Returns true if +sym+ ends with one of the +suffixes+ given.
11851 * :hello.end_with?("ello") #=> true
11853 * # returns true if one of the +suffixes+ matches.
11854 * :hello.end_with?("heaven", "ello") #=> true
11855 * :hello.end_with?("heaven", "paradise") #=> false
11858 static VALUE
11859 sym_end_with(int argc, VALUE *argv, VALUE sym)
11861 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11865 * call-seq:
11866 * sym.encoding -> encoding
11868 * Returns the Encoding object that represents the encoding of _sym_.
11871 static VALUE
11872 sym_encoding(VALUE sym)
11874 return rb_obj_encoding(rb_sym2str(sym));
11877 static VALUE
11878 string_for_symbol(VALUE name)
11880 if (!RB_TYPE_P(name, T_STRING)) {
11881 VALUE tmp = rb_check_string_type(name);
11882 if (NIL_P(tmp)) {
11883 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11884 name);
11886 name = tmp;
11888 return name;
11892 rb_to_id(VALUE name)
11894 if (SYMBOL_P(name)) {
11895 return SYM2ID(name);
11897 name = string_for_symbol(name);
11898 return rb_intern_str(name);
11901 VALUE
11902 rb_to_symbol(VALUE name)
11904 if (SYMBOL_P(name)) {
11905 return name;
11907 name = string_for_symbol(name);
11908 return rb_str_intern(name);
11912 * call-seq:
11913 * Symbol.all_symbols => array
11915 * Returns an array of all the symbols currently in Ruby's symbol
11916 * table.
11918 * Symbol.all_symbols.size #=> 903
11919 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11920 * :chown, :EOFError, :$;, :String,
11921 * :LOCK_SH, :"setuid?", :$<,
11922 * :default_proc, :compact, :extend,
11923 * :Tms, :getwd, :$=, :ThreadGroup,
11924 * :wait2, :$>]
11927 static VALUE
11928 sym_all_symbols(VALUE _)
11930 return rb_sym_all_symbols();
11933 VALUE
11934 rb_str_to_interned_str(VALUE str)
11936 return rb_fstring(str);
11939 VALUE
11940 rb_interned_str(const char *ptr, long len)
11942 struct RString fake_str;
11943 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11946 VALUE
11947 rb_interned_str_cstr(const char *ptr)
11949 return rb_interned_str(ptr, strlen(ptr));
11952 VALUE
11953 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11955 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11956 rb_enc_autoload(enc);
11959 struct RString fake_str;
11960 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11963 VALUE
11964 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11966 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11970 * A \String object has an arbitrary sequence of bytes,
11971 * typically representing text or binary data.
11972 * A \String object may be created using String::new or as literals.
11974 * String objects differ from Symbol objects in that Symbol objects are
11975 * designed to be used as identifiers, instead of text or data.
11977 * You can create a \String object explicitly with:
11979 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11980 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11982 * You can convert certain objects to Strings with:
11984 * - \Method {String}[Kernel.html#method-i-String].
11986 * Some \String methods modify +self+.
11987 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11988 * and returns +self+;
11989 * often a similarly named method (without the <tt>!</tt>)
11990 * returns a new string.
11992 * In general, if there exist both bang and non-bang version of method,
11993 * the bang! mutates and the non-bang! does not.
11994 * However, a method without a bang can also mutate, such as String#replace.
11996 * == Substitution Methods
11998 * These methods perform substitutions:
12000 * - String#sub: One substitution (or none); returns a new string.
12001 * - String#sub!: One substitution (or none); returns +self+.
12002 * - String#gsub: Zero or more substitutions; returns a new string.
12003 * - String#gsub!: Zero or more substitutions; returns +self+.
12005 * Each of these methods takes:
12007 * - A first argument, +pattern+ (string or regexp),
12008 * that specifies the substring(s) to be replaced.
12010 * - Either of these:
12012 * - A second argument, +replacement+ (string or hash),
12013 * that determines the replacing string.
12014 * - A block that will determine the replacing string.
12016 * The examples in this section mostly use methods String#sub and String#gsub;
12017 * the principles illustrated apply to all four substitution methods.
12019 * <b>Argument +pattern+</b>
12021 * Argument +pattern+ is commonly a regular expression:
12023 * s = 'hello'
12024 * s.sub(/[aeiou]/, '*') # => "h*llo"
12025 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12026 * s.gsub(/[aeiou]/, '') # => "hll"
12027 * s.sub(/ell/, 'al') # => "halo"
12028 * s.gsub(/xyzzy/, '*') # => "hello"
12029 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12031 * When +pattern+ is a string, all its characters are treated
12032 * as ordinary characters (not as regexp special characters):
12034 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12036 * <b>\String +replacement+</b>
12038 * If +replacement+ is a string, that string will determine
12039 * the replacing string that is to be substituted for the matched text.
12041 * Each of the examples above uses a simple string as the replacing string.
12043 * \String +replacement+ may contain back-references to the pattern's captures:
12045 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12046 * - <tt>\k<name></tt> refers to the named capture +name+.
12048 * See rdoc-ref:regexp.rdoc for details.
12050 * Note that within the string +replacement+, a character combination
12051 * such as <tt>$&</tt> is treated as ordinary text, and not as
12052 * a special match variable.
12053 * However, you may refer to some special match variables using these
12054 * combinations:
12056 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12057 * which contains the complete matched text.
12058 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12059 * which contains string after match.
12060 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12061 * which contains string before match.
12062 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12063 * which contains last capture group.
12065 * See rdoc-ref:regexp.rdoc for details.
12067 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12069 * Note also that a string literal consumes backslashes.
12070 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12072 * A back-reference is typically preceded by an additional backslash.
12073 * For example, if you want to write a back-reference <tt>\&</tt> in
12074 * +replacement+ with a double-quoted string literal, you need to write
12075 * <tt>"..\\\\&.."</tt>.
12077 * If you want to write a non-back-reference string <tt>\&</tt> in
12078 * +replacement+, you need first to escape the backslash to prevent
12079 * this method from interpreting it as a back-reference, and then you
12080 * need to escape the backslashes again to prevent a string literal from
12081 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12083 * You may want to use the block form to avoid a lot of backslashes.
12085 * <b>\Hash +replacement+</b>
12087 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12088 * the replacing string is the value for that key:
12090 * h = {'foo' => 'bar', 'baz' => 'bat'}
12091 * 'food'.sub('foo', h) # => "bard"
12093 * Note that a symbol key does not match:
12095 * h = {foo: 'bar', baz: 'bat'}
12096 * 'food'.sub('foo', h) # => "d"
12098 * <b>Block</b>
12100 * In the block form, the current match string is passed to the block;
12101 * the block's return value becomes the replacing string:
12103 * s = '@'
12104 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12106 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12107 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12110 * == What's Here
12112 * First, what's elsewhere. \Class \String:
12114 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12115 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12117 * Here, class \String provides methods that are useful for:
12119 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12120 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12121 * - {Querying}[#class-String-label-Methods+for+Querying]
12122 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12123 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12124 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12125 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12126 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12128 * === Methods for Creating a \String
12130 * - ::new:: Returns a new string.
12131 * - ::try_convert:: Returns a new string created from a given object.
12133 * === Methods for a Frozen/Unfrozen String
12135 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12136 * +self+, if not frozen; +self.dup+ otherwise.
12137 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12138 * +self+, if already frozen; +self.freeze+ otherwise.
12139 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12141 * === Methods for Querying
12143 * _Counts_
12145 * - #length, #size:: Returns the count of characters (not bytes).
12146 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12147 * - #bytesize:: Returns the count of bytes.
12148 * - #count:: Returns the count of substrings matching given strings.
12150 * _Substrings_
12152 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12153 * returns +nil+ if no match is found.
12154 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12155 * returns +nil+ if none found.
12156 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12157 * returns +nil+ if none found.
12158 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12159 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12160 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12161 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12162 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12164 * _Encodings_
12166 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12167 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12168 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12169 * for its encoding.
12170 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12172 * _Other_
12174 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12175 * - #hash:: Returns the integer hash code.
12177 * === Methods for Comparing
12179 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12180 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12181 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12182 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12183 * other string is smaller than, equal to, or larger than +self+.
12184 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12185 * +false+ otherwise.
12187 * === Methods for Modifying a \String
12189 * Each of these methods modifies +self+.
12191 * _Insertion_
12193 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12194 * - #<<:: Returns +self+ concatenated with a given string or integer.
12196 * _Substitution_
12198 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12199 * returns +self+ if any changes, +nil+ otherwise.
12200 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12201 * returns +self+ if any changes, +nil+ otherwise.
12202 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12203 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12204 * - #reverse!:: Returns +self+ with its characters in reverse order.
12205 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12206 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12207 * returns +self+ if any changes, +nil+ otherwise.
12208 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12209 * removing duplicates from the substrings that were modified;
12210 * returns +self+ if any changes, +nil+ otherwise.
12212 * _Casing_
12214 * - #capitalize!:: Upcases the initial character and downcases all others;
12215 * returns +self+ if any changes, +nil+ otherwise.
12216 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12217 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12218 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12219 * returns +self+ if any changes, +nil+ otherwise.
12221 * _Encoding_
12223 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12224 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12225 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12226 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12228 * _Deletion_
12230 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12231 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12232 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12233 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12234 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12235 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12236 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12237 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12238 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12239 * returns +self+ if any changes, +nil+ otherwise.
12241 * === Methods for Converting to New \String
12243 * Each of these methods returns a new \String based on +self+,
12244 * often just a modified copy of +self+.
12246 * _Extension_
12248 * - #*:: Returns the concatenation of multiple copies of +self+,
12249 * - #+:: Returns the concatenation of +self+ and a given other string.
12250 * - #center:: Returns a copy of +self+ centered between pad substring.
12251 * - #concat:: Returns the concatenation of +self+ with given other strings.
12252 * - #prepend:: Returns the concatenation of a given other string with +self+.
12253 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12254 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12256 * _Encoding_
12258 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12259 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12260 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12261 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12263 * _Substitution_
12265 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12266 * and all special characters escaped.
12267 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12268 * and all escaped characters unescaped.
12269 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12270 * replaced with a given replacement string;.
12271 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12272 * replaced with a given replacement string.
12273 * - #succ, #next:: Returns the string that is the successor to +self+.
12274 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12275 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12276 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12277 * removing duplicates from the substrings that were modified.
12278 * - #%:: Returns the string resulting from formatting a given object into +self+
12280 * _Casing_
12282 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12283 * and all other characters downcased.
12284 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12285 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12286 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12287 * and all downcase characters upcased.
12289 * _Deletion_
12291 * - #delete:: Returns a copy of +self+ with characters removed
12292 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12293 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12294 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12295 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12296 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12297 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12298 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12299 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12300 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12301 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12302 * - #chr:: Returns the first character.
12304 * _Duplication_
12306 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12307 * otherwise, returns +self+.
12309 * === Methods for Converting to Non-\String
12311 * Each of these methods converts the contents of +self+ to a non-\String.
12313 * <em>Characters, Bytes, and Clusters</em>
12315 * - #bytes:: Returns an array of the bytes in +self+.
12316 * - #chars:: Returns an array of the characters in +self+.
12317 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12318 * - #getbyte:: Returns an integer byte as determined by a given index.
12319 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12321 * _Splitting_
12323 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12324 * - #partition:: Returns a 3-element array determined by the first substring that matches
12325 * a given substring or regexp,
12326 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12327 * a given substring or regexp,
12328 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12329 * or, if a block given, passes those substrings to the block.
12331 * _Matching_
12333 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12334 * if a block given, passes each matching substring to the block.
12335 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12336 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12338 * _Numerics_
12340 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12341 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12342 * - #ord:: Returns the integer ordinal of the first character in +self+.
12343 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12344 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12346 * <em>Strings and Symbols</em>
12348 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12349 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12351 * === Methods for Iterating
12353 * - #each_byte:: Calls the given block with each successive byte in +self+.
12354 * - #each_char:: Calls the given block with each successive character in +self+.
12355 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12356 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12357 * - #each_line:: Calls the given block with each successive line in +self+,
12358 * as determined by a given record separator.
12359 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12362 void
12363 Init_String(void)
12365 rb_cString = rb_define_class("String", rb_cObject);
12366 assert(rb_vm_fstring_table());
12367 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12368 rb_include_module(rb_cString, rb_mComparable);
12369 rb_define_alloc_func(rb_cString, empty_str_alloc);
12370 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12371 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12372 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12373 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12374 rb_define_method(rb_cString, "==", rb_str_equal, 1);
12375 rb_define_method(rb_cString, "===", rb_str_equal, 1);
12376 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12377 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12378 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12379 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12380 rb_define_method(rb_cString, "+", rb_str_plus, 1);
12381 rb_define_method(rb_cString, "*", rb_str_times, 1);
12382 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12383 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12384 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12385 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12386 rb_define_method(rb_cString, "length", rb_str_length, 0);
12387 rb_define_method(rb_cString, "size", rb_str_length, 0);
12388 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12389 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12390 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12391 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12392 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12393 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12394 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12395 rb_define_method(rb_cString, "next", rb_str_succ, 0);
12396 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12397 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12398 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12399 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12400 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12401 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12402 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12403 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12404 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12405 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12406 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12407 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12408 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12409 rb_define_method(rb_cString, "+@", str_uplus, 0);
12410 rb_define_method(rb_cString, "-@", str_uminus, 0);
12412 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12413 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12414 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12415 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12416 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12417 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12418 rb_define_method(rb_cString, "undump", str_undump, 0);
12420 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12421 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12422 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12423 sym_fold = ID2SYM(rb_intern_const("fold"));
12425 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12426 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12427 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12428 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12430 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12431 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12432 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12433 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12435 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12436 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12437 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12438 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12439 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12440 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12441 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12442 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12443 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12444 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12445 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12446 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12447 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12448 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12449 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12450 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12451 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12453 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12454 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12455 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12457 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12459 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12460 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12461 rb_define_method(rb_cString, "center", rb_str_center, -1);
12463 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12464 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12465 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12466 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12467 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12468 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12469 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12470 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12471 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12473 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12474 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12475 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12476 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12477 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12478 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12479 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12480 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12481 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12483 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12484 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12485 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12486 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12487 rb_define_method(rb_cString, "count", rb_str_count, -1);
12489 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12490 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12491 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12492 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12494 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12495 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12496 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12497 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12498 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12500 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12502 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12503 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12505 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12506 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12508 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12509 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12510 rb_define_method(rb_cString, "b", rb_str_b, 0);
12511 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12512 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12514 /* define UnicodeNormalize module here so that we don't have to look it up */
12515 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12516 id_normalize = rb_intern_const("normalize");
12517 id_normalized_p = rb_intern_const("normalized?");
12519 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12520 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12521 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12523 rb_fs = Qnil;
12524 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12525 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12526 rb_gc_register_address(&rb_fs);
12528 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12529 rb_include_module(rb_cSymbol, rb_mComparable);
12530 rb_undef_alloc_func(rb_cSymbol);
12531 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12532 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12534 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12535 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12536 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12537 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12538 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12539 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12540 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12541 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12542 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12543 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12544 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12546 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12547 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12548 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12549 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12551 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12552 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12553 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12554 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12555 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12556 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12557 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12559 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12560 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12561 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12562 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12564 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12565 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12567 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);