[ruby/win32ole] Undefine allocator of WIN32OLE_VARIABLE to get rid of warning
[ruby-80x24.org.git] / string.c
blobcdb8658d7f4aca632d9197d54884fb3b4081a890
1 /**********************************************************************
3 string.c -
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "gc.h"
27 #include "id.h"
28 #include "internal.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
42 #include "probes.h"
43 #include "ruby/encoding.h"
44 #include "ruby/re.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
47 #include "vm_sync.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
51 # include <crypt.h>
52 # endif
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
56 #endif
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
61 #undef rb_str_new
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
64 #undef rb_enc_str_new
65 #undef rb_str_new_cstr
66 #undef rb_usascii_str_new_cstr
67 #undef rb_utf8_str_new_cstr
68 #undef rb_enc_str_new_cstr
69 #undef rb_external_str_new_cstr
70 #undef rb_locale_str_new_cstr
71 #undef rb_str_dup_frozen
72 #undef rb_str_buf_new_cstr
73 #undef rb_str_buf_cat
74 #undef rb_str_buf_cat2
75 #undef rb_str_cat2
76 #undef rb_str_cat_cstr
77 #undef rb_fstring_cstr
79 VALUE rb_cString;
80 VALUE rb_cSymbol;
82 /* FLAGS of RString
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
103 #define RUBY_MAX_CHAR_LEN 16
104 #define STR_SHARED_ROOT FL_USER5
105 #define STR_BORROWED FL_USER6
106 #define STR_TMPLOCK FL_USER7
107 #define STR_NOFREE FL_USER18
108 #define STR_FAKESTR FL_USER19
110 #define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
118 } while (0)
119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120 #if USE_RVARGC
121 # define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124 } while (0)
125 #else
126 # define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130 } while (0)
131 #endif
133 #define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
140 } while (0)
142 #define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
148 else {\
149 RSTRING(str)->as.heap.len--;\
151 } while (0)
153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
154 #define TERM_FILL(ptr, termlen) do {\
155 char *const term_fill_ptr = (ptr);\
156 const int term_fill_len = (termlen);\
157 *term_fill_ptr = '\0';\
158 if (UNLIKELY(term_fill_len > 1))\
159 memset(term_fill_ptr, 0, term_fill_len);\
160 } while (0)
162 #define RESIZE_CAPA(str,capacity) do {\
163 const int termlen = TERM_LEN(str);\
164 RESIZE_CAPA_TERM(str,capacity,termlen);\
165 } while (0)
166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
167 if (STR_EMBED_P(str)) {\
168 if (str_embed_capa(str) < capacity + termlen) {\
169 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
170 const long tlen = RSTRING_LEN(str);\
171 memcpy(tmp, RSTRING_PTR(str), tlen);\
172 RSTRING(str)->as.heap.ptr = tmp;\
173 RSTRING(str)->as.heap.len = tlen;\
174 STR_SET_NOEMBED(str);\
175 RSTRING(str)->as.heap.aux.capa = (capacity);\
178 else {\
179 assert(!FL_TEST((str), STR_SHARED)); \
180 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
181 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
182 RSTRING(str)->as.heap.aux.capa = (capacity);\
184 } while (0)
186 #define STR_SET_SHARED(str, shared_str) do { \
187 if (!FL_TEST(str, STR_FAKESTR)) { \
188 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
189 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
190 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
191 FL_SET((str), STR_SHARED); \
192 FL_SET((shared_str), STR_SHARED_ROOT); \
193 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
194 FL_SET_RAW((shared_str), STR_BORROWED); \
196 } while (0)
198 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
200 /* TODO: include the terminator size in capa. */
202 #define STR_ENC_GET(str) get_encoding(str)
204 #if !defined SHARABLE_MIDDLE_SUBSTRING
205 # define SHARABLE_MIDDLE_SUBSTRING 0
206 #endif
207 #if !SHARABLE_MIDDLE_SUBSTRING
208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
209 #else
210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
211 #endif
214 static inline long
215 str_embed_capa(VALUE str)
217 #if USE_RVARGC
218 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
219 #else
220 return RSTRING_EMBED_LEN_MAX + 1;
221 #endif
224 static inline size_t
225 str_embed_size(long capa)
227 return offsetof(struct RString, as.embed.ary) + capa;
230 static inline bool
231 STR_EMBEDDABLE_P(long len, long termlen)
233 #if USE_RVARGC
234 return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
235 #else
236 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
237 #endif
240 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
241 static VALUE str_new_frozen(VALUE klass, VALUE orig);
242 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
243 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
244 static VALUE str_new(VALUE klass, const char *ptr, long len);
245 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
246 static inline void str_modifiable(VALUE str);
247 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
249 static inline void
250 str_make_independent(VALUE str)
252 long len = RSTRING_LEN(str);
253 int termlen = TERM_LEN(str);
254 str_make_independent_expand((str), len, 0L, termlen);
257 static inline int str_dependent_p(VALUE str);
259 void
260 rb_str_make_independent(VALUE str)
262 if (str_dependent_p(str)) {
263 str_make_independent(str);
267 void
268 rb_debug_rstring_null_ptr(const char *func)
270 fprintf(stderr, "%s is returning NULL!! "
271 "SIGSEGV is highly expected to follow immediately. "
272 "If you could reproduce, attach your debugger here, "
273 "and look at the passed string.",
274 func);
277 /* symbols for [up|down|swap]case/capitalize options */
278 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
280 static rb_encoding *
281 get_actual_encoding(const int encidx, VALUE str)
283 const unsigned char *q;
285 switch (encidx) {
286 case ENCINDEX_UTF_16:
287 if (RSTRING_LEN(str) < 2) break;
288 q = (const unsigned char *)RSTRING_PTR(str);
289 if (q[0] == 0xFE && q[1] == 0xFF) {
290 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
292 if (q[0] == 0xFF && q[1] == 0xFE) {
293 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
295 return rb_ascii8bit_encoding();
296 case ENCINDEX_UTF_32:
297 if (RSTRING_LEN(str) < 4) break;
298 q = (const unsigned char *)RSTRING_PTR(str);
299 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
300 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
302 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
303 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
305 return rb_ascii8bit_encoding();
307 return rb_enc_from_index(encidx);
310 static rb_encoding *
311 get_encoding(VALUE str)
313 return get_actual_encoding(ENCODING_GET(str), str);
316 static void
317 mustnot_broken(VALUE str)
319 if (is_broken_string(str)) {
320 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
324 static void
325 mustnot_wchar(VALUE str)
327 rb_encoding *enc = STR_ENC_GET(str);
328 if (rb_enc_mbminlen(enc) > 1) {
329 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
333 static int fstring_cmp(VALUE a, VALUE b);
335 static VALUE register_fstring(VALUE str, bool copy);
337 const struct st_hash_type rb_fstring_hash_type = {
338 fstring_cmp,
339 rb_str_hash,
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg {
345 VALUE fstr;
346 bool copy;
349 static int
350 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
353 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
354 VALUE str = (VALUE)*key;
356 if (existing) {
357 /* because of lazy sweep, str may be unmarked already and swept
358 * at next time */
360 if (rb_objspace_garbage_object_p(str)) {
361 arg->fstr = Qundef;
362 return ST_DELETE;
365 arg->fstr = str;
366 return ST_STOP;
368 else {
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 if (arg->copy) {
371 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
372 rb_enc_copy(new_str, str);
373 str = new_str;
375 else {
376 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
377 RSTRING(str)->as.heap.len,
378 ENCODING_GET(str));
380 OBJ_FREEZE_RAW(str);
382 else {
383 if (!OBJ_FROZEN(str))
384 str = str_new_frozen(rb_cString, str);
385 if (STR_SHARED_P(str)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str);
388 assert(OBJ_FROZEN(str));
390 if (!BARE_STRING_P(str)) {
391 str = str_new_frozen(rb_cString, str);
394 RBASIC(str)->flags |= RSTRING_FSTR;
396 *key = *value = arg->fstr = str;
397 return ST_CONTINUE;
401 RUBY_FUNC_EXPORTED
402 VALUE
403 rb_fstring(VALUE str)
405 VALUE fstr;
406 int bare;
408 Check_Type(str, T_STRING);
410 if (FL_TEST(str, RSTRING_FSTR))
411 return str;
413 bare = BARE_STRING_P(str);
414 if (!bare) {
415 if (STR_EMBED_P(str)) {
416 OBJ_FREEZE_RAW(str);
417 return str;
419 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
420 assert(OBJ_FROZEN(str));
421 return str;
425 if (!OBJ_FROZEN(str))
426 rb_str_resize(str, RSTRING_LEN(str));
428 fstr = register_fstring(str, FALSE);
430 if (!bare) {
431 str_replace_shared_without_enc(str, fstr);
432 OBJ_FREEZE_RAW(str);
433 return str;
435 return fstr;
438 static VALUE
439 register_fstring(VALUE str, bool copy)
441 struct fstr_update_arg args;
442 args.copy = copy;
444 RB_VM_LOCK_ENTER();
446 st_table *frozen_strings = rb_vm_fstring_table();
447 do {
448 args.fstr = str;
449 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
450 } while (args.fstr == Qundef);
452 RB_VM_LOCK_LEAVE();
454 assert(OBJ_FROZEN(args.fstr));
455 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
456 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
457 assert(RBASIC_CLASS(args.fstr) == rb_cString);
458 return args.fstr;
461 static VALUE
462 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
464 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
465 /* SHARED to be allocated by the callback */
467 if (!name) {
468 RUBY_ASSERT_ALWAYS(len == 0);
469 name = "";
472 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
474 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
475 fake_str->as.heap.len = len;
476 fake_str->as.heap.ptr = (char *)name;
477 fake_str->as.heap.aux.capa = len;
478 return (VALUE)fake_str;
482 * set up a fake string which refers a static string literal.
484 VALUE
485 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
487 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
491 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
492 * shared string which refers a static string literal. `ptr` must
493 * point a constant string.
495 MJIT_FUNC_EXPORTED VALUE
496 rb_fstring_new(const char *ptr, long len)
498 struct RString fake_str;
499 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
502 VALUE
503 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
505 struct RString fake_str;
506 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
509 VALUE
510 rb_fstring_cstr(const char *ptr)
512 return rb_fstring_new(ptr, strlen(ptr));
515 static int
516 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
518 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
519 return ST_CONTINUE;
522 static int
523 fstring_cmp(VALUE a, VALUE b)
525 long alen, blen;
526 const char *aptr, *bptr;
527 RSTRING_GETMEM(a, aptr, alen);
528 RSTRING_GETMEM(b, bptr, blen);
529 return (alen != blen ||
530 ENCODING_GET(a) != ENCODING_GET(b) ||
531 memcmp(aptr, bptr, alen) != 0);
534 static inline int
535 single_byte_optimizable(VALUE str)
537 rb_encoding *enc;
539 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
540 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
541 return 1;
543 enc = STR_ENC_GET(str);
544 if (rb_enc_mbmaxlen(enc) == 1)
545 return 1;
547 /* Conservative. Possibly single byte.
548 * "\xa1" in Shift_JIS for example. */
549 return 0;
552 VALUE rb_fs;
554 static inline const char *
555 search_nonascii(const char *p, const char *e)
557 const uintptr_t *s, *t;
559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
560 # if SIZEOF_UINTPTR_T == 8
561 # define NONASCII_MASK UINT64_C(0x8080808080808080)
562 # elif SIZEOF_UINTPTR_T == 4
563 # define NONASCII_MASK UINT32_C(0x80808080)
564 # else
565 # error "don't know what to do."
566 # endif
567 #else
568 # if SIZEOF_UINTPTR_T == 8
569 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
570 # elif SIZEOF_UINTPTR_T == 4
571 # define NONASCII_MASK 0x80808080UL /* or...? */
572 # else
573 # error "don't know what to do."
574 # endif
575 #endif
577 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
578 #if !UNALIGNED_WORD_ACCESS
579 if ((uintptr_t)p % SIZEOF_VOIDP) {
580 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
581 p += l;
582 switch (l) {
583 default: UNREACHABLE;
584 #if SIZEOF_VOIDP > 4
585 case 7: if (p[-7]&0x80) return p-7;
586 case 6: if (p[-6]&0x80) return p-6;
587 case 5: if (p[-5]&0x80) return p-5;
588 case 4: if (p[-4]&0x80) return p-4;
589 #endif
590 case 3: if (p[-3]&0x80) return p-3;
591 case 2: if (p[-2]&0x80) return p-2;
592 case 1: if (p[-1]&0x80) return p-1;
593 case 0: break;
596 #endif
597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
598 #define aligned_ptr(value) \
599 __builtin_assume_aligned((value), sizeof(uintptr_t))
600 #else
601 #define aligned_ptr(value) (uintptr_t *)(value)
602 #endif
603 s = aligned_ptr(p);
604 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
605 #undef aligned_ptr
606 for (;s < t; s++) {
607 if (*s & NONASCII_MASK) {
608 #ifdef WORDS_BIGENDIAN
609 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
610 #else
611 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
612 #endif
615 p = (const char *)s;
618 switch (e - p) {
619 default: UNREACHABLE;
620 #if SIZEOF_VOIDP > 4
621 case 7: if (e[-7]&0x80) return e-7;
622 case 6: if (e[-6]&0x80) return e-6;
623 case 5: if (e[-5]&0x80) return e-5;
624 case 4: if (e[-4]&0x80) return e-4;
625 #endif
626 case 3: if (e[-3]&0x80) return e-3;
627 case 2: if (e[-2]&0x80) return e-2;
628 case 1: if (e[-1]&0x80) return e-1;
629 case 0: return NULL;
633 static int
634 coderange_scan(const char *p, long len, rb_encoding *enc)
636 const char *e = p + len;
638 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
639 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
640 p = search_nonascii(p, e);
641 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
644 if (rb_enc_asciicompat(enc)) {
645 p = search_nonascii(p, e);
646 if (!p) return ENC_CODERANGE_7BIT;
647 for (;;) {
648 int ret = rb_enc_precise_mbclen(p, e, enc);
649 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
650 p += MBCLEN_CHARFOUND_LEN(ret);
651 if (p == e) break;
652 p = search_nonascii(p, e);
653 if (!p) break;
656 else {
657 while (p < e) {
658 int ret = rb_enc_precise_mbclen(p, e, enc);
659 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
660 p += MBCLEN_CHARFOUND_LEN(ret);
663 return ENC_CODERANGE_VALID;
666 long
667 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
669 const char *p = s;
671 if (*cr == ENC_CODERANGE_BROKEN)
672 return e - s;
674 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
675 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
676 if (*cr == ENC_CODERANGE_VALID) return e - s;
677 p = search_nonascii(p, e);
678 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
679 return e - s;
681 else if (rb_enc_asciicompat(enc)) {
682 p = search_nonascii(p, e);
683 if (!p) {
684 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
685 return e - s;
687 for (;;) {
688 int ret = rb_enc_precise_mbclen(p, e, enc);
689 if (!MBCLEN_CHARFOUND_P(ret)) {
690 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
691 return p - s;
693 p += MBCLEN_CHARFOUND_LEN(ret);
694 if (p == e) break;
695 p = search_nonascii(p, e);
696 if (!p) break;
699 else {
700 while (p < e) {
701 int ret = rb_enc_precise_mbclen(p, e, enc);
702 if (!MBCLEN_CHARFOUND_P(ret)) {
703 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
704 return p - s;
706 p += MBCLEN_CHARFOUND_LEN(ret);
709 *cr = ENC_CODERANGE_VALID;
710 return e - s;
713 static inline void
714 str_enc_copy(VALUE str1, VALUE str2)
716 rb_enc_set_index(str1, ENCODING_GET(str2));
719 static void
720 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
728 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
729 else
730 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
731 return;
733 switch (ENC_CODERANGE(src)) {
734 case ENC_CODERANGE_7BIT:
735 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
736 break;
737 case ENC_CODERANGE_VALID:
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
740 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
741 else
742 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
743 break;
744 default:
745 break;
749 static void
750 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
752 str_enc_copy(dest, src);
753 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
756 static int
757 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
759 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
760 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
761 return ENC_CODERANGE_BROKEN;
763 else {
764 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
769 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
771 return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
775 rb_enc_str_coderange(VALUE str)
777 int cr = ENC_CODERANGE(str);
779 if (cr == ENC_CODERANGE_UNKNOWN) {
780 int encidx = ENCODING_GET(str);
781 rb_encoding *enc = rb_enc_from_index(encidx);
782 cr = enc_coderange_scan(str, enc, encidx);
783 ENC_CODERANGE_SET(str, cr);
785 return cr;
789 rb_enc_str_asciionly_p(VALUE str)
791 rb_encoding *enc = STR_ENC_GET(str);
793 if (!rb_enc_asciicompat(enc))
794 return FALSE;
795 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
796 return TRUE;
797 return FALSE;
800 static inline void
801 str_mod_check(VALUE s, const char *p, long len)
803 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
804 rb_raise(rb_eRuntimeError, "string modified");
808 static size_t
809 str_capacity(VALUE str, const int termlen)
811 if (STR_EMBED_P(str)) {
812 #if USE_RVARGC
813 return str_embed_capa(str) - termlen;
814 #else
815 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
816 #endif
818 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
819 return RSTRING(str)->as.heap.len;
821 else {
822 return RSTRING(str)->as.heap.aux.capa;
826 size_t
827 rb_str_capacity(VALUE str)
829 return str_capacity(str, TERM_LEN(str));
832 static inline void
833 must_not_null(const char *ptr)
835 if (!ptr) {
836 rb_raise(rb_eArgError, "NULL pointer given");
840 static inline VALUE
841 str_alloc(VALUE klass, size_t size)
843 assert(size > 0);
844 RVARGC_NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
846 return (VALUE)str;
849 static inline VALUE
850 str_alloc_embed(VALUE klass, size_t capa)
852 size_t size = str_embed_size(capa);
853 assert(rb_gc_size_allocatable_p(size));
854 #if !USE_RVARGC
855 assert(size <= sizeof(struct RString));
856 #endif
857 return str_alloc(klass, size);
860 static inline VALUE
861 str_alloc_heap(VALUE klass)
863 return str_alloc(klass, sizeof(struct RString));
866 static inline VALUE
867 empty_str_alloc(VALUE klass)
869 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
870 VALUE str = str_alloc_embed(klass, 0);
871 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
872 return str;
875 static VALUE
876 str_new0(VALUE klass, const char *ptr, long len, int termlen)
878 VALUE str;
880 if (len < 0) {
881 rb_raise(rb_eArgError, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING, len);
886 if (STR_EMBEDDABLE_P(len, termlen)) {
887 str = str_alloc_embed(klass, len + termlen);
888 if (len == 0) {
889 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
892 else {
893 str = str_alloc_heap(klass);
894 RSTRING(str)->as.heap.aux.capa = len;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str)->as.heap.ptr =
899 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
900 STR_SET_NOEMBED(str);
902 if (ptr) {
903 memcpy(RSTRING_PTR(str), ptr, len);
905 STR_SET_LEN(str, len);
906 TERM_FILL(RSTRING_PTR(str) + len, termlen);
907 return str;
910 static VALUE
911 str_new(VALUE klass, const char *ptr, long len)
913 return str_new0(klass, ptr, len, 1);
916 VALUE
917 rb_str_new(const char *ptr, long len)
919 return str_new(rb_cString, ptr, len);
922 VALUE
923 rb_usascii_str_new(const char *ptr, long len)
925 VALUE str = rb_str_new(ptr, len);
926 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
927 return str;
930 VALUE
931 rb_utf8_str_new(const char *ptr, long len)
933 VALUE str = str_new(rb_cString, ptr, len);
934 rb_enc_associate_index(str, rb_utf8_encindex());
935 return str;
938 VALUE
939 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
941 VALUE str;
943 if (!enc) return rb_str_new(ptr, len);
945 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
946 rb_enc_associate(str, enc);
947 return str;
950 VALUE
951 rb_str_new_cstr(const char *ptr)
953 must_not_null(ptr);
954 /* rb_str_new_cstr() can take pointer from non-malloc-generated
955 * memory regions, and that cannot be detected by the MSAN. Just
956 * trust the programmer that the argument passed here is a sane C
957 * string. */
958 __msan_unpoison_string(ptr);
959 return rb_str_new(ptr, strlen(ptr));
962 VALUE
963 rb_usascii_str_new_cstr(const char *ptr)
965 VALUE str = rb_str_new_cstr(ptr);
966 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
967 return str;
970 VALUE
971 rb_utf8_str_new_cstr(const char *ptr)
973 VALUE str = rb_str_new_cstr(ptr);
974 rb_enc_associate_index(str, rb_utf8_encindex());
975 return str;
978 VALUE
979 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
981 must_not_null(ptr);
982 if (rb_enc_mbminlen(enc) != 1) {
983 rb_raise(rb_eArgError, "wchar encoding given");
985 return rb_enc_str_new(ptr, strlen(ptr), enc);
988 static VALUE
989 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
991 VALUE str;
993 if (len < 0) {
994 rb_raise(rb_eArgError, "negative string size (or size too big)");
997 if (!ptr) {
998 rb_encoding *enc = rb_enc_get_from_index(encindex);
999 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001 else {
1002 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1003 str = str_alloc_heap(klass);
1004 RSTRING(str)->as.heap.len = len;
1005 RSTRING(str)->as.heap.ptr = (char *)ptr;
1006 RSTRING(str)->as.heap.aux.capa = len;
1007 STR_SET_NOEMBED(str);
1008 RBASIC(str)->flags |= STR_NOFREE;
1010 rb_enc_associate_index(str, encindex);
1011 return str;
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1017 return str_new_static(rb_cString, ptr, len, 0);
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1023 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1029 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1035 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039 rb_encoding *from, rb_encoding *to,
1040 int ecflags, VALUE ecopts);
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1045 int encidx = rb_enc_to_index(enc);
1046 if (rb_enc_get_index(str) == encidx)
1047 return is_ascii_string(str);
1048 return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1054 long len;
1055 const char *ptr;
1056 VALUE newstr;
1058 if (!to) return str;
1059 if (!from) from = rb_enc_get(str);
1060 if (from == to) return str;
1061 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062 to == rb_ascii8bit_encoding()) {
1063 if (STR_ENC_GET(str) != to) {
1064 str = rb_str_dup(str);
1065 rb_enc_associate(str, to);
1067 return str;
1070 RSTRING_GETMEM(str, ptr, len);
1071 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072 from, to, ecflags, ecopts);
1073 if (NIL_P(newstr)) {
1074 /* some error, return original */
1075 return str;
1077 return newstr;
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082 rb_encoding *from, int ecflags, VALUE ecopts)
1084 long olen;
1086 olen = RSTRING_LEN(newstr);
1087 if (ofs < -olen || olen < ofs)
1088 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089 if (ofs < 0) ofs += olen;
1090 if (!from) {
1091 STR_SET_LEN(newstr, ofs);
1092 return rb_str_cat(newstr, ptr, len);
1095 rb_str_modify(newstr);
1096 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097 rb_enc_get(newstr),
1098 ecflags, ecopts);
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1104 STR_SET_LEN(str, 0);
1105 rb_enc_associate(str, enc);
1106 rb_str_cat(str, ptr, len);
1107 return str;
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112 rb_encoding *from, rb_encoding *to,
1113 int ecflags, VALUE ecopts)
1115 rb_econv_t *ec;
1116 rb_econv_result_t ret;
1117 long olen;
1118 VALUE econv_wrapper;
1119 const unsigned char *start, *sp;
1120 unsigned char *dest, *dp;
1121 size_t converted_output = (size_t)ofs;
1123 olen = rb_str_capacity(newstr);
1125 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126 RBASIC_CLEAR_CLASS(econv_wrapper);
1127 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128 if (!ec) return Qnil;
1129 DATA_PTR(econv_wrapper) = ec;
1131 sp = (unsigned char*)ptr;
1132 start = sp;
1133 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134 (dp = dest + converted_output),
1135 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136 ret == econv_destination_buffer_full) {
1137 /* destination buffer short */
1138 size_t converted_input = sp - start;
1139 size_t rest = len - converted_input;
1140 converted_output = dp - dest;
1141 rb_str_set_len(newstr, converted_output);
1142 if (converted_input && converted_output &&
1143 rest < (LONG_MAX / converted_output)) {
1144 rest = (rest * converted_output) / converted_input;
1146 else {
1147 rest = olen;
1149 olen += rest < 2 ? 2 : rest;
1150 rb_str_resize(newstr, olen);
1152 DATA_PTR(econv_wrapper) = 0;
1153 rb_econv_close(ec);
1154 switch (ret) {
1155 case econv_finished:
1156 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1157 rb_str_set_len(newstr, len);
1158 rb_enc_associate(newstr, to);
1159 return newstr;
1161 default:
1162 return Qnil;
1166 VALUE
1167 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1169 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1172 VALUE
1173 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1175 rb_encoding *ienc;
1176 VALUE str;
1177 const int eidx = rb_enc_to_index(eenc);
1179 if (!ptr) {
1180 return rb_enc_str_new(ptr, len, eenc);
1183 /* ASCII-8BIT case, no conversion */
1184 if ((eidx == rb_ascii8bit_encindex()) ||
1185 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1186 return rb_str_new(ptr, len);
1188 /* no default_internal or same encoding, no conversion */
1189 ienc = rb_default_internal_encoding();
1190 if (!ienc || eenc == ienc) {
1191 return rb_enc_str_new(ptr, len, eenc);
1193 /* ASCII compatible, and ASCII only string, no conversion in
1194 * default_internal */
1195 if ((eidx == rb_ascii8bit_encindex()) ||
1196 (eidx == rb_usascii_encindex()) ||
1197 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1198 return rb_enc_str_new(ptr, len, ienc);
1200 /* convert from the given encoding to default_internal */
1201 str = rb_enc_str_new(NULL, 0, ienc);
1202 /* when the conversion failed for some reason, just ignore the
1203 * default_internal and result in the given encoding as-is. */
1204 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1205 rb_str_initialize(str, ptr, len, eenc);
1207 return str;
1210 VALUE
1211 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1213 int eidx = rb_enc_to_index(eenc);
1214 if (eidx == rb_usascii_encindex() &&
1215 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1216 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1217 return str;
1219 rb_enc_associate_index(str, eidx);
1220 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1223 VALUE
1224 rb_external_str_new(const char *ptr, long len)
1226 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1229 VALUE
1230 rb_external_str_new_cstr(const char *ptr)
1232 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1235 VALUE
1236 rb_locale_str_new(const char *ptr, long len)
1238 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1241 VALUE
1242 rb_locale_str_new_cstr(const char *ptr)
1244 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1247 VALUE
1248 rb_filesystem_str_new(const char *ptr, long len)
1250 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1253 VALUE
1254 rb_filesystem_str_new_cstr(const char *ptr)
1256 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1259 VALUE
1260 rb_str_export(VALUE str)
1262 return rb_str_export_to_enc(str, rb_default_external_encoding());
1265 VALUE
1266 rb_str_export_locale(VALUE str)
1268 return rb_str_export_to_enc(str, rb_locale_encoding());
1271 VALUE
1272 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1274 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1277 static VALUE
1278 str_replace_shared_without_enc(VALUE str2, VALUE str)
1280 const int termlen = TERM_LEN(str);
1281 char *ptr;
1282 long len;
1284 RSTRING_GETMEM(str, ptr, len);
1285 if (str_embed_capa(str2) >= len + termlen) {
1286 char *ptr2 = RSTRING(str2)->as.embed.ary;
1287 STR_SET_EMBED(str2);
1288 memcpy(ptr2, RSTRING_PTR(str), len);
1289 STR_SET_EMBED_LEN(str2, len);
1290 TERM_FILL(ptr2+len, termlen);
1292 else {
1293 VALUE root;
1294 if (STR_SHARED_P(str)) {
1295 root = RSTRING(str)->as.heap.aux.shared;
1296 RSTRING_GETMEM(str, ptr, len);
1298 else {
1299 root = rb_str_new_frozen(str);
1300 RSTRING_GETMEM(root, ptr, len);
1302 assert(OBJ_FROZEN(root));
1303 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1304 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1305 rb_fatal("about to free a possible shared root");
1307 char *ptr2 = STR_HEAP_PTR(str2);
1308 if (ptr2 != ptr) {
1309 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1312 FL_SET(str2, STR_NOEMBED);
1313 RSTRING(str2)->as.heap.len = len;
1314 RSTRING(str2)->as.heap.ptr = ptr;
1315 STR_SET_SHARED(str2, root);
1317 return str2;
1320 static VALUE
1321 str_replace_shared(VALUE str2, VALUE str)
1323 str_replace_shared_without_enc(str2, str);
1324 rb_enc_cr_str_exact_copy(str2, str);
1325 return str2;
1328 static VALUE
1329 str_new_shared(VALUE klass, VALUE str)
1331 return str_replace_shared(str_alloc_heap(klass), str);
1334 VALUE
1335 rb_str_new_shared(VALUE str)
1337 return str_new_shared(rb_obj_class(str), str);
1340 VALUE
1341 rb_str_new_frozen(VALUE orig)
1343 if (OBJ_FROZEN(orig)) return orig;
1344 return str_new_frozen(rb_obj_class(orig), orig);
1347 static VALUE
1348 rb_str_new_frozen_String(VALUE orig)
1350 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1351 return str_new_frozen(rb_cString, orig);
1354 VALUE
1355 rb_str_tmp_frozen_acquire(VALUE orig)
1357 if (OBJ_FROZEN_RAW(orig)) return orig;
1358 return str_new_frozen_buffer(0, orig, FALSE);
1361 void
1362 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1364 if (RBASIC_CLASS(tmp) != 0)
1365 return;
1367 if (STR_EMBED_P(tmp)) {
1368 assert(OBJ_FROZEN_RAW(tmp));
1370 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1371 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1372 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1374 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1375 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1376 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1378 /* Unshare orig since the root (tmp) only has this one child. */
1379 FL_UNSET_RAW(orig, STR_SHARED);
1380 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1381 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1382 assert(OBJ_FROZEN_RAW(tmp));
1384 /* Make tmp embedded and empty so it is safe for sweeping. */
1385 STR_SET_EMBED(tmp);
1386 STR_SET_EMBED_LEN(tmp, 0);
1391 static VALUE
1392 str_new_frozen(VALUE klass, VALUE orig)
1394 return str_new_frozen_buffer(klass, orig, TRUE);
1397 static VALUE
1398 heap_str_make_shared(VALUE klass, VALUE orig)
1400 assert(!STR_EMBED_P(orig));
1401 assert(!STR_SHARED_P(orig));
1403 VALUE str = str_alloc_heap(klass);
1404 STR_SET_NOEMBED(str);
1405 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1406 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1407 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1408 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1409 RBASIC(orig)->flags &= ~STR_NOFREE;
1410 STR_SET_SHARED(orig, str);
1411 if (klass == 0)
1412 FL_UNSET_RAW(str, STR_BORROWED);
1413 return str;
1416 static VALUE
1417 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1419 VALUE str;
1421 long len = RSTRING_LEN(orig);
1423 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1424 str = str_new(klass, RSTRING_PTR(orig), len);
1425 assert(STR_EMBED_P(str));
1427 else {
1428 if (FL_TEST_RAW(orig, STR_SHARED)) {
1429 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1430 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1431 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1432 assert(ofs >= 0);
1433 assert(rest >= 0);
1434 assert(ofs + rest <= RSTRING_LEN(shared));
1435 #if !USE_RVARGC
1436 assert(!STR_EMBED_P(shared));
1437 #endif
1438 assert(OBJ_FROZEN(shared));
1440 if ((ofs > 0) || (rest > 0) ||
1441 (klass != RBASIC(shared)->klass) ||
1442 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1443 str = str_new_shared(klass, shared);
1444 assert(!STR_EMBED_P(str));
1445 RSTRING(str)->as.heap.ptr += ofs;
1446 RSTRING(str)->as.heap.len -= ofs + rest;
1448 else {
1449 if (RBASIC_CLASS(shared) == 0)
1450 FL_SET_RAW(shared, STR_BORROWED);
1451 return shared;
1454 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1455 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1456 STR_SET_EMBED(str);
1457 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1458 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1459 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1461 else {
1462 str = heap_str_make_shared(klass, orig);
1466 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1467 OBJ_FREEZE(str);
1468 return str;
1471 VALUE
1472 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1474 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1477 static VALUE
1478 str_new_empty_String(VALUE str)
1480 VALUE v = rb_str_new(0, 0);
1481 rb_enc_copy(v, str);
1482 return v;
1485 #define STR_BUF_MIN_SIZE 63
1486 #if !USE_RVARGC
1487 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1488 #endif
1490 VALUE
1491 rb_str_buf_new(long capa)
1493 if (STR_EMBEDDABLE_P(capa, 1)) {
1494 return str_alloc_embed(rb_cString, capa + 1);
1497 VALUE str = str_alloc_heap(rb_cString);
1499 #if !USE_RVARGC
1500 if (capa < STR_BUF_MIN_SIZE) {
1501 capa = STR_BUF_MIN_SIZE;
1503 #endif
1504 FL_SET(str, STR_NOEMBED);
1505 RSTRING(str)->as.heap.aux.capa = capa;
1506 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1507 RSTRING(str)->as.heap.ptr[0] = '\0';
1509 return str;
1512 VALUE
1513 rb_str_buf_new_cstr(const char *ptr)
1515 VALUE str;
1516 long len = strlen(ptr);
1518 str = rb_str_buf_new(len);
1519 rb_str_buf_cat(str, ptr, len);
1521 return str;
1524 VALUE
1525 rb_str_tmp_new(long len)
1527 return str_new(0, 0, len);
1530 void
1531 rb_str_free(VALUE str)
1533 if (FL_TEST(str, RSTRING_FSTR)) {
1534 st_data_t fstr = (st_data_t)str;
1536 RB_VM_LOCK_ENTER();
1538 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1539 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1541 RB_VM_LOCK_LEAVE();
1544 if (STR_EMBED_P(str)) {
1545 RB_DEBUG_COUNTER_INC(obj_str_embed);
1547 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1548 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1549 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1551 else {
1552 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1553 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1557 RUBY_FUNC_EXPORTED size_t
1558 rb_str_memsize(VALUE str)
1560 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1561 return STR_HEAP_SIZE(str);
1563 else {
1564 return 0;
1568 VALUE
1569 rb_str_to_str(VALUE str)
1571 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1574 static inline void str_discard(VALUE str);
1575 static void str_shared_replace(VALUE str, VALUE str2);
1577 void
1578 rb_str_shared_replace(VALUE str, VALUE str2)
1580 if (str != str2) str_shared_replace(str, str2);
1583 static void
1584 str_shared_replace(VALUE str, VALUE str2)
1586 rb_encoding *enc;
1587 int cr;
1588 int termlen;
1590 RUBY_ASSERT(str2 != str);
1591 enc = STR_ENC_GET(str2);
1592 cr = ENC_CODERANGE(str2);
1593 str_discard(str);
1594 termlen = rb_enc_mbminlen(enc);
1596 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1597 STR_SET_EMBED(str);
1598 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1599 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1600 rb_enc_associate(str, enc);
1601 ENC_CODERANGE_SET(str, cr);
1603 else {
1604 #if USE_RVARGC
1605 if (STR_EMBED_P(str2)) {
1606 assert(!FL_TEST(str2, STR_SHARED));
1607 long len = RSTRING(str2)->as.embed.len;
1608 assert(len + termlen <= str_embed_capa(str2));
1610 char *new_ptr = ALLOC_N(char, len + termlen);
1611 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1612 RSTRING(str2)->as.heap.ptr = new_ptr;
1613 RSTRING(str2)->as.heap.len = len;
1614 RSTRING(str2)->as.heap.aux.capa = len;
1615 STR_SET_NOEMBED(str2);
1617 #endif
1619 STR_SET_NOEMBED(str);
1620 FL_UNSET(str, STR_SHARED);
1621 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1622 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1624 if (FL_TEST(str2, STR_SHARED)) {
1625 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1626 STR_SET_SHARED(str, shared);
1628 else {
1629 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1632 /* abandon str2 */
1633 STR_SET_EMBED(str2);
1634 RSTRING_PTR(str2)[0] = 0;
1635 STR_SET_EMBED_LEN(str2, 0);
1636 rb_enc_associate(str, enc);
1637 ENC_CODERANGE_SET(str, cr);
1641 VALUE
1642 rb_obj_as_string(VALUE obj)
1644 VALUE str;
1646 if (RB_TYPE_P(obj, T_STRING)) {
1647 return obj;
1649 str = rb_funcall(obj, idTo_s, 0);
1650 return rb_obj_as_string_result(str, obj);
1653 MJIT_FUNC_EXPORTED VALUE
1654 rb_obj_as_string_result(VALUE str, VALUE obj)
1656 if (!RB_TYPE_P(str, T_STRING))
1657 return rb_any_to_s(obj);
1658 return str;
1661 static VALUE
1662 str_replace(VALUE str, VALUE str2)
1664 long len;
1666 len = RSTRING_LEN(str2);
1667 if (STR_SHARED_P(str2)) {
1668 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1669 assert(OBJ_FROZEN(shared));
1670 STR_SET_NOEMBED(str);
1671 RSTRING(str)->as.heap.len = len;
1672 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1673 STR_SET_SHARED(str, shared);
1674 rb_enc_cr_str_exact_copy(str, str2);
1676 else {
1677 str_replace_shared(str, str2);
1680 return str;
1683 static inline VALUE
1684 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1686 assert(size > 0);
1687 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1688 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1689 return (VALUE)str;
1692 static inline VALUE
1693 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1695 size_t size = str_embed_size(capa);
1696 assert(rb_gc_size_allocatable_p(size));
1697 #if !USE_RVARGC
1698 assert(size <= sizeof(struct RString));
1699 #endif
1700 return ec_str_alloc(ec, klass, size);
1703 static inline VALUE
1704 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1706 return ec_str_alloc(ec, klass, sizeof(struct RString));
1709 static inline VALUE
1710 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1712 const VALUE flag_mask =
1713 #if !USE_RVARGC
1714 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1715 #endif
1716 ENC_CODERANGE_MASK | ENCODING_MASK |
1717 FL_FREEZE
1719 VALUE flags = FL_TEST_RAW(str, flag_mask);
1720 int encidx = 0;
1721 if (STR_EMBED_P(str)) {
1722 long len = RSTRING_EMBED_LEN(str);
1724 assert(str_embed_capa(dup) >= len + 1);
1725 STR_SET_EMBED_LEN(dup, len);
1726 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1728 else {
1729 VALUE root = str;
1730 if (FL_TEST_RAW(str, STR_SHARED)) {
1731 root = RSTRING(str)->as.heap.aux.shared;
1733 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1734 root = str = str_new_frozen(klass, str);
1735 flags = FL_TEST_RAW(str, flag_mask);
1737 assert(!STR_SHARED_P(root));
1738 assert(RB_OBJ_FROZEN_RAW(root));
1739 #if USE_RVARGC
1740 if (1) {
1741 #else
1742 if (STR_EMBED_P(root)) {
1743 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1744 char, RSTRING_EMBED_LEN_MAX + 1);
1746 else {
1747 #endif
1748 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1749 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1750 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1751 flags |= RSTRING_NOEMBED | STR_SHARED;
1755 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1756 encidx = rb_enc_get_index(str);
1757 flags &= ~ENCODING_MASK;
1759 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1760 if (encidx) rb_enc_associate_index(dup, encidx);
1761 return dup;
1764 static inline VALUE
1765 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1767 VALUE dup;
1768 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1769 dup = ec_str_alloc_heap(ec, klass);
1771 else {
1772 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1775 return str_duplicate_setup(klass, str, dup);
1778 static inline VALUE
1779 str_duplicate(VALUE klass, VALUE str)
1781 VALUE dup;
1782 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1783 dup = str_alloc_heap(klass);
1785 else {
1786 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1789 return str_duplicate_setup(klass, str, dup);
1792 VALUE
1793 rb_str_dup(VALUE str)
1795 return str_duplicate(rb_obj_class(str), str);
1798 VALUE
1799 rb_str_resurrect(VALUE str)
1801 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1802 return str_duplicate(rb_cString, str);
1805 VALUE
1806 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1808 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1809 return ec_str_duplicate(ec, rb_cString, str);
1813 * call-seq:
1814 * String.new(string = '') -> new_string
1815 * String.new(string = '', encoding: encoding) -> new_string
1816 * String.new(string = '', capacity: size) -> new_string
1818 * Returns a new \String that is a copy of +string+.
1820 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1821 * s = String.new
1822 * s # => ""
1823 * s.encoding # => #<Encoding:ASCII-8BIT>
1825 * With the single \String argument +string+, returns a copy of +string+
1826 * with the same encoding as +string+:
1827 * s = String.new("Que veut dire \u{e7}a?")
1828 * s # => "Que veut dire \u{e7}a?"
1829 * s.encoding # => #<Encoding:UTF-8>
1831 * Literal strings like <tt>""</tt> or here-documents always use
1832 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1834 * With keyword +encoding+, returns a copy of +str+
1835 * with the specified encoding:
1836 * s = String.new(encoding: 'ASCII')
1837 * s.encoding # => #<Encoding:US-ASCII>
1838 * s = String.new('foo', encoding: 'ASCII')
1839 * s.encoding # => #<Encoding:US-ASCII>
1841 * Note that these are equivalent:
1842 * s0 = String.new('foo', encoding: 'ASCII')
1843 * s1 = 'foo'.force_encoding('ASCII')
1844 * s0.encoding == s1.encoding # => true
1846 * With keyword +capacity+, returns a copy of +str+;
1847 * the given +capacity+ may set the size of the internal buffer,
1848 * which may affect performance:
1849 * String.new(capacity: 1) # => ""
1850 * String.new(capacity: 4096) # => ""
1852 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1854 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1858 static VALUE
1859 rb_str_init(int argc, VALUE *argv, VALUE str)
1861 static ID keyword_ids[2];
1862 VALUE orig, opt, venc, vcapa;
1863 VALUE kwargs[2];
1864 rb_encoding *enc = 0;
1865 int n;
1867 if (!keyword_ids[0]) {
1868 keyword_ids[0] = rb_id_encoding();
1869 CONST_ID(keyword_ids[1], "capacity");
1872 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1873 if (!NIL_P(opt)) {
1874 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1875 venc = kwargs[0];
1876 vcapa = kwargs[1];
1877 if (venc != Qundef && !NIL_P(venc)) {
1878 enc = rb_to_encoding(venc);
1880 if (vcapa != Qundef && !NIL_P(vcapa)) {
1881 long capa = NUM2LONG(vcapa);
1882 long len = 0;
1883 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1885 if (capa < STR_BUF_MIN_SIZE) {
1886 capa = STR_BUF_MIN_SIZE;
1888 if (n == 1) {
1889 StringValue(orig);
1890 len = RSTRING_LEN(orig);
1891 if (capa < len) {
1892 capa = len;
1894 if (orig == str) n = 0;
1896 str_modifiable(str);
1897 if (STR_EMBED_P(str)) { /* make noembed always */
1898 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1899 #if USE_RVARGC
1900 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1901 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1902 #else
1903 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1904 #endif
1905 RSTRING(str)->as.heap.ptr = new_ptr;
1907 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1908 const size_t size = (size_t)capa + termlen;
1909 const char *const old_ptr = RSTRING_PTR(str);
1910 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1911 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1912 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1913 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1914 RSTRING(str)->as.heap.ptr = new_ptr;
1916 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1917 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1918 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1920 RSTRING(str)->as.heap.len = len;
1921 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1922 if (n == 1) {
1923 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1924 rb_enc_cr_str_exact_copy(str, orig);
1926 FL_SET(str, STR_NOEMBED);
1927 RSTRING(str)->as.heap.aux.capa = capa;
1929 else if (n == 1) {
1930 rb_str_replace(str, orig);
1932 if (enc) {
1933 rb_enc_associate(str, enc);
1934 ENC_CODERANGE_CLEAR(str);
1937 else if (n == 1) {
1938 rb_str_replace(str, orig);
1940 return str;
1943 #ifdef NONASCII_MASK
1944 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1947 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1948 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1949 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1951 * if (!(byte & 0x80))
1952 * byte |= 0x40; // turn on bit6
1953 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1955 * This function calculates whether a byte is leading or not for all bytes
1956 * in the argument word by concurrently using the above logic, and then
1957 * adds up the number of leading bytes in the word.
1959 static inline uintptr_t
1960 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1962 uintptr_t d = *s;
1964 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1965 d = (d>>6) | (~d>>7);
1966 d &= NONASCII_MASK >> 7;
1968 /* Gather all bytes. */
1969 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1970 /* use only if it can use POPCNT */
1971 return rb_popcount_intptr(d);
1972 #else
1973 d += (d>>8);
1974 d += (d>>16);
1975 # if SIZEOF_VOIDP == 8
1976 d += (d>>32);
1977 # endif
1978 return (d&0xF);
1979 #endif
1981 #endif
1983 static inline long
1984 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1986 long c;
1987 const char *q;
1989 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1990 long diff = (long)(e - p);
1991 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1993 #ifdef NONASCII_MASK
1994 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1995 uintptr_t len = 0;
1996 if ((int)sizeof(uintptr_t) * 2 < e - p) {
1997 const uintptr_t *s, *t;
1998 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1999 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2000 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2001 while (p < (const char *)s) {
2002 if (is_utf8_lead_byte(*p)) len++;
2003 p++;
2005 while (s < t) {
2006 len += count_utf8_lead_bytes_with_word(s);
2007 s++;
2009 p = (const char *)s;
2011 while (p < e) {
2012 if (is_utf8_lead_byte(*p)) len++;
2013 p++;
2015 return (long)len;
2017 #endif
2018 else if (rb_enc_asciicompat(enc)) {
2019 c = 0;
2020 if (ENC_CODERANGE_CLEAN_P(cr)) {
2021 while (p < e) {
2022 if (ISASCII(*p)) {
2023 q = search_nonascii(p, e);
2024 if (!q)
2025 return c + (e - p);
2026 c += q - p;
2027 p = q;
2029 p += rb_enc_fast_mbclen(p, e, enc);
2030 c++;
2033 else {
2034 while (p < e) {
2035 if (ISASCII(*p)) {
2036 q = search_nonascii(p, e);
2037 if (!q)
2038 return c + (e - p);
2039 c += q - p;
2040 p = q;
2042 p += rb_enc_mbclen(p, e, enc);
2043 c++;
2046 return c;
2049 for (c=0; p<e; c++) {
2050 p += rb_enc_mbclen(p, e, enc);
2052 return c;
2055 long
2056 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2058 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2061 /* To get strlen with cr
2062 * Note that given cr is not used.
2064 long
2065 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2067 long c;
2068 const char *q;
2069 int ret;
2071 *cr = 0;
2072 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2073 long diff = (long)(e - p);
2074 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2076 else if (rb_enc_asciicompat(enc)) {
2077 c = 0;
2078 while (p < e) {
2079 if (ISASCII(*p)) {
2080 q = search_nonascii(p, e);
2081 if (!q) {
2082 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2083 return c + (e - p);
2085 c += q - p;
2086 p = q;
2088 ret = rb_enc_precise_mbclen(p, e, enc);
2089 if (MBCLEN_CHARFOUND_P(ret)) {
2090 *cr |= ENC_CODERANGE_VALID;
2091 p += MBCLEN_CHARFOUND_LEN(ret);
2093 else {
2094 *cr = ENC_CODERANGE_BROKEN;
2095 p++;
2097 c++;
2099 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2100 return c;
2103 for (c=0; p<e; c++) {
2104 ret = rb_enc_precise_mbclen(p, e, enc);
2105 if (MBCLEN_CHARFOUND_P(ret)) {
2106 *cr |= ENC_CODERANGE_VALID;
2107 p += MBCLEN_CHARFOUND_LEN(ret);
2109 else {
2110 *cr = ENC_CODERANGE_BROKEN;
2111 if (p + rb_enc_mbminlen(enc) <= e)
2112 p += rb_enc_mbminlen(enc);
2113 else
2114 p = e;
2117 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2118 return c;
2121 /* enc must be str's enc or rb_enc_check(str, str2) */
2122 static long
2123 str_strlen(VALUE str, rb_encoding *enc)
2125 const char *p, *e;
2126 int cr;
2128 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2129 if (!enc) enc = STR_ENC_GET(str);
2130 p = RSTRING_PTR(str);
2131 e = RSTRING_END(str);
2132 cr = ENC_CODERANGE(str);
2134 if (cr == ENC_CODERANGE_UNKNOWN) {
2135 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2136 if (cr) ENC_CODERANGE_SET(str, cr);
2137 return n;
2139 else {
2140 return enc_strlen(p, e, enc, cr);
2144 long
2145 rb_str_strlen(VALUE str)
2147 return str_strlen(str, NULL);
2151 * call-seq:
2152 * length -> integer
2154 * Returns the count of characters (not bytes) in +self+:
2156 * "\x80\u3042".length # => 2
2157 * "hello".length # => 5
2159 * String#size is an alias for String#length.
2161 * Related: String#bytesize.
2164 VALUE
2165 rb_str_length(VALUE str)
2167 return LONG2NUM(str_strlen(str, NULL));
2171 * call-seq:
2172 * bytesize -> integer
2174 * Returns the count of bytes in +self+:
2176 * "\x80\u3042".bytesize # => 4
2177 * "hello".bytesize # => 5
2179 * Related: String#length.
2182 static VALUE
2183 rb_str_bytesize(VALUE str)
2185 return LONG2NUM(RSTRING_LEN(str));
2189 * call-seq:
2190 * empty? -> true or false
2192 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2194 * "hello".empty? # => false
2195 * " ".empty? # => false
2196 * "".empty? # => true
2200 static VALUE
2201 rb_str_empty(VALUE str)
2203 return RBOOL(RSTRING_LEN(str) == 0);
2207 * call-seq:
2208 * string + other_string -> new_string
2210 * Returns a new \String containing +other_string+ concatenated to +self+:
2212 * "Hello from " + self.to_s # => "Hello from main"
2216 VALUE
2217 rb_str_plus(VALUE str1, VALUE str2)
2219 VALUE str3;
2220 rb_encoding *enc;
2221 char *ptr1, *ptr2, *ptr3;
2222 long len1, len2;
2223 int termlen;
2225 StringValue(str2);
2226 enc = rb_enc_check_str(str1, str2);
2227 RSTRING_GETMEM(str1, ptr1, len1);
2228 RSTRING_GETMEM(str2, ptr2, len2);
2229 termlen = rb_enc_mbminlen(enc);
2230 if (len1 > LONG_MAX - len2) {
2231 rb_raise(rb_eArgError, "string size too big");
2233 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2234 ptr3 = RSTRING_PTR(str3);
2235 memcpy(ptr3, ptr1, len1);
2236 memcpy(ptr3+len1, ptr2, len2);
2237 TERM_FILL(&ptr3[len1+len2], termlen);
2239 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2240 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2241 RB_GC_GUARD(str1);
2242 RB_GC_GUARD(str2);
2243 return str3;
2246 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2247 MJIT_FUNC_EXPORTED VALUE
2248 rb_str_opt_plus(VALUE str1, VALUE str2)
2250 assert(RBASIC_CLASS(str1) == rb_cString);
2251 assert(RBASIC_CLASS(str2) == rb_cString);
2252 long len1, len2;
2253 MAYBE_UNUSED(char) *ptr1, *ptr2;
2254 RSTRING_GETMEM(str1, ptr1, len1);
2255 RSTRING_GETMEM(str2, ptr2, len2);
2256 int enc1 = rb_enc_get_index(str1);
2257 int enc2 = rb_enc_get_index(str2);
2259 if (enc1 < 0) {
2260 return Qundef;
2262 else if (enc2 < 0) {
2263 return Qundef;
2265 else if (enc1 != enc2) {
2266 return Qundef;
2268 else if (len1 > LONG_MAX - len2) {
2269 return Qundef;
2271 else {
2272 return rb_str_plus(str1, str2);
2278 * call-seq:
2279 * string * integer -> new_string
2281 * Returns a new \String containing +integer+ copies of +self+:
2283 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2284 * "Ho! " * 0 # => ""
2288 VALUE
2289 rb_str_times(VALUE str, VALUE times)
2291 VALUE str2;
2292 long n, len;
2293 char *ptr2;
2294 int termlen;
2296 if (times == INT2FIX(1)) {
2297 return str_duplicate(rb_cString, str);
2299 if (times == INT2FIX(0)) {
2300 str2 = str_alloc_embed(rb_cString, 0);
2301 rb_enc_copy(str2, str);
2302 return str2;
2304 len = NUM2LONG(times);
2305 if (len < 0) {
2306 rb_raise(rb_eArgError, "negative argument");
2308 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2309 if (STR_EMBEDDABLE_P(len, 1)) {
2310 str2 = str_alloc_embed(rb_cString, len + 1);
2311 memset(RSTRING_PTR(str2), 0, len + 1);
2313 else {
2314 str2 = str_alloc_heap(rb_cString);
2315 RSTRING(str2)->as.heap.aux.capa = len;
2316 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2317 STR_SET_NOEMBED(str2);
2319 STR_SET_LEN(str2, len);
2320 rb_enc_copy(str2, str);
2321 return str2;
2323 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2324 rb_raise(rb_eArgError, "argument too big");
2327 len *= RSTRING_LEN(str);
2328 termlen = TERM_LEN(str);
2329 str2 = str_new0(rb_cString, 0, len, termlen);
2330 ptr2 = RSTRING_PTR(str2);
2331 if (len) {
2332 n = RSTRING_LEN(str);
2333 memcpy(ptr2, RSTRING_PTR(str), n);
2334 while (n <= len/2) {
2335 memcpy(ptr2 + n, ptr2, n);
2336 n *= 2;
2338 memcpy(ptr2 + n, ptr2, len-n);
2340 STR_SET_LEN(str2, len);
2341 TERM_FILL(&ptr2[len], termlen);
2342 rb_enc_cr_str_copy_for_substr(str2, str);
2344 return str2;
2348 * call-seq:
2349 * string % object -> new_string
2351 * Returns the result of formatting +object+ into the format specification +self+
2352 * (see Kernel#sprintf for formatting details):
2354 * "%05d" % 123 # => "00123"
2356 * If +self+ contains multiple substitutions, +object+ must be
2357 * an \Array or \Hash containing the values to be substituted:
2359 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2360 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2361 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2365 static VALUE
2366 rb_str_format_m(VALUE str, VALUE arg)
2368 VALUE tmp = rb_check_array_type(arg);
2370 if (!NIL_P(tmp)) {
2371 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2373 return rb_str_format(1, &arg, str);
2376 static inline void
2377 rb_check_lockedtmp(VALUE str)
2379 if (FL_TEST(str, STR_TMPLOCK)) {
2380 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2384 static inline void
2385 str_modifiable(VALUE str)
2387 rb_check_lockedtmp(str);
2388 rb_check_frozen(str);
2391 static inline int
2392 str_dependent_p(VALUE str)
2394 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2395 return 0;
2397 else {
2398 return 1;
2402 static inline int
2403 str_independent(VALUE str)
2405 str_modifiable(str);
2406 return !str_dependent_p(str);
2409 static void
2410 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2412 char *ptr;
2413 char *oldptr;
2414 long capa = len + expand;
2416 if (len > capa) len = capa;
2418 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2419 ptr = RSTRING(str)->as.heap.ptr;
2420 STR_SET_EMBED(str);
2421 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2422 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2423 STR_SET_EMBED_LEN(str, len);
2424 return;
2427 ptr = ALLOC_N(char, (size_t)capa + termlen);
2428 oldptr = RSTRING_PTR(str);
2429 if (oldptr) {
2430 memcpy(ptr, oldptr, len);
2432 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2433 xfree(oldptr);
2435 STR_SET_NOEMBED(str);
2436 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2437 TERM_FILL(ptr + len, termlen);
2438 RSTRING(str)->as.heap.ptr = ptr;
2439 RSTRING(str)->as.heap.len = len;
2440 RSTRING(str)->as.heap.aux.capa = capa;
2443 void
2444 rb_str_modify(VALUE str)
2446 if (!str_independent(str))
2447 str_make_independent(str);
2448 ENC_CODERANGE_CLEAR(str);
2451 void
2452 rb_str_modify_expand(VALUE str, long expand)
2454 int termlen = TERM_LEN(str);
2455 long len = RSTRING_LEN(str);
2457 if (expand < 0) {
2458 rb_raise(rb_eArgError, "negative expanding string size");
2460 if (expand >= LONG_MAX - len) {
2461 rb_raise(rb_eArgError, "string size too big");
2464 if (!str_independent(str)) {
2465 str_make_independent_expand(str, len, expand, termlen);
2467 else if (expand > 0) {
2468 RESIZE_CAPA_TERM(str, len + expand, termlen);
2470 ENC_CODERANGE_CLEAR(str);
2473 /* As rb_str_modify(), but don't clear coderange */
2474 static void
2475 str_modify_keep_cr(VALUE str)
2477 if (!str_independent(str))
2478 str_make_independent(str);
2479 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2480 /* Force re-scan later */
2481 ENC_CODERANGE_CLEAR(str);
2484 static inline void
2485 str_discard(VALUE str)
2487 str_modifiable(str);
2488 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2489 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2490 RSTRING(str)->as.heap.ptr = 0;
2491 RSTRING(str)->as.heap.len = 0;
2495 void
2496 rb_must_asciicompat(VALUE str)
2498 rb_encoding *enc = rb_enc_get(str);
2499 if (!rb_enc_asciicompat(enc)) {
2500 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2504 VALUE
2505 rb_string_value(volatile VALUE *ptr)
2507 VALUE s = *ptr;
2508 if (!RB_TYPE_P(s, T_STRING)) {
2509 s = rb_str_to_str(s);
2510 *ptr = s;
2512 return s;
2515 char *
2516 rb_string_value_ptr(volatile VALUE *ptr)
2518 VALUE str = rb_string_value(ptr);
2519 return RSTRING_PTR(str);
2522 static int
2523 zero_filled(const char *s, int n)
2525 for (; n > 0; --n) {
2526 if (*s++) return 0;
2528 return 1;
2531 static const char *
2532 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2534 const char *e = s + len;
2536 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2537 if (zero_filled(s, minlen)) return s;
2539 return 0;
2542 static char *
2543 str_fill_term(VALUE str, char *s, long len, int termlen)
2545 /* This function assumes that (capa + termlen) bytes of memory
2546 * is allocated, like many other functions in this file.
2548 if (str_dependent_p(str)) {
2549 if (!zero_filled(s + len, termlen))
2550 str_make_independent_expand(str, len, 0L, termlen);
2552 else {
2553 TERM_FILL(s + len, termlen);
2554 return s;
2556 return RSTRING_PTR(str);
2559 void
2560 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2562 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2563 long len = RSTRING_LEN(str);
2565 assert(capa >= len);
2566 if (capa - len < termlen) {
2567 rb_check_lockedtmp(str);
2568 str_make_independent_expand(str, len, 0L, termlen);
2570 else if (str_dependent_p(str)) {
2571 if (termlen > oldtermlen)
2572 str_make_independent_expand(str, len, 0L, termlen);
2574 else {
2575 if (!STR_EMBED_P(str)) {
2576 /* modify capa instead of realloc */
2577 assert(!FL_TEST((str), STR_SHARED));
2578 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2580 if (termlen > oldtermlen) {
2581 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2585 return;
2588 static char *
2589 str_null_check(VALUE str, int *w)
2591 char *s = RSTRING_PTR(str);
2592 long len = RSTRING_LEN(str);
2593 rb_encoding *enc = rb_enc_get(str);
2594 const int minlen = rb_enc_mbminlen(enc);
2596 if (minlen > 1) {
2597 *w = 1;
2598 if (str_null_char(s, len, minlen, enc)) {
2599 return NULL;
2601 return str_fill_term(str, s, len, minlen);
2603 *w = 0;
2604 if (!s || memchr(s, 0, len)) {
2605 return NULL;
2607 if (s[len]) {
2608 s = str_fill_term(str, s, len, minlen);
2610 return s;
2613 char *
2614 rb_str_to_cstr(VALUE str)
2616 int w;
2617 return str_null_check(str, &w);
2620 char *
2621 rb_string_value_cstr(volatile VALUE *ptr)
2623 VALUE str = rb_string_value(ptr);
2624 int w;
2625 char *s = str_null_check(str, &w);
2626 if (!s) {
2627 if (w) {
2628 rb_raise(rb_eArgError, "string contains null char");
2630 rb_raise(rb_eArgError, "string contains null byte");
2632 return s;
2635 char *
2636 rb_str_fill_terminator(VALUE str, const int newminlen)
2638 char *s = RSTRING_PTR(str);
2639 long len = RSTRING_LEN(str);
2640 return str_fill_term(str, s, len, newminlen);
2643 VALUE
2644 rb_check_string_type(VALUE str)
2646 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2647 return str;
2651 * call-seq:
2652 * String.try_convert(object) -> object, new_string, or nil
2654 * If +object+ is a \String object, returns +object+.
2656 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2657 * calls <tt>object.to_str</tt> and returns the result.
2659 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2661 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2663 static VALUE
2664 rb_str_s_try_convert(VALUE dummy, VALUE str)
2666 return rb_check_string_type(str);
2669 static char*
2670 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2672 long nth = *nthp;
2673 if (rb_enc_mbmaxlen(enc) == 1) {
2674 p += nth;
2676 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2677 p += nth * rb_enc_mbmaxlen(enc);
2679 else if (rb_enc_asciicompat(enc)) {
2680 const char *p2, *e2;
2681 int n;
2683 while (p < e && 0 < nth) {
2684 e2 = p + nth;
2685 if (e < e2) {
2686 *nthp = nth;
2687 return (char *)e;
2689 if (ISASCII(*p)) {
2690 p2 = search_nonascii(p, e2);
2691 if (!p2) {
2692 nth -= e2 - p;
2693 *nthp = nth;
2694 return (char *)e2;
2696 nth -= p2 - p;
2697 p = p2;
2699 n = rb_enc_mbclen(p, e, enc);
2700 p += n;
2701 nth--;
2703 *nthp = nth;
2704 if (nth != 0) {
2705 return (char *)e;
2707 return (char *)p;
2709 else {
2710 while (p < e && nth--) {
2711 p += rb_enc_mbclen(p, e, enc);
2714 if (p > e) p = e;
2715 *nthp = nth;
2716 return (char*)p;
2719 char*
2720 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2722 return str_nth_len(p, e, &nth, enc);
2725 static char*
2726 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2728 if (singlebyte)
2729 p += nth;
2730 else {
2731 p = str_nth_len(p, e, &nth, enc);
2733 if (!p) return 0;
2734 if (p > e) p = e;
2735 return (char *)p;
2738 /* char offset to byte offset */
2739 static long
2740 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2742 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2743 if (!pp) return e - p;
2744 return pp - p;
2747 long
2748 rb_str_offset(VALUE str, long pos)
2750 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2751 STR_ENC_GET(str), single_byte_optimizable(str));
2754 #ifdef NONASCII_MASK
2755 static char *
2756 str_utf8_nth(const char *p, const char *e, long *nthp)
2758 long nth = *nthp;
2759 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2760 const uintptr_t *s, *t;
2761 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2762 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2763 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2764 while (p < (const char *)s) {
2765 if (is_utf8_lead_byte(*p)) nth--;
2766 p++;
2768 do {
2769 nth -= count_utf8_lead_bytes_with_word(s);
2770 s++;
2771 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2772 p = (char *)s;
2774 while (p < e) {
2775 if (is_utf8_lead_byte(*p)) {
2776 if (nth == 0) break;
2777 nth--;
2779 p++;
2781 *nthp = nth;
2782 return (char *)p;
2785 static long
2786 str_utf8_offset(const char *p, const char *e, long nth)
2788 const char *pp = str_utf8_nth(p, e, &nth);
2789 return pp - p;
2791 #endif
2793 /* byte offset to char offset */
2794 long
2795 rb_str_sublen(VALUE str, long pos)
2797 if (single_byte_optimizable(str) || pos < 0)
2798 return pos;
2799 else {
2800 char *p = RSTRING_PTR(str);
2801 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2805 VALUE
2806 rb_str_subseq(VALUE str, long beg, long len)
2808 VALUE str2;
2810 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2811 SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2812 long olen;
2813 str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2814 RSTRING(str2)->as.heap.ptr += beg;
2815 olen = RSTRING(str2)->as.heap.len;
2816 if (olen > len) RSTRING(str2)->as.heap.len = len;
2818 else {
2819 str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2820 RB_GC_GUARD(str);
2823 rb_enc_cr_str_copy_for_substr(str2, str);
2825 return str2;
2828 char *
2829 rb_str_subpos(VALUE str, long beg, long *lenp)
2831 long len = *lenp;
2832 long slen = -1L;
2833 long blen = RSTRING_LEN(str);
2834 rb_encoding *enc = STR_ENC_GET(str);
2835 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2837 if (len < 0) return 0;
2838 if (!blen) {
2839 len = 0;
2841 if (single_byte_optimizable(str)) {
2842 if (beg > blen) return 0;
2843 if (beg < 0) {
2844 beg += blen;
2845 if (beg < 0) return 0;
2847 if (len > blen - beg)
2848 len = blen - beg;
2849 if (len < 0) return 0;
2850 p = s + beg;
2851 goto end;
2853 if (beg < 0) {
2854 if (len > -beg) len = -beg;
2855 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2856 beg = -beg;
2857 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2858 p = e;
2859 if (!p) return 0;
2860 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2861 if (!p) return 0;
2862 len = e - p;
2863 goto end;
2865 else {
2866 slen = str_strlen(str, enc);
2867 beg += slen;
2868 if (beg < 0) return 0;
2869 p = s + beg;
2870 if (len == 0) goto end;
2873 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2874 return 0;
2876 if (len == 0) {
2877 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2878 p = s + beg;
2880 #ifdef NONASCII_MASK
2881 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2882 enc == rb_utf8_encoding()) {
2883 p = str_utf8_nth(s, e, &beg);
2884 if (beg > 0) return 0;
2885 len = str_utf8_offset(p, e, len);
2887 #endif
2888 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2889 int char_sz = rb_enc_mbmaxlen(enc);
2891 p = s + beg * char_sz;
2892 if (p > e) {
2893 return 0;
2895 else if (len * char_sz > e - p)
2896 len = e - p;
2897 else
2898 len *= char_sz;
2900 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2901 if (beg > 0) return 0;
2902 len = 0;
2904 else {
2905 len = str_offset(p, e, len, enc, 0);
2907 end:
2908 *lenp = len;
2909 RB_GC_GUARD(str);
2910 return p;
2913 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2915 VALUE
2916 rb_str_substr(VALUE str, long beg, long len)
2918 return str_substr(str, beg, len, TRUE);
2921 static VALUE
2922 str_substr(VALUE str, long beg, long len, int empty)
2924 VALUE str2;
2925 char *p = rb_str_subpos(str, beg, &len);
2927 if (!p) return Qnil;
2928 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2929 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2930 long ofs = p - RSTRING_PTR(str);
2931 str2 = rb_str_new_frozen(str);
2932 str2 = str_new_shared(rb_cString, str2);
2933 RSTRING(str2)->as.heap.ptr += ofs;
2934 RSTRING(str2)->as.heap.len = len;
2935 ENC_CODERANGE_CLEAR(str2);
2937 else {
2938 if (!len && !empty) return Qnil;
2939 str2 = rb_str_new(p, len);
2940 RB_GC_GUARD(str);
2942 rb_enc_cr_str_copy_for_substr(str2, str);
2944 return str2;
2947 VALUE
2948 rb_str_freeze(VALUE str)
2950 if (OBJ_FROZEN(str)) return str;
2951 rb_str_resize(str, RSTRING_LEN(str));
2952 return rb_obj_freeze(str);
2957 * call-seq:
2958 * +string -> new_string or self
2960 * Returns +self+ if +self+ is not frozen.
2962 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2964 static VALUE
2965 str_uplus(VALUE str)
2967 if (OBJ_FROZEN(str)) {
2968 return rb_str_dup(str);
2970 else {
2971 return str;
2976 * call-seq:
2977 * -string -> frozen_string
2979 * Returns a frozen, possibly pre-existing copy of the string.
2981 * The returned \String will be deduplicated as long as it does not have
2982 * any instance variables set on it.
2984 static VALUE
2985 str_uminus(VALUE str)
2987 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2988 str = rb_str_dup(str);
2990 return rb_fstring(str);
2993 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2994 #define rb_str_dup_frozen rb_str_new_frozen
2996 VALUE
2997 rb_str_locktmp(VALUE str)
2999 if (FL_TEST(str, STR_TMPLOCK)) {
3000 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3002 FL_SET(str, STR_TMPLOCK);
3003 return str;
3006 VALUE
3007 rb_str_unlocktmp(VALUE str)
3009 if (!FL_TEST(str, STR_TMPLOCK)) {
3010 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3012 FL_UNSET(str, STR_TMPLOCK);
3013 return str;
3016 RUBY_FUNC_EXPORTED VALUE
3017 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3019 rb_str_locktmp(str);
3020 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3023 void
3024 rb_str_set_len(VALUE str, long len)
3026 long capa;
3027 const int termlen = TERM_LEN(str);
3029 str_modifiable(str);
3030 if (STR_SHARED_P(str)) {
3031 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3033 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3034 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3036 STR_SET_LEN(str, len);
3037 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3040 VALUE
3041 rb_str_resize(VALUE str, long len)
3043 long slen;
3044 int independent;
3046 if (len < 0) {
3047 rb_raise(rb_eArgError, "negative string size (or size too big)");
3050 independent = str_independent(str);
3051 ENC_CODERANGE_CLEAR(str);
3052 slen = RSTRING_LEN(str);
3055 long capa;
3056 const int termlen = TERM_LEN(str);
3057 if (STR_EMBED_P(str)) {
3058 if (len == slen) return str;
3059 if (str_embed_capa(str) >= len + termlen) {
3060 STR_SET_EMBED_LEN(str, len);
3061 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3062 return str;
3064 str_make_independent_expand(str, slen, len - slen, termlen);
3066 else if (str_embed_capa(str) >= len + termlen) {
3067 char *ptr = STR_HEAP_PTR(str);
3068 STR_SET_EMBED(str);
3069 if (slen > len) slen = len;
3070 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3071 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3072 STR_SET_EMBED_LEN(str, len);
3073 if (independent) ruby_xfree(ptr);
3074 return str;
3076 else if (!independent) {
3077 if (len == slen) return str;
3078 str_make_independent_expand(str, slen, len - slen, termlen);
3080 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3081 (capa - len) > (len < 1024 ? len : 1024)) {
3082 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3083 (size_t)len + termlen, STR_HEAP_SIZE(str));
3084 RSTRING(str)->as.heap.aux.capa = len;
3086 else if (len == slen) return str;
3087 RSTRING(str)->as.heap.len = len;
3088 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3090 return str;
3093 static VALUE
3094 str_buf_cat(VALUE str, const char *ptr, long len)
3096 long capa, total, olen, off = -1;
3097 char *sptr;
3098 const int termlen = TERM_LEN(str);
3099 #if !USE_RVARGC
3100 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3101 #endif
3103 RSTRING_GETMEM(str, sptr, olen);
3104 if (ptr >= sptr && ptr <= sptr + olen) {
3105 off = ptr - sptr;
3107 rb_str_modify(str);
3108 if (len == 0) return 0;
3109 if (STR_EMBED_P(str)) {
3110 capa = str_embed_capa(str) - termlen;
3111 sptr = RSTRING(str)->as.embed.ary;
3112 olen = RSTRING_EMBED_LEN(str);
3114 else {
3115 capa = RSTRING(str)->as.heap.aux.capa;
3116 sptr = RSTRING(str)->as.heap.ptr;
3117 olen = RSTRING(str)->as.heap.len;
3119 if (olen > LONG_MAX - len) {
3120 rb_raise(rb_eArgError, "string sizes too big");
3122 total = olen + len;
3123 if (capa < total) {
3124 if (total >= LONG_MAX / 2) {
3125 capa = total;
3127 while (total > capa) {
3128 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3130 RESIZE_CAPA_TERM(str, capa, termlen);
3131 sptr = RSTRING_PTR(str);
3133 if (off != -1) {
3134 ptr = sptr + off;
3136 memcpy(sptr + olen, ptr, len);
3137 STR_SET_LEN(str, total);
3138 TERM_FILL(sptr + total, termlen); /* sentinel */
3140 return str;
3143 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3145 VALUE
3146 rb_str_cat(VALUE str, const char *ptr, long len)
3148 if (len == 0) return str;
3149 if (len < 0) {
3150 rb_raise(rb_eArgError, "negative string size (or size too big)");
3152 return str_buf_cat(str, ptr, len);
3155 VALUE
3156 rb_str_cat_cstr(VALUE str, const char *ptr)
3158 must_not_null(ptr);
3159 return rb_str_buf_cat(str, ptr, strlen(ptr));
3162 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3164 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3166 static VALUE
3167 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3168 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3170 int str_encindex = ENCODING_GET(str);
3171 int res_encindex;
3172 int str_cr, res_cr;
3173 rb_encoding *str_enc, *ptr_enc;
3175 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3177 if (str_encindex == ptr_encindex) {
3178 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3179 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3182 else {
3183 str_enc = rb_enc_from_index(str_encindex);
3184 ptr_enc = rb_enc_from_index(ptr_encindex);
3185 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3186 if (len == 0)
3187 return str;
3188 if (RSTRING_LEN(str) == 0) {
3189 rb_str_buf_cat(str, ptr, len);
3190 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3191 return str;
3193 goto incompatible;
3195 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3196 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3198 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3199 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3200 str_cr = rb_enc_str_coderange(str);
3204 if (ptr_cr_ret)
3205 *ptr_cr_ret = ptr_cr;
3207 if (str_encindex != ptr_encindex &&
3208 str_cr != ENC_CODERANGE_7BIT &&
3209 ptr_cr != ENC_CODERANGE_7BIT) {
3210 str_enc = rb_enc_from_index(str_encindex);
3211 ptr_enc = rb_enc_from_index(ptr_encindex);
3212 goto incompatible;
3215 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3216 res_encindex = str_encindex;
3217 res_cr = ENC_CODERANGE_UNKNOWN;
3219 else if (str_cr == ENC_CODERANGE_7BIT) {
3220 if (ptr_cr == ENC_CODERANGE_7BIT) {
3221 res_encindex = str_encindex;
3222 res_cr = ENC_CODERANGE_7BIT;
3224 else {
3225 res_encindex = ptr_encindex;
3226 res_cr = ptr_cr;
3229 else if (str_cr == ENC_CODERANGE_VALID) {
3230 res_encindex = str_encindex;
3231 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3232 res_cr = str_cr;
3233 else
3234 res_cr = ptr_cr;
3236 else { /* str_cr == ENC_CODERANGE_BROKEN */
3237 res_encindex = str_encindex;
3238 res_cr = str_cr;
3239 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3242 if (len < 0) {
3243 rb_raise(rb_eArgError, "negative string size (or size too big)");
3245 str_buf_cat(str, ptr, len);
3246 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3247 return str;
3249 incompatible:
3250 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3251 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3252 UNREACHABLE_RETURN(Qundef);
3255 VALUE
3256 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3258 return rb_enc_cr_str_buf_cat(str, ptr, len,
3259 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3262 VALUE
3263 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3265 /* ptr must reference NUL terminated ASCII string. */
3266 int encindex = ENCODING_GET(str);
3267 rb_encoding *enc = rb_enc_from_index(encindex);
3268 if (rb_enc_asciicompat(enc)) {
3269 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3270 encindex, ENC_CODERANGE_7BIT, 0);
3272 else {
3273 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3274 while (*ptr) {
3275 unsigned int c = (unsigned char)*ptr;
3276 int len = rb_enc_codelen(c, enc);
3277 rb_enc_mbcput(c, buf, enc);
3278 rb_enc_cr_str_buf_cat(str, buf, len,
3279 encindex, ENC_CODERANGE_VALID, 0);
3280 ptr++;
3282 return str;
3286 VALUE
3287 rb_str_buf_append(VALUE str, VALUE str2)
3289 int str2_cr;
3291 str2_cr = ENC_CODERANGE(str2);
3293 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3294 ENCODING_GET(str2), str2_cr, &str2_cr);
3296 ENC_CODERANGE_SET(str2, str2_cr);
3298 return str;
3301 VALUE
3302 rb_str_append(VALUE str, VALUE str2)
3304 StringValue(str2);
3305 return rb_str_buf_append(str, str2);
3308 #define MIN_PRE_ALLOC_SIZE 48
3310 MJIT_FUNC_EXPORTED VALUE
3311 rb_str_concat_literals(size_t num, const VALUE *strary)
3313 VALUE str;
3314 size_t i, s;
3315 long len = 1;
3317 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3318 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3320 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3321 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3322 str = rb_str_resurrect(strary[0]);
3323 s = 1;
3325 else {
3326 str = rb_str_buf_new(len);
3327 rb_enc_copy(str, strary[0]);
3328 s = 0;
3331 for (i = s; i < num; ++i) {
3332 const VALUE v = strary[i];
3333 int encidx = ENCODING_GET(v);
3335 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3336 encidx, ENC_CODERANGE(v), NULL);
3337 if (encidx != ENCINDEX_US_ASCII) {
3338 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3339 rb_enc_set_index(str, encidx);
3342 return str;
3346 * call-seq:
3347 * concat(*objects) -> string
3349 * Concatenates each object in +objects+ to +self+ and returns +self+:
3351 * s = 'foo'
3352 * s.concat('bar', 'baz') # => "foobarbaz"
3353 * s # => "foobarbaz"
3355 * For each given object +object+ that is an \Integer,
3356 * the value is considered a codepoint and converted to a character before concatenation:
3358 * s = 'foo'
3359 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3361 * Related: String#<<, which takes a single argument.
3363 static VALUE
3364 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3366 str_modifiable(str);
3368 if (argc == 1) {
3369 return rb_str_concat(str, argv[0]);
3371 else if (argc > 1) {
3372 int i;
3373 VALUE arg_str = rb_str_tmp_new(0);
3374 rb_enc_copy(arg_str, str);
3375 for (i = 0; i < argc; i++) {
3376 rb_str_concat(arg_str, argv[i]);
3378 rb_str_buf_append(str, arg_str);
3381 return str;
3385 * call-seq:
3386 * string << object -> string
3388 * Concatenates +object+ to +self+ and returns +self+:
3390 * s = 'foo'
3391 * s << 'bar' # => "foobar"
3392 * s # => "foobar"
3394 * If +object+ is an \Integer,
3395 * the value is considered a codepoint and converted to a character before concatenation:
3397 * s = 'foo'
3398 * s << 33 # => "foo!"
3400 * Related: String#concat, which takes multiple arguments.
3402 VALUE
3403 rb_str_concat(VALUE str1, VALUE str2)
3405 unsigned int code;
3406 rb_encoding *enc = STR_ENC_GET(str1);
3407 int encidx;
3409 if (RB_INTEGER_TYPE_P(str2)) {
3410 if (rb_num_to_uint(str2, &code) == 0) {
3412 else if (FIXNUM_P(str2)) {
3413 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3415 else {
3416 rb_raise(rb_eRangeError, "bignum out of char range");
3419 else {
3420 return rb_str_append(str1, str2);
3423 encidx = rb_enc_to_index(enc);
3424 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3425 /* US-ASCII automatically extended to ASCII-8BIT */
3426 char buf[1];
3427 buf[0] = (char)code;
3428 if (code > 0xFF) {
3429 rb_raise(rb_eRangeError, "%u out of char range", code);
3431 rb_str_cat(str1, buf, 1);
3432 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3433 rb_enc_associate_index(str1, ENCINDEX_ASCII);
3434 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3437 else {
3438 long pos = RSTRING_LEN(str1);
3439 int cr = ENC_CODERANGE(str1);
3440 int len;
3441 char *buf;
3443 switch (len = rb_enc_codelen(code, enc)) {
3444 case ONIGERR_INVALID_CODE_POINT_VALUE:
3445 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3446 break;
3447 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3448 case 0:
3449 rb_raise(rb_eRangeError, "%u out of char range", code);
3450 break;
3452 buf = ALLOCA_N(char, len + 1);
3453 rb_enc_mbcput(code, buf, enc);
3454 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3455 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3457 rb_str_resize(str1, pos+len);
3458 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3459 if (cr == ENC_CODERANGE_7BIT && code > 127)
3460 cr = ENC_CODERANGE_VALID;
3461 ENC_CODERANGE_SET(str1, cr);
3463 return str1;
3467 * call-seq:
3468 * prepend(*other_strings) -> string
3470 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3472 * s = 'foo'
3473 * s.prepend('bar', 'baz') # => "barbazfoo"
3474 * s # => "barbazfoo"
3476 * Related: String#concat.
3479 static VALUE
3480 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3482 str_modifiable(str);
3484 if (argc == 1) {
3485 rb_str_update(str, 0L, 0L, argv[0]);
3487 else if (argc > 1) {
3488 int i;
3489 VALUE arg_str = rb_str_tmp_new(0);
3490 rb_enc_copy(arg_str, str);
3491 for (i = 0; i < argc; i++) {
3492 rb_str_append(arg_str, argv[i]);
3494 rb_str_update(str, 0L, 0L, arg_str);
3497 return str;
3500 st_index_t
3501 rb_str_hash(VALUE str)
3503 int e = ENCODING_GET(str);
3504 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3505 e = 0;
3507 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3511 rb_str_hash_cmp(VALUE str1, VALUE str2)
3513 long len1, len2;
3514 const char *ptr1, *ptr2;
3515 RSTRING_GETMEM(str1, ptr1, len1);
3516 RSTRING_GETMEM(str2, ptr2, len2);
3517 return (len1 != len2 ||
3518 !rb_str_comparable(str1, str2) ||
3519 memcmp(ptr1, ptr2, len1) != 0);
3523 * call-seq:
3524 * hash -> integer
3526 * Returns the integer hash value for +self+.
3527 * The value is based on the length, content and encoding of +self+.
3529 * Related: Object#hash.
3532 static VALUE
3533 rb_str_hash_m(VALUE str)
3535 st_index_t hval = rb_str_hash(str);
3536 return ST2FIX(hval);
3539 #define lesser(a,b) (((a)>(b))?(b):(a))
3542 rb_str_comparable(VALUE str1, VALUE str2)
3544 int idx1, idx2;
3545 int rc1, rc2;
3547 if (RSTRING_LEN(str1) == 0) return TRUE;
3548 if (RSTRING_LEN(str2) == 0) return TRUE;
3549 idx1 = ENCODING_GET(str1);
3550 idx2 = ENCODING_GET(str2);
3551 if (idx1 == idx2) return TRUE;
3552 rc1 = rb_enc_str_coderange(str1);
3553 rc2 = rb_enc_str_coderange(str2);
3554 if (rc1 == ENC_CODERANGE_7BIT) {
3555 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3556 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3557 return TRUE;
3559 if (rc2 == ENC_CODERANGE_7BIT) {
3560 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3561 return TRUE;
3563 return FALSE;
3567 rb_str_cmp(VALUE str1, VALUE str2)
3569 long len1, len2;
3570 const char *ptr1, *ptr2;
3571 int retval;
3573 if (str1 == str2) return 0;
3574 RSTRING_GETMEM(str1, ptr1, len1);
3575 RSTRING_GETMEM(str2, ptr2, len2);
3576 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3577 if (len1 == len2) {
3578 if (!rb_str_comparable(str1, str2)) {
3579 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3580 return 1;
3581 return -1;
3583 return 0;
3585 if (len1 > len2) return 1;
3586 return -1;
3588 if (retval > 0) return 1;
3589 return -1;
3593 * call-seq:
3594 * string == object -> true or false
3595 * string === object -> true or false
3597 * Returns +true+ if +object+ has the same length and content;
3598 * as +self+; +false+ otherwise:
3600 * s = 'foo'
3601 * s == 'foo' # => true
3602 * s == 'food' # => false
3603 * s == 'FOO' # => false
3605 * Returns +false+ if the two strings' encodings are not compatible:
3606 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3608 * If +object+ is not an instance of \String but responds to +to_str+, then the
3609 * two strings are compared using <code>object.==</code>.
3612 VALUE
3613 rb_str_equal(VALUE str1, VALUE str2)
3615 if (str1 == str2) return Qtrue;
3616 if (!RB_TYPE_P(str2, T_STRING)) {
3617 if (!rb_respond_to(str2, idTo_str)) {
3618 return Qfalse;
3620 return rb_equal(str2, str1);
3622 return rb_str_eql_internal(str1, str2);
3626 * call-seq:
3627 * eql?(object) -> true or false
3629 * Returns +true+ if +object+ has the same length and content;
3630 * as +self+; +false+ otherwise:
3632 * s = 'foo'
3633 * s.eql?('foo') # => true
3634 * s.eql?('food') # => false
3635 * s.eql?('FOO') # => false
3637 * Returns +false+ if the two strings' encodings are not compatible:
3639 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3643 MJIT_FUNC_EXPORTED VALUE
3644 rb_str_eql(VALUE str1, VALUE str2)
3646 if (str1 == str2) return Qtrue;
3647 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3648 return rb_str_eql_internal(str1, str2);
3652 * call-seq:
3653 * string <=> other_string -> -1, 0, 1, or nil
3655 * Compares +self+ and +other_string+, returning:
3657 * - -1 if +other_string+ is larger.
3658 * - 0 if the two are equal.
3659 * - 1 if +other_string+ is smaller.
3660 * - +nil+ if the two are incomparable.
3662 * Examples:
3664 * 'foo' <=> 'foo' # => 0
3665 * 'foo' <=> 'food' # => -1
3666 * 'food' <=> 'foo' # => 1
3667 * 'FOO' <=> 'foo' # => -1
3668 * 'foo' <=> 'FOO' # => 1
3669 * 'foo' <=> 1 # => nil
3673 static VALUE
3674 rb_str_cmp_m(VALUE str1, VALUE str2)
3676 int result;
3677 VALUE s = rb_check_string_type(str2);
3678 if (NIL_P(s)) {
3679 return rb_invcmp(str1, str2);
3681 result = rb_str_cmp(str1, s);
3682 return INT2FIX(result);
3685 static VALUE str_casecmp(VALUE str1, VALUE str2);
3686 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3689 * call-seq:
3690 * casecmp(other_string) -> -1, 0, 1, or nil
3692 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3694 * - -1 if <tt>other_string.downcase</tt> is larger.
3695 * - 0 if the two are equal.
3696 * - 1 if <tt>other_string.downcase</tt> is smaller.
3697 * - +nil+ if the two are incomparable.
3699 * Examples:
3701 * 'foo'.casecmp('foo') # => 0
3702 * 'foo'.casecmp('food') # => -1
3703 * 'food'.casecmp('foo') # => 1
3704 * 'FOO'.casecmp('foo') # => 0
3705 * 'foo'.casecmp('FOO') # => 0
3706 * 'foo'.casecmp(1) # => nil
3708 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3710 * Related: String#casecmp?.
3714 static VALUE
3715 rb_str_casecmp(VALUE str1, VALUE str2)
3717 VALUE s = rb_check_string_type(str2);
3718 if (NIL_P(s)) {
3719 return Qnil;
3721 return str_casecmp(str1, s);
3724 static VALUE
3725 str_casecmp(VALUE str1, VALUE str2)
3727 long len;
3728 rb_encoding *enc;
3729 const char *p1, *p1end, *p2, *p2end;
3731 enc = rb_enc_compatible(str1, str2);
3732 if (!enc) {
3733 return Qnil;
3736 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3737 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3738 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3739 while (p1 < p1end && p2 < p2end) {
3740 if (*p1 != *p2) {
3741 unsigned int c1 = TOLOWER(*p1 & 0xff);
3742 unsigned int c2 = TOLOWER(*p2 & 0xff);
3743 if (c1 != c2)
3744 return INT2FIX(c1 < c2 ? -1 : 1);
3746 p1++;
3747 p2++;
3750 else {
3751 while (p1 < p1end && p2 < p2end) {
3752 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3753 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3755 if (0 <= c1 && 0 <= c2) {
3756 c1 = TOLOWER(c1);
3757 c2 = TOLOWER(c2);
3758 if (c1 != c2)
3759 return INT2FIX(c1 < c2 ? -1 : 1);
3761 else {
3762 int r;
3763 l1 = rb_enc_mbclen(p1, p1end, enc);
3764 l2 = rb_enc_mbclen(p2, p2end, enc);
3765 len = l1 < l2 ? l1 : l2;
3766 r = memcmp(p1, p2, len);
3767 if (r != 0)
3768 return INT2FIX(r < 0 ? -1 : 1);
3769 if (l1 != l2)
3770 return INT2FIX(l1 < l2 ? -1 : 1);
3772 p1 += l1;
3773 p2 += l2;
3776 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3777 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3778 return INT2FIX(-1);
3782 * call-seq:
3783 * casecmp?(other_string) -> true, false, or nil
3785 * Returns +true+ if +self+ and +other_string+ are equal after
3786 * Unicode case folding, otherwise +false+:
3788 * 'foo'.casecmp?('foo') # => true
3789 * 'foo'.casecmp?('food') # => false
3790 * 'food'.casecmp?('foo') # => false
3791 * 'FOO'.casecmp?('foo') # => true
3792 * 'foo'.casecmp?('FOO') # => true
3794 * Returns +nil+ if the two values are incomparable:
3796 * 'foo'.casecmp?(1) # => nil
3798 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3800 * Related: String#casecmp.
3804 static VALUE
3805 rb_str_casecmp_p(VALUE str1, VALUE str2)
3807 VALUE s = rb_check_string_type(str2);
3808 if (NIL_P(s)) {
3809 return Qnil;
3811 return str_casecmp_p(str1, s);
3814 static VALUE
3815 str_casecmp_p(VALUE str1, VALUE str2)
3817 rb_encoding *enc;
3818 VALUE folded_str1, folded_str2;
3819 VALUE fold_opt = sym_fold;
3821 enc = rb_enc_compatible(str1, str2);
3822 if (!enc) {
3823 return Qnil;
3826 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3827 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3829 return rb_str_eql(folded_str1, folded_str2);
3832 static long
3833 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3834 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3836 const char *search_start = str_ptr;
3837 long pos, search_len = str_len - offset;
3839 for (;;) {
3840 const char *t;
3841 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3842 if (pos < 0) return pos;
3843 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3844 if (t == search_start + pos) break;
3845 search_len -= t - search_start;
3846 if (search_len <= 0) return -1;
3847 offset += t - search_start;
3848 search_start = t;
3850 return pos + offset;
3853 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3855 static long
3856 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3858 const char *str_ptr, *str_ptr_end, *sub_ptr;
3859 long str_len, sub_len;
3860 rb_encoding *enc;
3862 enc = rb_enc_check(str, sub);
3863 if (is_broken_string(sub)) return -1;
3865 str_ptr = RSTRING_PTR(str);
3866 str_ptr_end = RSTRING_END(str);
3867 str_len = RSTRING_LEN(str);
3868 sub_ptr = RSTRING_PTR(sub);
3869 sub_len = RSTRING_LEN(sub);
3871 if (str_len < sub_len) return -1;
3873 if (offset != 0) {
3874 long str_len_char, sub_len_char;
3875 int single_byte = single_byte_optimizable(str);
3876 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3877 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3878 if (offset < 0) {
3879 offset += str_len_char;
3880 if (offset < 0) return -1;
3882 if (str_len_char - offset < sub_len_char) return -1;
3883 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3884 str_ptr += offset;
3886 if (sub_len == 0) return offset;
3888 /* need proceed one character at a time */
3889 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3894 * call-seq:
3895 * index(substring, offset = 0) -> integer or nil
3896 * index(regexp, offset = 0) -> integer or nil
3898 * Returns the \Integer index of the first occurrence of the given +substring+,
3899 * or +nil+ if none found:
3901 * 'foo'.index('f') # => 0
3902 * 'foo'.index('o') # => 1
3903 * 'foo'.index('oo') # => 1
3904 * 'foo'.index('ooo') # => nil
3906 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3907 * or +nil+ if none found:
3909 * 'foo'.index(/f/) # => 0
3910 * 'foo'.index(/o/) # => 1
3911 * 'foo'.index(/oo/) # => 1
3912 * 'foo'.index(/ooo/) # => nil
3914 * \Integer argument +offset+, if given, specifies the position in the
3915 * string to begin the search:
3917 * 'foo'.index('o', 1) # => 1
3918 * 'foo'.index('o', 2) # => 2
3919 * 'foo'.index('o', 3) # => nil
3921 * If +offset+ is negative, counts backward from the end of +self+:
3923 * 'foo'.index('o', -1) # => 2
3924 * 'foo'.index('o', -2) # => 1
3925 * 'foo'.index('o', -3) # => 1
3926 * 'foo'.index('o', -4) # => nil
3928 * Related: String#rindex.
3931 static VALUE
3932 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3934 VALUE sub;
3935 VALUE initpos;
3936 long pos;
3938 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3939 pos = NUM2LONG(initpos);
3941 else {
3942 pos = 0;
3944 if (pos < 0) {
3945 pos += str_strlen(str, NULL);
3946 if (pos < 0) {
3947 if (RB_TYPE_P(sub, T_REGEXP)) {
3948 rb_backref_set(Qnil);
3950 return Qnil;
3954 if (RB_TYPE_P(sub, T_REGEXP)) {
3955 if (pos > str_strlen(str, NULL))
3956 return Qnil;
3957 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3958 rb_enc_check(str, sub), single_byte_optimizable(str));
3960 if (rb_reg_search(sub, str, pos, 0) < 0) {
3961 return Qnil;
3963 else {
3964 VALUE match = rb_backref_get();
3965 struct re_registers *regs = RMATCH_REGS(match);
3966 pos = rb_str_sublen(str, BEG(0));
3967 return LONG2NUM(pos);
3970 else {
3971 StringValue(sub);
3972 pos = rb_str_index(str, sub, pos);
3973 pos = rb_str_sublen(str, pos);
3976 if (pos == -1) return Qnil;
3977 return LONG2NUM(pos);
3980 #ifdef HAVE_MEMRCHR
3981 static long
3982 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3984 char *hit, *adjusted;
3985 int c;
3986 long slen, searchlen;
3987 char *sbeg, *e, *t;
3989 slen = RSTRING_LEN(sub);
3990 if (slen == 0) return pos;
3991 sbeg = RSTRING_PTR(str);
3992 e = RSTRING_END(str);
3993 t = RSTRING_PTR(sub);
3994 c = *t & 0xff;
3995 searchlen = s - sbeg + 1;
3997 do {
3998 hit = memrchr(sbeg, c, searchlen);
3999 if (!hit) break;
4000 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4001 if (hit != adjusted) {
4002 searchlen = adjusted - sbeg;
4003 continue;
4005 if (memcmp(hit, t, slen) == 0)
4006 return rb_str_sublen(str, hit - sbeg);
4007 searchlen = adjusted - sbeg;
4008 } while (searchlen > 0);
4010 return -1;
4012 #else
4013 static long
4014 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4016 long slen;
4017 char *sbeg, *e, *t;
4019 sbeg = RSTRING_PTR(str);
4020 e = RSTRING_END(str);
4021 t = RSTRING_PTR(sub);
4022 slen = RSTRING_LEN(sub);
4024 while (s) {
4025 if (memcmp(s, t, slen) == 0) {
4026 return pos;
4028 if (pos == 0) break;
4029 pos--;
4030 s = rb_enc_prev_char(sbeg, s, e, enc);
4033 return -1;
4035 #endif
4037 static long
4038 rb_str_rindex(VALUE str, VALUE sub, long pos)
4040 long len, slen;
4041 char *sbeg, *s;
4042 rb_encoding *enc;
4043 int singlebyte;
4045 enc = rb_enc_check(str, sub);
4046 if (is_broken_string(sub)) return -1;
4047 singlebyte = single_byte_optimizable(str);
4048 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4049 slen = str_strlen(sub, enc); /* rb_enc_check */
4051 /* substring longer than string */
4052 if (len < slen) return -1;
4053 if (len - pos < slen) pos = len - slen;
4054 if (len == 0) return pos;
4056 sbeg = RSTRING_PTR(str);
4058 if (pos == 0) {
4059 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4060 return 0;
4061 else
4062 return -1;
4065 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4066 return str_rindex(str, sub, s, pos, enc);
4070 * call-seq:
4071 * rindex(substring, offset = self.length) -> integer or nil
4072 * rindex(regexp, offset = self.length) -> integer or nil
4074 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4075 * or +nil+ if none found:
4077 * 'foo'.rindex('f') # => 0
4078 * 'foo'.rindex('o') # => 2
4079 * 'foo'.rindex('oo') # => 1
4080 * 'foo'.rindex('ooo') # => nil
4082 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4083 * or +nil+ if none found:
4085 * 'foo'.rindex(/f/) # => 0
4086 * 'foo'.rindex(/o/) # => 2
4087 * 'foo'.rindex(/oo/) # => 1
4088 * 'foo'.rindex(/ooo/) # => nil
4090 * The _last_ match means starting at the possible last position, not
4091 * the last of longest matches.
4093 * 'foo'.rindex(/o+/) # => 2
4094 * $~ #=> #<MatchData "o">
4096 * To get the last longest match, needs to combine with negative
4097 * lookbehind.
4099 * 'foo'.rindex(/(?<!o)o+/) # => 1
4100 * $~ #=> #<MatchData "oo">
4102 * Or String#index with negative lookforward.
4104 * 'foo'.index(/o+(?!.*o)/) # => 1
4105 * $~ #=> #<MatchData "oo">
4107 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4108 * string to _end_ the search:
4110 * 'foo'.rindex('o', 0) # => nil
4111 * 'foo'.rindex('o', 1) # => 1
4112 * 'foo'.rindex('o', 2) # => 2
4113 * 'foo'.rindex('o', 3) # => 2
4115 * If +offset+ is a negative \Integer, the maximum starting position in the
4116 * string to _end_ the search is the sum of the string's length and +offset+:
4118 * 'foo'.rindex('o', -1) # => 2
4119 * 'foo'.rindex('o', -2) # => 1
4120 * 'foo'.rindex('o', -3) # => nil
4121 * 'foo'.rindex('o', -4) # => nil
4123 * Related: String#index.
4126 static VALUE
4127 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4129 VALUE sub;
4130 VALUE vpos;
4131 rb_encoding *enc = STR_ENC_GET(str);
4132 long pos, len = str_strlen(str, enc); /* str's enc */
4134 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4135 pos = NUM2LONG(vpos);
4136 if (pos < 0) {
4137 pos += len;
4138 if (pos < 0) {
4139 if (RB_TYPE_P(sub, T_REGEXP)) {
4140 rb_backref_set(Qnil);
4142 return Qnil;
4145 if (pos > len) pos = len;
4147 else {
4148 pos = len;
4151 if (RB_TYPE_P(sub, T_REGEXP)) {
4152 /* enc = rb_get_check(str, sub); */
4153 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4154 enc, single_byte_optimizable(str));
4156 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4157 VALUE match = rb_backref_get();
4158 struct re_registers *regs = RMATCH_REGS(match);
4159 pos = rb_str_sublen(str, BEG(0));
4160 return LONG2NUM(pos);
4163 else {
4164 StringValue(sub);
4165 pos = rb_str_rindex(str, sub, pos);
4166 if (pos >= 0) return LONG2NUM(pos);
4168 return Qnil;
4172 * call-seq:
4173 * string =~ regexp -> integer or nil
4174 * string =~ object -> integer or nil
4176 * Returns the \Integer index of the first substring that matches
4177 * the given +regexp+, or +nil+ if no match found:
4179 * 'foo' =~ /f/ # => 0
4180 * 'foo' =~ /o/ # => 1
4181 * 'foo' =~ /x/ # => nil
4183 * Note: also updates
4184 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4186 * If the given +object+ is not a \Regexp, returns the value
4187 * returned by <tt>object =~ self</tt>.
4189 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4190 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4192 * number= nil
4193 * "no. 9" =~ /(?<number>\d+)/
4194 * number # => nil (not assigned)
4195 * /(?<number>\d+)/ =~ "no. 9"
4196 * number #=> "9"
4200 static VALUE
4201 rb_str_match(VALUE x, VALUE y)
4203 switch (OBJ_BUILTIN_TYPE(y)) {
4204 case T_STRING:
4205 rb_raise(rb_eTypeError, "type mismatch: String given");
4207 case T_REGEXP:
4208 return rb_reg_match(y, x);
4210 default:
4211 return rb_funcall(y, idEqTilde, 1, x);
4216 static VALUE get_pat(VALUE);
4220 * call-seq:
4221 * match(pattern, offset = 0) -> matchdata or nil
4222 * match(pattern, offset = 0) {|matchdata| ... } -> object
4224 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4226 * Note: also updates
4227 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4229 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4230 * regexp = Regexp.new(pattern)
4231 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4232 * (see Regexp#match):
4233 * matchdata = <tt>regexp.match(self)
4235 * With no block given, returns the computed +matchdata+:
4237 * 'foo'.match('f') # => #<MatchData "f">
4238 * 'foo'.match('o') # => #<MatchData "o">
4239 * 'foo'.match('x') # => nil
4241 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4243 * 'foo'.match('f', 1) # => nil
4244 * 'foo'.match('o', 1) # => #<MatchData "o">
4246 * With a block given, calls the block with the computed +matchdata+
4247 * and returns the block's return value:
4249 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4250 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4251 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4255 static VALUE
4256 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4258 VALUE re, result;
4259 if (argc < 1)
4260 rb_check_arity(argc, 1, 2);
4261 re = argv[0];
4262 argv[0] = str;
4263 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4264 if (!NIL_P(result) && rb_block_given_p()) {
4265 return rb_yield(result);
4267 return result;
4271 * call-seq:
4272 * match?(pattern, offset = 0) -> true or false
4274 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4276 * Note: does not update
4277 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4279 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4280 * regexp = Regexp.new(pattern)
4282 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4283 * +false+ otherwise:
4285 * 'foo'.match?(/o/) # => true
4286 * 'foo'.match?('o') # => true
4287 * 'foo'.match?(/x/) # => false
4289 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4290 * 'foo'.match?('f', 1) # => false
4291 * 'foo'.match?('o', 1) # => true
4295 static VALUE
4296 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4298 VALUE re;
4299 rb_check_arity(argc, 1, 2);
4300 re = get_pat(argv[0]);
4301 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4304 enum neighbor_char {
4305 NEIGHBOR_NOT_CHAR,
4306 NEIGHBOR_FOUND,
4307 NEIGHBOR_WRAPPED
4310 static enum neighbor_char
4311 enc_succ_char(char *p, long len, rb_encoding *enc)
4313 long i;
4314 int l;
4316 if (rb_enc_mbminlen(enc) > 1) {
4317 /* wchar, trivial case */
4318 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4319 if (!MBCLEN_CHARFOUND_P(r)) {
4320 return NEIGHBOR_NOT_CHAR;
4322 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4323 l = rb_enc_code_to_mbclen(c, enc);
4324 if (!l) return NEIGHBOR_NOT_CHAR;
4325 if (l != len) return NEIGHBOR_WRAPPED;
4326 rb_enc_mbcput(c, p, enc);
4327 r = rb_enc_precise_mbclen(p, p + len, enc);
4328 if (!MBCLEN_CHARFOUND_P(r)) {
4329 return NEIGHBOR_NOT_CHAR;
4331 return NEIGHBOR_FOUND;
4333 while (1) {
4334 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4335 p[i] = '\0';
4336 if (i < 0)
4337 return NEIGHBOR_WRAPPED;
4338 ++((unsigned char*)p)[i];
4339 l = rb_enc_precise_mbclen(p, p+len, enc);
4340 if (MBCLEN_CHARFOUND_P(l)) {
4341 l = MBCLEN_CHARFOUND_LEN(l);
4342 if (l == len) {
4343 return NEIGHBOR_FOUND;
4345 else {
4346 memset(p+l, 0xff, len-l);
4349 if (MBCLEN_INVALID_P(l) && i < len-1) {
4350 long len2;
4351 int l2;
4352 for (len2 = len-1; 0 < len2; len2--) {
4353 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4354 if (!MBCLEN_INVALID_P(l2))
4355 break;
4357 memset(p+len2+1, 0xff, len-(len2+1));
4362 static enum neighbor_char
4363 enc_pred_char(char *p, long len, rb_encoding *enc)
4365 long i;
4366 int l;
4367 if (rb_enc_mbminlen(enc) > 1) {
4368 /* wchar, trivial case */
4369 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4370 if (!MBCLEN_CHARFOUND_P(r)) {
4371 return NEIGHBOR_NOT_CHAR;
4373 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4374 if (!c) return NEIGHBOR_NOT_CHAR;
4375 --c;
4376 l = rb_enc_code_to_mbclen(c, enc);
4377 if (!l) return NEIGHBOR_NOT_CHAR;
4378 if (l != len) return NEIGHBOR_WRAPPED;
4379 rb_enc_mbcput(c, p, enc);
4380 r = rb_enc_precise_mbclen(p, p + len, enc);
4381 if (!MBCLEN_CHARFOUND_P(r)) {
4382 return NEIGHBOR_NOT_CHAR;
4384 return NEIGHBOR_FOUND;
4386 while (1) {
4387 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4388 p[i] = '\xff';
4389 if (i < 0)
4390 return NEIGHBOR_WRAPPED;
4391 --((unsigned char*)p)[i];
4392 l = rb_enc_precise_mbclen(p, p+len, enc);
4393 if (MBCLEN_CHARFOUND_P(l)) {
4394 l = MBCLEN_CHARFOUND_LEN(l);
4395 if (l == len) {
4396 return NEIGHBOR_FOUND;
4398 else {
4399 memset(p+l, 0, len-l);
4402 if (MBCLEN_INVALID_P(l) && i < len-1) {
4403 long len2;
4404 int l2;
4405 for (len2 = len-1; 0 < len2; len2--) {
4406 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4407 if (!MBCLEN_INVALID_P(l2))
4408 break;
4410 memset(p+len2+1, 0, len-(len2+1));
4416 overwrite +p+ by succeeding letter in +enc+ and returns
4417 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4418 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4419 assuming each ranges are successive, and mbclen
4420 never change in each ranges.
4421 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4422 character.
4424 static enum neighbor_char
4425 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4427 enum neighbor_char ret;
4428 unsigned int c;
4429 int ctype;
4430 int range;
4431 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4433 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4434 int try;
4435 const int max_gaps = 1;
4437 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4438 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4439 ctype = ONIGENC_CTYPE_DIGIT;
4440 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4441 ctype = ONIGENC_CTYPE_ALPHA;
4442 else
4443 return NEIGHBOR_NOT_CHAR;
4445 MEMCPY(save, p, char, len);
4446 for (try = 0; try <= max_gaps; ++try) {
4447 ret = enc_succ_char(p, len, enc);
4448 if (ret == NEIGHBOR_FOUND) {
4449 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4450 if (rb_enc_isctype(c, ctype, enc))
4451 return NEIGHBOR_FOUND;
4454 MEMCPY(p, save, char, len);
4455 range = 1;
4456 while (1) {
4457 MEMCPY(save, p, char, len);
4458 ret = enc_pred_char(p, len, enc);
4459 if (ret == NEIGHBOR_FOUND) {
4460 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4461 if (!rb_enc_isctype(c, ctype, enc)) {
4462 MEMCPY(p, save, char, len);
4463 break;
4466 else {
4467 MEMCPY(p, save, char, len);
4468 break;
4470 range++;
4472 if (range == 1) {
4473 return NEIGHBOR_NOT_CHAR;
4476 if (ctype != ONIGENC_CTYPE_DIGIT) {
4477 MEMCPY(carry, p, char, len);
4478 return NEIGHBOR_WRAPPED;
4481 MEMCPY(carry, p, char, len);
4482 enc_succ_char(carry, len, enc);
4483 return NEIGHBOR_WRAPPED;
4487 static VALUE str_succ(VALUE str);
4490 * call-seq:
4491 * succ -> new_str
4493 * Returns the successor to +self+. The successor is calculated by
4494 * incrementing characters.
4496 * The first character to be incremented is the rightmost alphanumeric:
4497 * or, if no alphanumerics, the rightmost character:
4499 * 'THX1138'.succ # => "THX1139"
4500 * '<<koala>>'.succ # => "<<koalb>>"
4501 * '***'.succ # => '**+'
4503 * The successor to a digit is another digit, "carrying" to the next-left
4504 * character for a "rollover" from 9 to 0, and prepending another digit
4505 * if necessary:
4507 * '00'.succ # => "01"
4508 * '09'.succ # => "10"
4509 * '99'.succ # => "100"
4511 * The successor to a letter is another letter of the same case,
4512 * carrying to the next-left character for a rollover,
4513 * and prepending another same-case letter if necessary:
4515 * 'aa'.succ # => "ab"
4516 * 'az'.succ # => "ba"
4517 * 'zz'.succ # => "aaa"
4518 * 'AA'.succ # => "AB"
4519 * 'AZ'.succ # => "BA"
4520 * 'ZZ'.succ # => "AAA"
4522 * The successor to a non-alphanumeric character is the next character
4523 * in the underlying character set's collating sequence,
4524 * carrying to the next-left character for a rollover,
4525 * and prepending another character if necessary:
4527 * s = 0.chr * 3
4528 * s # => "\x00\x00\x00"
4529 * s.succ # => "\x00\x00\x01"
4530 * s = 255.chr * 3
4531 * s # => "\xFF\xFF\xFF"
4532 * s.succ # => "\x01\x00\x00\x00"
4534 * Carrying can occur between and among mixtures of alphanumeric characters:
4536 * s = 'zz99zz99'
4537 * s.succ # => "aaa00aa00"
4538 * s = '99zz99zz'
4539 * s.succ # => "100aa00aa"
4541 * The successor to an empty \String is a new empty \String:
4543 * ''.succ # => ""
4545 * String#next is an alias for String#succ.
4548 VALUE
4549 rb_str_succ(VALUE orig)
4551 VALUE str;
4552 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4553 rb_enc_cr_str_copy_for_substr(str, orig);
4554 return str_succ(str);
4557 static VALUE
4558 str_succ(VALUE str)
4560 rb_encoding *enc;
4561 char *sbeg, *s, *e, *last_alnum = 0;
4562 int found_alnum = 0;
4563 long l, slen;
4564 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4565 long carry_pos = 0, carry_len = 1;
4566 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4568 slen = RSTRING_LEN(str);
4569 if (slen == 0) return str;
4571 enc = STR_ENC_GET(str);
4572 sbeg = RSTRING_PTR(str);
4573 s = e = sbeg + slen;
4575 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4576 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4577 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4578 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4579 break;
4582 l = rb_enc_precise_mbclen(s, e, enc);
4583 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4584 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4585 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4586 switch (neighbor) {
4587 case NEIGHBOR_NOT_CHAR:
4588 continue;
4589 case NEIGHBOR_FOUND:
4590 return str;
4591 case NEIGHBOR_WRAPPED:
4592 last_alnum = s;
4593 break;
4595 found_alnum = 1;
4596 carry_pos = s - sbeg;
4597 carry_len = l;
4599 if (!found_alnum) { /* str contains no alnum */
4600 s = e;
4601 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4602 enum neighbor_char neighbor;
4603 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4604 l = rb_enc_precise_mbclen(s, e, enc);
4605 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4606 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4607 MEMCPY(tmp, s, char, l);
4608 neighbor = enc_succ_char(tmp, l, enc);
4609 switch (neighbor) {
4610 case NEIGHBOR_FOUND:
4611 MEMCPY(s, tmp, char, l);
4612 return str;
4613 break;
4614 case NEIGHBOR_WRAPPED:
4615 MEMCPY(s, tmp, char, l);
4616 break;
4617 case NEIGHBOR_NOT_CHAR:
4618 break;
4620 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4621 /* wrapped to \0...\0. search next valid char. */
4622 enc_succ_char(s, l, enc);
4624 if (!rb_enc_asciicompat(enc)) {
4625 MEMCPY(carry, s, char, l);
4626 carry_len = l;
4628 carry_pos = s - sbeg;
4630 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4632 RESIZE_CAPA(str, slen + carry_len);
4633 sbeg = RSTRING_PTR(str);
4634 s = sbeg + carry_pos;
4635 memmove(s + carry_len, s, slen - carry_pos);
4636 memmove(s, carry, carry_len);
4637 slen += carry_len;
4638 STR_SET_LEN(str, slen);
4639 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4640 rb_enc_str_coderange(str);
4641 return str;
4646 * call-seq:
4647 * succ! -> self
4649 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4651 * String#next! is an alias for String#succ!.
4654 static VALUE
4655 rb_str_succ_bang(VALUE str)
4657 rb_str_modify(str);
4658 str_succ(str);
4659 return str;
4662 static int
4663 all_digits_p(const char *s, long len)
4665 while (len-- > 0) {
4666 if (!ISDIGIT(*s)) return 0;
4667 s++;
4669 return 1;
4672 static int
4673 str_upto_i(VALUE str, VALUE arg)
4675 rb_yield(str);
4676 return 0;
4680 * call-seq:
4681 * upto(other_string, exclusive = false) {|string| ... } -> self
4682 * upto(other_string, exclusive = false) -> new_enumerator
4684 * With a block given, calls the block with each \String value
4685 * returned by successive calls to String#succ;
4686 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4687 * the sequence terminates when value +other_string+ is reached;
4688 * returns +self+:
4690 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4691 * Output:
4693 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4695 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4697 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4699 * Output:
4701 * a8 a9 b0 b1 b2 b3 b4 b5
4703 * If +other_string+ would not be reached, does not call the block:
4705 * '25'.upto('5') {|s| fail s }
4706 * 'aa'.upto('a') {|s| fail s }
4708 * With no block given, returns a new \Enumerator:
4710 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4714 static VALUE
4715 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4717 VALUE end, exclusive;
4719 rb_scan_args(argc, argv, "11", &end, &exclusive);
4720 RETURN_ENUMERATOR(beg, argc, argv);
4721 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4724 VALUE
4725 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4727 VALUE current, after_end;
4728 ID succ;
4729 int n, ascii;
4730 rb_encoding *enc;
4732 CONST_ID(succ, "succ");
4733 StringValue(end);
4734 enc = rb_enc_check(beg, end);
4735 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4736 /* single character */
4737 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4738 char c = RSTRING_PTR(beg)[0];
4739 char e = RSTRING_PTR(end)[0];
4741 if (c > e || (excl && c == e)) return beg;
4742 for (;;) {
4743 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4744 if (!excl && c == e) break;
4745 c++;
4746 if (excl && c == e) break;
4748 return beg;
4750 /* both edges are all digits */
4751 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4752 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4753 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4754 VALUE b, e;
4755 int width;
4757 width = RSTRING_LENINT(beg);
4758 b = rb_str_to_inum(beg, 10, FALSE);
4759 e = rb_str_to_inum(end, 10, FALSE);
4760 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4761 long bi = FIX2LONG(b);
4762 long ei = FIX2LONG(e);
4763 rb_encoding *usascii = rb_usascii_encoding();
4765 while (bi <= ei) {
4766 if (excl && bi == ei) break;
4767 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4768 bi++;
4771 else {
4772 ID op = excl ? '<' : idLE;
4773 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4775 args[0] = INT2FIX(width);
4776 while (rb_funcall(b, op, 1, e)) {
4777 args[1] = b;
4778 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4779 b = rb_funcallv(b, succ, 0, 0);
4782 return beg;
4784 /* normal case */
4785 n = rb_str_cmp(beg, end);
4786 if (n > 0 || (excl && n == 0)) return beg;
4788 after_end = rb_funcallv(end, succ, 0, 0);
4789 current = str_duplicate(rb_cString, beg);
4790 while (!rb_str_equal(current, after_end)) {
4791 VALUE next = Qnil;
4792 if (excl || !rb_str_equal(current, end))
4793 next = rb_funcallv(current, succ, 0, 0);
4794 if ((*each)(current, arg)) break;
4795 if (NIL_P(next)) break;
4796 current = next;
4797 StringValue(current);
4798 if (excl && rb_str_equal(current, end)) break;
4799 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4800 break;
4803 return beg;
4806 VALUE
4807 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4809 VALUE current;
4810 ID succ;
4812 CONST_ID(succ, "succ");
4813 /* both edges are all digits */
4814 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4815 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4816 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4817 int width = RSTRING_LENINT(beg);
4818 b = rb_str_to_inum(beg, 10, FALSE);
4819 if (FIXNUM_P(b)) {
4820 long bi = FIX2LONG(b);
4821 rb_encoding *usascii = rb_usascii_encoding();
4823 while (FIXABLE(bi)) {
4824 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4825 bi++;
4827 b = LONG2NUM(bi);
4829 args[0] = INT2FIX(width);
4830 while (1) {
4831 args[1] = b;
4832 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4833 b = rb_funcallv(b, succ, 0, 0);
4836 /* normal case */
4837 current = str_duplicate(rb_cString, beg);
4838 while (1) {
4839 VALUE next = rb_funcallv(current, succ, 0, 0);
4840 if ((*each)(current, arg)) break;
4841 current = next;
4842 StringValue(current);
4843 if (RSTRING_LEN(current) == 0)
4844 break;
4847 return beg;
4850 static int
4851 include_range_i(VALUE str, VALUE arg)
4853 VALUE *argp = (VALUE *)arg;
4854 if (!rb_equal(str, *argp)) return 0;
4855 *argp = Qnil;
4856 return 1;
4859 VALUE
4860 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4862 beg = rb_str_new_frozen(beg);
4863 StringValue(end);
4864 end = rb_str_new_frozen(end);
4865 if (NIL_P(val)) return Qfalse;
4866 val = rb_check_string_type(val);
4867 if (NIL_P(val)) return Qfalse;
4868 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4869 rb_enc_asciicompat(STR_ENC_GET(end)) &&
4870 rb_enc_asciicompat(STR_ENC_GET(val))) {
4871 const char *bp = RSTRING_PTR(beg);
4872 const char *ep = RSTRING_PTR(end);
4873 const char *vp = RSTRING_PTR(val);
4874 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4875 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4876 return Qfalse;
4877 else {
4878 char b = *bp;
4879 char e = *ep;
4880 char v = *vp;
4882 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4883 if (b <= v && v < e) return Qtrue;
4884 return RBOOL(!RTEST(exclusive) && v == e);
4888 #if 0
4889 /* both edges are all digits */
4890 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4891 all_digits_p(bp, RSTRING_LEN(beg)) &&
4892 all_digits_p(ep, RSTRING_LEN(end))) {
4893 /* TODO */
4895 #endif
4897 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4899 return RBOOL(NIL_P(val));
4902 static VALUE
4903 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4905 if (rb_reg_search(re, str, 0, 0) >= 0) {
4906 VALUE match = rb_backref_get();
4907 int nth = rb_reg_backref_number(match, backref);
4908 return rb_reg_nth_match(nth, match);
4910 return Qnil;
4913 static VALUE
4914 rb_str_aref(VALUE str, VALUE indx)
4916 long idx;
4918 if (FIXNUM_P(indx)) {
4919 idx = FIX2LONG(indx);
4921 else if (RB_TYPE_P(indx, T_REGEXP)) {
4922 return rb_str_subpat(str, indx, INT2FIX(0));
4924 else if (RB_TYPE_P(indx, T_STRING)) {
4925 if (rb_str_index(str, indx, 0) != -1)
4926 return str_duplicate(rb_cString, indx);
4927 return Qnil;
4929 else {
4930 /* check if indx is Range */
4931 long beg, len = str_strlen(str, NULL);
4932 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4933 case Qfalse:
4934 break;
4935 case Qnil:
4936 return Qnil;
4937 default:
4938 return rb_str_substr(str, beg, len);
4940 idx = NUM2LONG(indx);
4943 return str_substr(str, idx, 1, FALSE);
4948 * call-seq:
4949 * string[index] -> new_string or nil
4950 * string[start, length] -> new_string or nil
4951 * string[range] -> new_string or nil
4952 * string[regexp, capture = 0] -> new_string or nil
4953 * string[substring] -> new_string or nil
4955 * Returns the substring of +self+ specified by the arguments.
4957 * When the single \Integer argument +index+ is given,
4958 * returns the 1-character substring found in +self+ at offset +index+:
4960 * 'bar'[2] # => "r"
4962 * Counts backward from the end of +self+ if +index+ is negative:
4964 * 'foo'[-3] # => "f"
4966 * Returns +nil+ if +index+ is out of range:
4968 * 'foo'[3] # => nil
4969 * 'foo'[-4] # => nil
4971 * When the two \Integer arguments +start+ and +length+ are given,
4972 * returns the substring of the given +length+ found in +self+ at offset +start+:
4974 * 'foo'[0, 2] # => "fo"
4975 * 'foo'[0, 0] # => ""
4977 * Counts backward from the end of +self+ if +start+ is negative:
4979 * 'foo'[-2, 2] # => "oo"
4981 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4983 * 'foo'[3, 2] # => ""
4985 * Returns +nil+ if +start+ is out of range:
4987 * 'foo'[4, 2] # => nil
4988 * 'foo'[-4, 2] # => nil
4990 * Returns the trailing substring of +self+ if +length+ is large:
4992 * 'foo'[1, 50] # => "oo"
4994 * Returns +nil+ if +length+ is negative:
4996 * 'foo'[0, -1] # => nil
4998 * When the single \Range argument +range+ is given,
4999 * derives +start+ and +length+ values from the given +range+,
5000 * and returns values as above:
5002 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5003 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5005 * When the \Regexp argument +regexp+ is given,
5006 * and the +capture+ argument is <tt>0</tt>,
5007 * returns the first matching substring found in +self+,
5008 * or +nil+ if none found:
5010 * 'foo'[/o/] # => "o"
5011 * 'foo'[/x/] # => nil
5012 * s = 'hello there'
5013 * s[/[aeiou](.)\1/] # => "ell"
5014 * s[/[aeiou](.)\1/, 0] # => "ell"
5016 * If argument +capture+ is given and not <tt>0</tt>,
5017 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5018 * the method call returns only the specified capture
5019 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5021 * s = 'hello there'
5022 * s[/[aeiou](.)\1/, 1] # => "l"
5023 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5024 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5026 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5027 * capture group name is given, +IndexError+ is raised.
5029 * When the single \String argument +substring+ is given,
5030 * returns the substring from +self+ if found, otherwise +nil+:
5032 * 'foo'['oo'] # => "oo"
5033 * 'foo'['xx'] # => nil
5035 * String#slice is an alias for String#[].
5038 static VALUE
5039 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5041 if (argc == 2) {
5042 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5043 return rb_str_subpat(str, argv[0], argv[1]);
5045 else {
5046 long beg = NUM2LONG(argv[0]);
5047 long len = NUM2LONG(argv[1]);
5048 return rb_str_substr(str, beg, len);
5051 rb_check_arity(argc, 1, 2);
5052 return rb_str_aref(str, argv[0]);
5055 VALUE
5056 rb_str_drop_bytes(VALUE str, long len)
5058 char *ptr = RSTRING_PTR(str);
5059 long olen = RSTRING_LEN(str), nlen;
5061 str_modifiable(str);
5062 if (len > olen) len = olen;
5063 nlen = olen - len;
5064 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5065 char *oldptr = ptr;
5066 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5067 STR_SET_EMBED(str);
5068 STR_SET_EMBED_LEN(str, nlen);
5069 ptr = RSTRING(str)->as.embed.ary;
5070 memmove(ptr, oldptr + len, nlen);
5071 if (fl == STR_NOEMBED) xfree(oldptr);
5073 else {
5074 if (!STR_SHARED_P(str)) {
5075 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5076 rb_enc_cr_str_exact_copy(shared, str);
5077 OBJ_FREEZE(shared);
5079 ptr = RSTRING(str)->as.heap.ptr += len;
5080 RSTRING(str)->as.heap.len = nlen;
5082 ptr[nlen] = 0;
5083 ENC_CODERANGE_CLEAR(str);
5084 return str;
5087 static void
5088 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5090 char *sptr;
5091 long slen, vlen = RSTRING_LEN(val);
5092 int cr;
5094 if (beg == 0 && vlen == 0) {
5095 rb_str_drop_bytes(str, len);
5096 return;
5099 str_modify_keep_cr(str);
5100 RSTRING_GETMEM(str, sptr, slen);
5101 if (len < vlen) {
5102 /* expand string */
5103 RESIZE_CAPA(str, slen + vlen - len);
5104 sptr = RSTRING_PTR(str);
5107 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5108 cr = rb_enc_str_coderange(val);
5109 else
5110 cr = ENC_CODERANGE_UNKNOWN;
5112 if (vlen != len) {
5113 memmove(sptr + beg + vlen,
5114 sptr + beg + len,
5115 slen - (beg + len));
5117 if (vlen < beg && len < 0) {
5118 MEMZERO(sptr + slen, char, -len);
5120 if (vlen > 0) {
5121 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5123 slen += vlen - len;
5124 STR_SET_LEN(str, slen);
5125 TERM_FILL(&sptr[slen], TERM_LEN(str));
5126 ENC_CODERANGE_SET(str, cr);
5129 void
5130 rb_str_update(VALUE str, long beg, long len, VALUE val)
5132 long slen;
5133 char *p, *e;
5134 rb_encoding *enc;
5135 int singlebyte = single_byte_optimizable(str);
5136 int cr;
5138 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5140 StringValue(val);
5141 enc = rb_enc_check(str, val);
5142 slen = str_strlen(str, enc); /* rb_enc_check */
5144 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5145 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5147 if (beg < 0) {
5148 beg += slen;
5150 assert(beg >= 0);
5151 assert(beg <= slen);
5152 if (len > slen - beg) {
5153 len = slen - beg;
5155 str_modify_keep_cr(str);
5156 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5157 if (!p) p = RSTRING_END(str);
5158 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5159 if (!e) e = RSTRING_END(str);
5160 /* error check */
5161 beg = p - RSTRING_PTR(str); /* physical position */
5162 len = e - p; /* physical length */
5163 rb_str_splice_0(str, beg, len, val);
5164 rb_enc_associate(str, enc);
5165 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5166 if (cr != ENC_CODERANGE_BROKEN)
5167 ENC_CODERANGE_SET(str, cr);
5170 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5172 static void
5173 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5175 int nth;
5176 VALUE match;
5177 long start, end, len;
5178 rb_encoding *enc;
5179 struct re_registers *regs;
5181 if (rb_reg_search(re, str, 0, 0) < 0) {
5182 rb_raise(rb_eIndexError, "regexp not matched");
5184 match = rb_backref_get();
5185 nth = rb_reg_backref_number(match, backref);
5186 regs = RMATCH_REGS(match);
5187 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5188 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5190 if (nth < 0) {
5191 nth += regs->num_regs;
5194 start = BEG(nth);
5195 if (start == -1) {
5196 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5198 end = END(nth);
5199 len = end - start;
5200 StringValue(val);
5201 enc = rb_enc_check_str(str, val);
5202 rb_str_splice_0(str, start, len, val);
5203 rb_enc_associate(str, enc);
5206 static VALUE
5207 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5209 long idx, beg;
5211 switch (TYPE(indx)) {
5212 case T_REGEXP:
5213 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5214 return val;
5216 case T_STRING:
5217 beg = rb_str_index(str, indx, 0);
5218 if (beg < 0) {
5219 rb_raise(rb_eIndexError, "string not matched");
5221 beg = rb_str_sublen(str, beg);
5222 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5223 return val;
5225 default:
5226 /* check if indx is Range */
5228 long beg, len;
5229 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5230 rb_str_splice(str, beg, len, val);
5231 return val;
5234 /* FALLTHROUGH */
5236 case T_FIXNUM:
5237 idx = NUM2LONG(indx);
5238 rb_str_splice(str, idx, 1, val);
5239 return val;
5244 * call-seq:
5245 * str[integer] = new_str
5246 * str[integer, integer] = new_str
5247 * str[range] = aString
5248 * str[regexp] = new_str
5249 * str[regexp, integer] = new_str
5250 * str[regexp, name] = new_str
5251 * str[other_str] = new_str
5253 * Element Assignment---Replaces some or all of the content of
5254 * <i>str</i>. The portion of the string affected is determined using
5255 * the same criteria as String#[]. If the replacement string is not
5256 * the same length as the text it is replacing, the string will be
5257 * adjusted accordingly. If the regular expression or string is used
5258 * as the index doesn't match a position in the string, IndexError is
5259 * raised. If the regular expression form is used, the optional
5260 * second Integer allows you to specify which portion of the match to
5261 * replace (effectively using the MatchData indexing rules. The forms
5262 * that take an Integer will raise an IndexError if the value is out
5263 * of range; the Range form will raise a RangeError, and the Regexp
5264 * and String will raise an IndexError on negative match.
5267 static VALUE
5268 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5270 if (argc == 3) {
5271 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5272 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5274 else {
5275 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5277 return argv[2];
5279 rb_check_arity(argc, 2, 3);
5280 return rb_str_aset(str, argv[0], argv[1]);
5284 * call-seq:
5285 * insert(index, other_string) -> self
5287 * Inserts the given +other_string+ into +self+; returns +self+.
5289 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5291 * 'foo'.insert(1, 'bar') # => "fbaroo"
5293 * If the \Integer +index+ is negative, counts backward from the end of +self+
5294 * and inserts +other_string+ at offset <tt>index+1</tt>
5295 * (that is, _after_ <tt>self[index]</tt>):
5297 * 'foo'.insert(-2, 'bar') # => "fobaro"
5301 static VALUE
5302 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5304 long pos = NUM2LONG(idx);
5306 if (pos == -1) {
5307 return rb_str_append(str, str2);
5309 else if (pos < 0) {
5310 pos++;
5312 rb_str_splice(str, pos, 0, str2);
5313 return str;
5318 * call-seq:
5319 * slice!(index) -> new_string or nil
5320 * slice!(start, length) -> new_string or nil
5321 * slice!(range) -> new_string or nil
5322 * slice!(regexp, capture = 0) -> new_string or nil
5323 * slice!(substring) -> new_string or nil
5325 * Removes the substring of +self+ specified by the arguments;
5326 * returns the removed substring.
5328 * See String#[] for details about the arguments that specify the substring.
5330 * A few examples:
5332 * string = "This is a string"
5333 * string.slice!(2) #=> "i"
5334 * string.slice!(3..6) #=> " is "
5335 * string.slice!(/s.*t/) #=> "sa st"
5336 * string.slice!("r") #=> "r"
5337 * string #=> "Thing"
5341 static VALUE
5342 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5344 VALUE result = Qnil;
5345 VALUE indx;
5346 long beg, len = 1;
5347 char *p;
5349 rb_check_arity(argc, 1, 2);
5350 str_modify_keep_cr(str);
5351 indx = argv[0];
5352 if (RB_TYPE_P(indx, T_REGEXP)) {
5353 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5354 VALUE match = rb_backref_get();
5355 struct re_registers *regs = RMATCH_REGS(match);
5356 int nth = 0;
5357 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5358 if ((nth += regs->num_regs) <= 0) return Qnil;
5360 else if (nth >= regs->num_regs) return Qnil;
5361 beg = BEG(nth);
5362 len = END(nth) - beg;
5363 goto subseq;
5365 else if (argc == 2) {
5366 beg = NUM2LONG(indx);
5367 len = NUM2LONG(argv[1]);
5368 goto num_index;
5370 else if (FIXNUM_P(indx)) {
5371 beg = FIX2LONG(indx);
5372 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5373 if (!len) return Qnil;
5374 beg = p - RSTRING_PTR(str);
5375 goto subseq;
5377 else if (RB_TYPE_P(indx, T_STRING)) {
5378 beg = rb_str_index(str, indx, 0);
5379 if (beg == -1) return Qnil;
5380 len = RSTRING_LEN(indx);
5381 result = str_duplicate(rb_cString, indx);
5382 goto squash;
5384 else {
5385 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5386 case Qnil:
5387 return Qnil;
5388 case Qfalse:
5389 beg = NUM2LONG(indx);
5390 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5391 if (!len) return Qnil;
5392 beg = p - RSTRING_PTR(str);
5393 goto subseq;
5394 default:
5395 goto num_index;
5399 num_index:
5400 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5401 beg = p - RSTRING_PTR(str);
5403 subseq:
5404 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5405 rb_enc_cr_str_copy_for_substr(result, str);
5407 squash:
5408 if (len > 0) {
5409 if (beg == 0) {
5410 rb_str_drop_bytes(str, len);
5412 else {
5413 char *sptr = RSTRING_PTR(str);
5414 long slen = RSTRING_LEN(str);
5415 if (beg + len > slen) /* pathological check */
5416 len = slen - beg;
5417 memmove(sptr + beg,
5418 sptr + beg + len,
5419 slen - (beg + len));
5420 slen -= len;
5421 STR_SET_LEN(str, slen);
5422 TERM_FILL(&sptr[slen], TERM_LEN(str));
5425 return result;
5428 static VALUE
5429 get_pat(VALUE pat)
5431 VALUE val;
5433 switch (OBJ_BUILTIN_TYPE(pat)) {
5434 case T_REGEXP:
5435 return pat;
5437 case T_STRING:
5438 break;
5440 default:
5441 val = rb_check_string_type(pat);
5442 if (NIL_P(val)) {
5443 Check_Type(pat, T_REGEXP);
5445 pat = val;
5448 return rb_reg_regcomp(pat);
5451 static VALUE
5452 get_pat_quoted(VALUE pat, int check)
5454 VALUE val;
5456 switch (OBJ_BUILTIN_TYPE(pat)) {
5457 case T_REGEXP:
5458 return pat;
5460 case T_STRING:
5461 break;
5463 default:
5464 val = rb_check_string_type(pat);
5465 if (NIL_P(val)) {
5466 Check_Type(pat, T_REGEXP);
5468 pat = val;
5470 if (check && is_broken_string(pat)) {
5471 rb_exc_raise(rb_reg_check_preprocess(pat));
5473 return pat;
5476 static long
5477 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5479 if (BUILTIN_TYPE(pat) == T_STRING) {
5480 pos = rb_strseq_index(str, pat, pos, 1);
5481 if (set_backref_str) {
5482 if (pos >= 0) {
5483 str = rb_str_new_frozen_String(str);
5484 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5486 else {
5487 rb_backref_set(Qnil);
5490 return pos;
5492 else {
5493 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5499 * call-seq:
5500 * sub!(pattern, replacement) -> self or nil
5501 * sub!(pattern) {|match| ... } -> self or nil
5503 * Returns +self+ with only the first occurrence
5504 * (not all occurrences) of the given +pattern+ replaced.
5506 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5508 * Related: String#sub, String#gsub, String#gsub!.
5512 static VALUE
5513 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5515 VALUE pat, repl, hash = Qnil;
5516 int iter = 0;
5517 long plen;
5518 int min_arity = rb_block_given_p() ? 1 : 2;
5519 long beg;
5521 rb_check_arity(argc, min_arity, 2);
5522 if (argc == 1) {
5523 iter = 1;
5525 else {
5526 repl = argv[1];
5527 hash = rb_check_hash_type(argv[1]);
5528 if (NIL_P(hash)) {
5529 StringValue(repl);
5533 pat = get_pat_quoted(argv[0], 1);
5535 str_modifiable(str);
5536 beg = rb_pat_search(pat, str, 0, 1);
5537 if (beg >= 0) {
5538 rb_encoding *enc;
5539 int cr = ENC_CODERANGE(str);
5540 long beg0, end0;
5541 VALUE match, match0 = Qnil;
5542 struct re_registers *regs;
5543 char *p, *rp;
5544 long len, rlen;
5546 match = rb_backref_get();
5547 regs = RMATCH_REGS(match);
5548 if (RB_TYPE_P(pat, T_STRING)) {
5549 beg0 = beg;
5550 end0 = beg0 + RSTRING_LEN(pat);
5551 match0 = pat;
5553 else {
5554 beg0 = BEG(0);
5555 end0 = END(0);
5556 if (iter) match0 = rb_reg_nth_match(0, match);
5559 if (iter || !NIL_P(hash)) {
5560 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5562 if (iter) {
5563 repl = rb_obj_as_string(rb_yield(match0));
5565 else {
5566 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5567 repl = rb_obj_as_string(repl);
5569 str_mod_check(str, p, len);
5570 rb_check_frozen(str);
5572 else {
5573 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5576 enc = rb_enc_compatible(str, repl);
5577 if (!enc) {
5578 rb_encoding *str_enc = STR_ENC_GET(str);
5579 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5580 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5581 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5582 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5583 rb_enc_name(str_enc),
5584 rb_enc_name(STR_ENC_GET(repl)));
5586 enc = STR_ENC_GET(repl);
5588 rb_str_modify(str);
5589 rb_enc_associate(str, enc);
5590 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5591 int cr2 = ENC_CODERANGE(repl);
5592 if (cr2 == ENC_CODERANGE_BROKEN ||
5593 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5594 cr = ENC_CODERANGE_UNKNOWN;
5595 else
5596 cr = cr2;
5598 plen = end0 - beg0;
5599 rlen = RSTRING_LEN(repl);
5600 len = RSTRING_LEN(str);
5601 if (rlen > plen) {
5602 RESIZE_CAPA(str, len + rlen - plen);
5604 p = RSTRING_PTR(str);
5605 if (rlen != plen) {
5606 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5608 rp = RSTRING_PTR(repl);
5609 memmove(p + beg0, rp, rlen);
5610 len += rlen - plen;
5611 STR_SET_LEN(str, len);
5612 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5613 ENC_CODERANGE_SET(str, cr);
5615 return str;
5617 return Qnil;
5622 * call-seq:
5623 * sub(pattern, replacement) -> new_string
5624 * sub(pattern) {|match| ... } -> new_string
5626 * Returns a copy of +self+ with only the first occurrence
5627 * (not all occurrences) of the given +pattern+ replaced.
5629 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5631 * Related: String#sub!, String#gsub, String#gsub!.
5635 static VALUE
5636 rb_str_sub(int argc, VALUE *argv, VALUE str)
5638 str = str_duplicate(rb_cString, str);
5639 rb_str_sub_bang(argc, argv, str);
5640 return str;
5643 static VALUE
5644 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5646 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5647 struct re_registers *regs;
5648 long beg, beg0, end0;
5649 long offset, blen, slen, len, last;
5650 enum {STR, ITER, MAP} mode = STR;
5651 char *sp, *cp;
5652 int need_backref = -1;
5653 rb_encoding *str_enc;
5655 switch (argc) {
5656 case 1:
5657 RETURN_ENUMERATOR(str, argc, argv);
5658 mode = ITER;
5659 break;
5660 case 2:
5661 repl = argv[1];
5662 hash = rb_check_hash_type(argv[1]);
5663 if (NIL_P(hash)) {
5664 StringValue(repl);
5666 else {
5667 mode = MAP;
5669 break;
5670 default:
5671 rb_error_arity(argc, 1, 2);
5674 pat = get_pat_quoted(argv[0], 1);
5675 beg = rb_pat_search(pat, str, 0, need_backref);
5676 if (beg < 0) {
5677 if (bang) return Qnil; /* no match, no substitution */
5678 return str_duplicate(rb_cString, str);
5681 offset = 0;
5682 blen = RSTRING_LEN(str) + 30; /* len + margin */
5683 dest = rb_str_buf_new(blen);
5684 sp = RSTRING_PTR(str);
5685 slen = RSTRING_LEN(str);
5686 cp = sp;
5687 str_enc = STR_ENC_GET(str);
5688 rb_enc_associate(dest, str_enc);
5689 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5691 do {
5692 match = rb_backref_get();
5693 regs = RMATCH_REGS(match);
5694 if (RB_TYPE_P(pat, T_STRING)) {
5695 beg0 = beg;
5696 end0 = beg0 + RSTRING_LEN(pat);
5697 match0 = pat;
5699 else {
5700 beg0 = BEG(0);
5701 end0 = END(0);
5702 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5705 if (mode) {
5706 if (mode == ITER) {
5707 val = rb_obj_as_string(rb_yield(match0));
5709 else {
5710 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5711 val = rb_obj_as_string(val);
5713 str_mod_check(str, sp, slen);
5714 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5715 rb_raise(rb_eRuntimeError, "block should not cheat");
5718 else if (need_backref) {
5719 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5720 if (need_backref < 0) {
5721 need_backref = val != repl;
5724 else {
5725 val = repl;
5728 len = beg0 - offset; /* copy pre-match substr */
5729 if (len) {
5730 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5733 rb_str_buf_append(dest, val);
5735 last = offset;
5736 offset = end0;
5737 if (beg0 == end0) {
5739 * Always consume at least one character of the input string
5740 * in order to prevent infinite loops.
5742 if (RSTRING_LEN(str) <= end0) break;
5743 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5744 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5745 offset = end0 + len;
5747 cp = RSTRING_PTR(str) + offset;
5748 if (offset > RSTRING_LEN(str)) break;
5749 beg = rb_pat_search(pat, str, offset, need_backref);
5750 } while (beg >= 0);
5751 if (RSTRING_LEN(str) > offset) {
5752 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5754 rb_pat_search(pat, str, last, 1);
5755 if (bang) {
5756 str_shared_replace(str, dest);
5758 else {
5759 str = dest;
5762 return str;
5767 * call-seq:
5768 * gsub!(pattern, replacement) -> self or nil
5769 * gsub!(pattern) {|match| ... } -> self or nil
5770 * gsub!(pattern) -> an_enumerator
5772 * Performs the specified substring replacement(s) on +self+;
5773 * returns +self+ if any replacement occurred, +nil+ otherwise.
5775 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5777 * Returns an Enumerator if no +replacement+ and no block given.
5779 * Related: String#sub, String#gsub, String#sub!.
5783 static VALUE
5784 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5786 str_modify_keep_cr(str);
5787 return str_gsub(argc, argv, str, 1);
5792 * call-seq:
5793 * gsub(pattern, replacement) -> new_string
5794 * gsub(pattern) {|match| ... } -> new_string
5795 * gsub(pattern) -> enumerator
5797 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5799 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5801 * Returns an Enumerator if no +replacement+ and no block given.
5803 * Related: String#sub, String#sub!, String#gsub!.
5807 static VALUE
5808 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5810 return str_gsub(argc, argv, str, 0);
5815 * call-seq:
5816 * replace(other_string) -> self
5818 * Replaces the contents of +self+ with the contents of +other_string+:
5820 * s = 'foo' # => "foo"
5821 * s.replace('bar') # => "bar"
5825 VALUE
5826 rb_str_replace(VALUE str, VALUE str2)
5828 str_modifiable(str);
5829 if (str == str2) return str;
5831 StringValue(str2);
5832 str_discard(str);
5833 return str_replace(str, str2);
5837 * call-seq:
5838 * clear -> self
5840 * Removes the contents of +self+:
5842 * s = 'foo' # => "foo"
5843 * s.clear # => ""
5847 static VALUE
5848 rb_str_clear(VALUE str)
5850 str_discard(str);
5851 STR_SET_EMBED(str);
5852 STR_SET_EMBED_LEN(str, 0);
5853 RSTRING_PTR(str)[0] = 0;
5854 if (rb_enc_asciicompat(STR_ENC_GET(str)))
5855 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5856 else
5857 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5858 return str;
5862 * call-seq:
5863 * chr -> string
5865 * Returns a string containing the first character of +self+:
5867 * s = 'foo' # => "foo"
5868 * s.chr # => "f"
5872 static VALUE
5873 rb_str_chr(VALUE str)
5875 return rb_str_substr(str, 0, 1);
5879 * call-seq:
5880 * getbyte(index) -> integer
5882 * Returns the byte at zero-based +index+ as an integer:
5884 * s = 'abcde' # => "abcde"
5885 * s.getbyte(0) # => 97
5886 * s.getbyte(1) # => 98
5888 * Related: String#setbyte.
5890 static VALUE
5891 rb_str_getbyte(VALUE str, VALUE index)
5893 long pos = NUM2LONG(index);
5895 if (pos < 0)
5896 pos += RSTRING_LEN(str);
5897 if (pos < 0 || RSTRING_LEN(str) <= pos)
5898 return Qnil;
5900 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5904 * call-seq:
5905 * setbyte(index, integer) -> integer
5907 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5909 * s = 'abcde' # => "abcde"
5910 * s.setbyte(0, 98) # => 98
5911 * s # => "bbcde"
5913 * Related: String#getbyte.
5915 static VALUE
5916 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5918 long pos = NUM2LONG(index);
5919 long len = RSTRING_LEN(str);
5920 char *ptr, *head, *left = 0;
5921 rb_encoding *enc;
5922 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5924 if (pos < -len || len <= pos)
5925 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5926 if (pos < 0)
5927 pos += len;
5929 VALUE v = rb_to_int(value);
5930 VALUE w = rb_int_and(v, INT2FIX(0xff));
5931 char byte = (char)(NUM2INT(w) & 0xFF);
5933 if (!str_independent(str))
5934 str_make_independent(str);
5935 enc = STR_ENC_GET(str);
5936 head = RSTRING_PTR(str);
5937 ptr = &head[pos];
5938 if (!STR_EMBED_P(str)) {
5939 cr = ENC_CODERANGE(str);
5940 switch (cr) {
5941 case ENC_CODERANGE_7BIT:
5942 left = ptr;
5943 *ptr = byte;
5944 if (ISASCII(byte)) goto end;
5945 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5946 if (!MBCLEN_CHARFOUND_P(nlen))
5947 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5948 else
5949 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5950 goto end;
5951 case ENC_CODERANGE_VALID:
5952 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5953 width = rb_enc_precise_mbclen(left, head+len, enc);
5954 *ptr = byte;
5955 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5956 if (!MBCLEN_CHARFOUND_P(nlen))
5957 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5958 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5959 ENC_CODERANGE_CLEAR(str);
5960 goto end;
5963 ENC_CODERANGE_CLEAR(str);
5964 *ptr = byte;
5966 end:
5967 return value;
5970 static VALUE
5971 str_byte_substr(VALUE str, long beg, long len, int empty)
5973 char *p, *s = RSTRING_PTR(str);
5974 long n = RSTRING_LEN(str);
5975 VALUE str2;
5977 if (beg > n || len < 0) return Qnil;
5978 if (beg < 0) {
5979 beg += n;
5980 if (beg < 0) return Qnil;
5982 if (len > n - beg)
5983 len = n - beg;
5984 if (len <= 0) {
5985 if (!empty) return Qnil;
5986 len = 0;
5987 p = 0;
5989 else
5990 p = s + beg;
5992 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5993 str2 = rb_str_new_frozen(str);
5994 str2 = str_new_shared(rb_cString, str2);
5995 RSTRING(str2)->as.heap.ptr += beg;
5996 RSTRING(str2)->as.heap.len = len;
5998 else {
5999 str2 = rb_str_new(p, len);
6002 str_enc_copy(str2, str);
6004 if (RSTRING_LEN(str2) == 0) {
6005 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6006 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6007 else
6008 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6010 else {
6011 switch (ENC_CODERANGE(str)) {
6012 case ENC_CODERANGE_7BIT:
6013 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6014 break;
6015 default:
6016 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6017 break;
6021 return str2;
6024 static VALUE
6025 str_byte_aref(VALUE str, VALUE indx)
6027 long idx;
6028 if (FIXNUM_P(indx)) {
6029 idx = FIX2LONG(indx);
6031 else {
6032 /* check if indx is Range */
6033 long beg, len = RSTRING_LEN(str);
6035 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6036 case Qfalse:
6037 break;
6038 case Qnil:
6039 return Qnil;
6040 default:
6041 return str_byte_substr(str, beg, len, TRUE);
6044 idx = NUM2LONG(indx);
6046 return str_byte_substr(str, idx, 1, FALSE);
6050 * call-seq:
6051 * byteslice(index, length = 1) -> string or nil
6052 * byteslice(range) -> string or nil
6054 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6056 * With integer arguments +index+ and +length+ given,
6057 * returns the substring beginning at the given +index+
6058 * of the given +length+ (if possible),
6059 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6061 * s = '0123456789' # => "0123456789"
6062 * s.byteslice(2) # => "2"
6063 * s.byteslice(200) # => nil
6064 * s.byteslice(4, 3) # => "456"
6065 * s.byteslice(4, 30) # => "456789"
6066 * s.byteslice(4, -1) # => nil
6067 * s.byteslice(40, 2) # => nil
6069 * In either case above, counts backwards from the end of +self+
6070 * if +index+ is negative:
6072 * s = '0123456789' # => "0123456789"
6073 * s.byteslice(-4) # => "6"
6074 * s.byteslice(-4, 3) # => "678"
6076 * With Range argument +range+ given, returns
6077 * <tt>byteslice(range.begin, range.size)</tt>:
6079 * s = '0123456789' # => "0123456789"
6080 * s.byteslice(4..6) # => "456"
6081 * s.byteslice(-6..-4) # => "456"
6082 * s.byteslice(5..2) # => "" # range.size is zero.
6083 * s.byteslice(40..42) # => nil
6085 * In all cases, a returned string has the same encoding as +self+:
6087 * s.encoding # => #<Encoding:UTF-8>
6088 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6092 static VALUE
6093 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6095 if (argc == 2) {
6096 long beg = NUM2LONG(argv[0]);
6097 long end = NUM2LONG(argv[1]);
6098 return str_byte_substr(str, beg, end, TRUE);
6100 rb_check_arity(argc, 1, 2);
6101 return str_byte_aref(str, argv[0]);
6105 * call-seq:
6106 * reverse -> string
6108 * Returns a new string with the characters from +self+ in reverse order.
6110 * 'stressed'.reverse # => "desserts"
6114 static VALUE
6115 rb_str_reverse(VALUE str)
6117 rb_encoding *enc;
6118 VALUE rev;
6119 char *s, *e, *p;
6120 int cr;
6122 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6123 enc = STR_ENC_GET(str);
6124 rev = rb_str_new(0, RSTRING_LEN(str));
6125 s = RSTRING_PTR(str); e = RSTRING_END(str);
6126 p = RSTRING_END(rev);
6127 cr = ENC_CODERANGE(str);
6129 if (RSTRING_LEN(str) > 1) {
6130 if (single_byte_optimizable(str)) {
6131 while (s < e) {
6132 *--p = *s++;
6135 else if (cr == ENC_CODERANGE_VALID) {
6136 while (s < e) {
6137 int clen = rb_enc_fast_mbclen(s, e, enc);
6139 p -= clen;
6140 memcpy(p, s, clen);
6141 s += clen;
6144 else {
6145 cr = rb_enc_asciicompat(enc) ?
6146 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6147 while (s < e) {
6148 int clen = rb_enc_mbclen(s, e, enc);
6150 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6151 p -= clen;
6152 memcpy(p, s, clen);
6153 s += clen;
6157 STR_SET_LEN(rev, RSTRING_LEN(str));
6158 str_enc_copy(rev, str);
6159 ENC_CODERANGE_SET(rev, cr);
6161 return rev;
6166 * call-seq:
6167 * reverse! -> self
6169 * Returns +self+ with its characters reversed:
6171 * s = 'stressed'
6172 * s.reverse! # => "desserts"
6173 * s # => "desserts"
6177 static VALUE
6178 rb_str_reverse_bang(VALUE str)
6180 if (RSTRING_LEN(str) > 1) {
6181 if (single_byte_optimizable(str)) {
6182 char *s, *e, c;
6184 str_modify_keep_cr(str);
6185 s = RSTRING_PTR(str);
6186 e = RSTRING_END(str) - 1;
6187 while (s < e) {
6188 c = *s;
6189 *s++ = *e;
6190 *e-- = c;
6193 else {
6194 str_shared_replace(str, rb_str_reverse(str));
6197 else {
6198 str_modify_keep_cr(str);
6200 return str;
6205 * call-seq:
6206 * include? other_string -> true or false
6208 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6210 * s = 'foo'
6211 * s.include?('f') # => true
6212 * s.include?('fo') # => true
6213 * s.include?('food') # => false
6217 static VALUE
6218 rb_str_include(VALUE str, VALUE arg)
6220 long i;
6222 StringValue(arg);
6223 i = rb_str_index(str, arg, 0);
6225 return RBOOL(i != -1);
6230 * call-seq:
6231 * to_i(base = 10) -> integer
6233 * Returns the result of interpreting leading characters in +self+
6234 * as an integer in the given +base+ (which must be in (2..36)):
6236 * '123456'.to_i # => 123456
6237 * '123def'.to_i(16) # => 1195503
6239 * Characters past a leading valid number (in the given +base+) are ignored:
6241 * '12.345'.to_i # => 12
6242 * '12345'.to_i(2) # => 1
6244 * Returns zero if there is no leading valid number:
6246 * 'abcdef'.to_i # => 0
6247 * '2'.to_i(2) # => 0
6251 static VALUE
6252 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6254 int base = 10;
6256 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6257 rb_raise(rb_eArgError, "invalid radix %d", base);
6259 return rb_str_to_inum(str, base, FALSE);
6264 * call-seq:
6265 * to_f -> float
6267 * Returns the result of interpreting leading characters in +self+ as a Float:
6269 * '3.14159'.to_f # => 3.14159
6270 '1.234e-2'.to_f # => 0.01234
6272 * Characters past a leading valid number (in the given +base+) are ignored:
6274 * '3.14 (pi to two places)'.to_f # => 3.14
6276 * Returns zero if there is no leading valid number:
6278 * 'abcdef'.to_f # => 0.0
6282 static VALUE
6283 rb_str_to_f(VALUE str)
6285 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6290 * call-seq:
6291 * to_s -> self or string
6293 * Returns +self+ if +self+ is a \String,
6294 * or +self+ converted to a \String if +self+ is a subclass of \String.
6296 * String#to_str is an alias for String#to_s.
6300 static VALUE
6301 rb_str_to_s(VALUE str)
6303 if (rb_obj_class(str) != rb_cString) {
6304 return str_duplicate(rb_cString, str);
6306 return str;
6309 #if 0
6310 static void
6311 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6313 char s[RUBY_MAX_CHAR_LEN];
6314 int n = rb_enc_codelen(c, enc);
6316 rb_enc_mbcput(c, s, enc);
6317 rb_enc_str_buf_cat(str, s, n, enc);
6319 #endif
6321 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6324 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6326 char buf[CHAR_ESC_LEN + 1];
6327 int l;
6329 #if SIZEOF_INT > 4
6330 c &= 0xffffffff;
6331 #endif
6332 if (unicode_p) {
6333 if (c < 0x7F && ISPRINT(c)) {
6334 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6336 else if (c < 0x10000) {
6337 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6339 else {
6340 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6343 else {
6344 if (c < 0x100) {
6345 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6347 else {
6348 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6351 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6352 rb_str_buf_cat(result, buf, l);
6353 return l;
6356 const char *
6357 ruby_escaped_char(int c)
6359 switch (c) {
6360 case '\0': return "\\0";
6361 case '\n': return "\\n";
6362 case '\r': return "\\r";
6363 case '\t': return "\\t";
6364 case '\f': return "\\f";
6365 case '\013': return "\\v";
6366 case '\010': return "\\b";
6367 case '\007': return "\\a";
6368 case '\033': return "\\e";
6369 case '\x7f': return "\\c?";
6371 return NULL;
6374 VALUE
6375 rb_str_escape(VALUE str)
6377 int encidx = ENCODING_GET(str);
6378 rb_encoding *enc = rb_enc_from_index(encidx);
6379 const char *p = RSTRING_PTR(str);
6380 const char *pend = RSTRING_END(str);
6381 const char *prev = p;
6382 char buf[CHAR_ESC_LEN + 1];
6383 VALUE result = rb_str_buf_new(0);
6384 int unicode_p = rb_enc_unicode_p(enc);
6385 int asciicompat = rb_enc_asciicompat(enc);
6387 while (p < pend) {
6388 unsigned int c;
6389 const char *cc;
6390 int n = rb_enc_precise_mbclen(p, pend, enc);
6391 if (!MBCLEN_CHARFOUND_P(n)) {
6392 if (p > prev) str_buf_cat(result, prev, p - prev);
6393 n = rb_enc_mbminlen(enc);
6394 if (pend < p + n)
6395 n = (int)(pend - p);
6396 while (n--) {
6397 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6398 str_buf_cat(result, buf, strlen(buf));
6399 prev = ++p;
6401 continue;
6403 n = MBCLEN_CHARFOUND_LEN(n);
6404 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6405 p += n;
6406 cc = ruby_escaped_char(c);
6407 if (cc) {
6408 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6409 str_buf_cat(result, cc, strlen(cc));
6410 prev = p;
6412 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6414 else {
6415 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6416 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6417 prev = p;
6420 if (p > prev) str_buf_cat(result, prev, p - prev);
6421 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6423 return result;
6427 * call-seq:
6428 * inspect -> string
6430 * Returns a printable version of +self+, enclosed in double-quotes,
6431 * and with special characters escaped:
6433 * s = "foo\tbar\tbaz\n"
6434 * # => "foo\tbar\tbaz\n"
6435 * s.inspect
6436 * # => "\"foo\\tbar\\tbaz\\n\""
6440 VALUE
6441 rb_str_inspect(VALUE str)
6443 int encidx = ENCODING_GET(str);
6444 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6445 const char *p, *pend, *prev;
6446 char buf[CHAR_ESC_LEN + 1];
6447 VALUE result = rb_str_buf_new(0);
6448 rb_encoding *resenc = rb_default_internal_encoding();
6449 int unicode_p = rb_enc_unicode_p(enc);
6450 int asciicompat = rb_enc_asciicompat(enc);
6452 if (resenc == NULL) resenc = rb_default_external_encoding();
6453 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6454 rb_enc_associate(result, resenc);
6455 str_buf_cat2(result, "\"");
6457 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6458 prev = p;
6459 actenc = get_actual_encoding(encidx, str);
6460 if (actenc != enc) {
6461 enc = actenc;
6462 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6464 while (p < pend) {
6465 unsigned int c, cc;
6466 int n;
6468 n = rb_enc_precise_mbclen(p, pend, enc);
6469 if (!MBCLEN_CHARFOUND_P(n)) {
6470 if (p > prev) str_buf_cat(result, prev, p - prev);
6471 n = rb_enc_mbminlen(enc);
6472 if (pend < p + n)
6473 n = (int)(pend - p);
6474 while (n--) {
6475 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6476 str_buf_cat(result, buf, strlen(buf));
6477 prev = ++p;
6479 continue;
6481 n = MBCLEN_CHARFOUND_LEN(n);
6482 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6483 p += n;
6484 if ((asciicompat || unicode_p) &&
6485 (c == '"'|| c == '\\' ||
6486 (c == '#' &&
6487 p < pend &&
6488 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6489 (cc = rb_enc_codepoint(p,pend,enc),
6490 (cc == '$' || cc == '@' || cc == '{'))))) {
6491 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6492 str_buf_cat2(result, "\\");
6493 if (asciicompat || enc == resenc) {
6494 prev = p - n;
6495 continue;
6498 switch (c) {
6499 case '\n': cc = 'n'; break;
6500 case '\r': cc = 'r'; break;
6501 case '\t': cc = 't'; break;
6502 case '\f': cc = 'f'; break;
6503 case '\013': cc = 'v'; break;
6504 case '\010': cc = 'b'; break;
6505 case '\007': cc = 'a'; break;
6506 case 033: cc = 'e'; break;
6507 default: cc = 0; break;
6509 if (cc) {
6510 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6511 buf[0] = '\\';
6512 buf[1] = (char)cc;
6513 str_buf_cat(result, buf, 2);
6514 prev = p;
6515 continue;
6517 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6518 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6519 continue;
6521 else {
6522 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6523 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6524 prev = p;
6525 continue;
6528 if (p > prev) str_buf_cat(result, prev, p - prev);
6529 str_buf_cat2(result, "\"");
6531 return result;
6534 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6537 * call-seq:
6538 * dump -> string
6540 * Returns a printable version of +self+, enclosed in double-quotes,
6541 * with special characters escaped, and with non-printing characters
6542 * replaced by hexadecimal notation:
6544 * "hello \n ''".dump # => "\"hello \\n ''\""
6545 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6547 * Related: String#undump (inverse of String#dump).
6551 VALUE
6552 rb_str_dump(VALUE str)
6554 int encidx = rb_enc_get_index(str);
6555 rb_encoding *enc = rb_enc_from_index(encidx);
6556 long len;
6557 const char *p, *pend;
6558 char *q, *qend;
6559 VALUE result;
6560 int u8 = (encidx == rb_utf8_encindex());
6561 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6563 len = 2; /* "" */
6564 if (!rb_enc_asciicompat(enc)) {
6565 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6566 len += strlen(enc->name);
6569 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6570 while (p < pend) {
6571 int clen;
6572 unsigned char c = *p++;
6574 switch (c) {
6575 case '"': case '\\':
6576 case '\n': case '\r':
6577 case '\t': case '\f':
6578 case '\013': case '\010': case '\007': case '\033':
6579 clen = 2;
6580 break;
6582 case '#':
6583 clen = IS_EVSTR(p, pend) ? 2 : 1;
6584 break;
6586 default:
6587 if (ISPRINT(c)) {
6588 clen = 1;
6590 else {
6591 if (u8 && c > 0x7F) { /* \u notation */
6592 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6593 if (MBCLEN_CHARFOUND_P(n)) {
6594 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6595 if (cc <= 0xFFFF)
6596 clen = 6; /* \uXXXX */
6597 else if (cc <= 0xFFFFF)
6598 clen = 9; /* \u{XXXXX} */
6599 else
6600 clen = 10; /* \u{XXXXXX} */
6601 p += MBCLEN_CHARFOUND_LEN(n)-1;
6602 break;
6605 clen = 4; /* \xNN */
6607 break;
6610 if (clen > LONG_MAX - len) {
6611 rb_raise(rb_eRuntimeError, "string size too big");
6613 len += clen;
6616 result = rb_str_new(0, len);
6617 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6618 q = RSTRING_PTR(result); qend = q + len + 1;
6620 *q++ = '"';
6621 while (p < pend) {
6622 unsigned char c = *p++;
6624 if (c == '"' || c == '\\') {
6625 *q++ = '\\';
6626 *q++ = c;
6628 else if (c == '#') {
6629 if (IS_EVSTR(p, pend)) *q++ = '\\';
6630 *q++ = '#';
6632 else if (c == '\n') {
6633 *q++ = '\\';
6634 *q++ = 'n';
6636 else if (c == '\r') {
6637 *q++ = '\\';
6638 *q++ = 'r';
6640 else if (c == '\t') {
6641 *q++ = '\\';
6642 *q++ = 't';
6644 else if (c == '\f') {
6645 *q++ = '\\';
6646 *q++ = 'f';
6648 else if (c == '\013') {
6649 *q++ = '\\';
6650 *q++ = 'v';
6652 else if (c == '\010') {
6653 *q++ = '\\';
6654 *q++ = 'b';
6656 else if (c == '\007') {
6657 *q++ = '\\';
6658 *q++ = 'a';
6660 else if (c == '\033') {
6661 *q++ = '\\';
6662 *q++ = 'e';
6664 else if (ISPRINT(c)) {
6665 *q++ = c;
6667 else {
6668 *q++ = '\\';
6669 if (u8) {
6670 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6671 if (MBCLEN_CHARFOUND_P(n)) {
6672 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6673 p += n;
6674 if (cc <= 0xFFFF)
6675 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6676 else
6677 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6678 q += strlen(q);
6679 continue;
6682 snprintf(q, qend-q, "x%02X", c);
6683 q += 3;
6686 *q++ = '"';
6687 *q = '\0';
6688 if (!rb_enc_asciicompat(enc)) {
6689 snprintf(q, qend-q, nonascii_suffix, enc->name);
6690 encidx = rb_ascii8bit_encindex();
6692 /* result from dump is ASCII */
6693 rb_enc_associate_index(result, encidx);
6694 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6695 return result;
6698 static int
6699 unescape_ascii(unsigned int c)
6701 switch (c) {
6702 case 'n':
6703 return '\n';
6704 case 'r':
6705 return '\r';
6706 case 't':
6707 return '\t';
6708 case 'f':
6709 return '\f';
6710 case 'v':
6711 return '\13';
6712 case 'b':
6713 return '\010';
6714 case 'a':
6715 return '\007';
6716 case 'e':
6717 return 033;
6719 UNREACHABLE_RETURN(-1);
6722 static void
6723 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6725 const char *s = *ss;
6726 unsigned int c;
6727 int codelen;
6728 size_t hexlen;
6729 unsigned char buf[6];
6730 static rb_encoding *enc_utf8 = NULL;
6732 switch (*s) {
6733 case '\\':
6734 case '"':
6735 case '#':
6736 rb_str_cat(undumped, s, 1); /* cat itself */
6737 s++;
6738 break;
6739 case 'n':
6740 case 'r':
6741 case 't':
6742 case 'f':
6743 case 'v':
6744 case 'b':
6745 case 'a':
6746 case 'e':
6747 *buf = unescape_ascii(*s);
6748 rb_str_cat(undumped, (char *)buf, 1);
6749 s++;
6750 break;
6751 case 'u':
6752 if (*binary) {
6753 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6755 *utf8 = true;
6756 if (++s >= s_end) {
6757 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6759 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6760 if (*penc != enc_utf8) {
6761 *penc = enc_utf8;
6762 rb_enc_associate(undumped, enc_utf8);
6764 if (*s == '{') { /* handle \u{...} form */
6765 s++;
6766 for (;;) {
6767 if (s >= s_end) {
6768 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6770 if (*s == '}') {
6771 s++;
6772 break;
6774 if (ISSPACE(*s)) {
6775 s++;
6776 continue;
6778 c = scan_hex(s, s_end-s, &hexlen);
6779 if (hexlen == 0 || hexlen > 6) {
6780 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6782 if (c > 0x10ffff) {
6783 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6785 if (0xd800 <= c && c <= 0xdfff) {
6786 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6788 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6789 rb_str_cat(undumped, (char *)buf, codelen);
6790 s += hexlen;
6793 else { /* handle \uXXXX form */
6794 c = scan_hex(s, 4, &hexlen);
6795 if (hexlen != 4) {
6796 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6798 if (0xd800 <= c && c <= 0xdfff) {
6799 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6801 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6802 rb_str_cat(undumped, (char *)buf, codelen);
6803 s += hexlen;
6805 break;
6806 case 'x':
6807 if (*utf8) {
6808 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6810 *binary = true;
6811 if (++s >= s_end) {
6812 rb_raise(rb_eRuntimeError, "invalid hex escape");
6814 *buf = scan_hex(s, 2, &hexlen);
6815 if (hexlen != 2) {
6816 rb_raise(rb_eRuntimeError, "invalid hex escape");
6818 rb_str_cat(undumped, (char *)buf, 1);
6819 s += hexlen;
6820 break;
6821 default:
6822 rb_str_cat(undumped, s-1, 2);
6823 s++;
6826 *ss = s;
6829 static VALUE rb_str_is_ascii_only_p(VALUE str);
6832 * call-seq:
6833 * undump -> string
6835 * Returns an unescaped version of +self+:
6837 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6838 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6839 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6840 * s_undumped == s_orig # => true
6842 * Related: String#dump (inverse of String#undump).
6846 static VALUE
6847 str_undump(VALUE str)
6849 const char *s = RSTRING_PTR(str);
6850 const char *s_end = RSTRING_END(str);
6851 rb_encoding *enc = rb_enc_get(str);
6852 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6853 bool utf8 = false;
6854 bool binary = false;
6855 int w;
6857 rb_must_asciicompat(str);
6858 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6859 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6861 if (!str_null_check(str, &w)) {
6862 rb_raise(rb_eRuntimeError, "string contains null byte");
6864 if (RSTRING_LEN(str) < 2) goto invalid_format;
6865 if (*s != '"') goto invalid_format;
6867 /* strip '"' at the start */
6868 s++;
6870 for (;;) {
6871 if (s >= s_end) {
6872 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6875 if (*s == '"') {
6876 /* epilogue */
6877 s++;
6878 if (s == s_end) {
6879 /* ascii compatible dumped string */
6880 break;
6882 else {
6883 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6884 static const char dup_suffix[] = ".dup";
6885 const char *encname;
6886 int encidx;
6887 ptrdiff_t size;
6889 /* check separately for strings dumped by older versions */
6890 size = sizeof(dup_suffix) - 1;
6891 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6893 size = sizeof(force_encoding_suffix) - 1;
6894 if (s_end - s <= size) goto invalid_format;
6895 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6896 s += size;
6898 if (utf8) {
6899 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6902 encname = s;
6903 s = memchr(s, '"', s_end-s);
6904 size = s - encname;
6905 if (!s) goto invalid_format;
6906 if (s_end - s != 2) goto invalid_format;
6907 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6909 encidx = rb_enc_find_index2(encname, (long)size);
6910 if (encidx < 0) {
6911 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6913 rb_enc_associate_index(undumped, encidx);
6915 break;
6918 if (*s == '\\') {
6919 s++;
6920 if (s >= s_end) {
6921 rb_raise(rb_eRuntimeError, "invalid escape");
6923 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6925 else {
6926 rb_str_cat(undumped, s++, 1);
6930 return undumped;
6931 invalid_format:
6932 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6935 static void
6936 rb_str_check_dummy_enc(rb_encoding *enc)
6938 if (rb_enc_dummy_p(enc)) {
6939 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6940 rb_enc_name(enc));
6944 static rb_encoding *
6945 str_true_enc(VALUE str)
6947 rb_encoding *enc = STR_ENC_GET(str);
6948 rb_str_check_dummy_enc(enc);
6949 return enc;
6952 static OnigCaseFoldType
6953 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6955 if (argc==0)
6956 return flags;
6957 if (argc>2)
6958 rb_raise(rb_eArgError, "too many options");
6959 if (argv[0]==sym_turkic) {
6960 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6961 if (argc==2) {
6962 if (argv[1]==sym_lithuanian)
6963 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6964 else
6965 rb_raise(rb_eArgError, "invalid second option");
6968 else if (argv[0]==sym_lithuanian) {
6969 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6970 if (argc==2) {
6971 if (argv[1]==sym_turkic)
6972 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6973 else
6974 rb_raise(rb_eArgError, "invalid second option");
6977 else if (argc>1)
6978 rb_raise(rb_eArgError, "too many options");
6979 else if (argv[0]==sym_ascii)
6980 flags |= ONIGENC_CASE_ASCII_ONLY;
6981 else if (argv[0]==sym_fold) {
6982 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6983 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6984 else
6985 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6987 else
6988 rb_raise(rb_eArgError, "invalid option");
6989 return flags;
6992 static inline bool
6993 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6995 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6996 return true;
6997 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7000 /* 16 should be long enough to absorb any kind of single character length increase */
7001 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7002 #ifndef CASEMAP_DEBUG
7003 # define CASEMAP_DEBUG 0
7004 #endif
7006 struct mapping_buffer;
7007 typedef struct mapping_buffer {
7008 size_t capa;
7009 size_t used;
7010 struct mapping_buffer *next;
7011 OnigUChar space[FLEX_ARY_LEN];
7012 } mapping_buffer;
7014 static void
7015 mapping_buffer_free(void *p)
7017 mapping_buffer *previous_buffer;
7018 mapping_buffer *current_buffer = p;
7019 while (current_buffer) {
7020 previous_buffer = current_buffer;
7021 current_buffer = current_buffer->next;
7022 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7026 static const rb_data_type_t mapping_buffer_type = {
7027 "mapping_buffer",
7028 {0, mapping_buffer_free,}
7031 static VALUE
7032 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7034 VALUE target;
7036 const OnigUChar *source_current, *source_end;
7037 int target_length = 0;
7038 VALUE buffer_anchor;
7039 mapping_buffer *current_buffer = 0;
7040 mapping_buffer **pre_buffer;
7041 size_t buffer_count = 0;
7042 int buffer_length_or_invalid;
7044 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7046 source_current = (OnigUChar*)RSTRING_PTR(source);
7047 source_end = (OnigUChar*)RSTRING_END(source);
7049 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7050 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7051 while (source_current < source_end) {
7052 /* increase multiplier using buffer count to converge quickly */
7053 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7054 if (CASEMAP_DEBUG) {
7055 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7057 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7058 *pre_buffer = current_buffer;
7059 pre_buffer = &current_buffer->next;
7060 current_buffer->next = NULL;
7061 current_buffer->capa = capa;
7062 buffer_length_or_invalid = enc->case_map(flags,
7063 &source_current, source_end,
7064 current_buffer->space,
7065 current_buffer->space+current_buffer->capa,
7066 enc);
7067 if (buffer_length_or_invalid < 0) {
7068 current_buffer = DATA_PTR(buffer_anchor);
7069 DATA_PTR(buffer_anchor) = 0;
7070 mapping_buffer_free(current_buffer);
7071 rb_raise(rb_eArgError, "input string invalid");
7073 target_length += current_buffer->used = buffer_length_or_invalid;
7075 if (CASEMAP_DEBUG) {
7076 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7079 if (buffer_count==1) {
7080 target = rb_str_new((const char*)current_buffer->space, target_length);
7082 else {
7083 char *target_current;
7085 target = rb_str_new(0, target_length);
7086 target_current = RSTRING_PTR(target);
7087 current_buffer = DATA_PTR(buffer_anchor);
7088 while (current_buffer) {
7089 memcpy(target_current, current_buffer->space, current_buffer->used);
7090 target_current += current_buffer->used;
7091 current_buffer = current_buffer->next;
7094 current_buffer = DATA_PTR(buffer_anchor);
7095 DATA_PTR(buffer_anchor) = 0;
7096 mapping_buffer_free(current_buffer);
7098 /* TODO: check about string terminator character */
7099 str_enc_copy(target, source);
7100 /*ENC_CODERANGE_SET(mapped, cr);*/
7102 return target;
7105 static VALUE
7106 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7108 const OnigUChar *source_current, *source_end;
7109 OnigUChar *target_current, *target_end;
7110 long old_length = RSTRING_LEN(source);
7111 int length_or_invalid;
7113 if (old_length == 0) return Qnil;
7115 source_current = (OnigUChar*)RSTRING_PTR(source);
7116 source_end = (OnigUChar*)RSTRING_END(source);
7117 if (source == target) {
7118 target_current = (OnigUChar*)source_current;
7119 target_end = (OnigUChar*)source_end;
7121 else {
7122 target_current = (OnigUChar*)RSTRING_PTR(target);
7123 target_end = (OnigUChar*)RSTRING_END(target);
7126 length_or_invalid = onigenc_ascii_only_case_map(flags,
7127 &source_current, source_end,
7128 target_current, target_end, enc);
7129 if (length_or_invalid < 0)
7130 rb_raise(rb_eArgError, "input string invalid");
7131 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7132 fprintf(stderr, "problem with rb_str_ascii_casemap"
7133 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7134 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7135 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7138 str_enc_copy(target, source);
7140 return target;
7143 static bool
7144 upcase_single(VALUE str)
7146 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7147 bool modified = false;
7149 while (s < send) {
7150 unsigned int c = *(unsigned char*)s;
7152 if ('a' <= c && c <= 'z') {
7153 *s = 'A' + (c - 'a');
7154 modified = true;
7156 s++;
7158 return modified;
7162 * call-seq:
7163 * upcase!(*options) -> self or nil
7165 * Upcases the characters in +self+;
7166 * returns +self+ if any changes were made, +nil+ otherwise:
7168 * s = 'Hello World!' # => "Hello World!"
7169 * s.upcase! # => "HELLO WORLD!"
7170 * s # => "HELLO WORLD!"
7171 * s.upcase! # => nil
7173 * The casing may be affected by the given +options+;
7174 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7176 * Related: String#upcase, String#downcase, String#downcase!.
7180 static VALUE
7181 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7183 rb_encoding *enc;
7184 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7186 flags = check_case_options(argc, argv, flags);
7187 str_modify_keep_cr(str);
7188 enc = str_true_enc(str);
7189 if (case_option_single_p(flags, enc, str)) {
7190 if (upcase_single(str))
7191 flags |= ONIGENC_CASE_MODIFIED;
7193 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7194 rb_str_ascii_casemap(str, str, &flags, enc);
7195 else
7196 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7198 if (ONIGENC_CASE_MODIFIED&flags) return str;
7199 return Qnil;
7204 * call-seq:
7205 * upcase(*options) -> string
7207 * Returns a string containing the upcased characters in +self+:
7209 * s = 'Hello World!' # => "Hello World!"
7210 * s.upcase # => "HELLO WORLD!"
7212 * The casing may be affected by the given +options+;
7213 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7215 * Related: String#upcase!, String#downcase, String#downcase!.
7219 static VALUE
7220 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7222 rb_encoding *enc;
7223 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7224 VALUE ret;
7226 flags = check_case_options(argc, argv, flags);
7227 enc = str_true_enc(str);
7228 if (case_option_single_p(flags, enc, str)) {
7229 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7230 str_enc_copy(ret, str);
7231 upcase_single(ret);
7233 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7234 ret = rb_str_new(0, RSTRING_LEN(str));
7235 rb_str_ascii_casemap(str, ret, &flags, enc);
7237 else {
7238 ret = rb_str_casemap(str, &flags, enc);
7241 return ret;
7244 static bool
7245 downcase_single(VALUE str)
7247 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7248 bool modified = false;
7250 while (s < send) {
7251 unsigned int c = *(unsigned char*)s;
7253 if ('A' <= c && c <= 'Z') {
7254 *s = 'a' + (c - 'A');
7255 modified = true;
7257 s++;
7260 return modified;
7264 * call-seq:
7265 * downcase!(*options) -> self or nil
7267 * Downcases the characters in +self+;
7268 * returns +self+ if any changes were made, +nil+ otherwise:
7270 * s = 'Hello World!' # => "Hello World!"
7271 * s.downcase! # => "hello world!"
7272 * s # => "hello world!"
7273 * s.downcase! # => nil
7275 * The casing may be affected by the given +options+;
7276 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7278 * Related: String#downcase, String#upcase, String#upcase!.
7282 static VALUE
7283 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7285 rb_encoding *enc;
7286 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7288 flags = check_case_options(argc, argv, flags);
7289 str_modify_keep_cr(str);
7290 enc = str_true_enc(str);
7291 if (case_option_single_p(flags, enc, str)) {
7292 if (downcase_single(str))
7293 flags |= ONIGENC_CASE_MODIFIED;
7295 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7296 rb_str_ascii_casemap(str, str, &flags, enc);
7297 else
7298 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7300 if (ONIGENC_CASE_MODIFIED&flags) return str;
7301 return Qnil;
7306 * call-seq:
7307 * downcase(*options) -> string
7309 * Returns a string containing the downcased characters in +self+:
7311 * s = 'Hello World!' # => "Hello World!"
7312 * s.downcase # => "hello world!"
7314 * The casing may be affected by the given +options+;
7315 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7317 * Related: String#downcase!, String#upcase, String#upcase!.
7321 static VALUE
7322 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7324 rb_encoding *enc;
7325 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7326 VALUE ret;
7328 flags = check_case_options(argc, argv, flags);
7329 enc = str_true_enc(str);
7330 if (case_option_single_p(flags, enc, str)) {
7331 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7332 str_enc_copy(ret, str);
7333 downcase_single(ret);
7335 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7336 ret = rb_str_new(0, RSTRING_LEN(str));
7337 rb_str_ascii_casemap(str, ret, &flags, enc);
7339 else {
7340 ret = rb_str_casemap(str, &flags, enc);
7343 return ret;
7348 * call-seq:
7349 * capitalize!(*options) -> self or nil
7351 * Upcases the first character in +self+;
7352 * downcases the remaining characters;
7353 * returns +self+ if any changes were made, +nil+ otherwise:
7355 * s = 'hello World!' # => "hello World!"
7356 * s.capitalize! # => "Hello world!"
7357 * s # => "Hello world!"
7358 * s.capitalize! # => nil
7360 * The casing may be affected by the given +options+;
7361 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7363 * Related: String#capitalize.
7367 static VALUE
7368 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7370 rb_encoding *enc;
7371 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7373 flags = check_case_options(argc, argv, flags);
7374 str_modify_keep_cr(str);
7375 enc = str_true_enc(str);
7376 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7377 if (flags&ONIGENC_CASE_ASCII_ONLY)
7378 rb_str_ascii_casemap(str, str, &flags, enc);
7379 else
7380 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7382 if (ONIGENC_CASE_MODIFIED&flags) return str;
7383 return Qnil;
7388 * call-seq:
7389 * capitalize(*options) -> string
7391 * Returns a string containing the characters in +self+;
7392 * the first character is upcased;
7393 * the remaining characters are downcased:
7395 * s = 'hello World!' # => "hello World!"
7396 * s.capitalize # => "Hello world!"
7398 * The casing may be affected by the given +options+;
7399 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7401 * Related: String#capitalize!.
7405 static VALUE
7406 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7408 rb_encoding *enc;
7409 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7410 VALUE ret;
7412 flags = check_case_options(argc, argv, flags);
7413 enc = str_true_enc(str);
7414 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7415 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7416 ret = rb_str_new(0, RSTRING_LEN(str));
7417 rb_str_ascii_casemap(str, ret, &flags, enc);
7419 else {
7420 ret = rb_str_casemap(str, &flags, enc);
7422 return ret;
7427 * call-seq:
7428 * swapcase!(*options) -> self or nil
7430 * Upcases each lowercase character in +self+;
7431 * downcases uppercase character;
7432 * returns +self+ if any changes were made, +nil+ otherwise:
7434 * s = 'Hello World!' # => "Hello World!"
7435 * s.swapcase! # => "hELLO wORLD!"
7436 * s # => "Hello World!"
7437 * ''.swapcase! # => nil
7439 * The casing may be affected by the given +options+;
7440 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7442 * Related: String#swapcase.
7446 static VALUE
7447 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7449 rb_encoding *enc;
7450 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7452 flags = check_case_options(argc, argv, flags);
7453 str_modify_keep_cr(str);
7454 enc = str_true_enc(str);
7455 if (flags&ONIGENC_CASE_ASCII_ONLY)
7456 rb_str_ascii_casemap(str, str, &flags, enc);
7457 else
7458 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7460 if (ONIGENC_CASE_MODIFIED&flags) return str;
7461 return Qnil;
7466 * call-seq:
7467 * swapcase(*options) -> string
7469 * Returns a string containing the characters in +self+, with cases reversed;
7470 * each uppercase character is downcased;
7471 * each lowercase character is upcased:
7473 * s = 'Hello World!' # => "Hello World!"
7474 * s.swapcase # => "hELLO wORLD!"
7476 * The casing may be affected by the given +options+;
7477 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7479 * Related: String#swapcase!.
7483 static VALUE
7484 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7486 rb_encoding *enc;
7487 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7488 VALUE ret;
7490 flags = check_case_options(argc, argv, flags);
7491 enc = str_true_enc(str);
7492 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7493 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7494 ret = rb_str_new(0, RSTRING_LEN(str));
7495 rb_str_ascii_casemap(str, ret, &flags, enc);
7497 else {
7498 ret = rb_str_casemap(str, &flags, enc);
7500 return ret;
7503 typedef unsigned char *USTR;
7505 struct tr {
7506 int gen;
7507 unsigned int now, max;
7508 char *p, *pend;
7511 static unsigned int
7512 trnext(struct tr *t, rb_encoding *enc)
7514 int n;
7516 for (;;) {
7517 nextpart:
7518 if (!t->gen) {
7519 if (t->p == t->pend) return -1;
7520 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7521 t->p += n;
7523 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7524 t->p += n;
7525 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7526 t->p += n;
7527 if (t->p < t->pend) {
7528 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7529 t->p += n;
7530 if (t->now > c) {
7531 if (t->now < 0x80 && c < 0x80) {
7532 rb_raise(rb_eArgError,
7533 "invalid range \"%c-%c\" in string transliteration",
7534 t->now, c);
7536 else {
7537 rb_raise(rb_eArgError, "invalid range in string transliteration");
7539 continue; /* not reached */
7541 t->gen = 1;
7542 t->max = c;
7545 return t->now;
7547 else {
7548 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7549 if (t->now == t->max) {
7550 t->gen = 0;
7551 goto nextpart;
7554 if (t->now < t->max) {
7555 return t->now;
7557 else {
7558 t->gen = 0;
7559 return t->max;
7565 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7567 static VALUE
7568 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7570 const unsigned int errc = -1;
7571 unsigned int trans[256];
7572 rb_encoding *enc, *e1, *e2;
7573 struct tr trsrc, trrepl;
7574 int cflag = 0;
7575 unsigned int c, c0, last = 0;
7576 int modify = 0, i, l;
7577 unsigned char *s, *send;
7578 VALUE hash = 0;
7579 int singlebyte = single_byte_optimizable(str);
7580 int termlen;
7581 int cr;
7583 #define CHECK_IF_ASCII(c) \
7584 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7585 (cr = ENC_CODERANGE_VALID) : 0)
7587 StringValue(src);
7588 StringValue(repl);
7589 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7590 if (RSTRING_LEN(repl) == 0) {
7591 return rb_str_delete_bang(1, &src, str);
7594 cr = ENC_CODERANGE(str);
7595 e1 = rb_enc_check(str, src);
7596 e2 = rb_enc_check(str, repl);
7597 if (e1 == e2) {
7598 enc = e1;
7600 else {
7601 enc = rb_enc_check(src, repl);
7603 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7604 if (RSTRING_LEN(src) > 1 &&
7605 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7606 trsrc.p + l < trsrc.pend) {
7607 cflag = 1;
7608 trsrc.p += l;
7610 trrepl.p = RSTRING_PTR(repl);
7611 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7612 trsrc.gen = trrepl.gen = 0;
7613 trsrc.now = trrepl.now = 0;
7614 trsrc.max = trrepl.max = 0;
7616 if (cflag) {
7617 for (i=0; i<256; i++) {
7618 trans[i] = 1;
7620 while ((c = trnext(&trsrc, enc)) != errc) {
7621 if (c < 256) {
7622 trans[c] = errc;
7624 else {
7625 if (!hash) hash = rb_hash_new();
7626 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7629 while ((c = trnext(&trrepl, enc)) != errc)
7630 /* retrieve last replacer */;
7631 last = trrepl.now;
7632 for (i=0; i<256; i++) {
7633 if (trans[i] != errc) {
7634 trans[i] = last;
7638 else {
7639 unsigned int r;
7641 for (i=0; i<256; i++) {
7642 trans[i] = errc;
7644 while ((c = trnext(&trsrc, enc)) != errc) {
7645 r = trnext(&trrepl, enc);
7646 if (r == errc) r = trrepl.now;
7647 if (c < 256) {
7648 trans[c] = r;
7649 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7651 else {
7652 if (!hash) hash = rb_hash_new();
7653 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7658 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7659 cr = ENC_CODERANGE_7BIT;
7660 str_modify_keep_cr(str);
7661 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7662 termlen = rb_enc_mbminlen(enc);
7663 if (sflag) {
7664 int clen, tlen;
7665 long offset, max = RSTRING_LEN(str);
7666 unsigned int save = -1;
7667 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7669 while (s < send) {
7670 int may_modify = 0;
7672 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7673 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7675 s += clen;
7676 if (c < 256) {
7677 c = trans[c];
7679 else if (hash) {
7680 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7681 if (NIL_P(tmp)) {
7682 if (cflag) c = last;
7683 else c = errc;
7685 else if (cflag) c = errc;
7686 else c = NUM2INT(tmp);
7688 else {
7689 c = errc;
7691 if (c != (unsigned int)-1) {
7692 if (save == c) {
7693 CHECK_IF_ASCII(c);
7694 continue;
7696 save = c;
7697 tlen = rb_enc_codelen(c, enc);
7698 modify = 1;
7700 else {
7701 save = -1;
7702 c = c0;
7703 if (enc != e1) may_modify = 1;
7705 if ((offset = t - buf) + tlen > max) {
7706 size_t MAYBE_UNUSED(old) = max + termlen;
7707 max = offset + tlen + (send - s);
7708 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7709 t = buf + offset;
7711 rb_enc_mbcput(c, t, enc);
7712 if (may_modify && memcmp(s, t, tlen) != 0) {
7713 modify = 1;
7715 CHECK_IF_ASCII(c);
7716 t += tlen;
7718 if (!STR_EMBED_P(str)) {
7719 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7721 TERM_FILL((char *)t, termlen);
7722 RSTRING(str)->as.heap.ptr = (char *)buf;
7723 RSTRING(str)->as.heap.len = t - buf;
7724 STR_SET_NOEMBED(str);
7725 RSTRING(str)->as.heap.aux.capa = max;
7727 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7728 while (s < send) {
7729 c = (unsigned char)*s;
7730 if (trans[c] != errc) {
7731 if (!cflag) {
7732 c = trans[c];
7733 *s = c;
7734 modify = 1;
7736 else {
7737 *s = last;
7738 modify = 1;
7741 CHECK_IF_ASCII(c);
7742 s++;
7745 else {
7746 int clen, tlen;
7747 long offset, max = (long)((send - s) * 1.2);
7748 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7750 while (s < send) {
7751 int may_modify = 0;
7752 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7753 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7755 if (c < 256) {
7756 c = trans[c];
7758 else if (hash) {
7759 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7760 if (NIL_P(tmp)) {
7761 if (cflag) c = last;
7762 else c = errc;
7764 else if (cflag) c = errc;
7765 else c = NUM2INT(tmp);
7767 else {
7768 c = cflag ? last : errc;
7770 if (c != errc) {
7771 tlen = rb_enc_codelen(c, enc);
7772 modify = 1;
7774 else {
7775 c = c0;
7776 if (enc != e1) may_modify = 1;
7778 if ((offset = t - buf) + tlen > max) {
7779 size_t MAYBE_UNUSED(old) = max + termlen;
7780 max = offset + tlen + (long)((send - s) * 1.2);
7781 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7782 t = buf + offset;
7784 if (s != t) {
7785 rb_enc_mbcput(c, t, enc);
7786 if (may_modify && memcmp(s, t, tlen) != 0) {
7787 modify = 1;
7790 CHECK_IF_ASCII(c);
7791 s += clen;
7792 t += tlen;
7794 if (!STR_EMBED_P(str)) {
7795 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7797 TERM_FILL((char *)t, termlen);
7798 RSTRING(str)->as.heap.ptr = (char *)buf;
7799 RSTRING(str)->as.heap.len = t - buf;
7800 STR_SET_NOEMBED(str);
7801 RSTRING(str)->as.heap.aux.capa = max;
7804 if (modify) {
7805 if (cr != ENC_CODERANGE_BROKEN)
7806 ENC_CODERANGE_SET(str, cr);
7807 rb_enc_associate(str, enc);
7808 return str;
7810 return Qnil;
7815 * call-seq:
7816 * str.tr!(from_str, to_str) -> str or nil
7818 * Translates <i>str</i> in place, using the same rules as
7819 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7820 * were made.
7823 static VALUE
7824 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7826 return tr_trans(str, src, repl, 0);
7831 * call-seq:
7832 * str.tr(from_str, to_str) => new_str
7834 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7835 * corresponding characters in +to_str+. If +to_str+ is shorter than
7836 * +from_str+, it is padded with its last character in order to maintain the
7837 * correspondence.
7839 * "hello".tr('el', 'ip') #=> "hippo"
7840 * "hello".tr('aeiou', '*') #=> "h*ll*"
7841 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7843 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7844 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7845 * all characters except those listed.
7847 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7848 * "hello".tr('^aeiou', '*') #=> "*e**o"
7850 * The backslash character <code>\\</code> can be used to escape
7851 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7852 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7854 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7855 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7857 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7858 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7859 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7861 * "X['\\b']".tr("X\\", "") #=> "['b']"
7862 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7865 static VALUE
7866 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7868 str = str_duplicate(rb_cString, str);
7869 tr_trans(str, src, repl, 0);
7870 return str;
7873 #define TR_TABLE_MAX (UCHAR_MAX+1)
7874 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7875 static void
7876 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7877 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7879 const unsigned int errc = -1;
7880 char buf[TR_TABLE_MAX];
7881 struct tr tr;
7882 unsigned int c;
7883 VALUE table = 0, ptable = 0;
7884 int i, l, cflag = 0;
7886 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7887 tr.gen = tr.now = tr.max = 0;
7889 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7890 cflag = 1;
7891 tr.p += l;
7893 if (first) {
7894 for (i=0; i<TR_TABLE_MAX; i++) {
7895 stable[i] = 1;
7897 stable[TR_TABLE_MAX] = cflag;
7899 else if (stable[TR_TABLE_MAX] && !cflag) {
7900 stable[TR_TABLE_MAX] = 0;
7902 for (i=0; i<TR_TABLE_MAX; i++) {
7903 buf[i] = cflag;
7906 while ((c = trnext(&tr, enc)) != errc) {
7907 if (c < TR_TABLE_MAX) {
7908 buf[(unsigned char)c] = !cflag;
7910 else {
7911 VALUE key = UINT2NUM(c);
7913 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7914 if (cflag) {
7915 ptable = *ctablep;
7916 table = ptable ? ptable : rb_hash_new();
7917 *ctablep = table;
7919 else {
7920 table = rb_hash_new();
7921 ptable = *tablep;
7922 *tablep = table;
7925 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7926 rb_hash_aset(table, key, Qtrue);
7930 for (i=0; i<TR_TABLE_MAX; i++) {
7931 stable[i] = stable[i] && buf[i];
7933 if (!table && !cflag) {
7934 *tablep = 0;
7939 static int
7940 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7942 if (c < TR_TABLE_MAX) {
7943 return table[c] != 0;
7945 else {
7946 VALUE v = UINT2NUM(c);
7948 if (del) {
7949 if (!NIL_P(rb_hash_lookup(del, v)) &&
7950 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7951 return TRUE;
7954 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7955 return FALSE;
7957 return table[TR_TABLE_MAX] ? TRUE : FALSE;
7962 * call-seq:
7963 * str.delete!([other_str]+) -> str or nil
7965 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7966 * <code>nil</code> if <i>str</i> was not modified.
7969 static VALUE
7970 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7972 char squeez[TR_TABLE_SIZE];
7973 rb_encoding *enc = 0;
7974 char *s, *send, *t;
7975 VALUE del = 0, nodel = 0;
7976 int modify = 0;
7977 int i, ascompat, cr;
7979 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7980 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7981 for (i=0; i<argc; i++) {
7982 VALUE s = argv[i];
7984 StringValue(s);
7985 enc = rb_enc_check(str, s);
7986 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7989 str_modify_keep_cr(str);
7990 ascompat = rb_enc_asciicompat(enc);
7991 s = t = RSTRING_PTR(str);
7992 send = RSTRING_END(str);
7993 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7994 while (s < send) {
7995 unsigned int c;
7996 int clen;
7998 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7999 if (squeez[c]) {
8000 modify = 1;
8002 else {
8003 if (t != s) *t = c;
8004 t++;
8006 s++;
8008 else {
8009 c = rb_enc_codepoint_len(s, send, &clen, enc);
8011 if (tr_find(c, squeez, del, nodel)) {
8012 modify = 1;
8014 else {
8015 if (t != s) rb_enc_mbcput(c, t, enc);
8016 t += clen;
8017 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8019 s += clen;
8022 TERM_FILL(t, TERM_LEN(str));
8023 STR_SET_LEN(str, t - RSTRING_PTR(str));
8024 ENC_CODERANGE_SET(str, cr);
8026 if (modify) return str;
8027 return Qnil;
8032 * call-seq:
8033 * str.delete([other_str]+) -> new_str
8035 * Returns a copy of <i>str</i> with all characters in the intersection of its
8036 * arguments deleted. Uses the same rules for building the set of characters as
8037 * String#count.
8039 * "hello".delete "l","lo" #=> "heo"
8040 * "hello".delete "lo" #=> "he"
8041 * "hello".delete "aeiou", "^e" #=> "hell"
8042 * "hello".delete "ej-m" #=> "ho"
8045 static VALUE
8046 rb_str_delete(int argc, VALUE *argv, VALUE str)
8048 str = str_duplicate(rb_cString, str);
8049 rb_str_delete_bang(argc, argv, str);
8050 return str;
8055 * call-seq:
8056 * str.squeeze!([other_str]*) -> str or nil
8058 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8059 * <code>nil</code> if no changes were made.
8062 static VALUE
8063 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8065 char squeez[TR_TABLE_SIZE];
8066 rb_encoding *enc = 0;
8067 VALUE del = 0, nodel = 0;
8068 unsigned char *s, *send, *t;
8069 int i, modify = 0;
8070 int ascompat, singlebyte = single_byte_optimizable(str);
8071 unsigned int save;
8073 if (argc == 0) {
8074 enc = STR_ENC_GET(str);
8076 else {
8077 for (i=0; i<argc; i++) {
8078 VALUE s = argv[i];
8080 StringValue(s);
8081 enc = rb_enc_check(str, s);
8082 if (singlebyte && !single_byte_optimizable(s))
8083 singlebyte = 0;
8084 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8088 str_modify_keep_cr(str);
8089 s = t = (unsigned char *)RSTRING_PTR(str);
8090 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8091 send = (unsigned char *)RSTRING_END(str);
8092 save = -1;
8093 ascompat = rb_enc_asciicompat(enc);
8095 if (singlebyte) {
8096 while (s < send) {
8097 unsigned int c = *s++;
8098 if (c != save || (argc > 0 && !squeez[c])) {
8099 *t++ = save = c;
8103 else {
8104 while (s < send) {
8105 unsigned int c;
8106 int clen;
8108 if (ascompat && (c = *s) < 0x80) {
8109 if (c != save || (argc > 0 && !squeez[c])) {
8110 *t++ = save = c;
8112 s++;
8114 else {
8115 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8117 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8118 if (t != s) rb_enc_mbcput(c, t, enc);
8119 save = c;
8120 t += clen;
8122 s += clen;
8127 TERM_FILL((char *)t, TERM_LEN(str));
8128 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8129 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8130 modify = 1;
8133 if (modify) return str;
8134 return Qnil;
8139 * call-seq:
8140 * str.squeeze([other_str]*) -> new_str
8142 * Builds a set of characters from the <i>other_str</i> parameter(s)
8143 * using the procedure described for String#count. Returns a new
8144 * string where runs of the same character that occur in this set are
8145 * replaced by a single character. If no arguments are given, all
8146 * runs of identical characters are replaced by a single character.
8148 * "yellow moon".squeeze #=> "yelow mon"
8149 * " now is the".squeeze(" ") #=> " now is the"
8150 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8153 static VALUE
8154 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8156 str = str_duplicate(rb_cString, str);
8157 rb_str_squeeze_bang(argc, argv, str);
8158 return str;
8163 * call-seq:
8164 * str.tr_s!(from_str, to_str) -> str or nil
8166 * Performs String#tr_s processing on <i>str</i> in place,
8167 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8170 static VALUE
8171 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8173 return tr_trans(str, src, repl, 1);
8178 * call-seq:
8179 * str.tr_s(from_str, to_str) -> new_str
8181 * Processes a copy of <i>str</i> as described under String#tr, then
8182 * removes duplicate characters in regions that were affected by the
8183 * translation.
8185 * "hello".tr_s('l', 'r') #=> "hero"
8186 * "hello".tr_s('el', '*') #=> "h*o"
8187 * "hello".tr_s('el', 'hx') #=> "hhxo"
8190 static VALUE
8191 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8193 str = str_duplicate(rb_cString, str);
8194 tr_trans(str, src, repl, 1);
8195 return str;
8200 * call-seq:
8201 * str.count([other_str]+) -> integer
8203 * Each +other_str+ parameter defines a set of characters to count. The
8204 * intersection of these sets defines the characters to count in +str+. Any
8205 * +other_str+ that starts with a caret <code>^</code> is negated. The
8206 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8207 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8208 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8209 * sequence or the end of a +other_str+.
8211 * a = "hello world"
8212 * a.count "lo" #=> 5
8213 * a.count "lo", "o" #=> 2
8214 * a.count "hello", "^l" #=> 4
8215 * a.count "ej-m" #=> 4
8217 * "hello^world".count "\\^aeiou" #=> 4
8218 * "hello-world".count "a\\-eo" #=> 4
8220 * c = "hello world\\r\\n"
8221 * c.count "\\" #=> 2
8222 * c.count "\\A" #=> 0
8223 * c.count "X-\\w" #=> 3
8226 static VALUE
8227 rb_str_count(int argc, VALUE *argv, VALUE str)
8229 char table[TR_TABLE_SIZE];
8230 rb_encoding *enc = 0;
8231 VALUE del = 0, nodel = 0, tstr;
8232 char *s, *send;
8233 int i;
8234 int ascompat;
8235 size_t n = 0;
8237 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8239 tstr = argv[0];
8240 StringValue(tstr);
8241 enc = rb_enc_check(str, tstr);
8242 if (argc == 1) {
8243 const char *ptstr;
8244 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8245 (ptstr = RSTRING_PTR(tstr),
8246 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8247 !is_broken_string(str)) {
8248 int clen;
8249 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8251 s = RSTRING_PTR(str);
8252 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8253 send = RSTRING_END(str);
8254 while (s < send) {
8255 if (*(unsigned char*)s++ == c) n++;
8257 return SIZET2NUM(n);
8261 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8262 for (i=1; i<argc; i++) {
8263 tstr = argv[i];
8264 StringValue(tstr);
8265 enc = rb_enc_check(str, tstr);
8266 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8269 s = RSTRING_PTR(str);
8270 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8271 send = RSTRING_END(str);
8272 ascompat = rb_enc_asciicompat(enc);
8273 while (s < send) {
8274 unsigned int c;
8276 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8277 if (table[c]) {
8278 n++;
8280 s++;
8282 else {
8283 int clen;
8284 c = rb_enc_codepoint_len(s, send, &clen, enc);
8285 if (tr_find(c, table, del, nodel)) {
8286 n++;
8288 s += clen;
8292 return SIZET2NUM(n);
8295 static VALUE
8296 rb_fs_check(VALUE val)
8298 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8299 val = rb_check_string_type(val);
8300 if (NIL_P(val)) return 0;
8302 return val;
8305 static const char isspacetable[256] = {
8306 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8308 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8324 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8326 static long
8327 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8329 if (empty_count >= 0 && len == 0) {
8330 return empty_count + 1;
8332 if (empty_count > 0) {
8333 /* make different substrings */
8334 if (result) {
8335 do {
8336 rb_ary_push(result, str_new_empty_String(str));
8337 } while (--empty_count > 0);
8339 else {
8340 do {
8341 rb_yield(str_new_empty_String(str));
8342 } while (--empty_count > 0);
8345 str = rb_str_subseq(str, beg, len);
8346 if (result) {
8347 rb_ary_push(result, str);
8349 else {
8350 rb_yield(str);
8352 return empty_count;
8355 typedef enum {
8356 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8357 } split_type_t;
8359 static split_type_t
8360 literal_split_pattern(VALUE spat, split_type_t default_type)
8362 rb_encoding *enc = STR_ENC_GET(spat);
8363 const char *ptr;
8364 long len;
8365 RSTRING_GETMEM(spat, ptr, len);
8366 if (len == 0) {
8367 /* Special case - split into chars */
8368 return SPLIT_TYPE_CHARS;
8370 else if (rb_enc_asciicompat(enc)) {
8371 if (len == 1 && ptr[0] == ' ') {
8372 return SPLIT_TYPE_AWK;
8375 else {
8376 int l;
8377 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8378 return SPLIT_TYPE_AWK;
8381 return default_type;
8385 * call-seq:
8386 * str.split(pattern=nil, [limit]) -> an_array
8387 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8389 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8390 * of these substrings.
8392 * If <i>pattern</i> is a String, then its contents are used as
8393 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8394 * space, <i>str</i> is split on whitespace, with leading and trailing
8395 * whitespace and runs of contiguous whitespace characters ignored.
8397 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8398 * pattern matches. Whenever the pattern matches a zero-length string,
8399 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8400 * groups, the respective matches will be returned in the array as well.
8402 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8403 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8404 * split on whitespace as if ' ' were specified.
8406 * If the <i>limit</i> parameter is omitted, trailing null fields are
8407 * suppressed. If <i>limit</i> is a positive number, at most that number
8408 * of split substrings will be returned (captured groups will be returned
8409 * as well, but are not counted towards the limit).
8410 * If <i>limit</i> is <code>1</code>, the entire
8411 * string is returned as the only entry in an array. If negative, there is no
8412 * limit to the number of fields returned, and trailing null fields are not
8413 * suppressed.
8415 * When the input +str+ is empty an empty Array is returned as the string is
8416 * considered to have no fields to split.
8418 * " now's the time ".split #=> ["now's", "the", "time"]
8419 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8420 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8421 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8422 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8423 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8424 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8426 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8427 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8428 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8429 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8431 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8433 * "".split(',', -1) #=> []
8435 * If a block is given, invoke the block with each split substring.
8439 static VALUE
8440 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8442 rb_encoding *enc;
8443 VALUE spat;
8444 VALUE limit;
8445 split_type_t split_type;
8446 long beg, end, i = 0, empty_count = -1;
8447 int lim = 0;
8448 VALUE result, tmp;
8450 result = rb_block_given_p() ? Qfalse : Qnil;
8451 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8452 lim = NUM2INT(limit);
8453 if (lim <= 0) limit = Qnil;
8454 else if (lim == 1) {
8455 if (RSTRING_LEN(str) == 0)
8456 return result ? rb_ary_new2(0) : str;
8457 tmp = str_duplicate(rb_cString, str);
8458 if (!result) {
8459 rb_yield(tmp);
8460 return str;
8462 return rb_ary_new3(1, tmp);
8464 i = 1;
8466 if (NIL_P(limit) && !lim) empty_count = 0;
8468 enc = STR_ENC_GET(str);
8469 split_type = SPLIT_TYPE_REGEXP;
8470 if (!NIL_P(spat)) {
8471 spat = get_pat_quoted(spat, 0);
8473 else if (NIL_P(spat = rb_fs)) {
8474 split_type = SPLIT_TYPE_AWK;
8476 else if (!(spat = rb_fs_check(spat))) {
8477 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8479 else {
8480 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8482 if (split_type != SPLIT_TYPE_AWK) {
8483 switch (BUILTIN_TYPE(spat)) {
8484 case T_REGEXP:
8485 rb_reg_options(spat); /* check if uninitialized */
8486 tmp = RREGEXP_SRC(spat);
8487 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8488 if (split_type == SPLIT_TYPE_AWK) {
8489 spat = tmp;
8490 split_type = SPLIT_TYPE_STRING;
8492 break;
8494 case T_STRING:
8495 mustnot_broken(spat);
8496 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8497 break;
8499 default:
8500 UNREACHABLE_RETURN(Qnil);
8504 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8506 if (result) result = rb_ary_new();
8507 beg = 0;
8508 char *ptr = RSTRING_PTR(str);
8509 char *eptr = RSTRING_END(str);
8510 if (split_type == SPLIT_TYPE_AWK) {
8511 char *bptr = ptr;
8512 int skip = 1;
8513 unsigned int c;
8515 end = beg;
8516 if (is_ascii_string(str)) {
8517 while (ptr < eptr) {
8518 c = (unsigned char)*ptr++;
8519 if (skip) {
8520 if (ascii_isspace(c)) {
8521 beg = ptr - bptr;
8523 else {
8524 end = ptr - bptr;
8525 skip = 0;
8526 if (!NIL_P(limit) && lim <= i) break;
8529 else if (ascii_isspace(c)) {
8530 SPLIT_STR(beg, end-beg);
8531 skip = 1;
8532 beg = ptr - bptr;
8533 if (!NIL_P(limit)) ++i;
8535 else {
8536 end = ptr - bptr;
8540 else {
8541 while (ptr < eptr) {
8542 int n;
8544 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8545 ptr += n;
8546 if (skip) {
8547 if (rb_isspace(c)) {
8548 beg = ptr - bptr;
8550 else {
8551 end = ptr - bptr;
8552 skip = 0;
8553 if (!NIL_P(limit) && lim <= i) break;
8556 else if (rb_isspace(c)) {
8557 SPLIT_STR(beg, end-beg);
8558 skip = 1;
8559 beg = ptr - bptr;
8560 if (!NIL_P(limit)) ++i;
8562 else {
8563 end = ptr - bptr;
8568 else if (split_type == SPLIT_TYPE_STRING) {
8569 char *str_start = ptr;
8570 char *substr_start = ptr;
8571 char *sptr = RSTRING_PTR(spat);
8572 long slen = RSTRING_LEN(spat);
8574 mustnot_broken(str);
8575 enc = rb_enc_check(str, spat);
8576 while (ptr < eptr &&
8577 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8578 /* Check we are at the start of a char */
8579 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8580 if (t != ptr + end) {
8581 ptr = t;
8582 continue;
8584 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8585 ptr += end + slen;
8586 substr_start = ptr;
8587 if (!NIL_P(limit) && lim <= ++i) break;
8589 beg = ptr - str_start;
8591 else if (split_type == SPLIT_TYPE_CHARS) {
8592 char *str_start = ptr;
8593 int n;
8595 mustnot_broken(str);
8596 enc = rb_enc_get(str);
8597 while (ptr < eptr &&
8598 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8599 SPLIT_STR(ptr - str_start, n);
8600 ptr += n;
8601 if (!NIL_P(limit) && lim <= ++i) break;
8603 beg = ptr - str_start;
8605 else {
8606 long len = RSTRING_LEN(str);
8607 long start = beg;
8608 long idx;
8609 int last_null = 0;
8610 struct re_registers *regs;
8611 VALUE match = 0;
8613 for (; rb_reg_search(spat, str, start, 0) >= 0;
8614 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8615 match = rb_backref_get();
8616 if (!result) rb_match_busy(match);
8617 regs = RMATCH_REGS(match);
8618 end = BEG(0);
8619 if (start == end && BEG(0) == END(0)) {
8620 if (!ptr) {
8621 SPLIT_STR(0, 0);
8622 break;
8624 else if (last_null == 1) {
8625 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8626 beg = start;
8628 else {
8629 if (start == len)
8630 start++;
8631 else
8632 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8633 last_null = 1;
8634 continue;
8637 else {
8638 SPLIT_STR(beg, end-beg);
8639 beg = start = END(0);
8641 last_null = 0;
8643 for (idx=1; idx < regs->num_regs; idx++) {
8644 if (BEG(idx) == -1) continue;
8645 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8647 if (!NIL_P(limit) && lim <= ++i) break;
8649 if (match) rb_match_unbusy(match);
8651 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8652 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8655 return result ? result : str;
8658 VALUE
8659 rb_str_split(VALUE str, const char *sep0)
8661 VALUE sep;
8663 StringValue(str);
8664 sep = rb_str_new_cstr(sep0);
8665 return rb_str_split_m(1, &sep, str);
8668 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8670 static inline int
8671 enumerator_element(VALUE ary, VALUE e)
8673 if (ary) {
8674 rb_ary_push(ary, e);
8675 return 0;
8677 else {
8678 rb_yield(e);
8679 return 1;
8683 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8685 static const char *
8686 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8688 const char *prev = rb_enc_prev_char(p, e, e, enc);
8689 if (rb_enc_is_newline(prev, e, enc)) {
8690 e = prev;
8691 prev = rb_enc_prev_char(p, e, e, enc);
8692 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8693 e = prev;
8695 return e;
8698 static VALUE
8699 get_rs(void)
8701 VALUE rs = rb_rs;
8702 if (!NIL_P(rs) &&
8703 (!RB_TYPE_P(rs, T_STRING) ||
8704 RSTRING_LEN(rs) != 1 ||
8705 RSTRING_PTR(rs)[0] != '\n')) {
8706 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8708 return rs;
8711 #define rb_rs get_rs()
8713 static VALUE
8714 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8716 rb_encoding *enc;
8717 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8718 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8719 long pos, len, rslen;
8720 int rsnewline = 0;
8722 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8723 rs = rb_rs;
8724 if (!NIL_P(opts)) {
8725 static ID keywords[1];
8726 if (!keywords[0]) {
8727 keywords[0] = rb_intern_const("chomp");
8729 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8730 chomp = (chomp != Qundef && RTEST(chomp));
8733 if (NIL_P(rs)) {
8734 if (!ENUM_ELEM(ary, str)) {
8735 return ary;
8737 else {
8738 return orig;
8742 if (!RSTRING_LEN(str)) goto end;
8743 str = rb_str_new_frozen(str);
8744 ptr = subptr = RSTRING_PTR(str);
8745 pend = RSTRING_END(str);
8746 len = RSTRING_LEN(str);
8747 StringValue(rs);
8748 rslen = RSTRING_LEN(rs);
8750 if (rs == rb_default_rs)
8751 enc = rb_enc_get(str);
8752 else
8753 enc = rb_enc_check(str, rs);
8755 if (rslen == 0) {
8756 /* paragraph mode */
8757 int n;
8758 const char *eol = NULL;
8759 subend = subptr;
8760 while (subend < pend) {
8761 do {
8762 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8763 n = 0;
8764 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8765 if (rb_enc_is_newline(subend + n, pend, enc)) {
8766 if (eol == subend) break;
8767 subend += rslen;
8768 if (subptr) eol = subend;
8770 else {
8771 if (!subptr) subptr = subend;
8772 subend += rslen;
8774 rslen = 0;
8775 } while (subend < pend);
8776 if (!subptr) break;
8777 line = rb_str_subseq(str, subptr - ptr,
8778 subend - subptr + (chomp ? 0 : rslen));
8779 if (ENUM_ELEM(ary, line)) {
8780 str_mod_check(str, ptr, len);
8782 subptr = eol = NULL;
8784 goto end;
8786 else {
8787 rsptr = RSTRING_PTR(rs);
8788 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8789 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8790 rsnewline = 1;
8794 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8795 rs = rb_str_new(rsptr, rslen);
8796 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8797 rsptr = RSTRING_PTR(rs);
8798 rslen = RSTRING_LEN(rs);
8801 while (subptr < pend) {
8802 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8803 if (pos < 0) break;
8804 hit = subptr + pos;
8805 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8806 if (hit != adjusted) {
8807 subptr = adjusted;
8808 continue;
8810 subend = hit += rslen;
8811 if (chomp) {
8812 if (rsnewline) {
8813 subend = chomp_newline(subptr, subend, enc);
8815 else {
8816 subend -= rslen;
8819 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8820 if (ENUM_ELEM(ary, line)) {
8821 str_mod_check(str, ptr, len);
8823 subptr = hit;
8826 if (subptr != pend) {
8827 if (chomp) {
8828 if (rsnewline) {
8829 pend = chomp_newline(subptr, pend, enc);
8831 else if (pend - subptr >= rslen &&
8832 memcmp(pend - rslen, rsptr, rslen) == 0) {
8833 pend -= rslen;
8836 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8837 ENUM_ELEM(ary, line);
8838 RB_GC_GUARD(str);
8841 end:
8842 if (ary)
8843 return ary;
8844 else
8845 return orig;
8849 * call-seq:
8850 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8851 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8853 * Splits <i>str</i> using the supplied parameter as the record
8854 * separator (<code>$/</code> by default), passing each substring in
8855 * turn to the supplied block. If a zero-length record separator is
8856 * supplied, the string is split into paragraphs delimited by
8857 * multiple successive newlines.
8859 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8860 * line.
8862 * If no block is given, an enumerator is returned instead.
8864 * "hello\nworld".each_line {|s| p s}
8865 * # prints:
8866 * # "hello\n"
8867 * # "world"
8869 * "hello\nworld".each_line('l') {|s| p s}
8870 * # prints:
8871 * # "hel"
8872 * # "l"
8873 * # "o\nworl"
8874 * # "d"
8876 * "hello\n\n\nworld".each_line('') {|s| p s}
8877 * # prints
8878 * # "hello\n\n"
8879 * # "world"
8881 * "hello\nworld".each_line(chomp: true) {|s| p s}
8882 * # prints:
8883 * # "hello"
8884 * # "world"
8886 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8887 * # prints:
8888 * # "he"
8889 * # ""
8890 * # "o\nwor"
8891 * # "d"
8895 static VALUE
8896 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8898 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8899 return rb_str_enumerate_lines(argc, argv, str, 0);
8903 * call-seq:
8904 * str.lines(separator=$/, chomp: false) -> an_array
8906 * Returns an array of lines in <i>str</i> split using the supplied
8907 * record separator (<code>$/</code> by default). This is a
8908 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8910 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8911 * line.
8913 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8914 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8915 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8917 * If a block is given, which is a deprecated form, works the same as
8918 * <code>each_line</code>.
8921 static VALUE
8922 rb_str_lines(int argc, VALUE *argv, VALUE str)
8924 VALUE ary = WANTARRAY("lines", 0);
8925 return rb_str_enumerate_lines(argc, argv, str, ary);
8928 static VALUE
8929 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8931 return LONG2FIX(RSTRING_LEN(str));
8934 static VALUE
8935 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8937 long i;
8939 for (i=0; i<RSTRING_LEN(str); i++) {
8940 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8942 if (ary)
8943 return ary;
8944 else
8945 return str;
8949 * call-seq:
8950 * str.each_byte {|integer| block } -> str
8951 * str.each_byte -> an_enumerator
8953 * Passes each byte in <i>str</i> to the given block, or returns an
8954 * enumerator if no block is given.
8956 * "hello".each_byte {|c| print c, ' ' }
8958 * <em>produces:</em>
8960 * 104 101 108 108 111
8963 static VALUE
8964 rb_str_each_byte(VALUE str)
8966 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8967 return rb_str_enumerate_bytes(str, 0);
8971 * call-seq:
8972 * str.bytes -> an_array
8974 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8975 * <code>str.each_byte.to_a</code>.
8977 * If a block is given, which is a deprecated form, works the same as
8978 * <code>each_byte</code>.
8981 static VALUE
8982 rb_str_bytes(VALUE str)
8984 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8985 return rb_str_enumerate_bytes(str, ary);
8988 static VALUE
8989 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8991 return rb_str_length(str);
8994 static VALUE
8995 rb_str_enumerate_chars(VALUE str, VALUE ary)
8997 VALUE orig = str;
8998 long i, len, n;
8999 const char *ptr;
9000 rb_encoding *enc;
9002 str = rb_str_new_frozen(str);
9003 ptr = RSTRING_PTR(str);
9004 len = RSTRING_LEN(str);
9005 enc = rb_enc_get(str);
9007 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9008 for (i = 0; i < len; i += n) {
9009 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9010 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9013 else {
9014 for (i = 0; i < len; i += n) {
9015 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9016 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9019 RB_GC_GUARD(str);
9020 if (ary)
9021 return ary;
9022 else
9023 return orig;
9027 * call-seq:
9028 * str.each_char {|cstr| block } -> str
9029 * str.each_char -> an_enumerator
9031 * Passes each character in <i>str</i> to the given block, or returns
9032 * an enumerator if no block is given.
9034 * "hello".each_char {|c| print c, ' ' }
9036 * <em>produces:</em>
9038 * h e l l o
9041 static VALUE
9042 rb_str_each_char(VALUE str)
9044 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9045 return rb_str_enumerate_chars(str, 0);
9049 * call-seq:
9050 * str.chars -> an_array
9052 * Returns an array of characters in <i>str</i>. This is a shorthand
9053 * for <code>str.each_char.to_a</code>.
9055 * If a block is given, which is a deprecated form, works the same as
9056 * <code>each_char</code>.
9059 static VALUE
9060 rb_str_chars(VALUE str)
9062 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9063 return rb_str_enumerate_chars(str, ary);
9066 static VALUE
9067 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9069 VALUE orig = str;
9070 int n;
9071 unsigned int c;
9072 const char *ptr, *end;
9073 rb_encoding *enc;
9075 if (single_byte_optimizable(str))
9076 return rb_str_enumerate_bytes(str, ary);
9078 str = rb_str_new_frozen(str);
9079 ptr = RSTRING_PTR(str);
9080 end = RSTRING_END(str);
9081 enc = STR_ENC_GET(str);
9083 while (ptr < end) {
9084 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9085 ENUM_ELEM(ary, UINT2NUM(c));
9086 ptr += n;
9088 RB_GC_GUARD(str);
9089 if (ary)
9090 return ary;
9091 else
9092 return orig;
9096 * call-seq:
9097 * str.each_codepoint {|integer| block } -> str
9098 * str.each_codepoint -> an_enumerator
9100 * Passes the Integer ordinal of each character in <i>str</i>,
9101 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9102 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9103 * values are directly derived from the binary representation
9104 * of each character.
9106 * If no block is given, an enumerator is returned instead.
9108 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9110 * <em>produces:</em>
9112 * 104 101 108 108 111 1593
9115 static VALUE
9116 rb_str_each_codepoint(VALUE str)
9118 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9119 return rb_str_enumerate_codepoints(str, 0);
9123 * call-seq:
9124 * str.codepoints -> an_array
9126 * Returns an array of the Integer ordinals of the
9127 * characters in <i>str</i>. This is a shorthand for
9128 * <code>str.each_codepoint.to_a</code>.
9130 * If a block is given, which is a deprecated form, works the same as
9131 * <code>each_codepoint</code>.
9134 static VALUE
9135 rb_str_codepoints(VALUE str)
9137 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9138 return rb_str_enumerate_codepoints(str, ary);
9141 static regex_t *
9142 get_reg_grapheme_cluster(rb_encoding *enc)
9144 int encidx = rb_enc_to_index(enc);
9145 regex_t *reg_grapheme_cluster = NULL;
9146 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9148 /* synchronize */
9149 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9150 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9152 if (!reg_grapheme_cluster) {
9153 const OnigUChar source_ascii[] = "\\X";
9154 OnigErrorInfo einfo;
9155 const OnigUChar *source = source_ascii;
9156 size_t source_len = sizeof(source_ascii) - 1;
9157 switch (encidx) {
9158 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9159 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9160 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9161 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9162 #define CASE_UTF(e) \
9163 case ENCINDEX_UTF_##e: { \
9164 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9165 source = source_UTF_##e; \
9166 source_len = sizeof(source_UTF_##e); \
9167 break; \
9169 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9170 #undef CASE_UTF
9171 #undef CHARS_16BE
9172 #undef CHARS_16LE
9173 #undef CHARS_32BE
9174 #undef CHARS_32LE
9176 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9177 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9178 if (r) {
9179 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9180 onig_error_code_to_str(message, r, &einfo);
9181 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9183 if (encidx == rb_utf8_encindex()) {
9184 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9187 return reg_grapheme_cluster;
9190 static VALUE
9191 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9193 size_t grapheme_cluster_count = 0;
9194 regex_t *reg_grapheme_cluster = NULL;
9195 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9196 const char *ptr, *end;
9198 if (!rb_enc_unicode_p(enc)) {
9199 return rb_str_length(str);
9202 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9203 ptr = RSTRING_PTR(str);
9204 end = RSTRING_END(str);
9206 while (ptr < end) {
9207 OnigPosition len = onig_match(reg_grapheme_cluster,
9208 (const OnigUChar *)ptr, (const OnigUChar *)end,
9209 (const OnigUChar *)ptr, NULL, 0);
9210 if (len <= 0) break;
9211 grapheme_cluster_count++;
9212 ptr += len;
9215 return SIZET2NUM(grapheme_cluster_count);
9218 static VALUE
9219 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9221 VALUE orig = str;
9222 regex_t *reg_grapheme_cluster = NULL;
9223 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9224 const char *ptr0, *ptr, *end;
9226 if (!rb_enc_unicode_p(enc)) {
9227 return rb_str_enumerate_chars(str, ary);
9230 if (!ary) str = rb_str_new_frozen(str);
9231 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9232 ptr0 = ptr = RSTRING_PTR(str);
9233 end = RSTRING_END(str);
9235 while (ptr < end) {
9236 OnigPosition len = onig_match(reg_grapheme_cluster,
9237 (const OnigUChar *)ptr, (const OnigUChar *)end,
9238 (const OnigUChar *)ptr, NULL, 0);
9239 if (len <= 0) break;
9240 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9241 ptr += len;
9243 RB_GC_GUARD(str);
9244 if (ary)
9245 return ary;
9246 else
9247 return orig;
9251 * call-seq:
9252 * str.each_grapheme_cluster {|cstr| block } -> str
9253 * str.each_grapheme_cluster -> an_enumerator
9255 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9256 * an enumerator if no block is given.
9257 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9258 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9260 * "a\u0300".each_char.to_a.size #=> 2
9261 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9265 static VALUE
9266 rb_str_each_grapheme_cluster(VALUE str)
9268 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9269 return rb_str_enumerate_grapheme_clusters(str, 0);
9273 * call-seq:
9274 * str.grapheme_clusters -> an_array
9276 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9277 * for <code>str.each_grapheme_cluster.to_a</code>.
9279 * If a block is given, which is a deprecated form, works the same as
9280 * <code>each_grapheme_cluster</code>.
9283 static VALUE
9284 rb_str_grapheme_clusters(VALUE str)
9286 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9287 return rb_str_enumerate_grapheme_clusters(str, ary);
9290 static long
9291 chopped_length(VALUE str)
9293 rb_encoding *enc = STR_ENC_GET(str);
9294 const char *p, *p2, *beg, *end;
9296 beg = RSTRING_PTR(str);
9297 end = beg + RSTRING_LEN(str);
9298 if (beg >= end) return 0;
9299 p = rb_enc_prev_char(beg, end, end, enc);
9300 if (!p) return 0;
9301 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9302 p2 = rb_enc_prev_char(beg, p, end, enc);
9303 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9305 return p - beg;
9309 * call-seq:
9310 * str.chop! -> str or nil
9312 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9313 * <code>nil</code> if <i>str</i> is the empty string. See also
9314 * String#chomp!.
9317 static VALUE
9318 rb_str_chop_bang(VALUE str)
9320 str_modify_keep_cr(str);
9321 if (RSTRING_LEN(str) > 0) {
9322 long len;
9323 len = chopped_length(str);
9324 STR_SET_LEN(str, len);
9325 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9326 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9327 ENC_CODERANGE_CLEAR(str);
9329 return str;
9331 return Qnil;
9336 * call-seq:
9337 * str.chop -> new_str
9339 * Returns a new String with the last character removed. If the
9340 * string ends with <code>\r\n</code>, both characters are
9341 * removed. Applying <code>chop</code> to an empty string returns an
9342 * empty string. String#chomp is often a safer alternative, as it
9343 * leaves the string unchanged if it doesn't end in a record
9344 * separator.
9346 * "string\r\n".chop #=> "string"
9347 * "string\n\r".chop #=> "string\n"
9348 * "string\n".chop #=> "string"
9349 * "string".chop #=> "strin"
9350 * "x".chop.chop #=> ""
9353 static VALUE
9354 rb_str_chop(VALUE str)
9356 return rb_str_subseq(str, 0, chopped_length(str));
9359 static long
9360 smart_chomp(VALUE str, const char *e, const char *p)
9362 rb_encoding *enc = rb_enc_get(str);
9363 if (rb_enc_mbminlen(enc) > 1) {
9364 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9365 if (rb_enc_is_newline(pp, e, enc)) {
9366 e = pp;
9368 pp = e - rb_enc_mbminlen(enc);
9369 if (pp >= p) {
9370 pp = rb_enc_left_char_head(p, pp, e, enc);
9371 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9372 e = pp;
9376 else {
9377 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9378 case '\n':
9379 if (--e > p && *(e-1) == '\r') {
9380 --e;
9382 break;
9383 case '\r':
9384 --e;
9385 break;
9388 return e - p;
9391 static long
9392 chompped_length(VALUE str, VALUE rs)
9394 rb_encoding *enc;
9395 int newline;
9396 char *pp, *e, *rsptr;
9397 long rslen;
9398 char *const p = RSTRING_PTR(str);
9399 long len = RSTRING_LEN(str);
9401 if (len == 0) return 0;
9402 e = p + len;
9403 if (rs == rb_default_rs) {
9404 return smart_chomp(str, e, p);
9407 enc = rb_enc_get(str);
9408 RSTRING_GETMEM(rs, rsptr, rslen);
9409 if (rslen == 0) {
9410 if (rb_enc_mbminlen(enc) > 1) {
9411 while (e > p) {
9412 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9413 if (!rb_enc_is_newline(pp, e, enc)) break;
9414 e = pp;
9415 pp -= rb_enc_mbminlen(enc);
9416 if (pp >= p) {
9417 pp = rb_enc_left_char_head(p, pp, e, enc);
9418 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9419 e = pp;
9424 else {
9425 while (e > p && *(e-1) == '\n') {
9426 --e;
9427 if (e > p && *(e-1) == '\r')
9428 --e;
9431 return e - p;
9433 if (rslen > len) return len;
9435 enc = rb_enc_get(rs);
9436 newline = rsptr[rslen-1];
9437 if (rslen == rb_enc_mbminlen(enc)) {
9438 if (rslen == 1) {
9439 if (newline == '\n')
9440 return smart_chomp(str, e, p);
9442 else {
9443 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9444 return smart_chomp(str, e, p);
9448 enc = rb_enc_check(str, rs);
9449 if (is_broken_string(rs)) {
9450 return len;
9452 pp = e - rslen;
9453 if (p[len-1] == newline &&
9454 (rslen <= 1 ||
9455 memcmp(rsptr, pp, rslen) == 0)) {
9456 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9457 return len - rslen;
9458 RB_GC_GUARD(rs);
9460 return len;
9464 * Returns the separator for arguments of rb_str_chomp.
9466 * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9468 static VALUE
9469 chomp_rs(int argc, const VALUE *argv)
9471 rb_check_arity(argc, 0, 1);
9472 if (argc > 0) {
9473 VALUE rs = argv[0];
9474 if (!NIL_P(rs)) StringValue(rs);
9475 return rs;
9477 else {
9478 return rb_rs;
9482 VALUE
9483 rb_str_chomp_string(VALUE str, VALUE rs)
9485 long olen = RSTRING_LEN(str);
9486 long len = chompped_length(str, rs);
9487 if (len >= olen) return Qnil;
9488 str_modify_keep_cr(str);
9489 STR_SET_LEN(str, len);
9490 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9491 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9492 ENC_CODERANGE_CLEAR(str);
9494 return str;
9498 * call-seq:
9499 * str.chomp!(separator=$/) -> str or nil
9501 * Modifies <i>str</i> in place as described for String#chomp,
9502 * returning <i>str</i>, or <code>nil</code> if no modifications were
9503 * made.
9506 static VALUE
9507 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9509 VALUE rs;
9510 str_modifiable(str);
9511 if (RSTRING_LEN(str) == 0) return Qnil;
9512 rs = chomp_rs(argc, argv);
9513 if (NIL_P(rs)) return Qnil;
9514 return rb_str_chomp_string(str, rs);
9519 * call-seq:
9520 * str.chomp(separator=$/) -> new_str
9522 * Returns a new String with the given record separator removed
9523 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9524 * changed from the default Ruby record separator, then <code>chomp</code> also
9525 * removes carriage return characters (that is, it will remove <code>\n</code>,
9526 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9527 * it will remove all trailing newlines from the string.
9529 * "hello".chomp #=> "hello"
9530 * "hello\n".chomp #=> "hello"
9531 * "hello\r\n".chomp #=> "hello"
9532 * "hello\n\r".chomp #=> "hello\n"
9533 * "hello\r".chomp #=> "hello"
9534 * "hello \n there".chomp #=> "hello \n there"
9535 * "hello".chomp("llo") #=> "he"
9536 * "hello\r\n\r\n".chomp('') #=> "hello"
9537 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9540 static VALUE
9541 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9543 VALUE rs = chomp_rs(argc, argv);
9544 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9545 return rb_str_subseq(str, 0, chompped_length(str, rs));
9548 static long
9549 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9551 const char *const start = s;
9553 if (!s || s >= e) return 0;
9555 /* remove spaces at head */
9556 if (single_byte_optimizable(str)) {
9557 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9559 else {
9560 while (s < e) {
9561 int n;
9562 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9564 if (cc && !rb_isspace(cc)) break;
9565 s += n;
9568 return s - start;
9572 * call-seq:
9573 * str.lstrip! -> self or nil
9575 * Removes leading whitespace from the receiver.
9576 * Returns the altered receiver, or +nil+ if no change was made.
9577 * See also String#rstrip! and String#strip!.
9579 * Refer to String#strip for the definition of whitespace.
9581 * " hello ".lstrip! #=> "hello "
9582 * "hello ".lstrip! #=> nil
9583 * "hello".lstrip! #=> nil
9586 static VALUE
9587 rb_str_lstrip_bang(VALUE str)
9589 rb_encoding *enc;
9590 char *start, *s;
9591 long olen, loffset;
9593 str_modify_keep_cr(str);
9594 enc = STR_ENC_GET(str);
9595 RSTRING_GETMEM(str, start, olen);
9596 loffset = lstrip_offset(str, start, start+olen, enc);
9597 if (loffset > 0) {
9598 long len = olen-loffset;
9599 s = start + loffset;
9600 memmove(start, s, len);
9601 STR_SET_LEN(str, len);
9602 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9603 return str;
9605 return Qnil;
9610 * call-seq:
9611 * str.lstrip -> new_str
9613 * Returns a copy of the receiver with leading whitespace removed.
9614 * See also String#rstrip and String#strip.
9616 * Refer to String#strip for the definition of whitespace.
9618 * " hello ".lstrip #=> "hello "
9619 * "hello".lstrip #=> "hello"
9622 static VALUE
9623 rb_str_lstrip(VALUE str)
9625 char *start;
9626 long len, loffset;
9627 RSTRING_GETMEM(str, start, len);
9628 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9629 if (loffset <= 0) return str_duplicate(rb_cString, str);
9630 return rb_str_subseq(str, loffset, len - loffset);
9633 static long
9634 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9636 const char *t;
9638 rb_str_check_dummy_enc(enc);
9639 if (!s || s >= e) return 0;
9640 t = e;
9642 /* remove trailing spaces or '\0's */
9643 if (single_byte_optimizable(str)) {
9644 unsigned char c;
9645 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9647 else {
9648 char *tp;
9650 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9651 unsigned int c = rb_enc_codepoint(tp, e, enc);
9652 if (c && !rb_isspace(c)) break;
9653 t = tp;
9656 return e - t;
9660 * call-seq:
9661 * str.rstrip! -> self or nil
9663 * Removes trailing whitespace from the receiver.
9664 * Returns the altered receiver, or +nil+ if no change was made.
9665 * See also String#lstrip! and String#strip!.
9667 * Refer to String#strip for the definition of whitespace.
9669 * " hello ".rstrip! #=> " hello"
9670 * " hello".rstrip! #=> nil
9671 * "hello".rstrip! #=> nil
9674 static VALUE
9675 rb_str_rstrip_bang(VALUE str)
9677 rb_encoding *enc;
9678 char *start;
9679 long olen, roffset;
9681 str_modify_keep_cr(str);
9682 enc = STR_ENC_GET(str);
9683 RSTRING_GETMEM(str, start, olen);
9684 roffset = rstrip_offset(str, start, start+olen, enc);
9685 if (roffset > 0) {
9686 long len = olen - roffset;
9688 STR_SET_LEN(str, len);
9689 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9690 return str;
9692 return Qnil;
9697 * call-seq:
9698 * str.rstrip -> new_str
9700 * Returns a copy of the receiver with trailing whitespace removed.
9701 * See also String#lstrip and String#strip.
9703 * Refer to String#strip for the definition of whitespace.
9705 * " hello ".rstrip #=> " hello"
9706 * "hello".rstrip #=> "hello"
9709 static VALUE
9710 rb_str_rstrip(VALUE str)
9712 rb_encoding *enc;
9713 char *start;
9714 long olen, roffset;
9716 enc = STR_ENC_GET(str);
9717 RSTRING_GETMEM(str, start, olen);
9718 roffset = rstrip_offset(str, start, start+olen, enc);
9720 if (roffset <= 0) return str_duplicate(rb_cString, str);
9721 return rb_str_subseq(str, 0, olen-roffset);
9726 * call-seq:
9727 * str.strip! -> self or nil
9729 * Removes leading and trailing whitespace from the receiver.
9730 * Returns the altered receiver, or +nil+ if there was no change.
9732 * Refer to String#strip for the definition of whitespace.
9734 * " hello ".strip! #=> "hello"
9735 * "hello".strip! #=> nil
9738 static VALUE
9739 rb_str_strip_bang(VALUE str)
9741 char *start;
9742 long olen, loffset, roffset;
9743 rb_encoding *enc;
9745 str_modify_keep_cr(str);
9746 enc = STR_ENC_GET(str);
9747 RSTRING_GETMEM(str, start, olen);
9748 loffset = lstrip_offset(str, start, start+olen, enc);
9749 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9751 if (loffset > 0 || roffset > 0) {
9752 long len = olen-roffset;
9753 if (loffset > 0) {
9754 len -= loffset;
9755 memmove(start, start + loffset, len);
9757 STR_SET_LEN(str, len);
9758 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9759 return str;
9761 return Qnil;
9766 * call-seq:
9767 * str.strip -> new_str
9769 * Returns a copy of the receiver with leading and trailing whitespace removed.
9771 * Whitespace is defined as any of the following characters:
9772 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9774 * " hello ".strip #=> "hello"
9775 * "\tgoodbye\r\n".strip #=> "goodbye"
9776 * "\x00\t\n\v\f\r ".strip #=> ""
9777 * "hello".strip #=> "hello"
9780 static VALUE
9781 rb_str_strip(VALUE str)
9783 char *start;
9784 long olen, loffset, roffset;
9785 rb_encoding *enc = STR_ENC_GET(str);
9787 RSTRING_GETMEM(str, start, olen);
9788 loffset = lstrip_offset(str, start, start+olen, enc);
9789 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9791 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9792 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9795 static VALUE
9796 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9798 VALUE result, match;
9799 struct re_registers *regs;
9800 int i;
9801 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9802 if (pos >= 0) {
9803 if (BUILTIN_TYPE(pat) == T_STRING) {
9804 regs = NULL;
9805 end = pos + RSTRING_LEN(pat);
9807 else {
9808 match = rb_backref_get();
9809 regs = RMATCH_REGS(match);
9810 pos = BEG(0);
9811 end = END(0);
9813 if (pos == end) {
9814 rb_encoding *enc = STR_ENC_GET(str);
9816 * Always consume at least one character of the input string
9818 if (RSTRING_LEN(str) > end)
9819 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9820 RSTRING_END(str), enc);
9821 else
9822 *start = end + 1;
9824 else {
9825 *start = end;
9827 if (!regs || regs->num_regs == 1) {
9828 result = rb_str_subseq(str, pos, end - pos);
9829 return result;
9831 result = rb_ary_new2(regs->num_regs);
9832 for (i=1; i < regs->num_regs; i++) {
9833 VALUE s = Qnil;
9834 if (BEG(i) >= 0) {
9835 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9837 rb_ary_push(result, s);
9840 return result;
9842 return Qnil;
9847 * call-seq:
9848 * str.scan(pattern) -> array
9849 * str.scan(pattern) {|match, ...| block } -> str
9851 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9852 * Regexp or a String). For each match, a result is
9853 * generated and either added to the result array or passed to the block. If
9854 * the pattern contains no groups, each individual result consists of the
9855 * matched string, <code>$&</code>. If the pattern contains groups, each
9856 * individual result is itself an array containing one entry per group.
9858 * a = "cruel world"
9859 * a.scan(/\w+/) #=> ["cruel", "world"]
9860 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9861 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9862 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9864 * And the block form:
9866 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9867 * print "\n"
9868 * a.scan(/(.)(.)/) {|x,y| print y, x }
9869 * print "\n"
9871 * <em>produces:</em>
9873 * <<cruel>> <<world>>
9874 * rceu lowlr
9877 static VALUE
9878 rb_str_scan(VALUE str, VALUE pat)
9880 VALUE result;
9881 long start = 0;
9882 long last = -1, prev = 0;
9883 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9885 pat = get_pat_quoted(pat, 1);
9886 mustnot_broken(str);
9887 if (!rb_block_given_p()) {
9888 VALUE ary = rb_ary_new();
9890 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9891 last = prev;
9892 prev = start;
9893 rb_ary_push(ary, result);
9895 if (last >= 0) rb_pat_search(pat, str, last, 1);
9896 else rb_backref_set(Qnil);
9897 return ary;
9900 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9901 last = prev;
9902 prev = start;
9903 rb_yield(result);
9904 str_mod_check(str, p, len);
9906 if (last >= 0) rb_pat_search(pat, str, last, 1);
9907 return str;
9912 * call-seq:
9913 * str.hex -> integer
9915 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9916 * (with an optional sign and an optional <code>0x</code>) and returns the
9917 * corresponding number. Zero is returned on error.
9919 * "0x0a".hex #=> 10
9920 * "-1234".hex #=> -4660
9921 * "0".hex #=> 0
9922 * "wombat".hex #=> 0
9925 static VALUE
9926 rb_str_hex(VALUE str)
9928 return rb_str_to_inum(str, 16, FALSE);
9933 * call-seq:
9934 * str.oct -> integer
9936 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9937 * optional sign) and returns the corresponding number. Returns 0 if the
9938 * conversion fails.
9940 * "123".oct #=> 83
9941 * "-377".oct #=> -255
9942 * "bad".oct #=> 0
9943 * "0377bad".oct #=> 255
9945 * If +str+ starts with <code>0</code>, radix indicators are honored.
9946 * See Kernel#Integer.
9949 static VALUE
9950 rb_str_oct(VALUE str)
9952 return rb_str_to_inum(str, -8, FALSE);
9955 #ifndef HAVE_CRYPT_R
9956 # include "ruby/thread_native.h"
9957 # include "ruby/atomic.h"
9959 static struct {
9960 rb_atomic_t initialized;
9961 rb_nativethread_lock_t lock;
9962 } crypt_mutex;
9964 static void
9965 crypt_mutex_destroy(void)
9967 RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9968 rb_nativethread_lock_destroy(&crypt_mutex.lock);
9969 crypt_mutex.initialized = 0;
9972 static void
9973 crypt_mutex_initialize(void)
9975 rb_atomic_t i;
9976 while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9977 switch (i) {
9978 case 0:
9979 rb_nativethread_lock_initialize(&crypt_mutex.lock);
9980 atexit(crypt_mutex_destroy);
9981 RUBY_ASSERT(crypt_mutex.initialized == 2);
9982 RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9983 break;
9984 case 1:
9985 break;
9986 default:
9987 rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
9990 #endif
9993 * call-seq:
9994 * str.crypt(salt_str) -> new_str
9996 * Returns the string generated by calling <code>crypt(3)</code>
9997 * standard library function with <code>str</code> and
9998 * <code>salt_str</code>, in this order, as its arguments. Please do
9999 * not use this method any longer. It is legacy; provided only for
10000 * backward compatibility with ruby scripts in earlier days. It is
10001 * bad to use in contemporary programs for several reasons:
10003 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10004 * run. The generated string lacks data portability.
10006 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10007 * (i.e. silently ends up in unexpected results).
10009 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10010 * thread safe.
10012 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10013 * very very weak. According to its manpage, Linux's traditional
10014 * <code>crypt(3)</code> output has only 2**56 variations; too
10015 * easy to brute force today. And this is the default behaviour.
10017 * * In order to make things robust some OSes implement so-called
10018 * "modular" usage. To go through, you have to do a complex
10019 * build-up of the <code>salt_str</code> parameter, by hand.
10020 * Failure in generation of a proper salt string tends not to
10021 * yield any errors; typos in parameters are normally not
10022 * detectable.
10024 * * For instance, in the following example, the second invocation
10025 * of String#crypt is wrong; it has a typo in "round=" (lacks
10026 * "s"). However the call does not fail and something unexpected
10027 * is generated.
10029 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10030 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10032 * * Even in the "modular" mode, some hash functions are considered
10033 * archaic and no longer recommended at all; for instance module
10034 * <code>$1$</code> is officially abandoned by its author: see
10035 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10036 * instance module <code>$3$</code> is considered completely
10037 * broken: see the manpage of FreeBSD.
10039 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10040 * written above, <code>crypt(3)</code> on Mac OS never fails.
10041 * This means even if you build up a proper salt string it
10042 * generates a traditional DES hash anyways, and there is no way
10043 * for you to be aware of.
10045 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10047 * If for some reason you cannot migrate to other secure contemporary
10048 * password hashing algorithms, install the string-crypt gem and
10049 * <code>require 'string/crypt'</code> to continue using it.
10052 static VALUE
10053 rb_str_crypt(VALUE str, VALUE salt)
10055 #ifdef HAVE_CRYPT_R
10056 VALUE databuf;
10057 struct crypt_data *data;
10058 # define CRYPT_END() ALLOCV_END(databuf)
10059 #else
10060 extern char *crypt(const char *, const char *);
10061 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10062 #endif
10063 VALUE result;
10064 const char *s, *saltp;
10065 char *res;
10066 #ifdef BROKEN_CRYPT
10067 char salt_8bit_clean[3];
10068 #endif
10070 StringValue(salt);
10071 mustnot_wchar(str);
10072 mustnot_wchar(salt);
10073 s = StringValueCStr(str);
10074 saltp = RSTRING_PTR(salt);
10075 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10076 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10079 #ifdef BROKEN_CRYPT
10080 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10081 salt_8bit_clean[0] = saltp[0] & 0x7f;
10082 salt_8bit_clean[1] = saltp[1] & 0x7f;
10083 salt_8bit_clean[2] = '\0';
10084 saltp = salt_8bit_clean;
10086 #endif
10087 #ifdef HAVE_CRYPT_R
10088 data = ALLOCV(databuf, sizeof(struct crypt_data));
10089 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10090 data->initialized = 0;
10091 # endif
10092 res = crypt_r(s, saltp, data);
10093 #else
10094 crypt_mutex_initialize();
10095 rb_nativethread_lock_lock(&crypt_mutex.lock);
10096 res = crypt(s, saltp);
10097 #endif
10098 if (!res) {
10099 int err = errno;
10100 CRYPT_END();
10101 rb_syserr_fail(err, "crypt");
10103 result = rb_str_new_cstr(res);
10104 CRYPT_END();
10105 return result;
10110 * call-seq:
10111 * str.ord -> integer
10113 * Returns the Integer ordinal of a one-character string.
10115 * "a".ord #=> 97
10118 static VALUE
10119 rb_str_ord(VALUE s)
10121 unsigned int c;
10123 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10124 return UINT2NUM(c);
10127 * call-seq:
10128 * str.sum(n=16) -> integer
10130 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10131 * where <em>n</em> is the optional Integer parameter, defaulting
10132 * to 16. The result is simply the sum of the binary value of each byte in
10133 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10134 * checksum.
10137 static VALUE
10138 rb_str_sum(int argc, VALUE *argv, VALUE str)
10140 int bits = 16;
10141 char *ptr, *p, *pend;
10142 long len;
10143 VALUE sum = INT2FIX(0);
10144 unsigned long sum0 = 0;
10146 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10147 bits = 0;
10149 ptr = p = RSTRING_PTR(str);
10150 len = RSTRING_LEN(str);
10151 pend = p + len;
10153 while (p < pend) {
10154 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10155 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10156 str_mod_check(str, ptr, len);
10157 sum0 = 0;
10159 sum0 += (unsigned char)*p;
10160 p++;
10163 if (bits == 0) {
10164 if (sum0) {
10165 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10168 else {
10169 if (sum == INT2FIX(0)) {
10170 if (bits < (int)sizeof(long)*CHAR_BIT) {
10171 sum0 &= (((unsigned long)1)<<bits)-1;
10173 sum = LONG2FIX(sum0);
10175 else {
10176 VALUE mod;
10178 if (sum0) {
10179 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10182 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10183 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10184 sum = rb_funcall(sum, '&', 1, mod);
10187 return sum;
10190 static VALUE
10191 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10193 rb_encoding *enc;
10194 VALUE w;
10195 long width, len, flen = 1, fclen = 1;
10196 VALUE res;
10197 char *p;
10198 const char *f = " ";
10199 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10200 VALUE pad;
10201 int singlebyte = 1, cr;
10202 int termlen;
10204 rb_scan_args(argc, argv, "11", &w, &pad);
10205 enc = STR_ENC_GET(str);
10206 termlen = rb_enc_mbminlen(enc);
10207 width = NUM2LONG(w);
10208 if (argc == 2) {
10209 StringValue(pad);
10210 enc = rb_enc_check(str, pad);
10211 f = RSTRING_PTR(pad);
10212 flen = RSTRING_LEN(pad);
10213 fclen = str_strlen(pad, enc); /* rb_enc_check */
10214 singlebyte = single_byte_optimizable(pad);
10215 if (flen == 0 || fclen == 0) {
10216 rb_raise(rb_eArgError, "zero width padding");
10219 len = str_strlen(str, enc); /* rb_enc_check */
10220 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10221 n = width - len;
10222 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10223 rlen = n - llen;
10224 cr = ENC_CODERANGE(str);
10225 if (flen > 1) {
10226 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10227 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10229 size = RSTRING_LEN(str);
10230 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10231 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10232 (len += llen2 + rlen2) >= LONG_MAX - size) {
10233 rb_raise(rb_eArgError, "argument too big");
10235 len += size;
10236 res = str_new0(rb_cString, 0, len, termlen);
10237 p = RSTRING_PTR(res);
10238 if (flen <= 1) {
10239 memset(p, *f, llen);
10240 p += llen;
10242 else {
10243 while (llen >= fclen) {
10244 memcpy(p,f,flen);
10245 p += flen;
10246 llen -= fclen;
10248 if (llen > 0) {
10249 memcpy(p, f, llen2);
10250 p += llen2;
10253 memcpy(p, RSTRING_PTR(str), size);
10254 p += size;
10255 if (flen <= 1) {
10256 memset(p, *f, rlen);
10257 p += rlen;
10259 else {
10260 while (rlen >= fclen) {
10261 memcpy(p,f,flen);
10262 p += flen;
10263 rlen -= fclen;
10265 if (rlen > 0) {
10266 memcpy(p, f, rlen2);
10267 p += rlen2;
10270 TERM_FILL(p, termlen);
10271 STR_SET_LEN(res, p-RSTRING_PTR(res));
10272 rb_enc_associate(res, enc);
10273 if (argc == 2)
10274 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10275 if (cr != ENC_CODERANGE_BROKEN)
10276 ENC_CODERANGE_SET(res, cr);
10278 RB_GC_GUARD(pad);
10279 return res;
10284 * call-seq:
10285 * str.ljust(integer, padstr=' ') -> new_str
10287 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10288 * String of length <i>integer</i> with <i>str</i> left justified
10289 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10291 * "hello".ljust(4) #=> "hello"
10292 * "hello".ljust(20) #=> "hello "
10293 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10296 static VALUE
10297 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10299 return rb_str_justify(argc, argv, str, 'l');
10304 * call-seq:
10305 * str.rjust(integer, padstr=' ') -> new_str
10307 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10308 * String of length <i>integer</i> with <i>str</i> right justified
10309 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10311 * "hello".rjust(4) #=> "hello"
10312 * "hello".rjust(20) #=> " hello"
10313 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10316 static VALUE
10317 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10319 return rb_str_justify(argc, argv, str, 'r');
10324 * call-seq:
10325 * str.center(width, padstr=' ') -> new_str
10327 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10328 * returns a new String of length +width+ with +str+ centered and padded with
10329 * +padstr+; otherwise, returns +str+.
10331 * "hello".center(4) #=> "hello"
10332 * "hello".center(20) #=> " hello "
10333 * "hello".center(20, '123') #=> "1231231hello12312312"
10336 static VALUE
10337 rb_str_center(int argc, VALUE *argv, VALUE str)
10339 return rb_str_justify(argc, argv, str, 'c');
10343 * call-seq:
10344 * str.partition(sep) -> [head, sep, tail]
10345 * str.partition(regexp) -> [head, match, tail]
10347 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10348 * and returns the part before it, the match, and the part
10349 * after it.
10350 * If it is not found, returns two empty strings and <i>str</i>.
10352 * "hello".partition("l") #=> ["he", "l", "lo"]
10353 * "hello".partition("x") #=> ["hello", "", ""]
10354 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10357 static VALUE
10358 rb_str_partition(VALUE str, VALUE sep)
10360 long pos;
10362 sep = get_pat_quoted(sep, 0);
10363 if (RB_TYPE_P(sep, T_REGEXP)) {
10364 if (rb_reg_search(sep, str, 0, 0) < 0) {
10365 goto failed;
10367 VALUE match = rb_backref_get();
10368 struct re_registers *regs = RMATCH_REGS(match);
10370 pos = BEG(0);
10371 sep = rb_str_subseq(str, pos, END(0) - pos);
10373 else {
10374 pos = rb_str_index(str, sep, 0);
10375 if (pos < 0) goto failed;
10377 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10378 sep,
10379 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10380 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10382 failed:
10383 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10387 * call-seq:
10388 * str.rpartition(sep) -> [head, sep, tail]
10389 * str.rpartition(regexp) -> [head, match, tail]
10391 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10392 * of the string, and returns the part before it, the match, and the part
10393 * after it.
10394 * If it is not found, returns two empty strings and <i>str</i>.
10396 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10397 * "hello".rpartition("x") #=> ["", "", "hello"]
10398 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10400 * The match from the end means starting at the possible last position, not
10401 * the last of longest matches.
10403 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10405 * To partition at the last longest match, needs to combine with
10406 * negative lookbehind.
10408 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10410 * Or String#partition with negative lookforward.
10412 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10415 static VALUE
10416 rb_str_rpartition(VALUE str, VALUE sep)
10418 long pos = RSTRING_LEN(str);
10420 sep = get_pat_quoted(sep, 0);
10421 if (RB_TYPE_P(sep, T_REGEXP)) {
10422 if (rb_reg_search(sep, str, pos, 1) < 0) {
10423 goto failed;
10425 VALUE match = rb_backref_get();
10426 struct re_registers *regs = RMATCH_REGS(match);
10428 pos = BEG(0);
10429 sep = rb_str_subseq(str, pos, END(0) - pos);
10431 else {
10432 pos = rb_str_sublen(str, pos);
10433 pos = rb_str_rindex(str, sep, pos);
10434 if (pos < 0) {
10435 goto failed;
10437 pos = rb_str_offset(str, pos);
10440 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10441 sep,
10442 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10443 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10444 failed:
10445 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10449 * call-seq:
10450 * str.start_with?([prefixes]+) -> true or false
10452 * Returns true if +str+ starts with one of the +prefixes+ given.
10453 * Each of the +prefixes+ should be a String or a Regexp.
10455 * "hello".start_with?("hell") #=> true
10456 * "hello".start_with?(/H/i) #=> true
10458 * # returns true if one of the prefixes matches.
10459 * "hello".start_with?("heaven", "hell") #=> true
10460 * "hello".start_with?("heaven", "paradise") #=> false
10463 static VALUE
10464 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10466 int i;
10468 for (i=0; i<argc; i++) {
10469 VALUE tmp = argv[i];
10470 if (RB_TYPE_P(tmp, T_REGEXP)) {
10471 if (rb_reg_start_with_p(tmp, str))
10472 return Qtrue;
10474 else {
10475 StringValue(tmp);
10476 rb_enc_check(str, tmp);
10477 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10478 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10479 return Qtrue;
10482 return Qfalse;
10486 * call-seq:
10487 * str.end_with?([suffixes]+) -> true or false
10489 * Returns true if +str+ ends with one of the +suffixes+ given.
10491 * "hello".end_with?("ello") #=> true
10493 * # returns true if one of the +suffixes+ matches.
10494 * "hello".end_with?("heaven", "ello") #=> true
10495 * "hello".end_with?("heaven", "paradise") #=> false
10498 static VALUE
10499 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10501 int i;
10502 char *p, *s, *e;
10503 rb_encoding *enc;
10505 for (i=0; i<argc; i++) {
10506 VALUE tmp = argv[i];
10507 long slen, tlen;
10508 StringValue(tmp);
10509 enc = rb_enc_check(str, tmp);
10510 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10511 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10512 p = RSTRING_PTR(str);
10513 e = p + slen;
10514 s = e - tlen;
10515 if (rb_enc_left_char_head(p, s, e, enc) != s)
10516 continue;
10517 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10518 return Qtrue;
10520 return Qfalse;
10524 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10525 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10527 * @param str the target
10528 * @param prefix the prefix
10529 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10530 * @retval Positive-Integer otherwise
10532 static long
10533 deleted_prefix_length(VALUE str, VALUE prefix)
10535 char *strptr, *prefixptr;
10536 long olen, prefixlen;
10538 StringValue(prefix);
10539 if (is_broken_string(prefix)) return 0;
10540 rb_enc_check(str, prefix);
10542 /* return 0 if not start with prefix */
10543 prefixlen = RSTRING_LEN(prefix);
10544 if (prefixlen <= 0) return 0;
10545 olen = RSTRING_LEN(str);
10546 if (olen < prefixlen) return 0;
10547 strptr = RSTRING_PTR(str);
10548 prefixptr = RSTRING_PTR(prefix);
10549 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10551 return prefixlen;
10555 * call-seq:
10556 * str.delete_prefix!(prefix) -> self or nil
10558 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10559 * <code>nil</code> if no change was made.
10561 * "hello".delete_prefix!("hel") #=> "lo"
10562 * "hello".delete_prefix!("llo") #=> nil
10565 static VALUE
10566 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10568 long prefixlen;
10569 str_modify_keep_cr(str);
10571 prefixlen = deleted_prefix_length(str, prefix);
10572 if (prefixlen <= 0) return Qnil;
10574 return rb_str_drop_bytes(str, prefixlen);
10578 * call-seq:
10579 * str.delete_prefix(prefix) -> new_str
10581 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10583 * "hello".delete_prefix("hel") #=> "lo"
10584 * "hello".delete_prefix("llo") #=> "hello"
10587 static VALUE
10588 rb_str_delete_prefix(VALUE str, VALUE prefix)
10590 long prefixlen;
10592 prefixlen = deleted_prefix_length(str, prefix);
10593 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10595 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10599 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10600 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10602 * @param str the target
10603 * @param suffix the suffix
10604 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10605 * @retval Positive-Integer otherwise
10607 static long
10608 deleted_suffix_length(VALUE str, VALUE suffix)
10610 char *strptr, *suffixptr, *s;
10611 long olen, suffixlen;
10612 rb_encoding *enc;
10614 StringValue(suffix);
10615 if (is_broken_string(suffix)) return 0;
10616 enc = rb_enc_check(str, suffix);
10618 /* return 0 if not start with suffix */
10619 suffixlen = RSTRING_LEN(suffix);
10620 if (suffixlen <= 0) return 0;
10621 olen = RSTRING_LEN(str);
10622 if (olen < suffixlen) return 0;
10623 strptr = RSTRING_PTR(str);
10624 suffixptr = RSTRING_PTR(suffix);
10625 s = strptr + olen - suffixlen;
10626 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10627 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10629 return suffixlen;
10633 * call-seq:
10634 * str.delete_suffix!(suffix) -> self or nil
10636 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10637 * <code>nil</code> if no change was made.
10639 * "hello".delete_suffix!("llo") #=> "he"
10640 * "hello".delete_suffix!("hel") #=> nil
10643 static VALUE
10644 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10646 long olen, suffixlen, len;
10647 str_modifiable(str);
10649 suffixlen = deleted_suffix_length(str, suffix);
10650 if (suffixlen <= 0) return Qnil;
10652 olen = RSTRING_LEN(str);
10653 str_modify_keep_cr(str);
10654 len = olen - suffixlen;
10655 STR_SET_LEN(str, len);
10656 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10657 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10658 ENC_CODERANGE_CLEAR(str);
10660 return str;
10664 * call-seq:
10665 * str.delete_suffix(suffix) -> new_str
10667 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10669 * "hello".delete_suffix("llo") #=> "he"
10670 * "hello".delete_suffix("hel") #=> "hello"
10673 static VALUE
10674 rb_str_delete_suffix(VALUE str, VALUE suffix)
10676 long suffixlen;
10678 suffixlen = deleted_suffix_length(str, suffix);
10679 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10681 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10684 void
10685 rb_str_setter(VALUE val, ID id, VALUE *var)
10687 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10688 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10690 *var = val;
10693 static void
10694 rb_fs_setter(VALUE val, ID id, VALUE *var)
10696 val = rb_fs_check(val);
10697 if (!val) {
10698 rb_raise(rb_eTypeError,
10699 "value of %"PRIsVALUE" must be String or Regexp",
10700 rb_id2str(id));
10702 if (!NIL_P(val)) {
10703 rb_warn_deprecated("`$;'", NULL);
10705 *var = val;
10710 * call-seq:
10711 * str.force_encoding(encoding) -> str
10713 * Changes the encoding to +encoding+ and returns self.
10716 static VALUE
10717 rb_str_force_encoding(VALUE str, VALUE enc)
10719 str_modifiable(str);
10720 rb_enc_associate(str, rb_to_encoding(enc));
10721 ENC_CODERANGE_CLEAR(str);
10722 return str;
10726 * call-seq:
10727 * str.b -> str
10729 * Returns a copied string whose encoding is ASCII-8BIT.
10732 static VALUE
10733 rb_str_b(VALUE str)
10735 VALUE str2;
10736 if (FL_TEST(str, STR_NOEMBED)) {
10737 str2 = str_alloc_heap(rb_cString);
10739 else {
10740 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10742 str_replace_shared_without_enc(str2, str);
10743 ENC_CODERANGE_CLEAR(str2);
10744 return str2;
10748 * call-seq:
10749 * str.valid_encoding? -> true or false
10751 * Returns true for a string which is encoded correctly.
10753 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10754 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10755 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10758 static VALUE
10759 rb_str_valid_encoding_p(VALUE str)
10761 int cr = rb_enc_str_coderange(str);
10763 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10767 * call-seq:
10768 * str.ascii_only? -> true or false
10770 * Returns true for a string which has only ASCII characters.
10772 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10773 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10776 static VALUE
10777 rb_str_is_ascii_only_p(VALUE str)
10779 int cr = rb_enc_str_coderange(str);
10781 return RBOOL(cr == ENC_CODERANGE_7BIT);
10784 VALUE
10785 rb_str_ellipsize(VALUE str, long len)
10787 static const char ellipsis[] = "...";
10788 const long ellipsislen = sizeof(ellipsis) - 1;
10789 rb_encoding *const enc = rb_enc_get(str);
10790 const long blen = RSTRING_LEN(str);
10791 const char *const p = RSTRING_PTR(str), *e = p + blen;
10792 VALUE estr, ret = 0;
10794 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10795 if (len * rb_enc_mbminlen(enc) >= blen ||
10796 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10797 ret = str;
10799 else if (len <= ellipsislen ||
10800 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10801 if (rb_enc_asciicompat(enc)) {
10802 ret = rb_str_new(ellipsis, len);
10803 rb_enc_associate(ret, enc);
10805 else {
10806 estr = rb_usascii_str_new(ellipsis, len);
10807 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10810 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10811 rb_str_cat(ret, ellipsis, ellipsislen);
10813 else {
10814 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10815 rb_enc_from_encoding(enc), 0, Qnil);
10816 rb_str_append(ret, estr);
10818 return ret;
10821 static VALUE
10822 str_compat_and_valid(VALUE str, rb_encoding *enc)
10824 int cr;
10825 str = StringValue(str);
10826 cr = rb_enc_str_coderange(str);
10827 if (cr == ENC_CODERANGE_BROKEN) {
10828 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10830 else {
10831 rb_encoding *e = STR_ENC_GET(str);
10832 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10833 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10834 rb_enc_name(enc), rb_enc_name(e));
10837 return str;
10840 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10842 VALUE
10843 rb_str_scrub(VALUE str, VALUE repl)
10845 rb_encoding *enc = STR_ENC_GET(str);
10846 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10849 VALUE
10850 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10852 int cr = ENC_CODERANGE_UNKNOWN;
10853 if (enc == STR_ENC_GET(str)) {
10854 /* cached coderange makes sense only when enc equals the
10855 * actual encoding of str */
10856 cr = ENC_CODERANGE(str);
10858 return enc_str_scrub(enc, str, repl, cr);
10861 static VALUE
10862 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10864 int encidx;
10865 VALUE buf = Qnil;
10866 const char *rep, *p, *e, *p1, *sp;
10867 long replen = -1;
10868 long slen;
10870 if (rb_block_given_p()) {
10871 if (!NIL_P(repl))
10872 rb_raise(rb_eArgError, "both of block and replacement given");
10873 replen = 0;
10876 if (ENC_CODERANGE_CLEAN_P(cr))
10877 return Qnil;
10879 if (!NIL_P(repl)) {
10880 repl = str_compat_and_valid(repl, enc);
10883 if (rb_enc_dummy_p(enc)) {
10884 return Qnil;
10886 encidx = rb_enc_to_index(enc);
10888 #define DEFAULT_REPLACE_CHAR(str) do { \
10889 static const char replace[sizeof(str)-1] = str; \
10890 rep = replace; replen = (int)sizeof(replace); \
10891 } while (0)
10893 slen = RSTRING_LEN(str);
10894 p = RSTRING_PTR(str);
10895 e = RSTRING_END(str);
10896 p1 = p;
10897 sp = p;
10899 if (rb_enc_asciicompat(enc)) {
10900 int rep7bit_p;
10901 if (!replen) {
10902 rep = NULL;
10903 rep7bit_p = FALSE;
10905 else if (!NIL_P(repl)) {
10906 rep = RSTRING_PTR(repl);
10907 replen = RSTRING_LEN(repl);
10908 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10910 else if (encidx == rb_utf8_encindex()) {
10911 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10912 rep7bit_p = FALSE;
10914 else {
10915 DEFAULT_REPLACE_CHAR("?");
10916 rep7bit_p = TRUE;
10918 cr = ENC_CODERANGE_7BIT;
10920 p = search_nonascii(p, e);
10921 if (!p) {
10922 p = e;
10924 while (p < e) {
10925 int ret = rb_enc_precise_mbclen(p, e, enc);
10926 if (MBCLEN_NEEDMORE_P(ret)) {
10927 break;
10929 else if (MBCLEN_CHARFOUND_P(ret)) {
10930 cr = ENC_CODERANGE_VALID;
10931 p += MBCLEN_CHARFOUND_LEN(ret);
10933 else if (MBCLEN_INVALID_P(ret)) {
10935 * p1~p: valid ascii/multibyte chars
10936 * p ~e: invalid bytes + unknown bytes
10938 long clen = rb_enc_mbmaxlen(enc);
10939 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10940 if (p > p1) {
10941 rb_str_buf_cat(buf, p1, p - p1);
10944 if (e - p < clen) clen = e - p;
10945 if (clen <= 2) {
10946 clen = 1;
10948 else {
10949 const char *q = p;
10950 clen--;
10951 for (; clen > 1; clen--) {
10952 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10953 if (MBCLEN_NEEDMORE_P(ret)) break;
10954 if (MBCLEN_INVALID_P(ret)) continue;
10955 UNREACHABLE;
10958 if (rep) {
10959 rb_str_buf_cat(buf, rep, replen);
10960 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10962 else {
10963 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10964 str_mod_check(str, sp, slen);
10965 repl = str_compat_and_valid(repl, enc);
10966 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10967 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10968 cr = ENC_CODERANGE_VALID;
10970 p += clen;
10971 p1 = p;
10972 p = search_nonascii(p, e);
10973 if (!p) {
10974 p = e;
10975 break;
10978 else {
10979 UNREACHABLE;
10982 if (NIL_P(buf)) {
10983 if (p == e) {
10984 ENC_CODERANGE_SET(str, cr);
10985 return Qnil;
10987 buf = rb_str_buf_new(RSTRING_LEN(str));
10989 if (p1 < p) {
10990 rb_str_buf_cat(buf, p1, p - p1);
10992 if (p < e) {
10993 if (rep) {
10994 rb_str_buf_cat(buf, rep, replen);
10995 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10997 else {
10998 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10999 str_mod_check(str, sp, slen);
11000 repl = str_compat_and_valid(repl, enc);
11001 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11002 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11003 cr = ENC_CODERANGE_VALID;
11007 else {
11008 /* ASCII incompatible */
11009 long mbminlen = rb_enc_mbminlen(enc);
11010 if (!replen) {
11011 rep = NULL;
11013 else if (!NIL_P(repl)) {
11014 rep = RSTRING_PTR(repl);
11015 replen = RSTRING_LEN(repl);
11017 else if (encidx == ENCINDEX_UTF_16BE) {
11018 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11020 else if (encidx == ENCINDEX_UTF_16LE) {
11021 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11023 else if (encidx == ENCINDEX_UTF_32BE) {
11024 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11026 else if (encidx == ENCINDEX_UTF_32LE) {
11027 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11029 else {
11030 DEFAULT_REPLACE_CHAR("?");
11033 while (p < e) {
11034 int ret = rb_enc_precise_mbclen(p, e, enc);
11035 if (MBCLEN_NEEDMORE_P(ret)) {
11036 break;
11038 else if (MBCLEN_CHARFOUND_P(ret)) {
11039 p += MBCLEN_CHARFOUND_LEN(ret);
11041 else if (MBCLEN_INVALID_P(ret)) {
11042 const char *q = p;
11043 long clen = rb_enc_mbmaxlen(enc);
11044 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11045 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11047 if (e - p < clen) clen = e - p;
11048 if (clen <= mbminlen * 2) {
11049 clen = mbminlen;
11051 else {
11052 clen -= mbminlen;
11053 for (; clen > mbminlen; clen-=mbminlen) {
11054 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11055 if (MBCLEN_NEEDMORE_P(ret)) break;
11056 if (MBCLEN_INVALID_P(ret)) continue;
11057 UNREACHABLE;
11060 if (rep) {
11061 rb_str_buf_cat(buf, rep, replen);
11063 else {
11064 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11065 str_mod_check(str, sp, slen);
11066 repl = str_compat_and_valid(repl, enc);
11067 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11069 p += clen;
11070 p1 = p;
11072 else {
11073 UNREACHABLE;
11076 if (NIL_P(buf)) {
11077 if (p == e) {
11078 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11079 return Qnil;
11081 buf = rb_str_buf_new(RSTRING_LEN(str));
11083 if (p1 < p) {
11084 rb_str_buf_cat(buf, p1, p - p1);
11086 if (p < e) {
11087 if (rep) {
11088 rb_str_buf_cat(buf, rep, replen);
11090 else {
11091 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11092 str_mod_check(str, sp, slen);
11093 repl = str_compat_and_valid(repl, enc);
11094 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11097 cr = ENC_CODERANGE_VALID;
11099 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11100 return buf;
11104 * call-seq:
11105 * str.scrub -> new_str
11106 * str.scrub(repl) -> new_str
11107 * str.scrub{|bytes|} -> new_str
11109 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11110 * character, else returns self.
11111 * If block is given, replace invalid bytes with returned value of the block.
11113 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11114 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11115 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11117 static VALUE
11118 str_scrub(int argc, VALUE *argv, VALUE str)
11120 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11121 VALUE new = rb_str_scrub(str, repl);
11122 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11126 * call-seq:
11127 * str.scrub! -> str
11128 * str.scrub!(repl) -> str
11129 * str.scrub!{|bytes|} -> str
11131 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11132 * character, else returns self.
11133 * If block is given, replace invalid bytes with returned value of the block.
11135 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11136 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11137 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11139 static VALUE
11140 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11142 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11143 VALUE new = rb_str_scrub(str, repl);
11144 if (!NIL_P(new)) rb_str_replace(str, new);
11145 return str;
11148 static ID id_normalize;
11149 static ID id_normalized_p;
11150 static VALUE mUnicodeNormalize;
11152 static VALUE
11153 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11155 static int UnicodeNormalizeRequired = 0;
11156 VALUE argv2[2];
11158 if (!UnicodeNormalizeRequired) {
11159 rb_require("unicode_normalize/normalize.rb");
11160 UnicodeNormalizeRequired = 1;
11162 argv2[0] = str;
11163 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11164 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11168 * call-seq:
11169 * str.unicode_normalize(form=:nfc)
11171 * Unicode Normalization---Returns a normalized form of +str+,
11172 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11173 * The normalization form used is determined by +form+, which can
11174 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11175 * The default is +:nfc+.
11177 * If the string is not in a Unicode Encoding, then an Exception is raised.
11178 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11179 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11180 * Anything other than UTF-8 is implemented by converting to UTF-8,
11181 * which makes it slower than UTF-8.
11183 * "a\u0300".unicode_normalize #=> "\u00E0"
11184 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11185 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11186 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11187 * #=> Encoding::CompatibilityError raised
11189 static VALUE
11190 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11192 return unicode_normalize_common(argc, argv, str, id_normalize);
11196 * call-seq:
11197 * str.unicode_normalize!(form=:nfc)
11199 * Destructive version of String#unicode_normalize, doing Unicode
11200 * normalization in place.
11202 static VALUE
11203 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11205 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11208 /* call-seq:
11209 * str.unicode_normalized?(form=:nfc)
11211 * Checks whether +str+ is in Unicode normalization form +form+,
11212 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11213 * The default is +:nfc+.
11215 * If the string is not in a Unicode Encoding, then an Exception is raised.
11216 * For details, see String#unicode_normalize.
11218 * "a\u0300".unicode_normalized? #=> false
11219 * "a\u0300".unicode_normalized?(:nfd) #=> true
11220 * "\u00E0".unicode_normalized? #=> true
11221 * "\u00E0".unicode_normalized?(:nfd) #=> false
11222 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11223 * #=> Encoding::CompatibilityError raised
11225 static VALUE
11226 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11228 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11231 /**********************************************************************
11232 * Document-class: Symbol
11234 * Symbol objects represent named identifiers inside the Ruby interpreter.
11236 * You can create a \Symbol object explicitly with:
11238 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11240 * The same Symbol object will be
11241 * created for a given name or string for the duration of a program's
11242 * execution, regardless of the context or meaning of that name. Thus
11243 * if <code>Fred</code> is a constant in one context, a method in
11244 * another, and a class in a third, the Symbol <code>:Fred</code>
11245 * will be the same object in all three contexts.
11247 * module One
11248 * class Fred
11249 * end
11250 * $f1 = :Fred
11251 * end
11252 * module Two
11253 * Fred = 1
11254 * $f2 = :Fred
11255 * end
11256 * def Fred()
11257 * end
11258 * $f3 = :Fred
11259 * $f1.object_id #=> 2514190
11260 * $f2.object_id #=> 2514190
11261 * $f3.object_id #=> 2514190
11263 * Constant, method, and variable names are returned as symbols:
11265 * module One
11266 * Two = 2
11267 * def three; 3 end
11268 * @four = 4
11269 * @@five = 5
11270 * $six = 6
11271 * end
11272 * seven = 7
11274 * One.constants
11275 * # => [:Two]
11276 * One.instance_methods(true)
11277 * # => [:three]
11278 * One.instance_variables
11279 * # => [:@four]
11280 * One.class_variables
11281 * # => [:@@five]
11282 * global_variables.grep(/six/)
11283 * # => [:$six]
11284 * local_variables
11285 * # => [:seven]
11287 * Symbol objects are different from String objects in that
11288 * Symbol objects represent identifiers, while String objects
11289 * represent text or data.
11291 * == What's Here
11293 * First, what's elsewhere. \Class \Symbol:
11295 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11296 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11298 * Here, class \Symbol provides methods that are useful for:
11300 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11301 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11302 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11304 * === Methods for Querying
11306 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11307 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11308 * in symbol that matches a given Regexp
11309 * or other object; returns +nil+ if no match is found.
11310 * - #[], #slice :: Returns a substring of symbol
11311 * determined by a given index, start/length, or range, or string.
11312 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11313 * - #encoding:: Returns the Encoding object that represents the encoding
11314 * of symbol.
11315 * - #end_with?:: Returns +true+ if symbol ends with
11316 * any of the given strings.
11317 * - #match:: Returns a MatchData object if symbol
11318 * matches a given Regexp; +nil+ otherwise.
11319 * - #match?:: Returns +true+ if symbol
11320 * matches a given Regexp; +false+ otherwise.
11321 * - #length, #size:: Returns the number of characters in symbol.
11322 * - #start_with?:: Returns +true+ if symbol starts with
11323 * any of the given strings.
11325 * === Methods for Comparing
11327 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11328 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11329 * has the same content and encoding.
11330 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11331 * symbol is smaller than, equal to, or larger than symbol.
11332 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11333 * after Unicode case folding; +false+ otherwise.
11335 * === Methods for Converting
11337 * - #capitalize:: Returns symbol with the first character upcased
11338 * and all other characters downcased.
11339 * - #downcase:: Returns symbol with all characters downcased.
11340 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11341 * - #name:: Returns the frozen string corresponding to symbol.
11342 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11343 * - #swapcase:: Returns symbol with all upcase characters downcased
11344 * and all downcase characters upcased.
11345 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11346 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11347 * - #to_sym, #intern:: Returns +self+.
11348 * - #upcase:: Returns symbol with all characters upcased.
11354 * call-seq:
11355 * sym == obj -> true or false
11357 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11358 * symbol, returns <code>true</code>.
11361 #define sym_equal rb_obj_equal
11363 static int
11364 sym_printable(const char *s, const char *send, rb_encoding *enc)
11366 while (s < send) {
11367 int n;
11368 int c = rb_enc_precise_mbclen(s, send, enc);
11370 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11371 n = MBCLEN_CHARFOUND_LEN(c);
11372 c = rb_enc_mbc_to_codepoint(s, send, enc);
11373 if (!rb_enc_isprint(c, enc)) return FALSE;
11374 s += n;
11376 return TRUE;
11380 rb_str_symname_p(VALUE sym)
11382 rb_encoding *enc;
11383 const char *ptr;
11384 long len;
11385 rb_encoding *resenc = rb_default_internal_encoding();
11387 if (resenc == NULL) resenc = rb_default_external_encoding();
11388 enc = STR_ENC_GET(sym);
11389 ptr = RSTRING_PTR(sym);
11390 len = RSTRING_LEN(sym);
11391 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11392 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11393 return FALSE;
11395 return TRUE;
11398 VALUE
11399 rb_str_quote_unprintable(VALUE str)
11401 rb_encoding *enc;
11402 const char *ptr;
11403 long len;
11404 rb_encoding *resenc;
11406 Check_Type(str, T_STRING);
11407 resenc = rb_default_internal_encoding();
11408 if (resenc == NULL) resenc = rb_default_external_encoding();
11409 enc = STR_ENC_GET(str);
11410 ptr = RSTRING_PTR(str);
11411 len = RSTRING_LEN(str);
11412 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11413 !sym_printable(ptr, ptr + len, enc)) {
11414 return rb_str_escape(str);
11416 return str;
11419 MJIT_FUNC_EXPORTED VALUE
11420 rb_id_quote_unprintable(ID id)
11422 VALUE str = rb_id2str(id);
11423 if (!rb_str_symname_p(str)) {
11424 return rb_str_escape(str);
11426 return str;
11430 * call-seq:
11431 * sym.inspect -> string
11433 * Returns the representation of <i>sym</i> as a symbol literal.
11435 * :fred.inspect #=> ":fred"
11438 static VALUE
11439 sym_inspect(VALUE sym)
11441 VALUE str = rb_sym2str(sym);
11442 const char *ptr;
11443 long len;
11444 char *dest;
11446 if (!rb_str_symname_p(str)) {
11447 str = rb_str_inspect(str);
11448 len = RSTRING_LEN(str);
11449 rb_str_resize(str, len + 1);
11450 dest = RSTRING_PTR(str);
11451 memmove(dest + 1, dest, len);
11453 else {
11454 rb_encoding *enc = STR_ENC_GET(str);
11455 RSTRING_GETMEM(str, ptr, len);
11456 str = rb_enc_str_new(0, len + 1, enc);
11457 dest = RSTRING_PTR(str);
11458 memcpy(dest + 1, ptr, len);
11460 dest[0] = ':';
11461 return str;
11464 #if 0 /* for RDoc */
11466 * call-seq:
11467 * sym.name -> string
11469 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11470 * returned string is frozen.
11472 * :fred.name #=> "fred"
11473 * :fred.name.frozen? #=> true
11474 * :fred.to_s #=> "fred"
11475 * :fred.to_s.frozen? #=> false
11477 VALUE
11478 rb_sym2str(VALUE sym)
11482 #endif
11486 * call-seq:
11487 * sym.id2name -> string
11488 * sym.to_s -> string
11490 * Returns the name or string corresponding to <i>sym</i>.
11492 * :fred.id2name #=> "fred"
11493 * :ginger.to_s #=> "ginger"
11495 * Note that this string is not frozen (unlike the symbol itself).
11496 * To get a frozen string, use #name.
11500 VALUE
11501 rb_sym_to_s(VALUE sym)
11503 return str_new_shared(rb_cString, rb_sym2str(sym));
11508 * call-seq:
11509 * sym.to_sym -> sym
11510 * sym.intern -> sym
11512 * In general, <code>to_sym</code> returns the Symbol corresponding
11513 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11514 * in this case.
11517 static VALUE
11518 sym_to_sym(VALUE sym)
11520 return sym;
11523 MJIT_FUNC_EXPORTED VALUE
11524 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11526 VALUE obj;
11528 if (argc < 1) {
11529 rb_raise(rb_eArgError, "no receiver given");
11531 obj = argv[0];
11532 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11535 #if 0
11537 * call-seq:
11538 * sym.to_proc
11540 * Returns a _Proc_ object which responds to the given method by _sym_.
11542 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11545 VALUE
11546 rb_sym_to_proc(VALUE sym)
11549 #endif
11552 * call-seq:
11554 * sym.succ
11556 * Same as <code>sym.to_s.succ.intern</code>.
11559 static VALUE
11560 sym_succ(VALUE sym)
11562 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11566 * call-seq:
11568 * symbol <=> other_symbol -> -1, 0, +1, or nil
11570 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11571 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11572 * less than, equal to, or greater than +other_symbol+.
11574 * +nil+ is returned if the two values are incomparable.
11576 * See String#<=> for more information.
11579 static VALUE
11580 sym_cmp(VALUE sym, VALUE other)
11582 if (!SYMBOL_P(other)) {
11583 return Qnil;
11585 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11589 * call-seq:
11590 * casecmp(other_symbol) -> -1, 0, 1, or nil
11592 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11594 * :aBcDeF.casecmp(:abcde) # => 1
11595 * :aBcDeF.casecmp(:abcdef) # => 0
11596 * :aBcDeF.casecmp(:abcdefg) # => -1
11597 * :abcdef.casecmp(:ABCDEF) # => 0
11599 * Returns +nil+ if the two symbols have incompatible encodings,
11600 * or if +other_symbol+ is not a symbol:
11602 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11603 * other_sym = :"\u{c4 d6 dc}"
11604 * sym.casecmp(other_sym) # => nil
11605 * :foo.casecmp(2) # => nil
11607 * Currently, case-insensitivity only works on characters A-Z/a-z,
11608 * not all of Unicode. This is different from Symbol#casecmp?.
11610 * Related: Symbol#casecmp?.
11614 static VALUE
11615 sym_casecmp(VALUE sym, VALUE other)
11617 if (!SYMBOL_P(other)) {
11618 return Qnil;
11620 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11624 * call-seq:
11625 * casecmp?(other_symbol) -> true, false, or nil
11627 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11628 * Unicode case folding, +false+ if they are not equal:
11630 * :aBcDeF.casecmp?(:abcde) # => false
11631 * :aBcDeF.casecmp?(:abcdef) # => true
11632 * :aBcDeF.casecmp?(:abcdefg) # => false
11633 * :abcdef.casecmp?(:ABCDEF) # => true
11634 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11636 * Returns +nil+ if the two symbols have incompatible encodings,
11637 * or if +other_symbol+ is not a symbol:
11639 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11640 * other_sym = :"\u{c4 d6 dc}"
11641 * sym.casecmp?(other_sym) # => nil
11642 * :foo.casecmp?(2) # => nil
11644 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11646 * Related: Symbol#casecmp.
11650 static VALUE
11651 sym_casecmp_p(VALUE sym, VALUE other)
11653 if (!SYMBOL_P(other)) {
11654 return Qnil;
11656 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11660 * call-seq:
11661 * sym =~ obj -> integer or nil
11663 * Returns <code>sym.to_s =~ obj</code>.
11666 static VALUE
11667 sym_match(VALUE sym, VALUE other)
11669 return rb_str_match(rb_sym2str(sym), other);
11673 * call-seq:
11674 * sym.match(pattern) -> matchdata or nil
11675 * sym.match(pattern, pos) -> matchdata or nil
11677 * Returns <code>sym.to_s.match</code>.
11680 static VALUE
11681 sym_match_m(int argc, VALUE *argv, VALUE sym)
11683 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11687 * call-seq:
11688 * sym.match?(pattern) -> true or false
11689 * sym.match?(pattern, pos) -> true or false
11691 * Returns <code>sym.to_s.match?</code>.
11694 static VALUE
11695 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11697 return rb_str_match_m_p(argc, argv, sym);
11701 * call-seq:
11702 * sym[idx] -> char
11703 * sym[b, n] -> string
11704 * sym.slice(idx) -> char
11705 * sym.slice(b, n) -> string
11707 * Returns <code>sym.to_s[]</code>.
11710 static VALUE
11711 sym_aref(int argc, VALUE *argv, VALUE sym)
11713 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11717 * call-seq:
11718 * sym.length -> integer
11719 * sym.size -> integer
11721 * Same as <code>sym.to_s.length</code>.
11724 static VALUE
11725 sym_length(VALUE sym)
11727 return rb_str_length(rb_sym2str(sym));
11731 * call-seq:
11732 * sym.empty? -> true or false
11734 * Returns whether _sym_ is :"" or not.
11737 static VALUE
11738 sym_empty(VALUE sym)
11740 return rb_str_empty(rb_sym2str(sym));
11744 * call-seq:
11745 * upcase(*options) -> symbol
11747 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11749 * See String#upcase.
11753 static VALUE
11754 sym_upcase(int argc, VALUE *argv, VALUE sym)
11756 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11760 * call-seq:
11761 * downcase(*options) -> symbol
11763 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11765 * See String#downcase.
11767 * Related: Symbol#upcase.
11771 static VALUE
11772 sym_downcase(int argc, VALUE *argv, VALUE sym)
11774 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11778 * call-seq:
11779 * capitalize(*options) -> symbol
11781 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11783 * See String#capitalize.
11787 static VALUE
11788 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11790 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11794 * call-seq:
11795 * swapcase(*options) -> symbol
11797 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11799 * See String#swapcase.
11803 static VALUE
11804 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11806 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11810 * call-seq:
11811 * sym.start_with?([prefixes]+) -> true or false
11813 * Returns true if +sym+ starts with one of the +prefixes+ given.
11814 * Each of the +prefixes+ should be a String or a Regexp.
11816 * :hello.start_with?("hell") #=> true
11817 * :hello.start_with?(/H/i) #=> true
11819 * # returns true if one of the prefixes matches.
11820 * :hello.start_with?("heaven", "hell") #=> true
11821 * :hello.start_with?("heaven", "paradise") #=> false
11824 static VALUE
11825 sym_start_with(int argc, VALUE *argv, VALUE sym)
11827 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11831 * call-seq:
11832 * sym.end_with?([suffixes]+) -> true or false
11834 * Returns true if +sym+ ends with one of the +suffixes+ given.
11836 * :hello.end_with?("ello") #=> true
11838 * # returns true if one of the +suffixes+ matches.
11839 * :hello.end_with?("heaven", "ello") #=> true
11840 * :hello.end_with?("heaven", "paradise") #=> false
11843 static VALUE
11844 sym_end_with(int argc, VALUE *argv, VALUE sym)
11846 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11850 * call-seq:
11851 * sym.encoding -> encoding
11853 * Returns the Encoding object that represents the encoding of _sym_.
11856 static VALUE
11857 sym_encoding(VALUE sym)
11859 return rb_obj_encoding(rb_sym2str(sym));
11862 static VALUE
11863 string_for_symbol(VALUE name)
11865 if (!RB_TYPE_P(name, T_STRING)) {
11866 VALUE tmp = rb_check_string_type(name);
11867 if (NIL_P(tmp)) {
11868 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11869 name);
11871 name = tmp;
11873 return name;
11877 rb_to_id(VALUE name)
11879 if (SYMBOL_P(name)) {
11880 return SYM2ID(name);
11882 name = string_for_symbol(name);
11883 return rb_intern_str(name);
11886 VALUE
11887 rb_to_symbol(VALUE name)
11889 if (SYMBOL_P(name)) {
11890 return name;
11892 name = string_for_symbol(name);
11893 return rb_str_intern(name);
11897 * call-seq:
11898 * Symbol.all_symbols => array
11900 * Returns an array of all the symbols currently in Ruby's symbol
11901 * table.
11903 * Symbol.all_symbols.size #=> 903
11904 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11905 * :chown, :EOFError, :$;, :String,
11906 * :LOCK_SH, :"setuid?", :$<,
11907 * :default_proc, :compact, :extend,
11908 * :Tms, :getwd, :$=, :ThreadGroup,
11909 * :wait2, :$>]
11912 static VALUE
11913 sym_all_symbols(VALUE _)
11915 return rb_sym_all_symbols();
11918 VALUE
11919 rb_str_to_interned_str(VALUE str)
11921 return rb_fstring(str);
11924 VALUE
11925 rb_interned_str(const char *ptr, long len)
11927 struct RString fake_str;
11928 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11931 VALUE
11932 rb_interned_str_cstr(const char *ptr)
11934 return rb_interned_str(ptr, strlen(ptr));
11937 VALUE
11938 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11940 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11941 rb_enc_autoload(enc);
11944 struct RString fake_str;
11945 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11948 VALUE
11949 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11951 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11955 * A \String object has an arbitrary sequence of bytes,
11956 * typically representing text or binary data.
11957 * A \String object may be created using String::new or as literals.
11959 * String objects differ from Symbol objects in that Symbol objects are
11960 * designed to be used as identifiers, instead of text or data.
11962 * You can create a \String object explicitly with:
11964 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11965 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11967 * You can convert certain objects to Strings with:
11969 * - \Method {String}[Kernel.html#method-i-String].
11971 * Some \String methods modify +self+.
11972 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11973 * and returns +self+;
11974 * often a similarly named method (without the <tt>!</tt>)
11975 * returns a new string.
11977 * In general, if there exist both bang and non-bang version of method,
11978 * the bang! mutates and the non-bang! does not.
11979 * However, a method without a bang can also mutate, such as String#replace.
11981 * == Substitution Methods
11983 * These methods perform substitutions:
11985 * - String#sub: One substitution (or none); returns a new string.
11986 * - String#sub!: One substitution (or none); returns +self+.
11987 * - String#gsub: Zero or more substitutions; returns a new string.
11988 * - String#gsub!: Zero or more substitutions; returns +self+.
11990 * Each of these methods takes:
11992 * - A first argument, +pattern+ (string or regexp),
11993 * that specifies the substring(s) to be replaced.
11995 * - Either of these:
11997 * - A second argument, +replacement+ (string or hash),
11998 * that determines the replacing string.
11999 * - A block that will determine the replacing string.
12001 * The examples in this section mostly use methods String#sub and String#gsub;
12002 * the principles illustrated apply to all four substitution methods.
12004 * <b>Argument +pattern+</b>
12006 * Argument +pattern+ is commonly a regular expression:
12008 * s = 'hello'
12009 * s.sub(/[aeiou]/, '*') # => "h*llo"
12010 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12011 * s.gsub(/[aeiou]/, '') # => "hll"
12012 * s.sub(/ell/, 'al') # => "halo"
12013 * s.gsub(/xyzzy/, '*') # => "hello"
12014 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12016 * When +pattern+ is a string, all its characters are treated
12017 * as ordinary characters (not as regexp special characters):
12019 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12021 * <b>\String +replacement+</b>
12023 * If +replacement+ is a string, that string will determine
12024 * the replacing string that is to be substituted for the matched text.
12026 * Each of the examples above uses a simple string as the replacing string.
12028 * \String +replacement+ may contain back-references to the pattern's captures:
12030 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12031 * - <tt>\k<name></tt> refers to the named capture +name+.
12033 * See rdoc-ref:regexp.rdoc for details.
12035 * Note that within the string +replacement+, a character combination
12036 * such as <tt>$&</tt> is treated as ordinary text, and not as
12037 * a special match variable.
12038 * However, you may refer to some special match variables using these
12039 * combinations:
12041 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12042 * which contains the complete matched text.
12043 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12044 * which contains string after match.
12045 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12046 * which contains string before match.
12047 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12048 * which contains last capture group.
12050 * See rdoc-ref:regexp.rdoc for details.
12052 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12054 * Note also that a string literal consumes backslashes.
12055 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12057 * A back-reference is typically preceded by an additional backslash.
12058 * For example, if you want to write a back-reference <tt>\&</tt> in
12059 * +replacement+ with a double-quoted string literal, you need to write
12060 * <tt>"..\\\\&.."</tt>.
12062 * If you want to write a non-back-reference string <tt>\&</tt> in
12063 * +replacement+, you need first to escape the backslash to prevent
12064 * this method from interpreting it as a back-reference, and then you
12065 * need to escape the backslashes again to prevent a string literal from
12066 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12068 * You may want to use the block form to avoid a lot of backslashes.
12070 * <b>\Hash +replacement+</b>
12072 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12073 * the replacing string is the value for that key:
12075 * h = {'foo' => 'bar', 'baz' => 'bat'}
12076 * 'food'.sub('foo', h) # => "bard"
12078 * Note that a symbol key does not match:
12080 * h = {foo: 'bar', baz: 'bat'}
12081 * 'food'.sub('foo', h) # => "d"
12083 * <b>Block</b>
12085 * In the block form, the current match string is passed to the block;
12086 * the block's return value becomes the replacing string:
12088 * s = '@'
12089 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12091 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12092 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12095 * == What's Here
12097 * First, what's elsewhere. \Class \String:
12099 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12100 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12102 * Here, class \String provides methods that are useful for:
12104 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12105 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12106 * - {Querying}[#class-String-label-Methods+for+Querying]
12107 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12108 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12109 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12110 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12111 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12113 * === Methods for Creating a \String
12115 * - ::new:: Returns a new string.
12116 * - ::try_convert:: Returns a new string created from a given object.
12118 * === Methods for a Frozen/Unfrozen String
12120 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12121 * +self+, if not frozen; +self.dup+ otherwise.
12122 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12123 * +self+, if already frozen; +self.freeze+ otherwise.
12124 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12126 * === Methods for Querying
12128 * _Counts_
12130 * - #length, #size:: Returns the count of characters (not bytes).
12131 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12132 * - #bytesize:: Returns the count of bytes.
12133 * - #count:: Returns the count of substrings matching given strings.
12135 * _Substrings_
12137 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12138 * returns +nil+ if no match is found.
12139 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12140 * returns +nil+ if none found.
12141 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12142 * returns +nil+ if none found.
12143 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12144 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12145 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12146 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12147 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12149 * _Encodings_
12151 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12152 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12153 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12154 * for its encoding.
12155 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12157 * _Other_
12159 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12160 * - #hash:: Returns the integer hash code.
12162 * === Methods for Comparing
12164 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12165 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12166 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12167 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12168 * other string is smaller than, equal to, or larger than +self+.
12169 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12170 * +false+ otherwise.
12172 * === Methods for Modifying a \String
12174 * Each of these methods modifies +self+.
12176 * _Insertion_
12178 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12179 * - #<<:: Returns +self+ concatenated with a given string or integer.
12181 * _Substitution_
12183 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12184 * returns +self+ if any changes, +nil+ otherwise.
12185 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12186 * returns +self+ if any changes, +nil+ otherwise.
12187 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12188 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12189 * - #reverse!:: Returns +self+ with its characters in reverse order.
12190 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12191 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12192 * returns +self+ if any changes, +nil+ otherwise.
12193 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12194 * removing duplicates from the substrings that were modified;
12195 * returns +self+ if any changes, +nil+ otherwise.
12197 * _Casing_
12199 * - #capitalize!:: Upcases the initial character and downcases all others;
12200 * returns +self+ if any changes, +nil+ otherwise.
12201 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12202 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12203 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12204 * returns +self+ if any changes, +nil+ otherwise.
12206 * _Encoding_
12208 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12209 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12210 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12211 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12213 * _Deletion_
12215 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12216 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12217 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12218 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12219 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12220 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12221 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12222 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12223 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12224 * returns +self+ if any changes, +nil+ otherwise.
12226 * === Methods for Converting to New \String
12228 * Each of these methods returns a new \String based on +self+,
12229 * often just a modified copy of +self+.
12231 * _Extension_
12233 * - #*:: Returns the concatenation of multiple copies of +self+,
12234 * - #+:: Returns the concatenation of +self+ and a given other string.
12235 * - #center:: Returns a copy of +self+ centered between pad substring.
12236 * - #concat:: Returns the concatenation of +self+ with given other strings.
12237 * - #prepend:: Returns the concatenation of a given other string with +self+.
12238 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12239 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12241 * _Encoding_
12243 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12244 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12245 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12246 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12248 * _Substitution_
12250 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12251 * and all special characters escaped.
12252 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12253 * and all escaped characters unescaped.
12254 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12255 * replaced with a given replacement string;.
12256 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12257 * replaced with a given replacement string.
12258 * - #succ, #next:: Returns the string that is the successor to +self+.
12259 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12260 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12261 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12262 * removing duplicates from the substrings that were modified.
12263 * - #%:: Returns the string resulting from formatting a given object into +self+
12265 * _Casing_
12267 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12268 * and all other characters downcased.
12269 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12270 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12271 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12272 * and all downcase characters upcased.
12274 * _Deletion_
12276 * - #delete:: Returns a copy of +self+ with characters removed
12277 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12278 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12279 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12280 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12281 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12282 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12283 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12284 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12285 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12286 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12287 * - #chr:: Returns the first character.
12289 * _Duplication_
12291 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12292 * otherwise, returns +self+.
12294 * === Methods for Converting to Non-\String
12296 * Each of these methods converts the contents of +self+ to a non-\String.
12298 * <em>Characters, Bytes, and Clusters</em>
12300 * - #bytes:: Returns an array of the bytes in +self+.
12301 * - #chars:: Returns an array of the characters in +self+.
12302 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12303 * - #getbyte:: Returns an integer byte as determined by a given index.
12304 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12306 * _Splitting_
12308 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12309 * - #partition:: Returns a 3-element array determined by the first substring that matches
12310 * a given substring or regexp,
12311 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12312 * a given substring or regexp,
12313 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12314 * or, if a block given, passes those substrings to the block.
12316 * _Matching_
12318 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12319 * if a block given, passes each matching substring to the block.
12320 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12321 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12323 * _Numerics_
12325 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12326 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12327 * - #ord:: Returns the integer ordinal of the first character in +self+.
12328 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12329 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12331 * <em>Strings and Symbols</em>
12333 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12334 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12336 * === Methods for Iterating
12338 * - #each_byte:: Calls the given block with each successive byte in +self+.
12339 * - #each_char:: Calls the given block with each successive character in +self+.
12340 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12341 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12342 * - #each_line:: Calls the given block with each successive line in +self+,
12343 * as determined by a given record separator.
12344 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12347 void
12348 Init_String(void)
12350 rb_cString = rb_define_class("String", rb_cObject);
12351 assert(rb_vm_fstring_table());
12352 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12353 rb_include_module(rb_cString, rb_mComparable);
12354 rb_define_alloc_func(rb_cString, empty_str_alloc);
12355 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12356 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12357 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12358 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12359 rb_define_method(rb_cString, "==", rb_str_equal, 1);
12360 rb_define_method(rb_cString, "===", rb_str_equal, 1);
12361 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12362 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12363 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12364 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12365 rb_define_method(rb_cString, "+", rb_str_plus, 1);
12366 rb_define_method(rb_cString, "*", rb_str_times, 1);
12367 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12368 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12369 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12370 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12371 rb_define_method(rb_cString, "length", rb_str_length, 0);
12372 rb_define_method(rb_cString, "size", rb_str_length, 0);
12373 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12374 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12375 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12376 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12377 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12378 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12379 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12380 rb_define_method(rb_cString, "next", rb_str_succ, 0);
12381 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12382 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12383 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12384 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12385 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12386 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12387 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12388 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12389 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12390 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12391 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12392 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12393 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12394 rb_define_method(rb_cString, "+@", str_uplus, 0);
12395 rb_define_method(rb_cString, "-@", str_uminus, 0);
12397 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12398 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12399 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12400 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12401 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12402 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12403 rb_define_method(rb_cString, "undump", str_undump, 0);
12405 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12406 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12407 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12408 sym_fold = ID2SYM(rb_intern_const("fold"));
12410 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12411 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12412 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12413 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12415 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12416 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12417 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12418 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12420 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12421 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12422 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12423 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12424 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12425 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12426 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12427 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12428 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12429 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12430 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12431 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12432 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12433 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12434 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12435 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12436 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12438 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12439 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12440 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12442 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12444 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12445 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12446 rb_define_method(rb_cString, "center", rb_str_center, -1);
12448 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12449 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12450 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12451 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12452 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12453 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12454 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12455 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12456 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12458 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12459 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12460 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12461 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12462 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12463 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12464 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12465 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12466 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12468 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12469 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12470 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12471 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12472 rb_define_method(rb_cString, "count", rb_str_count, -1);
12474 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12475 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12476 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12477 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12479 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12480 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12481 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12482 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12483 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12485 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12487 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12488 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12490 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12491 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12493 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12494 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12495 rb_define_method(rb_cString, "b", rb_str_b, 0);
12496 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12497 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12499 /* define UnicodeNormalize module here so that we don't have to look it up */
12500 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12501 id_normalize = rb_intern_const("normalize");
12502 id_normalized_p = rb_intern_const("normalized?");
12504 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12505 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12506 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12508 rb_fs = Qnil;
12509 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12510 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12511 rb_gc_register_address(&rb_fs);
12513 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12514 rb_include_module(rb_cSymbol, rb_mComparable);
12515 rb_undef_alloc_func(rb_cSymbol);
12516 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12517 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12519 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12520 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12521 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12522 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12523 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12524 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12525 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12526 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12527 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12528 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12529 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12531 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12532 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12533 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12534 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12536 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12537 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12538 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12539 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12540 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12541 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12542 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12544 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12545 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12546 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12547 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12549 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12550 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12552 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);