[rubygems/rubygems] Use a constant empty tar header to avoid extra allocations
[ruby.git] / string.c
blob9f7c163a812f50ea74c8885607d1f0f70f85479c
1 /**********************************************************************
3 string.c -
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "id.h"
27 #include "internal.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
41 #include "probes.h"
42 #include "ruby/encoding.h"
43 #include "ruby/re.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
46 #include "vm_sync.h"
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
50 # include <crypt.h>
51 # endif
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
55 #endif
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
60 #undef rb_str_new
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
63 #undef rb_enc_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
72 #undef rb_str_buf_cat
73 #undef rb_str_buf_cat2
74 #undef rb_str_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
78 VALUE rb_cString;
79 VALUE rb_cSymbol;
81 /* Flags of RString
83 * 1: RSTRING_NOEMBED
84 * The string is not embedded. When a string is embedded, the contents
85 * follow the header. When a string is not embedded, the contents is
86 * on a separately allocated buffer.
87 * 2: STR_SHARED (equal to ELTS_SHARED)
88 * The string is shared. The buffer this string points to is owned by
89 * another string (the shared root).
90 * 3: STR_CHILLED (will be frozen in a future version)
91 * The string appears frozen but can be mutated with a warning.
92 * 5: STR_SHARED_ROOT
93 * Other strings may point to the contents of this string. When this
94 * flag is set, STR_SHARED must not be set.
95 * 6: STR_BORROWED
96 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
97 * to be unshared by rb_str_tmp_frozen_release.
98 * 7: STR_TMPLOCK
99 * The pointer to the buffer is passed to a system call such as
100 * read(2). Any modification and realloc is prohibited.
101 * 8-9: ENC_CODERANGE
102 * Stores the coderange of the string.
103 * 10-16: ENCODING
104 * Stores the encoding of the string.
105 * 17: RSTRING_FSTR
106 * The string is a fstring. The string is deduplicated in the fstring
107 * table.
108 * 18: STR_NOFREE
109 * Do not free this string's buffer when the string is reclaimed
110 * by the garbage collector. Used for when the string buffer is a C
111 * string literal.
112 * 19: STR_FAKESTR
113 * The string is not allocated or managed by the garbage collector.
114 * Typically, the string object header (struct RString) is temporarily
115 * allocated on C stack.
118 #define RUBY_MAX_CHAR_LEN 16
119 #define STR_SHARED_ROOT FL_USER5
120 #define STR_BORROWED FL_USER6
121 #define STR_TMPLOCK FL_USER7
122 #define STR_NOFREE FL_USER18
123 #define STR_FAKESTR FL_USER19
125 #define STR_SET_NOEMBED(str) do {\
126 FL_SET((str), STR_NOEMBED);\
127 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
128 } while (0)
129 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
131 #define STR_SET_LEN(str, n) do { \
132 RSTRING(str)->len = (n); \
133 } while (0)
135 static inline bool
136 str_enc_fastpath(VALUE str)
138 // The overwhelming majority of strings are in one of these 3 encodings.
139 switch (ENCODING_GET_INLINED(str)) {
140 case ENCINDEX_ASCII_8BIT:
141 case ENCINDEX_UTF_8:
142 case ENCINDEX_US_ASCII:
143 return true;
144 default:
145 return false;
149 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150 #define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
156 } while (0)
158 #define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
161 } while (0)
162 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), tlen);\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
174 else {\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
180 } while (0)
182 #define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 FL_SET((shared_str), STR_SHARED_ROOT); \
189 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
190 FL_SET_RAW((shared_str), STR_BORROWED); \
192 } while (0)
194 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
195 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
196 /* TODO: include the terminator size in capa. */
198 #define STR_ENC_GET(str) get_encoding(str)
200 #if !defined SHARABLE_MIDDLE_SUBSTRING
201 # define SHARABLE_MIDDLE_SUBSTRING 0
202 #endif
203 #if !SHARABLE_MIDDLE_SUBSTRING
204 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
205 #else
206 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
207 #endif
210 static inline long
211 str_embed_capa(VALUE str)
213 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
216 bool
217 rb_str_reembeddable_p(VALUE str)
219 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
222 static inline size_t
223 rb_str_embed_size(long capa)
225 return offsetof(struct RString, as.embed.ary) + capa;
228 size_t
229 rb_str_size_as_embedded(VALUE str)
231 size_t real_size;
232 if (STR_EMBED_P(str)) {
233 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
235 /* if the string is not currently embedded, but it can be embedded, how
236 * much space would it require */
237 else if (rb_str_reembeddable_p(str)) {
238 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
240 else {
241 real_size = sizeof(struct RString);
243 return real_size;
246 static inline bool
247 STR_EMBEDDABLE_P(long len, long termlen)
249 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
252 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
253 static VALUE str_new_frozen(VALUE klass, VALUE orig);
254 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
255 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
256 static VALUE str_new(VALUE klass, const char *ptr, long len);
257 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
258 static inline void str_modifiable(VALUE str);
259 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
261 static inline void
262 str_make_independent(VALUE str)
264 long len = RSTRING_LEN(str);
265 int termlen = TERM_LEN(str);
266 str_make_independent_expand((str), len, 0L, termlen);
269 static inline int str_dependent_p(VALUE str);
271 void
272 rb_str_make_independent(VALUE str)
274 if (str_dependent_p(str)) {
275 str_make_independent(str);
279 void
280 rb_str_make_embedded(VALUE str)
282 RUBY_ASSERT(rb_str_reembeddable_p(str));
283 RUBY_ASSERT(!STR_EMBED_P(str));
285 char *buf = RSTRING(str)->as.heap.ptr;
286 long len = RSTRING(str)->len;
288 STR_SET_EMBED(str);
289 STR_SET_LEN(str, len);
291 if (len > 0) {
292 memcpy(RSTRING_PTR(str), buf, len);
293 ruby_xfree(buf);
296 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
299 void
300 rb_debug_rstring_null_ptr(const char *func)
302 fprintf(stderr, "%s is returning NULL!! "
303 "SIGSEGV is highly expected to follow immediately.\n"
304 "If you could reproduce, attach your debugger here, "
305 "and look at the passed string.\n",
306 func);
309 /* symbols for [up|down|swap]case/capitalize options */
310 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
312 static rb_encoding *
313 get_encoding(VALUE str)
315 return rb_enc_from_index(ENCODING_GET(str));
318 static void
319 mustnot_broken(VALUE str)
321 if (is_broken_string(str)) {
322 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
326 static void
327 mustnot_wchar(VALUE str)
329 rb_encoding *enc = STR_ENC_GET(str);
330 if (rb_enc_mbminlen(enc) > 1) {
331 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
335 static int fstring_cmp(VALUE a, VALUE b);
337 static VALUE register_fstring(VALUE str, bool copy);
339 const struct st_hash_type rb_fstring_hash_type = {
340 fstring_cmp,
341 rb_str_hash,
344 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
346 struct fstr_update_arg {
347 VALUE fstr;
348 bool copy;
351 static int
352 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
355 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
356 VALUE str = (VALUE)*key;
358 if (existing) {
359 /* because of lazy sweep, str may be unmarked already and swept
360 * at next time */
362 if (rb_objspace_garbage_object_p(str)) {
363 arg->fstr = Qundef;
364 return ST_DELETE;
367 arg->fstr = str;
368 return ST_STOP;
370 else {
371 if (FL_TEST_RAW(str, STR_FAKESTR)) {
372 if (arg->copy) {
373 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
374 rb_enc_copy(new_str, str);
375 str = new_str;
377 else {
378 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
379 RSTRING(str)->len,
380 ENCODING_GET(str));
382 OBJ_FREEZE(str);
384 else {
385 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
386 str = str_new_frozen(rb_cString, str);
388 if (STR_SHARED_P(str)) { /* str should not be shared */
389 /* shared substring */
390 str_make_independent(str);
391 RUBY_ASSERT(OBJ_FROZEN(str));
393 if (!BARE_STRING_P(str)) {
394 str = str_new_frozen(rb_cString, str);
397 RBASIC(str)->flags |= RSTRING_FSTR;
399 *key = *value = arg->fstr = str;
400 return ST_CONTINUE;
404 VALUE
405 rb_fstring(VALUE str)
407 VALUE fstr;
408 int bare;
410 Check_Type(str, T_STRING);
412 if (FL_TEST(str, RSTRING_FSTR))
413 return str;
415 bare = BARE_STRING_P(str);
416 if (!bare) {
417 if (STR_EMBED_P(str)) {
418 OBJ_FREEZE(str);
419 return str;
422 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 RUBY_ASSERT(OBJ_FROZEN(str));
424 return str;
428 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
429 rb_str_resize(str, RSTRING_LEN(str));
431 fstr = register_fstring(str, FALSE);
433 if (!bare) {
434 str_replace_shared_without_enc(str, fstr);
435 OBJ_FREEZE(str);
436 return str;
438 return fstr;
441 static VALUE
442 register_fstring(VALUE str, bool copy)
444 struct fstr_update_arg args;
445 args.copy = copy;
447 RB_VM_LOCK_ENTER();
449 st_table *frozen_strings = rb_vm_fstring_table();
450 do {
451 args.fstr = str;
452 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
453 } while (UNDEF_P(args.fstr));
455 RB_VM_LOCK_LEAVE();
457 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
458 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
459 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
460 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
462 return args.fstr;
465 static VALUE
466 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
468 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
469 /* SHARED to be allocated by the callback */
471 if (!name) {
472 RUBY_ASSERT_ALWAYS(len == 0);
473 name = "";
476 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
478 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
479 fake_str->len = len;
480 fake_str->as.heap.ptr = (char *)name;
481 fake_str->as.heap.aux.capa = len;
482 return (VALUE)fake_str;
486 * set up a fake string which refers a static string literal.
488 VALUE
489 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
491 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
495 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
496 * shared string which refers a static string literal. `ptr` must
497 * point a constant string.
499 VALUE
500 rb_fstring_new(const char *ptr, long len)
502 struct RString fake_str;
503 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
506 VALUE
507 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
509 struct RString fake_str;
510 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
513 VALUE
514 rb_fstring_cstr(const char *ptr)
516 return rb_fstring_new(ptr, strlen(ptr));
519 static int
520 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
522 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
523 return ST_CONTINUE;
526 static int
527 fstring_cmp(VALUE a, VALUE b)
529 long alen, blen;
530 const char *aptr, *bptr;
531 RSTRING_GETMEM(a, aptr, alen);
532 RSTRING_GETMEM(b, bptr, blen);
533 return (alen != blen ||
534 ENCODING_GET(a) != ENCODING_GET(b) ||
535 memcmp(aptr, bptr, alen) != 0);
538 static inline int
539 single_byte_optimizable(VALUE str)
541 rb_encoding *enc;
543 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
544 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
545 return 1;
547 enc = STR_ENC_GET(str);
548 if (rb_enc_mbmaxlen(enc) == 1)
549 return 1;
551 /* Conservative. Possibly single byte.
552 * "\xa1" in Shift_JIS for example. */
553 return 0;
556 VALUE rb_fs;
558 static inline const char *
559 search_nonascii(const char *p, const char *e)
561 const uintptr_t *s, *t;
563 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
564 # if SIZEOF_UINTPTR_T == 8
565 # define NONASCII_MASK UINT64_C(0x8080808080808080)
566 # elif SIZEOF_UINTPTR_T == 4
567 # define NONASCII_MASK UINT32_C(0x80808080)
568 # else
569 # error "don't know what to do."
570 # endif
571 #else
572 # if SIZEOF_UINTPTR_T == 8
573 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
574 # elif SIZEOF_UINTPTR_T == 4
575 # define NONASCII_MASK 0x80808080UL /* or...? */
576 # else
577 # error "don't know what to do."
578 # endif
579 #endif
581 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
582 #if !UNALIGNED_WORD_ACCESS
583 if ((uintptr_t)p % SIZEOF_VOIDP) {
584 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
585 p += l;
586 switch (l) {
587 default: UNREACHABLE;
588 #if SIZEOF_VOIDP > 4
589 case 7: if (p[-7]&0x80) return p-7;
590 case 6: if (p[-6]&0x80) return p-6;
591 case 5: if (p[-5]&0x80) return p-5;
592 case 4: if (p[-4]&0x80) return p-4;
593 #endif
594 case 3: if (p[-3]&0x80) return p-3;
595 case 2: if (p[-2]&0x80) return p-2;
596 case 1: if (p[-1]&0x80) return p-1;
597 case 0: break;
600 #endif
601 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
602 #define aligned_ptr(value) \
603 __builtin_assume_aligned((value), sizeof(uintptr_t))
604 #else
605 #define aligned_ptr(value) (uintptr_t *)(value)
606 #endif
607 s = aligned_ptr(p);
608 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
609 #undef aligned_ptr
610 for (;s < t; s++) {
611 if (*s & NONASCII_MASK) {
612 #ifdef WORDS_BIGENDIAN
613 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
614 #else
615 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
616 #endif
619 p = (const char *)s;
622 switch (e - p) {
623 default: UNREACHABLE;
624 #if SIZEOF_VOIDP > 4
625 case 7: if (e[-7]&0x80) return e-7;
626 case 6: if (e[-6]&0x80) return e-6;
627 case 5: if (e[-5]&0x80) return e-5;
628 case 4: if (e[-4]&0x80) return e-4;
629 #endif
630 case 3: if (e[-3]&0x80) return e-3;
631 case 2: if (e[-2]&0x80) return e-2;
632 case 1: if (e[-1]&0x80) return e-1;
633 case 0: return NULL;
637 static int
638 coderange_scan(const char *p, long len, rb_encoding *enc)
640 const char *e = p + len;
642 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
643 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
644 p = search_nonascii(p, e);
645 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
648 if (rb_enc_asciicompat(enc)) {
649 p = search_nonascii(p, e);
650 if (!p) return ENC_CODERANGE_7BIT;
651 for (;;) {
652 int ret = rb_enc_precise_mbclen(p, e, enc);
653 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
654 p += MBCLEN_CHARFOUND_LEN(ret);
655 if (p == e) break;
656 p = search_nonascii(p, e);
657 if (!p) break;
660 else {
661 while (p < e) {
662 int ret = rb_enc_precise_mbclen(p, e, enc);
663 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
664 p += MBCLEN_CHARFOUND_LEN(ret);
667 return ENC_CODERANGE_VALID;
670 long
671 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
673 const char *p = s;
675 if (*cr == ENC_CODERANGE_BROKEN)
676 return e - s;
678 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
679 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
680 if (*cr == ENC_CODERANGE_VALID) return e - s;
681 p = search_nonascii(p, e);
682 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
683 return e - s;
685 else if (rb_enc_asciicompat(enc)) {
686 p = search_nonascii(p, e);
687 if (!p) {
688 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
689 return e - s;
691 for (;;) {
692 int ret = rb_enc_precise_mbclen(p, e, enc);
693 if (!MBCLEN_CHARFOUND_P(ret)) {
694 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
695 return p - s;
697 p += MBCLEN_CHARFOUND_LEN(ret);
698 if (p == e) break;
699 p = search_nonascii(p, e);
700 if (!p) break;
703 else {
704 while (p < e) {
705 int ret = rb_enc_precise_mbclen(p, e, enc);
706 if (!MBCLEN_CHARFOUND_P(ret)) {
707 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
708 return p - s;
710 p += MBCLEN_CHARFOUND_LEN(ret);
713 *cr = ENC_CODERANGE_VALID;
714 return e - s;
717 static inline void
718 str_enc_copy(VALUE str1, VALUE str2)
720 rb_enc_set_index(str1, ENCODING_GET(str2));
723 /* Like str_enc_copy, but does not check frozen status of str1.
724 * You should use this only if you're certain that str1 is not frozen. */
725 static inline void
726 str_enc_copy_direct(VALUE str1, VALUE str2)
728 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
729 if (inlined_encoding == ENCODING_INLINE_MAX) {
730 rb_enc_set_index(str1, rb_enc_get_index(str2));
732 else {
733 ENCODING_SET_INLINED(str1, inlined_encoding);
737 static void
738 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
740 /* this function is designed for copying encoding and coderange
741 * from src to new string "dest" which is made from the part of src.
743 str_enc_copy(dest, src);
744 if (RSTRING_LEN(dest) == 0) {
745 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
746 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
747 else
748 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
749 return;
751 switch (ENC_CODERANGE(src)) {
752 case ENC_CODERANGE_7BIT:
753 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
754 break;
755 case ENC_CODERANGE_VALID:
756 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
757 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
758 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
759 else
760 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
761 break;
762 default:
763 break;
767 static void
768 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
770 str_enc_copy(dest, src);
771 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
774 static int
775 enc_coderange_scan(VALUE str, rb_encoding *enc)
777 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
781 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
783 return enc_coderange_scan(str, enc);
787 rb_enc_str_coderange(VALUE str)
789 int cr = ENC_CODERANGE(str);
791 if (cr == ENC_CODERANGE_UNKNOWN) {
792 cr = enc_coderange_scan(str, get_encoding(str));
793 ENC_CODERANGE_SET(str, cr);
795 return cr;
799 rb_enc_str_asciionly_p(VALUE str)
801 rb_encoding *enc = STR_ENC_GET(str);
803 if (!rb_enc_asciicompat(enc))
804 return FALSE;
805 else if (is_ascii_string(str))
806 return TRUE;
807 return FALSE;
810 static inline void
811 str_mod_check(VALUE s, const char *p, long len)
813 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
814 rb_raise(rb_eRuntimeError, "string modified");
818 static size_t
819 str_capacity(VALUE str, const int termlen)
821 if (STR_EMBED_P(str)) {
822 return str_embed_capa(str) - termlen;
824 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
825 return RSTRING(str)->len;
827 else {
828 return RSTRING(str)->as.heap.aux.capa;
832 size_t
833 rb_str_capacity(VALUE str)
835 return str_capacity(str, TERM_LEN(str));
838 static inline void
839 must_not_null(const char *ptr)
841 if (!ptr) {
842 rb_raise(rb_eArgError, "NULL pointer given");
846 static inline VALUE
847 str_alloc_embed(VALUE klass, size_t capa)
849 size_t size = rb_str_embed_size(capa);
850 RUBY_ASSERT(size > 0);
851 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
853 NEWOBJ_OF(str, struct RString, klass,
854 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, 0);
856 return (VALUE)str;
859 static inline VALUE
860 str_alloc_heap(VALUE klass)
862 NEWOBJ_OF(str, struct RString, klass,
863 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
865 return (VALUE)str;
868 static inline VALUE
869 empty_str_alloc(VALUE klass)
871 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
872 VALUE str = str_alloc_embed(klass, 0);
873 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
874 return str;
877 static VALUE
878 str_new0(VALUE klass, const char *ptr, long len, int termlen)
880 VALUE str;
882 if (len < 0) {
883 rb_raise(rb_eArgError, "negative string size (or size too big)");
886 RUBY_DTRACE_CREATE_HOOK(STRING, len);
888 if (STR_EMBEDDABLE_P(len, termlen)) {
889 str = str_alloc_embed(klass, len + termlen);
890 if (len == 0) {
891 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
894 else {
895 str = str_alloc_heap(klass);
896 RSTRING(str)->as.heap.aux.capa = len;
897 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
898 * integer overflow. If we can STATIC_ASSERT that, the following
899 * mul_add_mul can be reverted to a simple ALLOC_N. */
900 RSTRING(str)->as.heap.ptr =
901 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
903 if (ptr) {
904 memcpy(RSTRING_PTR(str), ptr, len);
906 STR_SET_LEN(str, len);
907 TERM_FILL(RSTRING_PTR(str) + len, termlen);
908 return str;
911 static VALUE
912 str_new(VALUE klass, const char *ptr, long len)
914 return str_new0(klass, ptr, len, 1);
917 VALUE
918 rb_str_new(const char *ptr, long len)
920 return str_new(rb_cString, ptr, len);
923 VALUE
924 rb_usascii_str_new(const char *ptr, long len)
926 VALUE str = rb_str_new(ptr, len);
927 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
928 return str;
931 VALUE
932 rb_utf8_str_new(const char *ptr, long len)
934 VALUE str = str_new(rb_cString, ptr, len);
935 rb_enc_associate_index(str, rb_utf8_encindex());
936 return str;
939 VALUE
940 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
942 VALUE str;
944 if (!enc) return rb_str_new(ptr, len);
946 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
947 rb_enc_associate(str, enc);
948 return str;
951 VALUE
952 rb_str_new_cstr(const char *ptr)
954 must_not_null(ptr);
955 /* rb_str_new_cstr() can take pointer from non-malloc-generated
956 * memory regions, and that cannot be detected by the MSAN. Just
957 * trust the programmer that the argument passed here is a sane C
958 * string. */
959 __msan_unpoison_string(ptr);
960 return rb_str_new(ptr, strlen(ptr));
963 VALUE
964 rb_usascii_str_new_cstr(const char *ptr)
966 VALUE str = rb_str_new_cstr(ptr);
967 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
968 return str;
971 VALUE
972 rb_utf8_str_new_cstr(const char *ptr)
974 VALUE str = rb_str_new_cstr(ptr);
975 rb_enc_associate_index(str, rb_utf8_encindex());
976 return str;
979 VALUE
980 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
982 must_not_null(ptr);
983 if (rb_enc_mbminlen(enc) != 1) {
984 rb_raise(rb_eArgError, "wchar encoding given");
986 return rb_enc_str_new(ptr, strlen(ptr), enc);
989 static VALUE
990 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
992 VALUE str;
994 if (len < 0) {
995 rb_raise(rb_eArgError, "negative string size (or size too big)");
998 if (!ptr) {
999 rb_encoding *enc = rb_enc_get_from_index(encindex);
1000 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1002 else {
1003 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004 str = str_alloc_heap(klass);
1005 RSTRING(str)->len = len;
1006 RSTRING(str)->as.heap.ptr = (char *)ptr;
1007 RSTRING(str)->as.heap.aux.capa = len;
1008 RBASIC(str)->flags |= STR_NOFREE;
1010 rb_enc_associate_index(str, encindex);
1011 return str;
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1017 return str_new_static(rb_cString, ptr, len, 0);
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1023 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1029 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1035 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039 rb_encoding *from, rb_encoding *to,
1040 int ecflags, VALUE ecopts);
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1045 int encidx = rb_enc_to_index(enc);
1046 if (rb_enc_get_index(str) == encidx)
1047 return is_ascii_string(str);
1048 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1054 long len;
1055 const char *ptr;
1056 VALUE newstr;
1058 if (!to) return str;
1059 if (!from) from = rb_enc_get(str);
1060 if (from == to) return str;
1061 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062 rb_is_ascii8bit_enc(to)) {
1063 if (STR_ENC_GET(str) != to) {
1064 str = rb_str_dup(str);
1065 rb_enc_associate(str, to);
1067 return str;
1070 RSTRING_GETMEM(str, ptr, len);
1071 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072 from, to, ecflags, ecopts);
1073 if (NIL_P(newstr)) {
1074 /* some error, return original */
1075 return str;
1077 return newstr;
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082 rb_encoding *from, int ecflags, VALUE ecopts)
1084 long olen;
1086 olen = RSTRING_LEN(newstr);
1087 if (ofs < -olen || olen < ofs)
1088 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089 if (ofs < 0) ofs += olen;
1090 if (!from) {
1091 STR_SET_LEN(newstr, ofs);
1092 return rb_str_cat(newstr, ptr, len);
1095 rb_str_modify(newstr);
1096 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097 rb_enc_get(newstr),
1098 ecflags, ecopts);
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1104 STR_SET_LEN(str, 0);
1105 rb_enc_associate(str, enc);
1106 rb_str_cat(str, ptr, len);
1107 return str;
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112 rb_encoding *from, rb_encoding *to,
1113 int ecflags, VALUE ecopts)
1115 rb_econv_t *ec;
1116 rb_econv_result_t ret;
1117 long olen;
1118 VALUE econv_wrapper;
1119 const unsigned char *start, *sp;
1120 unsigned char *dest, *dp;
1121 size_t converted_output = (size_t)ofs;
1123 olen = rb_str_capacity(newstr);
1125 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126 RBASIC_CLEAR_CLASS(econv_wrapper);
1127 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128 if (!ec) return Qnil;
1129 DATA_PTR(econv_wrapper) = ec;
1131 sp = (unsigned char*)ptr;
1132 start = sp;
1133 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134 (dp = dest + converted_output),
1135 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136 ret == econv_destination_buffer_full) {
1137 /* destination buffer short */
1138 size_t converted_input = sp - start;
1139 size_t rest = len - converted_input;
1140 converted_output = dp - dest;
1141 rb_str_set_len(newstr, converted_output);
1142 if (converted_input && converted_output &&
1143 rest < (LONG_MAX / converted_output)) {
1144 rest = (rest * converted_output) / converted_input;
1146 else {
1147 rest = olen;
1149 olen += rest < 2 ? 2 : rest;
1150 rb_str_resize(newstr, olen);
1152 DATA_PTR(econv_wrapper) = 0;
1153 RB_GC_GUARD(econv_wrapper);
1154 rb_econv_close(ec);
1155 switch (ret) {
1156 case econv_finished:
1157 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1158 rb_str_set_len(newstr, len);
1159 rb_enc_associate(newstr, to);
1160 return newstr;
1162 default:
1163 return Qnil;
1167 VALUE
1168 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1170 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1173 VALUE
1174 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1176 rb_encoding *ienc;
1177 VALUE str;
1178 const int eidx = rb_enc_to_index(eenc);
1180 if (!ptr) {
1181 return rb_enc_str_new(ptr, len, eenc);
1184 /* ASCII-8BIT case, no conversion */
1185 if ((eidx == rb_ascii8bit_encindex()) ||
1186 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1187 return rb_str_new(ptr, len);
1189 /* no default_internal or same encoding, no conversion */
1190 ienc = rb_default_internal_encoding();
1191 if (!ienc || eenc == ienc) {
1192 return rb_enc_str_new(ptr, len, eenc);
1194 /* ASCII compatible, and ASCII only string, no conversion in
1195 * default_internal */
1196 if ((eidx == rb_ascii8bit_encindex()) ||
1197 (eidx == rb_usascii_encindex()) ||
1198 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1199 return rb_enc_str_new(ptr, len, ienc);
1201 /* convert from the given encoding to default_internal */
1202 str = rb_enc_str_new(NULL, 0, ienc);
1203 /* when the conversion failed for some reason, just ignore the
1204 * default_internal and result in the given encoding as-is. */
1205 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1206 rb_str_initialize(str, ptr, len, eenc);
1208 return str;
1211 VALUE
1212 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1214 int eidx = rb_enc_to_index(eenc);
1215 if (eidx == rb_usascii_encindex() &&
1216 !is_ascii_string(str)) {
1217 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1218 return str;
1220 rb_enc_associate_index(str, eidx);
1221 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1224 VALUE
1225 rb_external_str_new(const char *ptr, long len)
1227 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1230 VALUE
1231 rb_external_str_new_cstr(const char *ptr)
1233 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1236 VALUE
1237 rb_locale_str_new(const char *ptr, long len)
1239 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1242 VALUE
1243 rb_locale_str_new_cstr(const char *ptr)
1245 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1248 VALUE
1249 rb_filesystem_str_new(const char *ptr, long len)
1251 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1254 VALUE
1255 rb_filesystem_str_new_cstr(const char *ptr)
1257 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1260 VALUE
1261 rb_str_export(VALUE str)
1263 return rb_str_export_to_enc(str, rb_default_external_encoding());
1266 VALUE
1267 rb_str_export_locale(VALUE str)
1269 return rb_str_export_to_enc(str, rb_locale_encoding());
1272 VALUE
1273 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1275 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1278 static VALUE
1279 str_replace_shared_without_enc(VALUE str2, VALUE str)
1281 const int termlen = TERM_LEN(str);
1282 char *ptr;
1283 long len;
1285 RSTRING_GETMEM(str, ptr, len);
1286 if (str_embed_capa(str2) >= len + termlen) {
1287 char *ptr2 = RSTRING(str2)->as.embed.ary;
1288 STR_SET_EMBED(str2);
1289 memcpy(ptr2, RSTRING_PTR(str), len);
1290 TERM_FILL(ptr2+len, termlen);
1292 else {
1293 VALUE root;
1294 if (STR_SHARED_P(str)) {
1295 root = RSTRING(str)->as.heap.aux.shared;
1296 RSTRING_GETMEM(str, ptr, len);
1298 else {
1299 root = rb_str_new_frozen(str);
1300 RSTRING_GETMEM(root, ptr, len);
1302 RUBY_ASSERT(OBJ_FROZEN(root));
1304 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1305 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1306 rb_fatal("about to free a possible shared root");
1308 char *ptr2 = STR_HEAP_PTR(str2);
1309 if (ptr2 != ptr) {
1310 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1313 FL_SET(str2, STR_NOEMBED);
1314 RSTRING(str2)->as.heap.ptr = ptr;
1315 STR_SET_SHARED(str2, root);
1318 STR_SET_LEN(str2, len);
1320 return str2;
1323 static VALUE
1324 str_replace_shared(VALUE str2, VALUE str)
1326 str_replace_shared_without_enc(str2, str);
1327 rb_enc_cr_str_exact_copy(str2, str);
1328 return str2;
1331 static VALUE
1332 str_new_shared(VALUE klass, VALUE str)
1334 return str_replace_shared(str_alloc_heap(klass), str);
1337 VALUE
1338 rb_str_new_shared(VALUE str)
1340 return str_new_shared(rb_obj_class(str), str);
1343 VALUE
1344 rb_str_new_frozen(VALUE orig)
1346 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1347 return str_new_frozen(rb_obj_class(orig), orig);
1350 static VALUE
1351 rb_str_new_frozen_String(VALUE orig)
1353 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1354 return str_new_frozen(rb_cString, orig);
1357 VALUE
1358 rb_str_tmp_frozen_acquire(VALUE orig)
1360 if (OBJ_FROZEN_RAW(orig)) return orig;
1361 return str_new_frozen_buffer(0, orig, FALSE);
1364 VALUE
1365 rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1367 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1368 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1370 VALUE str = str_alloc_heap(0);
1371 OBJ_FREEZE(str);
1372 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1373 FL_SET(str, STR_SHARED_ROOT);
1375 size_t capa = str_capacity(orig, TERM_LEN(orig));
1377 /* If the string is embedded then we want to create a copy that is heap
1378 * allocated. If the string is shared then the shared root must be
1379 * embedded, so we want to create a copy. If the string is a shared root
1380 * then it must be embedded, so we want to create a copy. */
1381 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1382 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1383 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1385 else {
1386 /* orig must be heap allocated and not shared, so we can safely transfer
1387 * the pointer to str. */
1388 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1389 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1390 RBASIC(orig)->flags &= ~STR_NOFREE;
1391 STR_SET_SHARED(orig, str);
1394 RSTRING(str)->len = RSTRING(orig)->len;
1395 RSTRING(str)->as.heap.aux.capa = capa;
1397 return str;
1400 void
1401 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1403 if (RBASIC_CLASS(tmp) != 0)
1404 return;
1406 if (STR_EMBED_P(tmp)) {
1407 RUBY_ASSERT(OBJ_FROZEN_RAW(tmp));
1409 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1410 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1411 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1413 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1414 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1415 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1417 /* Unshare orig since the root (tmp) only has this one child. */
1418 FL_UNSET_RAW(orig, STR_SHARED);
1419 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1420 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1421 RUBY_ASSERT(OBJ_FROZEN_RAW(tmp));
1423 /* Make tmp embedded and empty so it is safe for sweeping. */
1424 STR_SET_EMBED(tmp);
1425 STR_SET_LEN(tmp, 0);
1430 static VALUE
1431 str_new_frozen(VALUE klass, VALUE orig)
1433 return str_new_frozen_buffer(klass, orig, TRUE);
1436 static VALUE
1437 heap_str_make_shared(VALUE klass, VALUE orig)
1439 RUBY_ASSERT(!STR_EMBED_P(orig));
1440 RUBY_ASSERT(!STR_SHARED_P(orig));
1442 VALUE str = str_alloc_heap(klass);
1443 STR_SET_LEN(str, RSTRING_LEN(orig));
1444 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1445 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1446 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1447 RBASIC(orig)->flags &= ~STR_NOFREE;
1448 STR_SET_SHARED(orig, str);
1449 if (klass == 0)
1450 FL_UNSET_RAW(str, STR_BORROWED);
1451 return str;
1454 static VALUE
1455 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1457 VALUE str;
1459 long len = RSTRING_LEN(orig);
1460 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1462 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1463 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1464 RUBY_ASSERT(STR_EMBED_P(str));
1466 else {
1467 if (FL_TEST_RAW(orig, STR_SHARED)) {
1468 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1469 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1470 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1471 RUBY_ASSERT(ofs >= 0);
1472 RUBY_ASSERT(rest >= 0);
1473 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1474 RUBY_ASSERT(OBJ_FROZEN(shared));
1476 if ((ofs > 0) || (rest > 0) ||
1477 (klass != RBASIC(shared)->klass) ||
1478 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1479 str = str_new_shared(klass, shared);
1480 RUBY_ASSERT(!STR_EMBED_P(str));
1481 RSTRING(str)->as.heap.ptr += ofs;
1482 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1484 else {
1485 if (RBASIC_CLASS(shared) == 0)
1486 FL_SET_RAW(shared, STR_BORROWED);
1487 return shared;
1490 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1491 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1492 STR_SET_EMBED(str);
1493 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1494 STR_SET_LEN(str, RSTRING_LEN(orig));
1495 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1497 else {
1498 str = heap_str_make_shared(klass, orig);
1502 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1503 OBJ_FREEZE(str);
1504 return str;
1507 VALUE
1508 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1510 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1513 static VALUE
1514 str_new_empty_String(VALUE str)
1516 VALUE v = rb_str_new(0, 0);
1517 rb_enc_copy(v, str);
1518 return v;
1521 #define STR_BUF_MIN_SIZE 63
1523 VALUE
1524 rb_str_buf_new(long capa)
1526 if (STR_EMBEDDABLE_P(capa, 1)) {
1527 return str_alloc_embed(rb_cString, capa + 1);
1530 VALUE str = str_alloc_heap(rb_cString);
1532 RSTRING(str)->as.heap.aux.capa = capa;
1533 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1534 RSTRING(str)->as.heap.ptr[0] = '\0';
1536 return str;
1539 VALUE
1540 rb_str_buf_new_cstr(const char *ptr)
1542 VALUE str;
1543 long len = strlen(ptr);
1545 str = rb_str_buf_new(len);
1546 rb_str_buf_cat(str, ptr, len);
1548 return str;
1551 VALUE
1552 rb_str_tmp_new(long len)
1554 return str_new(0, 0, len);
1557 void
1558 rb_str_free(VALUE str)
1560 if (FL_TEST(str, RSTRING_FSTR)) {
1561 st_data_t fstr = (st_data_t)str;
1563 RB_VM_LOCK_ENTER();
1565 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1566 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1568 RB_VM_LOCK_LEAVE();
1571 if (STR_EMBED_P(str)) {
1572 RB_DEBUG_COUNTER_INC(obj_str_embed);
1574 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1575 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1576 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1578 else {
1579 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1580 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1584 size_t
1585 rb_str_memsize(VALUE str)
1587 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1588 return STR_HEAP_SIZE(str);
1590 else {
1591 return 0;
1595 VALUE
1596 rb_str_to_str(VALUE str)
1598 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1601 static inline void str_discard(VALUE str);
1602 static void str_shared_replace(VALUE str, VALUE str2);
1604 void
1605 rb_str_shared_replace(VALUE str, VALUE str2)
1607 if (str != str2) str_shared_replace(str, str2);
1610 static void
1611 str_shared_replace(VALUE str, VALUE str2)
1613 rb_encoding *enc;
1614 int cr;
1615 int termlen;
1617 RUBY_ASSERT(str2 != str);
1618 enc = STR_ENC_GET(str2);
1619 cr = ENC_CODERANGE(str2);
1620 str_discard(str);
1621 termlen = rb_enc_mbminlen(enc);
1623 STR_SET_LEN(str, RSTRING_LEN(str2));
1625 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1626 STR_SET_EMBED(str);
1627 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1628 rb_enc_associate(str, enc);
1629 ENC_CODERANGE_SET(str, cr);
1631 else {
1632 if (STR_EMBED_P(str2)) {
1633 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1634 long len = RSTRING_LEN(str2);
1635 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1637 char *new_ptr = ALLOC_N(char, len + termlen);
1638 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1639 RSTRING(str2)->as.heap.ptr = new_ptr;
1640 STR_SET_LEN(str2, len);
1641 RSTRING(str2)->as.heap.aux.capa = len;
1642 STR_SET_NOEMBED(str2);
1645 STR_SET_NOEMBED(str);
1646 FL_UNSET(str, STR_SHARED);
1647 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1649 if (FL_TEST(str2, STR_SHARED)) {
1650 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1651 STR_SET_SHARED(str, shared);
1653 else {
1654 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1657 /* abandon str2 */
1658 STR_SET_EMBED(str2);
1659 RSTRING_PTR(str2)[0] = 0;
1660 STR_SET_LEN(str2, 0);
1661 rb_enc_associate(str, enc);
1662 ENC_CODERANGE_SET(str, cr);
1666 VALUE
1667 rb_obj_as_string(VALUE obj)
1669 VALUE str;
1671 if (RB_TYPE_P(obj, T_STRING)) {
1672 return obj;
1674 str = rb_funcall(obj, idTo_s, 0);
1675 return rb_obj_as_string_result(str, obj);
1678 VALUE
1679 rb_obj_as_string_result(VALUE str, VALUE obj)
1681 if (!RB_TYPE_P(str, T_STRING))
1682 return rb_any_to_s(obj);
1683 return str;
1686 static VALUE
1687 str_replace(VALUE str, VALUE str2)
1689 long len;
1691 len = RSTRING_LEN(str2);
1692 if (STR_SHARED_P(str2)) {
1693 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1694 RUBY_ASSERT(OBJ_FROZEN(shared));
1695 STR_SET_NOEMBED(str);
1696 STR_SET_LEN(str, len);
1697 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1698 STR_SET_SHARED(str, shared);
1699 rb_enc_cr_str_exact_copy(str, str2);
1701 else {
1702 str_replace_shared(str, str2);
1705 return str;
1708 static inline VALUE
1709 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1711 size_t size = rb_str_embed_size(capa);
1712 RUBY_ASSERT(size > 0);
1713 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1715 NEWOBJ_OF(str, struct RString, klass,
1716 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, ec);
1718 return (VALUE)str;
1721 static inline VALUE
1722 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1724 NEWOBJ_OF(str, struct RString, klass,
1725 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1727 return (VALUE)str;
1730 static inline VALUE
1731 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1733 const VALUE flag_mask =
1734 ENC_CODERANGE_MASK | ENCODING_MASK |
1735 FL_FREEZE
1737 VALUE flags = FL_TEST_RAW(str, flag_mask);
1738 int encidx = 0;
1739 if (STR_EMBED_P(str)) {
1740 long len = RSTRING_LEN(str);
1742 RUBY_ASSERT(STR_EMBED_P(dup));
1743 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1744 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1746 else {
1747 VALUE root = str;
1748 if (FL_TEST_RAW(str, STR_SHARED)) {
1749 root = RSTRING(str)->as.heap.aux.shared;
1751 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1752 root = str = str_new_frozen(klass, str);
1753 flags = FL_TEST_RAW(str, flag_mask);
1755 RUBY_ASSERT(!STR_SHARED_P(root));
1756 RUBY_ASSERT(RB_OBJ_FROZEN_RAW(root));
1758 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1759 FL_SET(root, STR_SHARED_ROOT);
1760 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1761 flags |= RSTRING_NOEMBED | STR_SHARED;
1764 STR_SET_LEN(dup, RSTRING_LEN(str));
1766 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1767 encidx = rb_enc_get_index(str);
1768 flags &= ~ENCODING_MASK;
1770 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1771 if (encidx) rb_enc_associate_index(dup, encidx);
1772 return dup;
1775 static inline VALUE
1776 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1778 VALUE dup;
1779 if (STR_EMBED_P(str)) {
1780 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1782 else {
1783 dup = ec_str_alloc_heap(ec, klass);
1786 return str_duplicate_setup(klass, str, dup);
1789 static inline VALUE
1790 str_duplicate(VALUE klass, VALUE str)
1792 VALUE dup;
1793 if (STR_EMBED_P(str)) {
1794 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1796 else {
1797 dup = str_alloc_heap(klass);
1800 return str_duplicate_setup(klass, str, dup);
1803 VALUE
1804 rb_str_dup(VALUE str)
1806 return str_duplicate(rb_obj_class(str), str);
1809 /* :nodoc: */
1810 VALUE
1811 rb_str_dup_m(VALUE str)
1813 if (LIKELY(BARE_STRING_P(str))) {
1814 return str_duplicate(rb_obj_class(str), str);
1816 else {
1817 return rb_obj_dup(str);
1821 VALUE
1822 rb_str_resurrect(VALUE str)
1824 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1825 return str_duplicate(rb_cString, str);
1828 VALUE
1829 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1831 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1832 VALUE new_str = ec_str_duplicate(ec, rb_cString, str);
1833 if (chilled) {
1834 STR_CHILL_RAW(new_str);
1836 return new_str;
1839 bool
1840 rb_str_chilled_p(VALUE str)
1842 return CHILLED_STRING_P(str);
1847 * call-seq:
1848 * String.new(string = '', **opts) -> new_string
1850 * :include: doc/string/new.rdoc
1854 static VALUE
1855 rb_str_init(int argc, VALUE *argv, VALUE str)
1857 static ID keyword_ids[2];
1858 VALUE orig, opt, venc, vcapa;
1859 VALUE kwargs[2];
1860 rb_encoding *enc = 0;
1861 int n;
1863 if (!keyword_ids[0]) {
1864 keyword_ids[0] = rb_id_encoding();
1865 CONST_ID(keyword_ids[1], "capacity");
1868 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1869 if (!NIL_P(opt)) {
1870 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1871 venc = kwargs[0];
1872 vcapa = kwargs[1];
1873 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1874 enc = rb_to_encoding(venc);
1876 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1877 long capa = NUM2LONG(vcapa);
1878 long len = 0;
1879 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1881 if (capa < STR_BUF_MIN_SIZE) {
1882 capa = STR_BUF_MIN_SIZE;
1884 if (n == 1) {
1885 StringValue(orig);
1886 len = RSTRING_LEN(orig);
1887 if (capa < len) {
1888 capa = len;
1890 if (orig == str) n = 0;
1892 str_modifiable(str);
1893 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1894 /* make noembed always */
1895 const size_t size = (size_t)capa + termlen;
1896 const char *const old_ptr = RSTRING_PTR(str);
1897 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1898 char *new_ptr = ALLOC_N(char, size);
1899 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
1900 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1901 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1902 RSTRING(str)->as.heap.ptr = new_ptr;
1904 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1905 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1906 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1908 STR_SET_LEN(str, len);
1909 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1910 if (n == 1) {
1911 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1912 rb_enc_cr_str_exact_copy(str, orig);
1914 FL_SET(str, STR_NOEMBED);
1915 RSTRING(str)->as.heap.aux.capa = capa;
1917 else if (n == 1) {
1918 rb_str_replace(str, orig);
1920 if (enc) {
1921 rb_enc_associate(str, enc);
1922 ENC_CODERANGE_CLEAR(str);
1925 else if (n == 1) {
1926 rb_str_replace(str, orig);
1928 return str;
1931 /* :nodoc: */
1932 static VALUE
1933 rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1935 if (klass != rb_cString) {
1936 return rb_class_new_instance_pass_kw(argc, argv, klass);
1939 static ID keyword_ids[2];
1940 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1941 VALUE kwargs[2];
1942 rb_encoding *enc = NULL;
1944 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1945 if (NIL_P(opt)) {
1946 return rb_class_new_instance_pass_kw(argc, argv, klass);
1949 keyword_ids[0] = rb_id_encoding();
1950 CONST_ID(keyword_ids[1], "capacity");
1951 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1952 encoding = kwargs[0];
1953 capacity = kwargs[1];
1955 int termlen = 1;
1957 if (n == 1) {
1958 orig = StringValue(orig);
1960 else {
1961 orig = Qnil;
1964 if (UNDEF_P(encoding)) {
1965 if (!NIL_P(orig)) {
1966 encoding = rb_obj_encoding(orig);
1970 if (!UNDEF_P(encoding)) {
1971 enc = rb_to_encoding(encoding);
1972 termlen = rb_enc_mbminlen(enc);
1975 // If capacity is nil, we're basically just duping `orig`.
1976 if (UNDEF_P(capacity)) {
1977 if (NIL_P(orig)) {
1978 VALUE empty_str = str_new(klass, "", 0);
1979 if (enc) {
1980 rb_enc_associate(empty_str, enc);
1982 return empty_str;
1984 VALUE copy = str_duplicate(klass, orig);
1985 rb_enc_associate(copy, enc);
1986 ENC_CODERANGE_CLEAR(copy);
1987 return copy;
1990 long capa = 0;
1991 capa = NUM2LONG(capacity);
1992 if (capa < 0) {
1993 capa = 0;
1996 if (!NIL_P(orig)) {
1997 long orig_capa = rb_str_capacity(orig);
1998 if (orig_capa > capa) {
1999 capa = orig_capa;
2003 long fake_len = capa - termlen;
2004 if (fake_len < 0) {
2005 fake_len = 0;
2008 VALUE str = str_new0(klass, NULL, fake_len, termlen);
2009 STR_SET_LEN(str, 0);
2010 TERM_FILL(RSTRING_PTR(str), termlen);
2012 if (enc) {
2013 rb_enc_associate(str, enc);
2016 if (!NIL_P(orig)) {
2017 rb_str_buf_append(str, orig);
2020 return str;
2023 #ifdef NONASCII_MASK
2024 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2027 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2028 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2029 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2031 * if (!(byte & 0x80))
2032 * byte |= 0x40; // turn on bit6
2033 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2035 * This function calculates whether a byte is leading or not for all bytes
2036 * in the argument word by concurrently using the above logic, and then
2037 * adds up the number of leading bytes in the word.
2039 static inline uintptr_t
2040 count_utf8_lead_bytes_with_word(const uintptr_t *s)
2042 uintptr_t d = *s;
2044 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2045 d = (d>>6) | (~d>>7);
2046 d &= NONASCII_MASK >> 7;
2048 /* Gather all bytes. */
2049 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2050 /* use only if it can use POPCNT */
2051 return rb_popcount_intptr(d);
2052 #else
2053 d += (d>>8);
2054 d += (d>>16);
2055 # if SIZEOF_VOIDP == 8
2056 d += (d>>32);
2057 # endif
2058 return (d&0xF);
2059 #endif
2061 #endif
2063 static inline long
2064 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2066 long c;
2067 const char *q;
2069 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2070 long diff = (long)(e - p);
2071 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2073 #ifdef NONASCII_MASK
2074 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2075 uintptr_t len = 0;
2076 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2077 const uintptr_t *s, *t;
2078 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2079 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2080 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2081 while (p < (const char *)s) {
2082 if (is_utf8_lead_byte(*p)) len++;
2083 p++;
2085 while (s < t) {
2086 len += count_utf8_lead_bytes_with_word(s);
2087 s++;
2089 p = (const char *)s;
2091 while (p < e) {
2092 if (is_utf8_lead_byte(*p)) len++;
2093 p++;
2095 return (long)len;
2097 #endif
2098 else if (rb_enc_asciicompat(enc)) {
2099 c = 0;
2100 if (ENC_CODERANGE_CLEAN_P(cr)) {
2101 while (p < e) {
2102 if (ISASCII(*p)) {
2103 q = search_nonascii(p, e);
2104 if (!q)
2105 return c + (e - p);
2106 c += q - p;
2107 p = q;
2109 p += rb_enc_fast_mbclen(p, e, enc);
2110 c++;
2113 else {
2114 while (p < e) {
2115 if (ISASCII(*p)) {
2116 q = search_nonascii(p, e);
2117 if (!q)
2118 return c + (e - p);
2119 c += q - p;
2120 p = q;
2122 p += rb_enc_mbclen(p, e, enc);
2123 c++;
2126 return c;
2129 for (c=0; p<e; c++) {
2130 p += rb_enc_mbclen(p, e, enc);
2132 return c;
2135 long
2136 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2138 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2141 /* To get strlen with cr
2142 * Note that given cr is not used.
2144 long
2145 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2147 long c;
2148 const char *q;
2149 int ret;
2151 *cr = 0;
2152 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2153 long diff = (long)(e - p);
2154 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2156 else if (rb_enc_asciicompat(enc)) {
2157 c = 0;
2158 while (p < e) {
2159 if (ISASCII(*p)) {
2160 q = search_nonascii(p, e);
2161 if (!q) {
2162 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2163 return c + (e - p);
2165 c += q - p;
2166 p = q;
2168 ret = rb_enc_precise_mbclen(p, e, enc);
2169 if (MBCLEN_CHARFOUND_P(ret)) {
2170 *cr |= ENC_CODERANGE_VALID;
2171 p += MBCLEN_CHARFOUND_LEN(ret);
2173 else {
2174 *cr = ENC_CODERANGE_BROKEN;
2175 p++;
2177 c++;
2179 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2180 return c;
2183 for (c=0; p<e; c++) {
2184 ret = rb_enc_precise_mbclen(p, e, enc);
2185 if (MBCLEN_CHARFOUND_P(ret)) {
2186 *cr |= ENC_CODERANGE_VALID;
2187 p += MBCLEN_CHARFOUND_LEN(ret);
2189 else {
2190 *cr = ENC_CODERANGE_BROKEN;
2191 if (p + rb_enc_mbminlen(enc) <= e)
2192 p += rb_enc_mbminlen(enc);
2193 else
2194 p = e;
2197 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2198 return c;
2201 /* enc must be str's enc or rb_enc_check(str, str2) */
2202 static long
2203 str_strlen(VALUE str, rb_encoding *enc)
2205 const char *p, *e;
2206 int cr;
2208 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2209 if (!enc) enc = STR_ENC_GET(str);
2210 p = RSTRING_PTR(str);
2211 e = RSTRING_END(str);
2212 cr = ENC_CODERANGE(str);
2214 if (cr == ENC_CODERANGE_UNKNOWN) {
2215 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2216 if (cr) ENC_CODERANGE_SET(str, cr);
2217 return n;
2219 else {
2220 return enc_strlen(p, e, enc, cr);
2224 long
2225 rb_str_strlen(VALUE str)
2227 return str_strlen(str, NULL);
2231 * call-seq:
2232 * length -> integer
2234 * :include: doc/string/length.rdoc
2238 VALUE
2239 rb_str_length(VALUE str)
2241 return LONG2NUM(str_strlen(str, NULL));
2245 * call-seq:
2246 * bytesize -> integer
2248 * :include: doc/string/bytesize.rdoc
2252 VALUE
2253 rb_str_bytesize(VALUE str)
2255 return LONG2NUM(RSTRING_LEN(str));
2259 * call-seq:
2260 * empty? -> true or false
2262 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2264 * "hello".empty? # => false
2265 * " ".empty? # => false
2266 * "".empty? # => true
2270 static VALUE
2271 rb_str_empty(VALUE str)
2273 return RBOOL(RSTRING_LEN(str) == 0);
2277 * call-seq:
2278 * string + other_string -> new_string
2280 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2282 * "Hello from " + self.to_s # => "Hello from main"
2286 VALUE
2287 rb_str_plus(VALUE str1, VALUE str2)
2289 VALUE str3;
2290 rb_encoding *enc;
2291 char *ptr1, *ptr2, *ptr3;
2292 long len1, len2;
2293 int termlen;
2295 StringValue(str2);
2296 enc = rb_enc_check_str(str1, str2);
2297 RSTRING_GETMEM(str1, ptr1, len1);
2298 RSTRING_GETMEM(str2, ptr2, len2);
2299 termlen = rb_enc_mbminlen(enc);
2300 if (len1 > LONG_MAX - len2) {
2301 rb_raise(rb_eArgError, "string size too big");
2303 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2304 ptr3 = RSTRING_PTR(str3);
2305 memcpy(ptr3, ptr1, len1);
2306 memcpy(ptr3+len1, ptr2, len2);
2307 TERM_FILL(&ptr3[len1+len2], termlen);
2309 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2310 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2311 RB_GC_GUARD(str1);
2312 RB_GC_GUARD(str2);
2313 return str3;
2316 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2317 VALUE
2318 rb_str_opt_plus(VALUE str1, VALUE str2)
2320 RUBY_ASSERT(RBASIC_CLASS(str1) == rb_cString);
2321 RUBY_ASSERT(RBASIC_CLASS(str2) == rb_cString);
2322 long len1, len2;
2323 MAYBE_UNUSED(char) *ptr1, *ptr2;
2324 RSTRING_GETMEM(str1, ptr1, len1);
2325 RSTRING_GETMEM(str2, ptr2, len2);
2326 int enc1 = rb_enc_get_index(str1);
2327 int enc2 = rb_enc_get_index(str2);
2329 if (enc1 < 0) {
2330 return Qundef;
2332 else if (enc2 < 0) {
2333 return Qundef;
2335 else if (enc1 != enc2) {
2336 return Qundef;
2338 else if (len1 > LONG_MAX - len2) {
2339 return Qundef;
2341 else {
2342 return rb_str_plus(str1, str2);
2348 * call-seq:
2349 * string * integer -> new_string
2351 * Returns a new +String+ containing +integer+ copies of +self+:
2353 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2354 * "Ho! " * 0 # => ""
2358 VALUE
2359 rb_str_times(VALUE str, VALUE times)
2361 VALUE str2;
2362 long n, len;
2363 char *ptr2;
2364 int termlen;
2366 if (times == INT2FIX(1)) {
2367 return str_duplicate(rb_cString, str);
2369 if (times == INT2FIX(0)) {
2370 str2 = str_alloc_embed(rb_cString, 0);
2371 rb_enc_copy(str2, str);
2372 return str2;
2374 len = NUM2LONG(times);
2375 if (len < 0) {
2376 rb_raise(rb_eArgError, "negative argument");
2378 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2379 if (STR_EMBEDDABLE_P(len, 1)) {
2380 str2 = str_alloc_embed(rb_cString, len + 1);
2381 memset(RSTRING_PTR(str2), 0, len + 1);
2383 else {
2384 str2 = str_alloc_heap(rb_cString);
2385 RSTRING(str2)->as.heap.aux.capa = len;
2386 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2388 STR_SET_LEN(str2, len);
2389 rb_enc_copy(str2, str);
2390 return str2;
2392 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2393 rb_raise(rb_eArgError, "argument too big");
2396 len *= RSTRING_LEN(str);
2397 termlen = TERM_LEN(str);
2398 str2 = str_new0(rb_cString, 0, len, termlen);
2399 ptr2 = RSTRING_PTR(str2);
2400 if (len) {
2401 n = RSTRING_LEN(str);
2402 memcpy(ptr2, RSTRING_PTR(str), n);
2403 while (n <= len/2) {
2404 memcpy(ptr2 + n, ptr2, n);
2405 n *= 2;
2407 memcpy(ptr2 + n, ptr2, len-n);
2409 STR_SET_LEN(str2, len);
2410 TERM_FILL(&ptr2[len], termlen);
2411 rb_enc_cr_str_copy_for_substr(str2, str);
2413 return str2;
2417 * call-seq:
2418 * string % object -> new_string
2420 * Returns the result of formatting +object+ into the format specification +self+
2421 * (see Kernel#sprintf for formatting details):
2423 * "%05d" % 123 # => "00123"
2425 * If +self+ contains multiple substitutions, +object+ must be
2426 * an Array or Hash containing the values to be substituted:
2428 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2429 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2430 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2434 static VALUE
2435 rb_str_format_m(VALUE str, VALUE arg)
2437 VALUE tmp = rb_check_array_type(arg);
2439 if (!NIL_P(tmp)) {
2440 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2442 return rb_str_format(1, &arg, str);
2445 static inline void
2446 rb_check_lockedtmp(VALUE str)
2448 if (FL_TEST(str, STR_TMPLOCK)) {
2449 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2453 static inline void
2454 str_modifiable(VALUE str)
2456 rb_check_lockedtmp(str);
2457 rb_check_frozen(str);
2460 static inline int
2461 str_dependent_p(VALUE str)
2463 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2464 return 0;
2466 else {
2467 return 1;
2471 static inline int
2472 str_independent(VALUE str)
2474 str_modifiable(str);
2475 return !str_dependent_p(str);
2478 static void
2479 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2481 char *ptr;
2482 char *oldptr;
2483 long capa = len + expand;
2485 if (len > capa) len = capa;
2487 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2488 ptr = RSTRING(str)->as.heap.ptr;
2489 STR_SET_EMBED(str);
2490 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2491 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2492 STR_SET_LEN(str, len);
2493 return;
2496 ptr = ALLOC_N(char, (size_t)capa + termlen);
2497 oldptr = RSTRING_PTR(str);
2498 if (oldptr) {
2499 memcpy(ptr, oldptr, len);
2501 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2502 xfree(oldptr);
2504 STR_SET_NOEMBED(str);
2505 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2506 TERM_FILL(ptr + len, termlen);
2507 RSTRING(str)->as.heap.ptr = ptr;
2508 STR_SET_LEN(str, len);
2509 RSTRING(str)->as.heap.aux.capa = capa;
2512 void
2513 rb_str_modify(VALUE str)
2515 if (!str_independent(str))
2516 str_make_independent(str);
2517 ENC_CODERANGE_CLEAR(str);
2520 void
2521 rb_str_modify_expand(VALUE str, long expand)
2523 int termlen = TERM_LEN(str);
2524 long len = RSTRING_LEN(str);
2526 if (expand < 0) {
2527 rb_raise(rb_eArgError, "negative expanding string size");
2529 if (expand >= LONG_MAX - len) {
2530 rb_raise(rb_eArgError, "string size too big");
2533 if (!str_independent(str)) {
2534 str_make_independent_expand(str, len, expand, termlen);
2536 else if (expand > 0) {
2537 RESIZE_CAPA_TERM(str, len + expand, termlen);
2539 ENC_CODERANGE_CLEAR(str);
2542 /* As rb_str_modify(), but don't clear coderange */
2543 static void
2544 str_modify_keep_cr(VALUE str)
2546 if (!str_independent(str))
2547 str_make_independent(str);
2548 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2549 /* Force re-scan later */
2550 ENC_CODERANGE_CLEAR(str);
2553 static inline void
2554 str_discard(VALUE str)
2556 str_modifiable(str);
2557 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2558 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2559 RSTRING(str)->as.heap.ptr = 0;
2560 STR_SET_LEN(str, 0);
2564 void
2565 rb_must_asciicompat(VALUE str)
2567 rb_encoding *enc = rb_enc_get(str);
2568 if (!enc) {
2569 rb_raise(rb_eTypeError, "not encoding capable object");
2571 if (!rb_enc_asciicompat(enc)) {
2572 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2576 VALUE
2577 rb_string_value(volatile VALUE *ptr)
2579 VALUE s = *ptr;
2580 if (!RB_TYPE_P(s, T_STRING)) {
2581 s = rb_str_to_str(s);
2582 *ptr = s;
2584 return s;
2587 char *
2588 rb_string_value_ptr(volatile VALUE *ptr)
2590 VALUE str = rb_string_value(ptr);
2591 return RSTRING_PTR(str);
2594 static int
2595 zero_filled(const char *s, int n)
2597 for (; n > 0; --n) {
2598 if (*s++) return 0;
2600 return 1;
2603 static const char *
2604 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2606 const char *e = s + len;
2608 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2609 if (zero_filled(s, minlen)) return s;
2611 return 0;
2614 static char *
2615 str_fill_term(VALUE str, char *s, long len, int termlen)
2617 /* This function assumes that (capa + termlen) bytes of memory
2618 * is allocated, like many other functions in this file.
2620 if (str_dependent_p(str)) {
2621 if (!zero_filled(s + len, termlen))
2622 str_make_independent_expand(str, len, 0L, termlen);
2624 else {
2625 TERM_FILL(s + len, termlen);
2626 return s;
2628 return RSTRING_PTR(str);
2631 void
2632 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2634 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2635 long len = RSTRING_LEN(str);
2637 RUBY_ASSERT(capa >= len);
2638 if (capa - len < termlen) {
2639 rb_check_lockedtmp(str);
2640 str_make_independent_expand(str, len, 0L, termlen);
2642 else if (str_dependent_p(str)) {
2643 if (termlen > oldtermlen)
2644 str_make_independent_expand(str, len, 0L, termlen);
2646 else {
2647 if (!STR_EMBED_P(str)) {
2648 /* modify capa instead of realloc */
2649 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2650 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2652 if (termlen > oldtermlen) {
2653 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2657 return;
2660 static char *
2661 str_null_check(VALUE str, int *w)
2663 char *s = RSTRING_PTR(str);
2664 long len = RSTRING_LEN(str);
2665 rb_encoding *enc = rb_enc_get(str);
2666 const int minlen = rb_enc_mbminlen(enc);
2668 if (minlen > 1) {
2669 *w = 1;
2670 if (str_null_char(s, len, minlen, enc)) {
2671 return NULL;
2673 return str_fill_term(str, s, len, minlen);
2675 *w = 0;
2676 if (!s || memchr(s, 0, len)) {
2677 return NULL;
2679 if (s[len]) {
2680 s = str_fill_term(str, s, len, minlen);
2682 return s;
2685 char *
2686 rb_str_to_cstr(VALUE str)
2688 int w;
2689 return str_null_check(str, &w);
2692 char *
2693 rb_string_value_cstr(volatile VALUE *ptr)
2695 VALUE str = rb_string_value(ptr);
2696 int w;
2697 char *s = str_null_check(str, &w);
2698 if (!s) {
2699 if (w) {
2700 rb_raise(rb_eArgError, "string contains null char");
2702 rb_raise(rb_eArgError, "string contains null byte");
2704 return s;
2707 char *
2708 rb_str_fill_terminator(VALUE str, const int newminlen)
2710 char *s = RSTRING_PTR(str);
2711 long len = RSTRING_LEN(str);
2712 return str_fill_term(str, s, len, newminlen);
2715 VALUE
2716 rb_check_string_type(VALUE str)
2718 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2719 return str;
2723 * call-seq:
2724 * String.try_convert(object) -> object, new_string, or nil
2726 * If +object+ is a +String+ object, returns +object+.
2728 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2729 * calls <tt>object.to_str</tt> and returns the result.
2731 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2733 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2735 static VALUE
2736 rb_str_s_try_convert(VALUE dummy, VALUE str)
2738 return rb_check_string_type(str);
2741 static char*
2742 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2744 long nth = *nthp;
2745 if (rb_enc_mbmaxlen(enc) == 1) {
2746 p += nth;
2748 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2749 p += nth * rb_enc_mbmaxlen(enc);
2751 else if (rb_enc_asciicompat(enc)) {
2752 const char *p2, *e2;
2753 int n;
2755 while (p < e && 0 < nth) {
2756 e2 = p + nth;
2757 if (e < e2) {
2758 *nthp = nth;
2759 return (char *)e;
2761 if (ISASCII(*p)) {
2762 p2 = search_nonascii(p, e2);
2763 if (!p2) {
2764 nth -= e2 - p;
2765 *nthp = nth;
2766 return (char *)e2;
2768 nth -= p2 - p;
2769 p = p2;
2771 n = rb_enc_mbclen(p, e, enc);
2772 p += n;
2773 nth--;
2775 *nthp = nth;
2776 if (nth != 0) {
2777 return (char *)e;
2779 return (char *)p;
2781 else {
2782 while (p < e && nth--) {
2783 p += rb_enc_mbclen(p, e, enc);
2786 if (p > e) p = e;
2787 *nthp = nth;
2788 return (char*)p;
2791 char*
2792 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2794 return str_nth_len(p, e, &nth, enc);
2797 static char*
2798 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2800 if (singlebyte)
2801 p += nth;
2802 else {
2803 p = str_nth_len(p, e, &nth, enc);
2805 if (!p) return 0;
2806 if (p > e) p = e;
2807 return (char *)p;
2810 /* char offset to byte offset */
2811 static long
2812 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2814 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2815 if (!pp) return e - p;
2816 return pp - p;
2819 long
2820 rb_str_offset(VALUE str, long pos)
2822 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2823 STR_ENC_GET(str), single_byte_optimizable(str));
2826 #ifdef NONASCII_MASK
2827 static char *
2828 str_utf8_nth(const char *p, const char *e, long *nthp)
2830 long nth = *nthp;
2831 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2832 const uintptr_t *s, *t;
2833 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2834 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2835 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2836 while (p < (const char *)s) {
2837 if (is_utf8_lead_byte(*p)) nth--;
2838 p++;
2840 do {
2841 nth -= count_utf8_lead_bytes_with_word(s);
2842 s++;
2843 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2844 p = (char *)s;
2846 while (p < e) {
2847 if (is_utf8_lead_byte(*p)) {
2848 if (nth == 0) break;
2849 nth--;
2851 p++;
2853 *nthp = nth;
2854 return (char *)p;
2857 static long
2858 str_utf8_offset(const char *p, const char *e, long nth)
2860 const char *pp = str_utf8_nth(p, e, &nth);
2861 return pp - p;
2863 #endif
2865 /* byte offset to char offset */
2866 long
2867 rb_str_sublen(VALUE str, long pos)
2869 if (single_byte_optimizable(str) || pos < 0)
2870 return pos;
2871 else {
2872 char *p = RSTRING_PTR(str);
2873 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2877 static VALUE
2878 str_subseq(VALUE str, long beg, long len)
2880 VALUE str2;
2882 RUBY_ASSERT(beg >= 0);
2883 RUBY_ASSERT(len >= 0);
2884 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
2886 const int termlen = TERM_LEN(str);
2887 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2888 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2889 RB_GC_GUARD(str);
2890 return str2;
2893 str2 = str_alloc_heap(rb_cString);
2894 if (str_embed_capa(str2) >= len + termlen) {
2895 char *ptr2 = RSTRING(str2)->as.embed.ary;
2896 STR_SET_EMBED(str2);
2897 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2898 TERM_FILL(ptr2+len, termlen);
2900 STR_SET_LEN(str2, len);
2901 RB_GC_GUARD(str);
2903 else {
2904 str_replace_shared(str2, str);
2905 RUBY_ASSERT(!STR_EMBED_P(str2));
2906 ENC_CODERANGE_CLEAR(str2);
2907 RSTRING(str2)->as.heap.ptr += beg;
2908 if (RSTRING_LEN(str2) > len) {
2909 STR_SET_LEN(str2, len);
2913 return str2;
2916 VALUE
2917 rb_str_subseq(VALUE str, long beg, long len)
2919 VALUE str2 = str_subseq(str, beg, len);
2920 rb_enc_cr_str_copy_for_substr(str2, str);
2921 return str2;
2924 char *
2925 rb_str_subpos(VALUE str, long beg, long *lenp)
2927 long len = *lenp;
2928 long slen = -1L;
2929 long blen = RSTRING_LEN(str);
2930 rb_encoding *enc = STR_ENC_GET(str);
2931 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2933 if (len < 0) return 0;
2934 if (!blen) {
2935 len = 0;
2937 if (single_byte_optimizable(str)) {
2938 if (beg > blen) return 0;
2939 if (beg < 0) {
2940 beg += blen;
2941 if (beg < 0) return 0;
2943 if (len > blen - beg)
2944 len = blen - beg;
2945 if (len < 0) return 0;
2946 p = s + beg;
2947 goto end;
2949 if (beg < 0) {
2950 if (len > -beg) len = -beg;
2951 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2952 beg = -beg;
2953 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2954 p = e;
2955 if (!p) return 0;
2956 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2957 if (!p) return 0;
2958 len = e - p;
2959 goto end;
2961 else {
2962 slen = str_strlen(str, enc);
2963 beg += slen;
2964 if (beg < 0) return 0;
2965 p = s + beg;
2966 if (len == 0) goto end;
2969 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2970 return 0;
2972 if (len == 0) {
2973 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2974 p = s + beg;
2976 #ifdef NONASCII_MASK
2977 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2978 enc == rb_utf8_encoding()) {
2979 p = str_utf8_nth(s, e, &beg);
2980 if (beg > 0) return 0;
2981 len = str_utf8_offset(p, e, len);
2983 #endif
2984 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2985 int char_sz = rb_enc_mbmaxlen(enc);
2987 p = s + beg * char_sz;
2988 if (p > e) {
2989 return 0;
2991 else if (len * char_sz > e - p)
2992 len = e - p;
2993 else
2994 len *= char_sz;
2996 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2997 if (beg > 0) return 0;
2998 len = 0;
3000 else {
3001 len = str_offset(p, e, len, enc, 0);
3003 end:
3004 *lenp = len;
3005 RB_GC_GUARD(str);
3006 return p;
3009 static VALUE str_substr(VALUE str, long beg, long len, int empty);
3011 VALUE
3012 rb_str_substr(VALUE str, long beg, long len)
3014 return str_substr(str, beg, len, TRUE);
3017 static VALUE
3018 str_substr(VALUE str, long beg, long len, int empty)
3020 char *p = rb_str_subpos(str, beg, &len);
3022 if (!p) return Qnil;
3023 if (!len && !empty) return Qnil;
3025 beg = p - RSTRING_PTR(str);
3027 VALUE str2 = str_subseq(str, beg, len);
3028 rb_enc_cr_str_copy_for_substr(str2, str);
3029 return str2;
3032 /* :nodoc: */
3033 VALUE
3034 rb_str_freeze(VALUE str)
3036 if (CHILLED_STRING_P(str)) {
3037 FL_UNSET_RAW(str, STR_CHILLED);
3040 if (OBJ_FROZEN(str)) return str;
3041 rb_str_resize(str, RSTRING_LEN(str));
3042 return rb_obj_freeze(str);
3046 * call-seq:
3047 * +string -> new_string or self
3049 * Returns +self+ if +self+ is not frozen.
3051 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3053 static VALUE
3054 str_uplus(VALUE str)
3056 if (OBJ_FROZEN(str)) {
3057 return rb_str_dup(str);
3059 else {
3060 return str;
3065 * call-seq:
3066 * -string -> frozen_string
3067 * dedup -> frozen_string
3069 * Returns a frozen, possibly pre-existing copy of the string.
3071 * The returned +String+ will be deduplicated as long as it does not have
3072 * any instance variables set on it and is not a String subclass.
3074 * Note that <tt>-string</tt> variant is more convenient for defining
3075 * constants:
3077 * FILENAME = -'config/database.yml'
3079 * while +dedup+ is better suitable for using the method in chains
3080 * of calculations:
3082 * @url_list.concat(urls.map(&:dedup))
3085 static VALUE
3086 str_uminus(VALUE str)
3088 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3089 str = rb_str_dup(str);
3091 return rb_fstring(str);
3094 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3095 #define rb_str_dup_frozen rb_str_new_frozen
3097 VALUE
3098 rb_str_locktmp(VALUE str)
3100 if (FL_TEST(str, STR_TMPLOCK)) {
3101 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3103 FL_SET(str, STR_TMPLOCK);
3104 return str;
3107 VALUE
3108 rb_str_unlocktmp(VALUE str)
3110 if (!FL_TEST(str, STR_TMPLOCK)) {
3111 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3113 FL_UNSET(str, STR_TMPLOCK);
3114 return str;
3117 VALUE
3118 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3120 rb_str_locktmp(str);
3121 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3124 void
3125 rb_str_set_len(VALUE str, long len)
3127 long capa;
3128 const int termlen = TERM_LEN(str);
3130 str_modifiable(str);
3131 if (STR_SHARED_P(str)) {
3132 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3134 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3135 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3138 int cr = ENC_CODERANGE(str);
3139 if (cr == ENC_CODERANGE_UNKNOWN) {
3140 /* Leave unknown. */
3142 else if (len > RSTRING_LEN(str)) {
3143 if (ENC_CODERANGE_CLEAN_P(cr)) {
3144 /* Update the coderange regarding the extended part. */
3145 const char *const prev_end = RSTRING_END(str);
3146 const char *const new_end = RSTRING_PTR(str) + len;
3147 rb_encoding *enc = rb_enc_get(str);
3148 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3149 ENC_CODERANGE_SET(str, cr);
3151 else if (cr == ENC_CODERANGE_BROKEN) {
3152 /* May be valid now, by appended part. */
3153 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
3156 else if (len < RSTRING_LEN(str)) {
3157 if (cr != ENC_CODERANGE_7BIT) {
3158 /* ASCII-only string is keeping after truncated. Valid
3159 * and broken may be invalid or valid, leave unknown. */
3160 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
3164 STR_SET_LEN(str, len);
3165 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3168 VALUE
3169 rb_str_resize(VALUE str, long len)
3171 if (len < 0) {
3172 rb_raise(rb_eArgError, "negative string size (or size too big)");
3175 int independent = str_independent(str);
3176 long slen = RSTRING_LEN(str);
3178 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3179 ENC_CODERANGE_CLEAR(str);
3183 long capa;
3184 const int termlen = TERM_LEN(str);
3185 if (STR_EMBED_P(str)) {
3186 if (len == slen) return str;
3187 if (str_embed_capa(str) >= len + termlen) {
3188 STR_SET_LEN(str, len);
3189 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3190 return str;
3192 str_make_independent_expand(str, slen, len - slen, termlen);
3194 else if (str_embed_capa(str) >= len + termlen) {
3195 char *ptr = STR_HEAP_PTR(str);
3196 STR_SET_EMBED(str);
3197 if (slen > len) slen = len;
3198 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3199 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3200 STR_SET_LEN(str, len);
3201 if (independent) ruby_xfree(ptr);
3202 return str;
3204 else if (!independent) {
3205 if (len == slen) return str;
3206 str_make_independent_expand(str, slen, len - slen, termlen);
3208 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3209 (capa - len) > (len < 1024 ? len : 1024)) {
3210 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3211 (size_t)len + termlen, STR_HEAP_SIZE(str));
3212 RSTRING(str)->as.heap.aux.capa = len;
3214 else if (len == slen) return str;
3215 STR_SET_LEN(str, len);
3216 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3218 return str;
3221 static VALUE
3222 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3224 if (keep_cr) {
3225 str_modify_keep_cr(str);
3227 else {
3228 rb_str_modify(str);
3230 if (len == 0) return 0;
3232 long total, olen, off = -1;
3233 char *sptr;
3234 const int termlen = TERM_LEN(str);
3236 RSTRING_GETMEM(str, sptr, olen);
3237 if (ptr >= sptr && ptr <= sptr + olen) {
3238 off = ptr - sptr;
3241 long capa = str_capacity(str, termlen);
3243 if (olen > LONG_MAX - len) {
3244 rb_raise(rb_eArgError, "string sizes too big");
3246 total = olen + len;
3247 if (capa < total) {
3248 if (total >= LONG_MAX / 2) {
3249 capa = total;
3251 while (total > capa) {
3252 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3254 RESIZE_CAPA_TERM(str, capa, termlen);
3255 sptr = RSTRING_PTR(str);
3257 if (off != -1) {
3258 ptr = sptr + off;
3260 memcpy(sptr + olen, ptr, len);
3261 STR_SET_LEN(str, total);
3262 TERM_FILL(sptr + total, termlen); /* sentinel */
3264 return str;
3267 #define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3268 #define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3270 VALUE
3271 rb_str_cat(VALUE str, const char *ptr, long len)
3273 if (len == 0) return str;
3274 if (len < 0) {
3275 rb_raise(rb_eArgError, "negative string size (or size too big)");
3277 return str_buf_cat(str, ptr, len);
3280 VALUE
3281 rb_str_cat_cstr(VALUE str, const char *ptr)
3283 must_not_null(ptr);
3284 return rb_str_buf_cat(str, ptr, strlen(ptr));
3287 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3288 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3289 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3291 static VALUE
3292 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3293 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3295 int str_encindex = ENCODING_GET(str);
3296 int res_encindex;
3297 int str_cr, res_cr;
3298 rb_encoding *str_enc, *ptr_enc;
3300 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3302 if (str_encindex == ptr_encindex) {
3303 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3304 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3307 else {
3308 str_enc = rb_enc_from_index(str_encindex);
3309 ptr_enc = rb_enc_from_index(ptr_encindex);
3310 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3311 if (len == 0)
3312 return str;
3313 if (RSTRING_LEN(str) == 0) {
3314 rb_str_buf_cat(str, ptr, len);
3315 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3316 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3317 return str;
3319 goto incompatible;
3321 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3322 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3324 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3325 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3326 str_cr = rb_enc_str_coderange(str);
3330 if (ptr_cr_ret)
3331 *ptr_cr_ret = ptr_cr;
3333 if (str_encindex != ptr_encindex &&
3334 str_cr != ENC_CODERANGE_7BIT &&
3335 ptr_cr != ENC_CODERANGE_7BIT) {
3336 str_enc = rb_enc_from_index(str_encindex);
3337 ptr_enc = rb_enc_from_index(ptr_encindex);
3338 goto incompatible;
3341 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3342 res_encindex = str_encindex;
3343 res_cr = ENC_CODERANGE_UNKNOWN;
3345 else if (str_cr == ENC_CODERANGE_7BIT) {
3346 if (ptr_cr == ENC_CODERANGE_7BIT) {
3347 res_encindex = str_encindex;
3348 res_cr = ENC_CODERANGE_7BIT;
3350 else {
3351 res_encindex = ptr_encindex;
3352 res_cr = ptr_cr;
3355 else if (str_cr == ENC_CODERANGE_VALID) {
3356 res_encindex = str_encindex;
3357 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3358 res_cr = str_cr;
3359 else
3360 res_cr = ptr_cr;
3362 else { /* str_cr == ENC_CODERANGE_BROKEN */
3363 res_encindex = str_encindex;
3364 res_cr = str_cr;
3365 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3368 if (len < 0) {
3369 rb_raise(rb_eArgError, "negative string size (or size too big)");
3371 str_buf_cat(str, ptr, len);
3372 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3373 return str;
3375 incompatible:
3376 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3377 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3378 UNREACHABLE_RETURN(Qundef);
3381 VALUE
3382 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3384 return rb_enc_cr_str_buf_cat(str, ptr, len,
3385 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3388 VALUE
3389 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3391 /* ptr must reference NUL terminated ASCII string. */
3392 int encindex = ENCODING_GET(str);
3393 rb_encoding *enc = rb_enc_from_index(encindex);
3394 if (rb_enc_asciicompat(enc)) {
3395 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3396 encindex, ENC_CODERANGE_7BIT, 0);
3398 else {
3399 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3400 while (*ptr) {
3401 unsigned int c = (unsigned char)*ptr;
3402 int len = rb_enc_codelen(c, enc);
3403 rb_enc_mbcput(c, buf, enc);
3404 rb_enc_cr_str_buf_cat(str, buf, len,
3405 encindex, ENC_CODERANGE_VALID, 0);
3406 ptr++;
3408 return str;
3412 VALUE
3413 rb_str_buf_append(VALUE str, VALUE str2)
3415 int str2_cr = rb_enc_str_coderange(str2);
3417 if (str_enc_fastpath(str)) {
3418 switch (str2_cr) {
3419 case ENC_CODERANGE_7BIT:
3420 // If RHS is 7bit we can do simple concatenation
3421 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3422 RB_GC_GUARD(str2);
3423 return str;
3424 case ENC_CODERANGE_VALID:
3425 // If RHS is valid, we can do simple concatenation if encodings are the same
3426 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3427 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3428 int str_cr = ENC_CODERANGE(str);
3429 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3430 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3432 RB_GC_GUARD(str2);
3433 return str;
3438 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3439 ENCODING_GET(str2), str2_cr, &str2_cr);
3441 ENC_CODERANGE_SET(str2, str2_cr);
3443 return str;
3446 VALUE
3447 rb_str_append(VALUE str, VALUE str2)
3449 StringValue(str2);
3450 return rb_str_buf_append(str, str2);
3453 VALUE
3454 rb_str_concat_literals(size_t num, const VALUE *strary)
3456 VALUE str;
3457 size_t i, s = 0;
3458 unsigned long len = 1;
3460 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3461 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3463 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3464 str = rb_str_buf_new(len);
3465 str_enc_copy_direct(str, strary[0]);
3467 for (i = s; i < num; ++i) {
3468 const VALUE v = strary[i];
3469 int encidx = ENCODING_GET(v);
3471 rb_str_buf_append(str, v);
3472 if (encidx != ENCINDEX_US_ASCII) {
3473 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3474 rb_enc_set_index(str, encidx);
3477 return str;
3481 * call-seq:
3482 * concat(*objects) -> string
3484 * Concatenates each object in +objects+ to +self+ and returns +self+:
3486 * s = 'foo'
3487 * s.concat('bar', 'baz') # => "foobarbaz"
3488 * s # => "foobarbaz"
3490 * For each given object +object+ that is an Integer,
3491 * the value is considered a codepoint and converted to a character before concatenation:
3493 * s = 'foo'
3494 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3496 * Related: String#<<, which takes a single argument.
3498 static VALUE
3499 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3501 str_modifiable(str);
3503 if (argc == 1) {
3504 return rb_str_concat(str, argv[0]);
3506 else if (argc > 1) {
3507 int i;
3508 VALUE arg_str = rb_str_tmp_new(0);
3509 rb_enc_copy(arg_str, str);
3510 for (i = 0; i < argc; i++) {
3511 rb_str_concat(arg_str, argv[i]);
3513 rb_str_buf_append(str, arg_str);
3516 return str;
3520 * call-seq:
3521 * string << object -> string
3523 * Concatenates +object+ to +self+ and returns +self+:
3525 * s = 'foo'
3526 * s << 'bar' # => "foobar"
3527 * s # => "foobar"
3529 * If +object+ is an Integer,
3530 * the value is considered a codepoint and converted to a character before concatenation:
3532 * s = 'foo'
3533 * s << 33 # => "foo!"
3535 * Related: String#concat, which takes multiple arguments.
3537 VALUE
3538 rb_str_concat(VALUE str1, VALUE str2)
3540 unsigned int code;
3541 rb_encoding *enc = STR_ENC_GET(str1);
3542 int encidx;
3544 if (RB_INTEGER_TYPE_P(str2)) {
3545 if (rb_num_to_uint(str2, &code) == 0) {
3547 else if (FIXNUM_P(str2)) {
3548 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3550 else {
3551 rb_raise(rb_eRangeError, "bignum out of char range");
3554 else {
3555 return rb_str_append(str1, str2);
3558 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3559 if (encidx >= 0) {
3560 char buf[1];
3561 buf[0] = (char)code;
3562 rb_str_cat(str1, buf, 1);
3563 if (encidx != rb_enc_to_index(enc)) {
3564 rb_enc_associate_index(str1, encidx);
3565 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3568 else {
3569 long pos = RSTRING_LEN(str1);
3570 int cr = ENC_CODERANGE(str1);
3571 int len;
3572 char *buf;
3574 switch (len = rb_enc_codelen(code, enc)) {
3575 case ONIGERR_INVALID_CODE_POINT_VALUE:
3576 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3577 break;
3578 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3579 case 0:
3580 rb_raise(rb_eRangeError, "%u out of char range", code);
3581 break;
3583 buf = ALLOCA_N(char, len + 1);
3584 rb_enc_mbcput(code, buf, enc);
3585 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3586 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3588 rb_str_resize(str1, pos+len);
3589 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3590 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3591 cr = ENC_CODERANGE_VALID;
3593 else if (cr == ENC_CODERANGE_BROKEN) {
3594 cr = ENC_CODERANGE_UNKNOWN;
3596 ENC_CODERANGE_SET(str1, cr);
3598 return str1;
3602 rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3604 int encidx = rb_enc_to_index(enc);
3606 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3607 /* US-ASCII automatically extended to ASCII-8BIT */
3608 if (code > 0xFF) {
3609 rb_raise(rb_eRangeError, "%u out of char range", code);
3611 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3612 return ENCINDEX_ASCII_8BIT;
3614 return encidx;
3616 else {
3617 return -1;
3622 * call-seq:
3623 * prepend(*other_strings) -> string
3625 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3627 * s = 'foo'
3628 * s.prepend('bar', 'baz') # => "barbazfoo"
3629 * s # => "barbazfoo"
3631 * Related: String#concat.
3634 static VALUE
3635 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3637 str_modifiable(str);
3639 if (argc == 1) {
3640 rb_str_update(str, 0L, 0L, argv[0]);
3642 else if (argc > 1) {
3643 int i;
3644 VALUE arg_str = rb_str_tmp_new(0);
3645 rb_enc_copy(arg_str, str);
3646 for (i = 0; i < argc; i++) {
3647 rb_str_append(arg_str, argv[i]);
3649 rb_str_update(str, 0L, 0L, arg_str);
3652 return str;
3655 st_index_t
3656 rb_str_hash(VALUE str)
3658 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3659 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3660 if (e && !is_ascii_string(str)) {
3661 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3663 return h;
3667 rb_str_hash_cmp(VALUE str1, VALUE str2)
3669 long len1, len2;
3670 const char *ptr1, *ptr2;
3671 RSTRING_GETMEM(str1, ptr1, len1);
3672 RSTRING_GETMEM(str2, ptr2, len2);
3673 return (len1 != len2 ||
3674 !rb_str_comparable(str1, str2) ||
3675 memcmp(ptr1, ptr2, len1) != 0);
3679 * call-seq:
3680 * hash -> integer
3682 * Returns the integer hash value for +self+.
3683 * The value is based on the length, content and encoding of +self+.
3685 * Related: Object#hash.
3688 static VALUE
3689 rb_str_hash_m(VALUE str)
3691 st_index_t hval = rb_str_hash(str);
3692 return ST2FIX(hval);
3695 #define lesser(a,b) (((a)>(b))?(b):(a))
3698 rb_str_comparable(VALUE str1, VALUE str2)
3700 int idx1, idx2;
3701 int rc1, rc2;
3703 if (RSTRING_LEN(str1) == 0) return TRUE;
3704 if (RSTRING_LEN(str2) == 0) return TRUE;
3705 idx1 = ENCODING_GET(str1);
3706 idx2 = ENCODING_GET(str2);
3707 if (idx1 == idx2) return TRUE;
3708 rc1 = rb_enc_str_coderange(str1);
3709 rc2 = rb_enc_str_coderange(str2);
3710 if (rc1 == ENC_CODERANGE_7BIT) {
3711 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3712 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3713 return TRUE;
3715 if (rc2 == ENC_CODERANGE_7BIT) {
3716 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3717 return TRUE;
3719 return FALSE;
3723 rb_str_cmp(VALUE str1, VALUE str2)
3725 long len1, len2;
3726 const char *ptr1, *ptr2;
3727 int retval;
3729 if (str1 == str2) return 0;
3730 RSTRING_GETMEM(str1, ptr1, len1);
3731 RSTRING_GETMEM(str2, ptr2, len2);
3732 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3733 if (len1 == len2) {
3734 if (!rb_str_comparable(str1, str2)) {
3735 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3736 return 1;
3737 return -1;
3739 return 0;
3741 if (len1 > len2) return 1;
3742 return -1;
3744 if (retval > 0) return 1;
3745 return -1;
3749 * call-seq:
3750 * string == object -> true or false
3751 * string === object -> true or false
3753 * Returns +true+ if +object+ has the same length and content;
3754 * as +self+; +false+ otherwise:
3756 * s = 'foo'
3757 * s == 'foo' # => true
3758 * s == 'food' # => false
3759 * s == 'FOO' # => false
3761 * Returns +false+ if the two strings' encodings are not compatible:
3762 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3764 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
3765 * two strings are compared using <code>object.==</code>.
3768 VALUE
3769 rb_str_equal(VALUE str1, VALUE str2)
3771 if (str1 == str2) return Qtrue;
3772 if (!RB_TYPE_P(str2, T_STRING)) {
3773 if (!rb_respond_to(str2, idTo_str)) {
3774 return Qfalse;
3776 return rb_equal(str2, str1);
3778 return rb_str_eql_internal(str1, str2);
3782 * call-seq:
3783 * eql?(object) -> true or false
3785 * Returns +true+ if +object+ has the same length and content;
3786 * as +self+; +false+ otherwise:
3788 * s = 'foo'
3789 * s.eql?('foo') # => true
3790 * s.eql?('food') # => false
3791 * s.eql?('FOO') # => false
3793 * Returns +false+ if the two strings' encodings are not compatible:
3795 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3799 VALUE
3800 rb_str_eql(VALUE str1, VALUE str2)
3802 if (str1 == str2) return Qtrue;
3803 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3804 return rb_str_eql_internal(str1, str2);
3808 * call-seq:
3809 * string <=> other_string -> -1, 0, 1, or nil
3811 * Compares +self+ and +other_string+, returning:
3813 * - -1 if +other_string+ is larger.
3814 * - 0 if the two are equal.
3815 * - 1 if +other_string+ is smaller.
3816 * - +nil+ if the two are incomparable.
3818 * Examples:
3820 * 'foo' <=> 'foo' # => 0
3821 * 'foo' <=> 'food' # => -1
3822 * 'food' <=> 'foo' # => 1
3823 * 'FOO' <=> 'foo' # => -1
3824 * 'foo' <=> 'FOO' # => 1
3825 * 'foo' <=> 1 # => nil
3829 static VALUE
3830 rb_str_cmp_m(VALUE str1, VALUE str2)
3832 int result;
3833 VALUE s = rb_check_string_type(str2);
3834 if (NIL_P(s)) {
3835 return rb_invcmp(str1, str2);
3837 result = rb_str_cmp(str1, s);
3838 return INT2FIX(result);
3841 static VALUE str_casecmp(VALUE str1, VALUE str2);
3842 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3845 * call-seq:
3846 * casecmp(other_string) -> -1, 0, 1, or nil
3848 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3850 * - -1 if <tt>other_string.downcase</tt> is larger.
3851 * - 0 if the two are equal.
3852 * - 1 if <tt>other_string.downcase</tt> is smaller.
3853 * - +nil+ if the two are incomparable.
3855 * Examples:
3857 * 'foo'.casecmp('foo') # => 0
3858 * 'foo'.casecmp('food') # => -1
3859 * 'food'.casecmp('foo') # => 1
3860 * 'FOO'.casecmp('foo') # => 0
3861 * 'foo'.casecmp('FOO') # => 0
3862 * 'foo'.casecmp(1) # => nil
3864 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3866 * Related: String#casecmp?.
3870 static VALUE
3871 rb_str_casecmp(VALUE str1, VALUE str2)
3873 VALUE s = rb_check_string_type(str2);
3874 if (NIL_P(s)) {
3875 return Qnil;
3877 return str_casecmp(str1, s);
3880 static VALUE
3881 str_casecmp(VALUE str1, VALUE str2)
3883 long len;
3884 rb_encoding *enc;
3885 const char *p1, *p1end, *p2, *p2end;
3887 enc = rb_enc_compatible(str1, str2);
3888 if (!enc) {
3889 return Qnil;
3892 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3893 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3894 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3895 while (p1 < p1end && p2 < p2end) {
3896 if (*p1 != *p2) {
3897 unsigned int c1 = TOLOWER(*p1 & 0xff);
3898 unsigned int c2 = TOLOWER(*p2 & 0xff);
3899 if (c1 != c2)
3900 return INT2FIX(c1 < c2 ? -1 : 1);
3902 p1++;
3903 p2++;
3906 else {
3907 while (p1 < p1end && p2 < p2end) {
3908 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3909 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3911 if (0 <= c1 && 0 <= c2) {
3912 c1 = TOLOWER(c1);
3913 c2 = TOLOWER(c2);
3914 if (c1 != c2)
3915 return INT2FIX(c1 < c2 ? -1 : 1);
3917 else {
3918 int r;
3919 l1 = rb_enc_mbclen(p1, p1end, enc);
3920 l2 = rb_enc_mbclen(p2, p2end, enc);
3921 len = l1 < l2 ? l1 : l2;
3922 r = memcmp(p1, p2, len);
3923 if (r != 0)
3924 return INT2FIX(r < 0 ? -1 : 1);
3925 if (l1 != l2)
3926 return INT2FIX(l1 < l2 ? -1 : 1);
3928 p1 += l1;
3929 p2 += l2;
3932 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3933 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3934 return INT2FIX(-1);
3938 * call-seq:
3939 * casecmp?(other_string) -> true, false, or nil
3941 * Returns +true+ if +self+ and +other_string+ are equal after
3942 * Unicode case folding, otherwise +false+:
3944 * 'foo'.casecmp?('foo') # => true
3945 * 'foo'.casecmp?('food') # => false
3946 * 'food'.casecmp?('foo') # => false
3947 * 'FOO'.casecmp?('foo') # => true
3948 * 'foo'.casecmp?('FOO') # => true
3950 * Returns +nil+ if the two values are incomparable:
3952 * 'foo'.casecmp?(1) # => nil
3954 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3956 * Related: String#casecmp.
3960 static VALUE
3961 rb_str_casecmp_p(VALUE str1, VALUE str2)
3963 VALUE s = rb_check_string_type(str2);
3964 if (NIL_P(s)) {
3965 return Qnil;
3967 return str_casecmp_p(str1, s);
3970 static VALUE
3971 str_casecmp_p(VALUE str1, VALUE str2)
3973 rb_encoding *enc;
3974 VALUE folded_str1, folded_str2;
3975 VALUE fold_opt = sym_fold;
3977 enc = rb_enc_compatible(str1, str2);
3978 if (!enc) {
3979 return Qnil;
3982 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3983 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3985 return rb_str_eql(folded_str1, folded_str2);
3988 static long
3989 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3990 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3992 const char *search_start = str_ptr;
3993 long pos, search_len = str_len - offset;
3995 for (;;) {
3996 const char *t;
3997 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3998 if (pos < 0) return pos;
3999 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4000 if (t == search_start + pos) break;
4001 search_len -= t - search_start;
4002 if (search_len <= 0) return -1;
4003 offset += t - search_start;
4004 search_start = t;
4006 return pos + offset;
4009 /* found index in byte */
4010 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4011 #define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4013 static long
4014 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4016 const char *str_ptr, *str_ptr_end, *sub_ptr;
4017 long str_len, sub_len;
4018 rb_encoding *enc;
4020 enc = rb_enc_check(str, sub);
4021 if (is_broken_string(sub)) return -1;
4023 str_ptr = RSTRING_PTR(str);
4024 str_ptr_end = RSTRING_END(str);
4025 str_len = RSTRING_LEN(str);
4026 sub_ptr = RSTRING_PTR(sub);
4027 sub_len = RSTRING_LEN(sub);
4029 if (str_len < sub_len) return -1;
4031 if (offset != 0) {
4032 long str_len_char, sub_len_char;
4033 int single_byte = single_byte_optimizable(str);
4034 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4035 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4036 if (offset < 0) {
4037 offset += str_len_char;
4038 if (offset < 0) return -1;
4040 if (str_len_char - offset < sub_len_char) return -1;
4041 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4042 str_ptr += offset;
4044 if (sub_len == 0) return offset;
4046 /* need proceed one character at a time */
4047 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4052 * call-seq:
4053 * index(substring, offset = 0) -> integer or nil
4054 * index(regexp, offset = 0) -> integer or nil
4056 * :include: doc/string/index.rdoc
4060 static VALUE
4061 rb_str_index_m(int argc, VALUE *argv, VALUE str)
4063 VALUE sub;
4064 VALUE initpos;
4065 rb_encoding *enc = STR_ENC_GET(str);
4066 long pos;
4068 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4069 long slen = str_strlen(str, enc); /* str's enc */
4070 pos = NUM2LONG(initpos);
4071 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4072 if (RB_TYPE_P(sub, T_REGEXP)) {
4073 rb_backref_set(Qnil);
4075 return Qnil;
4078 else {
4079 pos = 0;
4082 if (RB_TYPE_P(sub, T_REGEXP)) {
4083 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4084 enc, single_byte_optimizable(str));
4086 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4087 VALUE match = rb_backref_get();
4088 struct re_registers *regs = RMATCH_REGS(match);
4089 pos = rb_str_sublen(str, BEG(0));
4090 return LONG2NUM(pos);
4093 else {
4094 StringValue(sub);
4095 pos = rb_str_index(str, sub, pos);
4096 if (pos >= 0) {
4097 pos = rb_str_sublen(str, pos);
4098 return LONG2NUM(pos);
4101 return Qnil;
4104 /* Ensure that the given pos is a valid character boundary.
4105 * Note that in this function, "character" means a code point
4106 * (Unicode scalar value), not a grapheme cluster.
4108 static void
4109 str_ensure_byte_pos(VALUE str, long pos)
4111 const char *s = RSTRING_PTR(str);
4112 const char *e = RSTRING_END(str);
4113 const char *p = s + pos;
4114 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4115 rb_raise(rb_eIndexError,
4116 "offset %ld does not land on character boundary", pos);
4121 * call-seq:
4122 * byteindex(substring, offset = 0) -> integer or nil
4123 * byteindex(regexp, offset = 0) -> integer or nil
4125 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4126 * or +nil+ if none found:
4128 * 'foo'.byteindex('f') # => 0
4129 * 'foo'.byteindex('o') # => 1
4130 * 'foo'.byteindex('oo') # => 1
4131 * 'foo'.byteindex('ooo') # => nil
4133 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4134 * or +nil+ if none found:
4136 * 'foo'.byteindex(/f/) # => 0
4137 * 'foo'.byteindex(/o/) # => 1
4138 * 'foo'.byteindex(/oo/) # => 1
4139 * 'foo'.byteindex(/ooo/) # => nil
4141 * Integer argument +offset+, if given, specifies the byte-based position in the
4142 * string to begin the search:
4144 * 'foo'.byteindex('o', 1) # => 1
4145 * 'foo'.byteindex('o', 2) # => 2
4146 * 'foo'.byteindex('o', 3) # => nil
4148 * If +offset+ is negative, counts backward from the end of +self+:
4150 * 'foo'.byteindex('o', -1) # => 2
4151 * 'foo'.byteindex('o', -2) # => 1
4152 * 'foo'.byteindex('o', -3) # => 1
4153 * 'foo'.byteindex('o', -4) # => nil
4155 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4156 * raised.
4158 * Related: String#index, String#byterindex.
4161 static VALUE
4162 rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4164 VALUE sub;
4165 VALUE initpos;
4166 long pos;
4168 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4169 long slen = RSTRING_LEN(str);
4170 pos = NUM2LONG(initpos);
4171 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4172 if (RB_TYPE_P(sub, T_REGEXP)) {
4173 rb_backref_set(Qnil);
4175 return Qnil;
4178 else {
4179 pos = 0;
4182 str_ensure_byte_pos(str, pos);
4184 if (RB_TYPE_P(sub, T_REGEXP)) {
4185 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4186 VALUE match = rb_backref_get();
4187 struct re_registers *regs = RMATCH_REGS(match);
4188 pos = BEG(0);
4189 return LONG2NUM(pos);
4192 else {
4193 StringValue(sub);
4194 pos = rb_str_byteindex(str, sub, pos);
4195 if (pos >= 0) return LONG2NUM(pos);
4197 return Qnil;
4200 #ifdef HAVE_MEMRCHR
4201 static long
4202 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4204 char *hit, *adjusted;
4205 int c;
4206 long slen, searchlen;
4207 char *sbeg, *e, *t;
4209 sbeg = RSTRING_PTR(str);
4210 slen = RSTRING_LEN(sub);
4211 if (slen == 0) return s - sbeg;
4212 e = RSTRING_END(str);
4213 t = RSTRING_PTR(sub);
4214 c = *t & 0xff;
4215 searchlen = s - sbeg + 1;
4217 do {
4218 hit = memrchr(sbeg, c, searchlen);
4219 if (!hit) break;
4220 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4221 if (hit != adjusted) {
4222 searchlen = adjusted - sbeg;
4223 continue;
4225 if (memcmp(hit, t, slen) == 0)
4226 return hit - sbeg;
4227 searchlen = adjusted - sbeg;
4228 } while (searchlen > 0);
4230 return -1;
4232 #else
4233 static long
4234 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4236 long slen;
4237 char *sbeg, *e, *t;
4239 sbeg = RSTRING_PTR(str);
4240 e = RSTRING_END(str);
4241 t = RSTRING_PTR(sub);
4242 slen = RSTRING_LEN(sub);
4244 while (s) {
4245 if (memcmp(s, t, slen) == 0) {
4246 return s - sbeg;
4248 if (s <= sbeg) break;
4249 s = rb_enc_prev_char(sbeg, s, e, enc);
4252 return -1;
4254 #endif
4256 /* found index in byte */
4257 static long
4258 rb_str_rindex(VALUE str, VALUE sub, long pos)
4260 long len, slen;
4261 char *sbeg, *s;
4262 rb_encoding *enc;
4263 int singlebyte;
4265 enc = rb_enc_check(str, sub);
4266 if (is_broken_string(sub)) return -1;
4267 singlebyte = single_byte_optimizable(str);
4268 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4269 slen = str_strlen(sub, enc); /* rb_enc_check */
4271 /* substring longer than string */
4272 if (len < slen) return -1;
4273 if (len - pos < slen) pos = len - slen;
4274 if (len == 0) return pos;
4276 sbeg = RSTRING_PTR(str);
4278 if (pos == 0) {
4279 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4280 return 0;
4281 else
4282 return -1;
4285 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4286 return str_rindex(str, sub, s, enc);
4290 * call-seq:
4291 * rindex(substring, offset = self.length) -> integer or nil
4292 * rindex(regexp, offset = self.length) -> integer or nil
4294 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4295 * or +nil+ if none found:
4297 * 'foo'.rindex('f') # => 0
4298 * 'foo'.rindex('o') # => 2
4299 * 'foo'.rindex('oo') # => 1
4300 * 'foo'.rindex('ooo') # => nil
4302 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4303 * or +nil+ if none found:
4305 * 'foo'.rindex(/f/) # => 0
4306 * 'foo'.rindex(/o/) # => 2
4307 * 'foo'.rindex(/oo/) # => 1
4308 * 'foo'.rindex(/ooo/) # => nil
4310 * The _last_ match means starting at the possible last position, not
4311 * the last of longest matches.
4313 * 'foo'.rindex(/o+/) # => 2
4314 * $~ #=> #<MatchData "o">
4316 * To get the last longest match, needs to combine with negative
4317 * lookbehind.
4319 * 'foo'.rindex(/(?<!o)o+/) # => 1
4320 * $~ #=> #<MatchData "oo">
4322 * Or String#index with negative lookforward.
4324 * 'foo'.index(/o+(?!.*o)/) # => 1
4325 * $~ #=> #<MatchData "oo">
4327 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4328 * string to _end_ the search:
4330 * 'foo'.rindex('o', 0) # => nil
4331 * 'foo'.rindex('o', 1) # => 1
4332 * 'foo'.rindex('o', 2) # => 2
4333 * 'foo'.rindex('o', 3) # => 2
4335 * If +offset+ is a negative Integer, the maximum starting position in the
4336 * string to _end_ the search is the sum of the string's length and +offset+:
4338 * 'foo'.rindex('o', -1) # => 2
4339 * 'foo'.rindex('o', -2) # => 1
4340 * 'foo'.rindex('o', -3) # => nil
4341 * 'foo'.rindex('o', -4) # => nil
4343 * Related: String#index.
4346 static VALUE
4347 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4349 VALUE sub;
4350 VALUE initpos;
4351 rb_encoding *enc = STR_ENC_GET(str);
4352 long pos, len = str_strlen(str, enc); /* str's enc */
4354 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4355 pos = NUM2LONG(initpos);
4356 if (pos < 0 && (pos += len) < 0) {
4357 if (RB_TYPE_P(sub, T_REGEXP)) {
4358 rb_backref_set(Qnil);
4360 return Qnil;
4362 if (pos > len) pos = len;
4364 else {
4365 pos = len;
4368 if (RB_TYPE_P(sub, T_REGEXP)) {
4369 /* enc = rb_enc_check(str, sub); */
4370 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4371 enc, single_byte_optimizable(str));
4373 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4374 VALUE match = rb_backref_get();
4375 struct re_registers *regs = RMATCH_REGS(match);
4376 pos = rb_str_sublen(str, BEG(0));
4377 return LONG2NUM(pos);
4380 else {
4381 StringValue(sub);
4382 pos = rb_str_rindex(str, sub, pos);
4383 if (pos >= 0) {
4384 pos = rb_str_sublen(str, pos);
4385 return LONG2NUM(pos);
4388 return Qnil;
4391 static long
4392 rb_str_byterindex(VALUE str, VALUE sub, long pos)
4394 long len, slen;
4395 char *sbeg, *s;
4396 rb_encoding *enc;
4398 enc = rb_enc_check(str, sub);
4399 if (is_broken_string(sub)) return -1;
4400 len = RSTRING_LEN(str);
4401 slen = RSTRING_LEN(sub);
4403 /* substring longer than string */
4404 if (len < slen) return -1;
4405 if (len - pos < slen) pos = len - slen;
4406 if (len == 0) return pos;
4408 sbeg = RSTRING_PTR(str);
4410 if (pos == 0) {
4411 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4412 return 0;
4413 else
4414 return -1;
4417 s = sbeg + pos;
4418 return str_rindex(str, sub, s, enc);
4423 * call-seq:
4424 * byterindex(substring, offset = self.bytesize) -> integer or nil
4425 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4427 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4428 * or +nil+ if none found:
4430 * 'foo'.byterindex('f') # => 0
4431 * 'foo'.byterindex('o') # => 2
4432 * 'foo'.byterindex('oo') # => 1
4433 * 'foo'.byterindex('ooo') # => nil
4435 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4436 * or +nil+ if none found:
4438 * 'foo'.byterindex(/f/) # => 0
4439 * 'foo'.byterindex(/o/) # => 2
4440 * 'foo'.byterindex(/oo/) # => 1
4441 * 'foo'.byterindex(/ooo/) # => nil
4443 * The _last_ match means starting at the possible last position, not
4444 * the last of longest matches.
4446 * 'foo'.byterindex(/o+/) # => 2
4447 * $~ #=> #<MatchData "o">
4449 * To get the last longest match, needs to combine with negative
4450 * lookbehind.
4452 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4453 * $~ #=> #<MatchData "oo">
4455 * Or String#byteindex with negative lookforward.
4457 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4458 * $~ #=> #<MatchData "oo">
4460 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4461 * string to _end_ the search:
4463 * 'foo'.byterindex('o', 0) # => nil
4464 * 'foo'.byterindex('o', 1) # => 1
4465 * 'foo'.byterindex('o', 2) # => 2
4466 * 'foo'.byterindex('o', 3) # => 2
4468 * If +offset+ is a negative Integer, the maximum starting position in the
4469 * string to _end_ the search is the sum of the string's length and +offset+:
4471 * 'foo'.byterindex('o', -1) # => 2
4472 * 'foo'.byterindex('o', -2) # => 1
4473 * 'foo'.byterindex('o', -3) # => nil
4474 * 'foo'.byterindex('o', -4) # => nil
4476 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4477 * raised.
4479 * Related: String#byteindex.
4482 static VALUE
4483 rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4485 VALUE sub;
4486 VALUE initpos;
4487 long pos, len = RSTRING_LEN(str);
4489 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4490 pos = NUM2LONG(initpos);
4491 if (pos < 0 && (pos += len) < 0) {
4492 if (RB_TYPE_P(sub, T_REGEXP)) {
4493 rb_backref_set(Qnil);
4495 return Qnil;
4497 if (pos > len) pos = len;
4499 else {
4500 pos = len;
4503 str_ensure_byte_pos(str, pos);
4505 if (RB_TYPE_P(sub, T_REGEXP)) {
4506 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4507 VALUE match = rb_backref_get();
4508 struct re_registers *regs = RMATCH_REGS(match);
4509 pos = BEG(0);
4510 return LONG2NUM(pos);
4513 else {
4514 StringValue(sub);
4515 pos = rb_str_byterindex(str, sub, pos);
4516 if (pos >= 0) return LONG2NUM(pos);
4518 return Qnil;
4522 * call-seq:
4523 * string =~ regexp -> integer or nil
4524 * string =~ object -> integer or nil
4526 * Returns the Integer index of the first substring that matches
4527 * the given +regexp+, or +nil+ if no match found:
4529 * 'foo' =~ /f/ # => 0
4530 * 'foo' =~ /o/ # => 1
4531 * 'foo' =~ /x/ # => nil
4533 * Note: also updates Regexp@Global+Variables.
4535 * If the given +object+ is not a Regexp, returns the value
4536 * returned by <tt>object =~ self</tt>.
4538 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4539 * (see Regexp#=~):
4541 * number= nil
4542 * "no. 9" =~ /(?<number>\d+)/
4543 * number # => nil (not assigned)
4544 * /(?<number>\d+)/ =~ "no. 9"
4545 * number #=> "9"
4549 static VALUE
4550 rb_str_match(VALUE x, VALUE y)
4552 switch (OBJ_BUILTIN_TYPE(y)) {
4553 case T_STRING:
4554 rb_raise(rb_eTypeError, "type mismatch: String given");
4556 case T_REGEXP:
4557 return rb_reg_match(y, x);
4559 default:
4560 return rb_funcall(y, idEqTilde, 1, x);
4565 static VALUE get_pat(VALUE);
4569 * call-seq:
4570 * match(pattern, offset = 0) -> matchdata or nil
4571 * match(pattern, offset = 0) {|matchdata| ... } -> object
4573 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4575 * Note: also updates Regexp@Global+Variables.
4577 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4578 * regexp = Regexp.new(pattern)
4579 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4580 * (see Regexp#match):
4581 * matchdata = <tt>regexp.match(self)
4583 * With no block given, returns the computed +matchdata+:
4585 * 'foo'.match('f') # => #<MatchData "f">
4586 * 'foo'.match('o') # => #<MatchData "o">
4587 * 'foo'.match('x') # => nil
4589 * If Integer argument +offset+ is given, the search begins at index +offset+:
4591 * 'foo'.match('f', 1) # => nil
4592 * 'foo'.match('o', 1) # => #<MatchData "o">
4594 * With a block given, calls the block with the computed +matchdata+
4595 * and returns the block's return value:
4597 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4598 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4599 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4603 static VALUE
4604 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4606 VALUE re, result;
4607 if (argc < 1)
4608 rb_check_arity(argc, 1, 2);
4609 re = argv[0];
4610 argv[0] = str;
4611 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4612 if (!NIL_P(result) && rb_block_given_p()) {
4613 return rb_yield(result);
4615 return result;
4619 * call-seq:
4620 * match?(pattern, offset = 0) -> true or false
4622 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4624 * Note: does not update Regexp@Global+Variables.
4626 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4627 * regexp = Regexp.new(pattern)
4629 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4630 * +false+ otherwise:
4632 * 'foo'.match?(/o/) # => true
4633 * 'foo'.match?('o') # => true
4634 * 'foo'.match?(/x/) # => false
4636 * If Integer argument +offset+ is given, the search begins at index +offset+:
4637 * 'foo'.match?('f', 1) # => false
4638 * 'foo'.match?('o', 1) # => true
4642 static VALUE
4643 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4645 VALUE re;
4646 rb_check_arity(argc, 1, 2);
4647 re = get_pat(argv[0]);
4648 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4651 enum neighbor_char {
4652 NEIGHBOR_NOT_CHAR,
4653 NEIGHBOR_FOUND,
4654 NEIGHBOR_WRAPPED
4657 static enum neighbor_char
4658 enc_succ_char(char *p, long len, rb_encoding *enc)
4660 long i;
4661 int l;
4663 if (rb_enc_mbminlen(enc) > 1) {
4664 /* wchar, trivial case */
4665 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4666 if (!MBCLEN_CHARFOUND_P(r)) {
4667 return NEIGHBOR_NOT_CHAR;
4669 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4670 l = rb_enc_code_to_mbclen(c, enc);
4671 if (!l) return NEIGHBOR_NOT_CHAR;
4672 if (l != len) return NEIGHBOR_WRAPPED;
4673 rb_enc_mbcput(c, p, enc);
4674 r = rb_enc_precise_mbclen(p, p + len, enc);
4675 if (!MBCLEN_CHARFOUND_P(r)) {
4676 return NEIGHBOR_NOT_CHAR;
4678 return NEIGHBOR_FOUND;
4680 while (1) {
4681 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4682 p[i] = '\0';
4683 if (i < 0)
4684 return NEIGHBOR_WRAPPED;
4685 ++((unsigned char*)p)[i];
4686 l = rb_enc_precise_mbclen(p, p+len, enc);
4687 if (MBCLEN_CHARFOUND_P(l)) {
4688 l = MBCLEN_CHARFOUND_LEN(l);
4689 if (l == len) {
4690 return NEIGHBOR_FOUND;
4692 else {
4693 memset(p+l, 0xff, len-l);
4696 if (MBCLEN_INVALID_P(l) && i < len-1) {
4697 long len2;
4698 int l2;
4699 for (len2 = len-1; 0 < len2; len2--) {
4700 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4701 if (!MBCLEN_INVALID_P(l2))
4702 break;
4704 memset(p+len2+1, 0xff, len-(len2+1));
4709 static enum neighbor_char
4710 enc_pred_char(char *p, long len, rb_encoding *enc)
4712 long i;
4713 int l;
4714 if (rb_enc_mbminlen(enc) > 1) {
4715 /* wchar, trivial case */
4716 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4717 if (!MBCLEN_CHARFOUND_P(r)) {
4718 return NEIGHBOR_NOT_CHAR;
4720 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4721 if (!c) return NEIGHBOR_NOT_CHAR;
4722 --c;
4723 l = rb_enc_code_to_mbclen(c, enc);
4724 if (!l) return NEIGHBOR_NOT_CHAR;
4725 if (l != len) return NEIGHBOR_WRAPPED;
4726 rb_enc_mbcput(c, p, enc);
4727 r = rb_enc_precise_mbclen(p, p + len, enc);
4728 if (!MBCLEN_CHARFOUND_P(r)) {
4729 return NEIGHBOR_NOT_CHAR;
4731 return NEIGHBOR_FOUND;
4733 while (1) {
4734 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4735 p[i] = '\xff';
4736 if (i < 0)
4737 return NEIGHBOR_WRAPPED;
4738 --((unsigned char*)p)[i];
4739 l = rb_enc_precise_mbclen(p, p+len, enc);
4740 if (MBCLEN_CHARFOUND_P(l)) {
4741 l = MBCLEN_CHARFOUND_LEN(l);
4742 if (l == len) {
4743 return NEIGHBOR_FOUND;
4745 else {
4746 memset(p+l, 0, len-l);
4749 if (MBCLEN_INVALID_P(l) && i < len-1) {
4750 long len2;
4751 int l2;
4752 for (len2 = len-1; 0 < len2; len2--) {
4753 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4754 if (!MBCLEN_INVALID_P(l2))
4755 break;
4757 memset(p+len2+1, 0, len-(len2+1));
4763 overwrite +p+ by succeeding letter in +enc+ and returns
4764 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4765 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4766 assuming each ranges are successive, and mbclen
4767 never change in each ranges.
4768 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4769 character.
4771 static enum neighbor_char
4772 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4774 enum neighbor_char ret;
4775 unsigned int c;
4776 int ctype;
4777 int range;
4778 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4780 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4781 int try;
4782 const int max_gaps = 1;
4784 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4785 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4786 ctype = ONIGENC_CTYPE_DIGIT;
4787 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4788 ctype = ONIGENC_CTYPE_ALPHA;
4789 else
4790 return NEIGHBOR_NOT_CHAR;
4792 MEMCPY(save, p, char, len);
4793 for (try = 0; try <= max_gaps; ++try) {
4794 ret = enc_succ_char(p, len, enc);
4795 if (ret == NEIGHBOR_FOUND) {
4796 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4797 if (rb_enc_isctype(c, ctype, enc))
4798 return NEIGHBOR_FOUND;
4801 MEMCPY(p, save, char, len);
4802 range = 1;
4803 while (1) {
4804 MEMCPY(save, p, char, len);
4805 ret = enc_pred_char(p, len, enc);
4806 if (ret == NEIGHBOR_FOUND) {
4807 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4808 if (!rb_enc_isctype(c, ctype, enc)) {
4809 MEMCPY(p, save, char, len);
4810 break;
4813 else {
4814 MEMCPY(p, save, char, len);
4815 break;
4817 range++;
4819 if (range == 1) {
4820 return NEIGHBOR_NOT_CHAR;
4823 if (ctype != ONIGENC_CTYPE_DIGIT) {
4824 MEMCPY(carry, p, char, len);
4825 return NEIGHBOR_WRAPPED;
4828 MEMCPY(carry, p, char, len);
4829 enc_succ_char(carry, len, enc);
4830 return NEIGHBOR_WRAPPED;
4834 static VALUE str_succ(VALUE str);
4837 * call-seq:
4838 * succ -> new_str
4840 * Returns the successor to +self+. The successor is calculated by
4841 * incrementing characters.
4843 * The first character to be incremented is the rightmost alphanumeric:
4844 * or, if no alphanumerics, the rightmost character:
4846 * 'THX1138'.succ # => "THX1139"
4847 * '<<koala>>'.succ # => "<<koalb>>"
4848 * '***'.succ # => '**+'
4850 * The successor to a digit is another digit, "carrying" to the next-left
4851 * character for a "rollover" from 9 to 0, and prepending another digit
4852 * if necessary:
4854 * '00'.succ # => "01"
4855 * '09'.succ # => "10"
4856 * '99'.succ # => "100"
4858 * The successor to a letter is another letter of the same case,
4859 * carrying to the next-left character for a rollover,
4860 * and prepending another same-case letter if necessary:
4862 * 'aa'.succ # => "ab"
4863 * 'az'.succ # => "ba"
4864 * 'zz'.succ # => "aaa"
4865 * 'AA'.succ # => "AB"
4866 * 'AZ'.succ # => "BA"
4867 * 'ZZ'.succ # => "AAA"
4869 * The successor to a non-alphanumeric character is the next character
4870 * in the underlying character set's collating sequence,
4871 * carrying to the next-left character for a rollover,
4872 * and prepending another character if necessary:
4874 * s = 0.chr * 3
4875 * s # => "\x00\x00\x00"
4876 * s.succ # => "\x00\x00\x01"
4877 * s = 255.chr * 3
4878 * s # => "\xFF\xFF\xFF"
4879 * s.succ # => "\x01\x00\x00\x00"
4881 * Carrying can occur between and among mixtures of alphanumeric characters:
4883 * s = 'zz99zz99'
4884 * s.succ # => "aaa00aa00"
4885 * s = '99zz99zz'
4886 * s.succ # => "100aa00aa"
4888 * The successor to an empty +String+ is a new empty +String+:
4890 * ''.succ # => ""
4894 VALUE
4895 rb_str_succ(VALUE orig)
4897 VALUE str;
4898 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4899 rb_enc_cr_str_copy_for_substr(str, orig);
4900 return str_succ(str);
4903 static VALUE
4904 str_succ(VALUE str)
4906 rb_encoding *enc;
4907 char *sbeg, *s, *e, *last_alnum = 0;
4908 int found_alnum = 0;
4909 long l, slen;
4910 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4911 long carry_pos = 0, carry_len = 1;
4912 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4914 slen = RSTRING_LEN(str);
4915 if (slen == 0) return str;
4917 enc = STR_ENC_GET(str);
4918 sbeg = RSTRING_PTR(str);
4919 s = e = sbeg + slen;
4921 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4922 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4923 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4924 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4925 break;
4928 l = rb_enc_precise_mbclen(s, e, enc);
4929 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4930 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4931 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4932 switch (neighbor) {
4933 case NEIGHBOR_NOT_CHAR:
4934 continue;
4935 case NEIGHBOR_FOUND:
4936 return str;
4937 case NEIGHBOR_WRAPPED:
4938 last_alnum = s;
4939 break;
4941 found_alnum = 1;
4942 carry_pos = s - sbeg;
4943 carry_len = l;
4945 if (!found_alnum) { /* str contains no alnum */
4946 s = e;
4947 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4948 enum neighbor_char neighbor;
4949 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4950 l = rb_enc_precise_mbclen(s, e, enc);
4951 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4952 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4953 MEMCPY(tmp, s, char, l);
4954 neighbor = enc_succ_char(tmp, l, enc);
4955 switch (neighbor) {
4956 case NEIGHBOR_FOUND:
4957 MEMCPY(s, tmp, char, l);
4958 return str;
4959 break;
4960 case NEIGHBOR_WRAPPED:
4961 MEMCPY(s, tmp, char, l);
4962 break;
4963 case NEIGHBOR_NOT_CHAR:
4964 break;
4966 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4967 /* wrapped to \0...\0. search next valid char. */
4968 enc_succ_char(s, l, enc);
4970 if (!rb_enc_asciicompat(enc)) {
4971 MEMCPY(carry, s, char, l);
4972 carry_len = l;
4974 carry_pos = s - sbeg;
4976 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4978 RESIZE_CAPA(str, slen + carry_len);
4979 sbeg = RSTRING_PTR(str);
4980 s = sbeg + carry_pos;
4981 memmove(s + carry_len, s, slen - carry_pos);
4982 memmove(s, carry, carry_len);
4983 slen += carry_len;
4984 STR_SET_LEN(str, slen);
4985 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4986 rb_enc_str_coderange(str);
4987 return str;
4992 * call-seq:
4993 * succ! -> self
4995 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4998 static VALUE
4999 rb_str_succ_bang(VALUE str)
5001 rb_str_modify(str);
5002 str_succ(str);
5003 return str;
5006 static int
5007 all_digits_p(const char *s, long len)
5009 while (len-- > 0) {
5010 if (!ISDIGIT(*s)) return 0;
5011 s++;
5013 return 1;
5016 static int
5017 str_upto_i(VALUE str, VALUE arg)
5019 rb_yield(str);
5020 return 0;
5024 * call-seq:
5025 * upto(other_string, exclusive = false) {|string| ... } -> self
5026 * upto(other_string, exclusive = false) -> new_enumerator
5028 * With a block given, calls the block with each +String+ value
5029 * returned by successive calls to String#succ;
5030 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5031 * the sequence terminates when value +other_string+ is reached;
5032 * returns +self+:
5034 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5035 * Output:
5037 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5039 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5041 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5043 * Output:
5045 * a8 a9 b0 b1 b2 b3 b4 b5
5047 * If +other_string+ would not be reached, does not call the block:
5049 * '25'.upto('5') {|s| fail s }
5050 * 'aa'.upto('a') {|s| fail s }
5052 * With no block given, returns a new Enumerator:
5054 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5058 static VALUE
5059 rb_str_upto(int argc, VALUE *argv, VALUE beg)
5061 VALUE end, exclusive;
5063 rb_scan_args(argc, argv, "11", &end, &exclusive);
5064 RETURN_ENUMERATOR(beg, argc, argv);
5065 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5068 VALUE
5069 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5071 VALUE current, after_end;
5072 ID succ;
5073 int n, ascii;
5074 rb_encoding *enc;
5076 CONST_ID(succ, "succ");
5077 StringValue(end);
5078 enc = rb_enc_check(beg, end);
5079 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5080 /* single character */
5081 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5082 char c = RSTRING_PTR(beg)[0];
5083 char e = RSTRING_PTR(end)[0];
5085 if (c > e || (excl && c == e)) return beg;
5086 for (;;) {
5087 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5088 if (!excl && c == e) break;
5089 c++;
5090 if (excl && c == e) break;
5092 return beg;
5094 /* both edges are all digits */
5095 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5096 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5097 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5098 VALUE b, e;
5099 int width;
5101 width = RSTRING_LENINT(beg);
5102 b = rb_str_to_inum(beg, 10, FALSE);
5103 e = rb_str_to_inum(end, 10, FALSE);
5104 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5105 long bi = FIX2LONG(b);
5106 long ei = FIX2LONG(e);
5107 rb_encoding *usascii = rb_usascii_encoding();
5109 while (bi <= ei) {
5110 if (excl && bi == ei) break;
5111 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5112 bi++;
5115 else {
5116 ID op = excl ? '<' : idLE;
5117 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5119 args[0] = INT2FIX(width);
5120 while (rb_funcall(b, op, 1, e)) {
5121 args[1] = b;
5122 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5123 b = rb_funcallv(b, succ, 0, 0);
5126 return beg;
5128 /* normal case */
5129 n = rb_str_cmp(beg, end);
5130 if (n > 0 || (excl && n == 0)) return beg;
5132 after_end = rb_funcallv(end, succ, 0, 0);
5133 current = str_duplicate(rb_cString, beg);
5134 while (!rb_str_equal(current, after_end)) {
5135 VALUE next = Qnil;
5136 if (excl || !rb_str_equal(current, end))
5137 next = rb_funcallv(current, succ, 0, 0);
5138 if ((*each)(current, arg)) break;
5139 if (NIL_P(next)) break;
5140 current = next;
5141 StringValue(current);
5142 if (excl && rb_str_equal(current, end)) break;
5143 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5144 break;
5147 return beg;
5150 VALUE
5151 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5153 VALUE current;
5154 ID succ;
5156 CONST_ID(succ, "succ");
5157 /* both edges are all digits */
5158 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5159 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5160 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5161 int width = RSTRING_LENINT(beg);
5162 b = rb_str_to_inum(beg, 10, FALSE);
5163 if (FIXNUM_P(b)) {
5164 long bi = FIX2LONG(b);
5165 rb_encoding *usascii = rb_usascii_encoding();
5167 while (FIXABLE(bi)) {
5168 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5169 bi++;
5171 b = LONG2NUM(bi);
5173 args[0] = INT2FIX(width);
5174 while (1) {
5175 args[1] = b;
5176 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5177 b = rb_funcallv(b, succ, 0, 0);
5180 /* normal case */
5181 current = str_duplicate(rb_cString, beg);
5182 while (1) {
5183 VALUE next = rb_funcallv(current, succ, 0, 0);
5184 if ((*each)(current, arg)) break;
5185 current = next;
5186 StringValue(current);
5187 if (RSTRING_LEN(current) == 0)
5188 break;
5191 return beg;
5194 static int
5195 include_range_i(VALUE str, VALUE arg)
5197 VALUE *argp = (VALUE *)arg;
5198 if (!rb_equal(str, *argp)) return 0;
5199 *argp = Qnil;
5200 return 1;
5203 VALUE
5204 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5206 beg = rb_str_new_frozen(beg);
5207 StringValue(end);
5208 end = rb_str_new_frozen(end);
5209 if (NIL_P(val)) return Qfalse;
5210 val = rb_check_string_type(val);
5211 if (NIL_P(val)) return Qfalse;
5212 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5213 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5214 rb_enc_asciicompat(STR_ENC_GET(val))) {
5215 const char *bp = RSTRING_PTR(beg);
5216 const char *ep = RSTRING_PTR(end);
5217 const char *vp = RSTRING_PTR(val);
5218 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5219 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5220 return Qfalse;
5221 else {
5222 char b = *bp;
5223 char e = *ep;
5224 char v = *vp;
5226 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5227 if (b <= v && v < e) return Qtrue;
5228 return RBOOL(!RTEST(exclusive) && v == e);
5232 #if 0
5233 /* both edges are all digits */
5234 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5235 all_digits_p(bp, RSTRING_LEN(beg)) &&
5236 all_digits_p(ep, RSTRING_LEN(end))) {
5237 /* TODO */
5239 #endif
5241 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5243 return RBOOL(NIL_P(val));
5246 static VALUE
5247 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5249 if (rb_reg_search(re, str, 0, 0) >= 0) {
5250 VALUE match = rb_backref_get();
5251 int nth = rb_reg_backref_number(match, backref);
5252 return rb_reg_nth_match(nth, match);
5254 return Qnil;
5257 static VALUE
5258 rb_str_aref(VALUE str, VALUE indx)
5260 long idx;
5262 if (FIXNUM_P(indx)) {
5263 idx = FIX2LONG(indx);
5265 else if (RB_TYPE_P(indx, T_REGEXP)) {
5266 return rb_str_subpat(str, indx, INT2FIX(0));
5268 else if (RB_TYPE_P(indx, T_STRING)) {
5269 if (rb_str_index(str, indx, 0) != -1)
5270 return str_duplicate(rb_cString, indx);
5271 return Qnil;
5273 else {
5274 /* check if indx is Range */
5275 long beg, len = str_strlen(str, NULL);
5276 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5277 case Qfalse:
5278 break;
5279 case Qnil:
5280 return Qnil;
5281 default:
5282 return rb_str_substr(str, beg, len);
5284 idx = NUM2LONG(indx);
5287 return str_substr(str, idx, 1, FALSE);
5292 * call-seq:
5293 * string[index] -> new_string or nil
5294 * string[start, length] -> new_string or nil
5295 * string[range] -> new_string or nil
5296 * string[regexp, capture = 0] -> new_string or nil
5297 * string[substring] -> new_string or nil
5299 * Returns the substring of +self+ specified by the arguments.
5300 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5305 static VALUE
5306 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5308 if (argc == 2) {
5309 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5310 return rb_str_subpat(str, argv[0], argv[1]);
5312 else {
5313 long beg = NUM2LONG(argv[0]);
5314 long len = NUM2LONG(argv[1]);
5315 return rb_str_substr(str, beg, len);
5318 rb_check_arity(argc, 1, 2);
5319 return rb_str_aref(str, argv[0]);
5322 VALUE
5323 rb_str_drop_bytes(VALUE str, long len)
5325 char *ptr = RSTRING_PTR(str);
5326 long olen = RSTRING_LEN(str), nlen;
5328 str_modifiable(str);
5329 if (len > olen) len = olen;
5330 nlen = olen - len;
5331 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5332 char *oldptr = ptr;
5333 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5334 STR_SET_EMBED(str);
5335 ptr = RSTRING(str)->as.embed.ary;
5336 memmove(ptr, oldptr + len, nlen);
5337 if (fl == STR_NOEMBED) xfree(oldptr);
5339 else {
5340 if (!STR_SHARED_P(str)) {
5341 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5342 rb_enc_cr_str_exact_copy(shared, str);
5343 OBJ_FREEZE(shared);
5345 ptr = RSTRING(str)->as.heap.ptr += len;
5347 STR_SET_LEN(str, nlen);
5349 if (!SHARABLE_MIDDLE_SUBSTRING) {
5350 TERM_FILL(ptr + nlen, TERM_LEN(str));
5352 ENC_CODERANGE_CLEAR(str);
5353 return str;
5356 static void
5357 rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5359 char *sptr;
5360 long slen;
5361 int cr;
5363 if (beg == 0 && vlen == 0) {
5364 rb_str_drop_bytes(str, len);
5365 return;
5368 str_modify_keep_cr(str);
5369 RSTRING_GETMEM(str, sptr, slen);
5370 if (len < vlen) {
5371 /* expand string */
5372 RESIZE_CAPA(str, slen + vlen - len);
5373 sptr = RSTRING_PTR(str);
5376 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5377 cr = rb_enc_str_coderange(val);
5378 else
5379 cr = ENC_CODERANGE_UNKNOWN;
5381 if (vlen != len) {
5382 memmove(sptr + beg + vlen,
5383 sptr + beg + len,
5384 slen - (beg + len));
5386 if (vlen < beg && len < 0) {
5387 MEMZERO(sptr + slen, char, -len);
5389 if (vlen > 0) {
5390 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5392 slen += vlen - len;
5393 STR_SET_LEN(str, slen);
5394 TERM_FILL(&sptr[slen], TERM_LEN(str));
5395 ENC_CODERANGE_SET(str, cr);
5398 static inline void
5399 rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5401 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5404 void
5405 rb_str_update(VALUE str, long beg, long len, VALUE val)
5407 long slen;
5408 char *p, *e;
5409 rb_encoding *enc;
5410 int singlebyte = single_byte_optimizable(str);
5411 int cr;
5413 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5415 StringValue(val);
5416 enc = rb_enc_check(str, val);
5417 slen = str_strlen(str, enc); /* rb_enc_check */
5419 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5420 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5422 if (beg < 0) {
5423 beg += slen;
5425 RUBY_ASSERT(beg >= 0);
5426 RUBY_ASSERT(beg <= slen);
5428 if (len > slen - beg) {
5429 len = slen - beg;
5431 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5432 if (!p) p = RSTRING_END(str);
5433 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5434 if (!e) e = RSTRING_END(str);
5435 /* error check */
5436 beg = p - RSTRING_PTR(str); /* physical position */
5437 len = e - p; /* physical length */
5438 rb_str_update_0(str, beg, len, val);
5439 rb_enc_associate(str, enc);
5440 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5441 if (cr != ENC_CODERANGE_BROKEN)
5442 ENC_CODERANGE_SET(str, cr);
5445 static void
5446 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5448 int nth;
5449 VALUE match;
5450 long start, end, len;
5451 rb_encoding *enc;
5452 struct re_registers *regs;
5454 if (rb_reg_search(re, str, 0, 0) < 0) {
5455 rb_raise(rb_eIndexError, "regexp not matched");
5457 match = rb_backref_get();
5458 nth = rb_reg_backref_number(match, backref);
5459 regs = RMATCH_REGS(match);
5460 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5461 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5463 if (nth < 0) {
5464 nth += regs->num_regs;
5467 start = BEG(nth);
5468 if (start == -1) {
5469 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5471 end = END(nth);
5472 len = end - start;
5473 StringValue(val);
5474 enc = rb_enc_check_str(str, val);
5475 rb_str_update_0(str, start, len, val);
5476 rb_enc_associate(str, enc);
5479 static VALUE
5480 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5482 long idx, beg;
5484 switch (TYPE(indx)) {
5485 case T_REGEXP:
5486 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5487 return val;
5489 case T_STRING:
5490 beg = rb_str_index(str, indx, 0);
5491 if (beg < 0) {
5492 rb_raise(rb_eIndexError, "string not matched");
5494 beg = rb_str_sublen(str, beg);
5495 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5496 return val;
5498 default:
5499 /* check if indx is Range */
5501 long beg, len;
5502 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5503 rb_str_update(str, beg, len, val);
5504 return val;
5507 /* FALLTHROUGH */
5509 case T_FIXNUM:
5510 idx = NUM2LONG(indx);
5511 rb_str_update(str, idx, 1, val);
5512 return val;
5517 * call-seq:
5518 * string[index] = new_string
5519 * string[start, length] = new_string
5520 * string[range] = new_string
5521 * string[regexp, capture = 0] = new_string
5522 * string[substring] = new_string
5524 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5525 * See {String Slices}[rdoc-ref:String@String+Slices].
5527 * A few examples:
5529 * s = 'foo'
5530 * s[2] = 'rtune' # => "rtune"
5531 * s # => "fortune"
5532 * s[1, 5] = 'init' # => "init"
5533 * s # => "finite"
5534 * s[3..4] = 'al' # => "al"
5535 * s # => "finale"
5536 * s[/e$/] = 'ly' # => "ly"
5537 * s # => "finally"
5538 * s['lly'] = 'ncial' # => "ncial"
5539 * s # => "financial"
5543 static VALUE
5544 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5546 if (argc == 3) {
5547 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5548 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5550 else {
5551 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5553 return argv[2];
5555 rb_check_arity(argc, 2, 3);
5556 return rb_str_aset(str, argv[0], argv[1]);
5560 * call-seq:
5561 * insert(index, other_string) -> self
5563 * Inserts the given +other_string+ into +self+; returns +self+.
5565 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5567 * 'foo'.insert(1, 'bar') # => "fbaroo"
5569 * If the Integer +index+ is negative, counts backward from the end of +self+
5570 * and inserts +other_string+ at offset <tt>index+1</tt>
5571 * (that is, _after_ <tt>self[index]</tt>):
5573 * 'foo'.insert(-2, 'bar') # => "fobaro"
5577 static VALUE
5578 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5580 long pos = NUM2LONG(idx);
5582 if (pos == -1) {
5583 return rb_str_append(str, str2);
5585 else if (pos < 0) {
5586 pos++;
5588 rb_str_update(str, pos, 0, str2);
5589 return str;
5594 * call-seq:
5595 * slice!(index) -> new_string or nil
5596 * slice!(start, length) -> new_string or nil
5597 * slice!(range) -> new_string or nil
5598 * slice!(regexp, capture = 0) -> new_string or nil
5599 * slice!(substring) -> new_string or nil
5601 * Removes and returns the substring of +self+ specified by the arguments.
5602 * See {String Slices}[rdoc-ref:String@String+Slices].
5604 * A few examples:
5606 * string = "This is a string"
5607 * string.slice!(2) #=> "i"
5608 * string.slice!(3..6) #=> " is "
5609 * string.slice!(/s.*t/) #=> "sa st"
5610 * string.slice!("r") #=> "r"
5611 * string #=> "Thing"
5615 static VALUE
5616 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5618 VALUE result = Qnil;
5619 VALUE indx;
5620 long beg, len = 1;
5621 char *p;
5623 rb_check_arity(argc, 1, 2);
5624 str_modify_keep_cr(str);
5625 indx = argv[0];
5626 if (RB_TYPE_P(indx, T_REGEXP)) {
5627 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5628 VALUE match = rb_backref_get();
5629 struct re_registers *regs = RMATCH_REGS(match);
5630 int nth = 0;
5631 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5632 if ((nth += regs->num_regs) <= 0) return Qnil;
5634 else if (nth >= regs->num_regs) return Qnil;
5635 beg = BEG(nth);
5636 len = END(nth) - beg;
5637 goto subseq;
5639 else if (argc == 2) {
5640 beg = NUM2LONG(indx);
5641 len = NUM2LONG(argv[1]);
5642 goto num_index;
5644 else if (FIXNUM_P(indx)) {
5645 beg = FIX2LONG(indx);
5646 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5647 if (!len) return Qnil;
5648 beg = p - RSTRING_PTR(str);
5649 goto subseq;
5651 else if (RB_TYPE_P(indx, T_STRING)) {
5652 beg = rb_str_index(str, indx, 0);
5653 if (beg == -1) return Qnil;
5654 len = RSTRING_LEN(indx);
5655 result = str_duplicate(rb_cString, indx);
5656 goto squash;
5658 else {
5659 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5660 case Qnil:
5661 return Qnil;
5662 case Qfalse:
5663 beg = NUM2LONG(indx);
5664 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5665 if (!len) return Qnil;
5666 beg = p - RSTRING_PTR(str);
5667 goto subseq;
5668 default:
5669 goto num_index;
5673 num_index:
5674 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5675 beg = p - RSTRING_PTR(str);
5677 subseq:
5678 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5679 rb_enc_cr_str_copy_for_substr(result, str);
5681 squash:
5682 if (len > 0) {
5683 if (beg == 0) {
5684 rb_str_drop_bytes(str, len);
5686 else {
5687 char *sptr = RSTRING_PTR(str);
5688 long slen = RSTRING_LEN(str);
5689 if (beg + len > slen) /* pathological check */
5690 len = slen - beg;
5691 memmove(sptr + beg,
5692 sptr + beg + len,
5693 slen - (beg + len));
5694 slen -= len;
5695 STR_SET_LEN(str, slen);
5696 TERM_FILL(&sptr[slen], TERM_LEN(str));
5699 return result;
5702 static VALUE
5703 get_pat(VALUE pat)
5705 VALUE val;
5707 switch (OBJ_BUILTIN_TYPE(pat)) {
5708 case T_REGEXP:
5709 return pat;
5711 case T_STRING:
5712 break;
5714 default:
5715 val = rb_check_string_type(pat);
5716 if (NIL_P(val)) {
5717 Check_Type(pat, T_REGEXP);
5719 pat = val;
5722 return rb_reg_regcomp(pat);
5725 static VALUE
5726 get_pat_quoted(VALUE pat, int check)
5728 VALUE val;
5730 switch (OBJ_BUILTIN_TYPE(pat)) {
5731 case T_REGEXP:
5732 return pat;
5734 case T_STRING:
5735 break;
5737 default:
5738 val = rb_check_string_type(pat);
5739 if (NIL_P(val)) {
5740 Check_Type(pat, T_REGEXP);
5742 pat = val;
5744 if (check && is_broken_string(pat)) {
5745 rb_exc_raise(rb_reg_check_preprocess(pat));
5747 return pat;
5750 static long
5751 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5753 if (BUILTIN_TYPE(pat) == T_STRING) {
5754 pos = rb_str_byteindex(str, pat, pos);
5755 if (set_backref_str) {
5756 if (pos >= 0) {
5757 str = rb_str_new_frozen_String(str);
5758 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5760 else {
5761 rb_backref_set(Qnil);
5764 return pos;
5766 else {
5767 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5773 * call-seq:
5774 * sub!(pattern, replacement) -> self or nil
5775 * sub!(pattern) {|match| ... } -> self or nil
5777 * Returns +self+ with only the first occurrence
5778 * (not all occurrences) of the given +pattern+ replaced.
5780 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5782 * Related: String#sub, String#gsub, String#gsub!.
5786 static VALUE
5787 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5789 VALUE pat, repl, hash = Qnil;
5790 int iter = 0;
5791 long plen;
5792 int min_arity = rb_block_given_p() ? 1 : 2;
5793 long beg;
5795 rb_check_arity(argc, min_arity, 2);
5796 if (argc == 1) {
5797 iter = 1;
5799 else {
5800 repl = argv[1];
5801 hash = rb_check_hash_type(argv[1]);
5802 if (NIL_P(hash)) {
5803 StringValue(repl);
5807 pat = get_pat_quoted(argv[0], 1);
5809 str_modifiable(str);
5810 beg = rb_pat_search(pat, str, 0, 1);
5811 if (beg >= 0) {
5812 rb_encoding *enc;
5813 int cr = ENC_CODERANGE(str);
5814 long beg0, end0;
5815 VALUE match, match0 = Qnil;
5816 struct re_registers *regs;
5817 char *p, *rp;
5818 long len, rlen;
5820 match = rb_backref_get();
5821 regs = RMATCH_REGS(match);
5822 if (RB_TYPE_P(pat, T_STRING)) {
5823 beg0 = beg;
5824 end0 = beg0 + RSTRING_LEN(pat);
5825 match0 = pat;
5827 else {
5828 beg0 = BEG(0);
5829 end0 = END(0);
5830 if (iter) match0 = rb_reg_nth_match(0, match);
5833 if (iter || !NIL_P(hash)) {
5834 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5836 if (iter) {
5837 repl = rb_obj_as_string(rb_yield(match0));
5839 else {
5840 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5841 repl = rb_obj_as_string(repl);
5843 str_mod_check(str, p, len);
5844 rb_check_frozen(str);
5846 else {
5847 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5850 enc = rb_enc_compatible(str, repl);
5851 if (!enc) {
5852 rb_encoding *str_enc = STR_ENC_GET(str);
5853 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5854 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5855 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5856 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5857 rb_enc_inspect_name(str_enc),
5858 rb_enc_inspect_name(STR_ENC_GET(repl)));
5860 enc = STR_ENC_GET(repl);
5862 rb_str_modify(str);
5863 rb_enc_associate(str, enc);
5864 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5865 int cr2 = ENC_CODERANGE(repl);
5866 if (cr2 == ENC_CODERANGE_BROKEN ||
5867 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5868 cr = ENC_CODERANGE_UNKNOWN;
5869 else
5870 cr = cr2;
5872 plen = end0 - beg0;
5873 rlen = RSTRING_LEN(repl);
5874 len = RSTRING_LEN(str);
5875 if (rlen > plen) {
5876 RESIZE_CAPA(str, len + rlen - plen);
5878 p = RSTRING_PTR(str);
5879 if (rlen != plen) {
5880 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5882 rp = RSTRING_PTR(repl);
5883 memmove(p + beg0, rp, rlen);
5884 len += rlen - plen;
5885 STR_SET_LEN(str, len);
5886 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5887 ENC_CODERANGE_SET(str, cr);
5889 RB_GC_GUARD(match);
5891 return str;
5893 return Qnil;
5898 * call-seq:
5899 * sub(pattern, replacement) -> new_string
5900 * sub(pattern) {|match| ... } -> new_string
5902 * Returns a copy of +self+ with only the first occurrence
5903 * (not all occurrences) of the given +pattern+ replaced.
5905 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5907 * Related: String#sub!, String#gsub, String#gsub!.
5911 static VALUE
5912 rb_str_sub(int argc, VALUE *argv, VALUE str)
5914 str = str_duplicate(rb_cString, str);
5915 rb_str_sub_bang(argc, argv, str);
5916 return str;
5919 static VALUE
5920 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5922 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5923 long beg, beg0, end0;
5924 long offset, blen, slen, len, last;
5925 enum {STR, ITER, MAP} mode = STR;
5926 char *sp, *cp;
5927 int need_backref = -1;
5928 rb_encoding *str_enc;
5930 switch (argc) {
5931 case 1:
5932 RETURN_ENUMERATOR(str, argc, argv);
5933 mode = ITER;
5934 break;
5935 case 2:
5936 repl = argv[1];
5937 hash = rb_check_hash_type(argv[1]);
5938 if (NIL_P(hash)) {
5939 StringValue(repl);
5941 else {
5942 mode = MAP;
5944 break;
5945 default:
5946 rb_error_arity(argc, 1, 2);
5949 pat = get_pat_quoted(argv[0], 1);
5950 beg = rb_pat_search(pat, str, 0, need_backref);
5951 if (beg < 0) {
5952 if (bang) return Qnil; /* no match, no substitution */
5953 return str_duplicate(rb_cString, str);
5956 offset = 0;
5957 blen = RSTRING_LEN(str) + 30; /* len + margin */
5958 dest = rb_str_buf_new(blen);
5959 sp = RSTRING_PTR(str);
5960 slen = RSTRING_LEN(str);
5961 cp = sp;
5962 str_enc = STR_ENC_GET(str);
5963 rb_enc_associate(dest, str_enc);
5964 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5966 do {
5967 VALUE match = rb_backref_get();
5968 struct re_registers *regs = RMATCH_REGS(match);
5969 if (RB_TYPE_P(pat, T_STRING)) {
5970 beg0 = beg;
5971 end0 = beg0 + RSTRING_LEN(pat);
5972 match0 = pat;
5974 else {
5975 beg0 = BEG(0);
5976 end0 = END(0);
5977 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5980 if (mode) {
5981 if (mode == ITER) {
5982 val = rb_obj_as_string(rb_yield(match0));
5984 else {
5985 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5986 val = rb_obj_as_string(val);
5988 str_mod_check(str, sp, slen);
5989 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5990 rb_raise(rb_eRuntimeError, "block should not cheat");
5993 else if (need_backref) {
5994 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5995 if (need_backref < 0) {
5996 need_backref = val != repl;
5999 else {
6000 val = repl;
6003 len = beg0 - offset; /* copy pre-match substr */
6004 if (len) {
6005 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6008 rb_str_buf_append(dest, val);
6010 last = offset;
6011 offset = end0;
6012 if (beg0 == end0) {
6014 * Always consume at least one character of the input string
6015 * in order to prevent infinite loops.
6017 if (RSTRING_LEN(str) <= end0) break;
6018 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6019 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6020 offset = end0 + len;
6022 cp = RSTRING_PTR(str) + offset;
6023 if (offset > RSTRING_LEN(str)) break;
6024 beg = rb_pat_search(pat, str, offset, need_backref);
6026 RB_GC_GUARD(match);
6027 } while (beg >= 0);
6028 if (RSTRING_LEN(str) > offset) {
6029 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6031 rb_pat_search(pat, str, last, 1);
6032 if (bang) {
6033 str_shared_replace(str, dest);
6035 else {
6036 str = dest;
6039 return str;
6044 * call-seq:
6045 * gsub!(pattern, replacement) -> self or nil
6046 * gsub!(pattern) {|match| ... } -> self or nil
6047 * gsub!(pattern) -> an_enumerator
6049 * Performs the specified substring replacement(s) on +self+;
6050 * returns +self+ if any replacement occurred, +nil+ otherwise.
6052 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6054 * Returns an Enumerator if no +replacement+ and no block given.
6056 * Related: String#sub, String#gsub, String#sub!.
6060 static VALUE
6061 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6063 str_modify_keep_cr(str);
6064 return str_gsub(argc, argv, str, 1);
6069 * call-seq:
6070 * gsub(pattern, replacement) -> new_string
6071 * gsub(pattern) {|match| ... } -> new_string
6072 * gsub(pattern) -> enumerator
6074 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6076 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6078 * Returns an Enumerator if no +replacement+ and no block given.
6080 * Related: String#sub, String#sub!, String#gsub!.
6084 static VALUE
6085 rb_str_gsub(int argc, VALUE *argv, VALUE str)
6087 return str_gsub(argc, argv, str, 0);
6092 * call-seq:
6093 * replace(other_string) -> self
6095 * Replaces the contents of +self+ with the contents of +other_string+:
6097 * s = 'foo' # => "foo"
6098 * s.replace('bar') # => "bar"
6102 VALUE
6103 rb_str_replace(VALUE str, VALUE str2)
6105 str_modifiable(str);
6106 if (str == str2) return str;
6108 StringValue(str2);
6109 str_discard(str);
6110 return str_replace(str, str2);
6114 * call-seq:
6115 * clear -> self
6117 * Removes the contents of +self+:
6119 * s = 'foo' # => "foo"
6120 * s.clear # => ""
6124 static VALUE
6125 rb_str_clear(VALUE str)
6127 str_discard(str);
6128 STR_SET_EMBED(str);
6129 STR_SET_LEN(str, 0);
6130 RSTRING_PTR(str)[0] = 0;
6131 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6132 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
6133 else
6134 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
6135 return str;
6139 * call-seq:
6140 * chr -> string
6142 * Returns a string containing the first character of +self+:
6144 * s = 'foo' # => "foo"
6145 * s.chr # => "f"
6149 static VALUE
6150 rb_str_chr(VALUE str)
6152 return rb_str_substr(str, 0, 1);
6156 * call-seq:
6157 * getbyte(index) -> integer or nil
6159 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6161 * s = 'abcde' # => "abcde"
6162 * s.getbyte(0) # => 97
6163 * s.getbyte(-1) # => 101
6164 * s.getbyte(5) # => nil
6166 * Related: String#setbyte.
6168 VALUE
6169 rb_str_getbyte(VALUE str, VALUE index)
6171 long pos = NUM2LONG(index);
6173 if (pos < 0)
6174 pos += RSTRING_LEN(str);
6175 if (pos < 0 || RSTRING_LEN(str) <= pos)
6176 return Qnil;
6178 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6182 * call-seq:
6183 * setbyte(index, integer) -> integer
6185 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6187 * s = 'abcde' # => "abcde"
6188 * s.setbyte(0, 98) # => 98
6189 * s # => "bbcde"
6191 * Related: String#getbyte.
6193 VALUE
6194 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6196 long pos = NUM2LONG(index);
6197 long len = RSTRING_LEN(str);
6198 char *ptr, *head, *left = 0;
6199 rb_encoding *enc;
6200 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6202 if (pos < -len || len <= pos)
6203 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6204 if (pos < 0)
6205 pos += len;
6207 VALUE v = rb_to_int(value);
6208 VALUE w = rb_int_and(v, INT2FIX(0xff));
6209 char byte = (char)(NUM2INT(w) & 0xFF);
6211 if (!str_independent(str))
6212 str_make_independent(str);
6213 enc = STR_ENC_GET(str);
6214 head = RSTRING_PTR(str);
6215 ptr = &head[pos];
6216 if (!STR_EMBED_P(str)) {
6217 cr = ENC_CODERANGE(str);
6218 switch (cr) {
6219 case ENC_CODERANGE_7BIT:
6220 left = ptr;
6221 *ptr = byte;
6222 if (ISASCII(byte)) goto end;
6223 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6224 if (!MBCLEN_CHARFOUND_P(nlen))
6225 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
6226 else
6227 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
6228 goto end;
6229 case ENC_CODERANGE_VALID:
6230 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6231 width = rb_enc_precise_mbclen(left, head+len, enc);
6232 *ptr = byte;
6233 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6234 if (!MBCLEN_CHARFOUND_P(nlen))
6235 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
6236 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6237 ENC_CODERANGE_CLEAR(str);
6238 goto end;
6241 ENC_CODERANGE_CLEAR(str);
6242 *ptr = byte;
6244 end:
6245 return value;
6248 static VALUE
6249 str_byte_substr(VALUE str, long beg, long len, int empty)
6251 long n = RSTRING_LEN(str);
6253 if (beg > n || len < 0) return Qnil;
6254 if (beg < 0) {
6255 beg += n;
6256 if (beg < 0) return Qnil;
6258 if (len > n - beg)
6259 len = n - beg;
6260 if (len <= 0) {
6261 if (!empty) return Qnil;
6262 len = 0;
6265 VALUE str2 = str_subseq(str, beg, len);
6267 str_enc_copy_direct(str2, str);
6269 if (RSTRING_LEN(str2) == 0) {
6270 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6271 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6272 else
6273 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6275 else {
6276 switch (ENC_CODERANGE(str)) {
6277 case ENC_CODERANGE_7BIT:
6278 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6279 break;
6280 default:
6281 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6282 break;
6286 return str2;
6289 VALUE
6290 rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6292 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6295 static VALUE
6296 str_byte_aref(VALUE str, VALUE indx)
6298 long idx;
6299 if (FIXNUM_P(indx)) {
6300 idx = FIX2LONG(indx);
6302 else {
6303 /* check if indx is Range */
6304 long beg, len = RSTRING_LEN(str);
6306 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6307 case Qfalse:
6308 break;
6309 case Qnil:
6310 return Qnil;
6311 default:
6312 return str_byte_substr(str, beg, len, TRUE);
6315 idx = NUM2LONG(indx);
6317 return str_byte_substr(str, idx, 1, FALSE);
6321 * call-seq:
6322 * byteslice(index, length = 1) -> string or nil
6323 * byteslice(range) -> string or nil
6325 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6327 * With integer arguments +index+ and +length+ given,
6328 * returns the substring beginning at the given +index+
6329 * of the given +length+ (if possible),
6330 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6332 * s = '0123456789' # => "0123456789"
6333 * s.byteslice(2) # => "2"
6334 * s.byteslice(200) # => nil
6335 * s.byteslice(4, 3) # => "456"
6336 * s.byteslice(4, 30) # => "456789"
6337 * s.byteslice(4, -1) # => nil
6338 * s.byteslice(40, 2) # => nil
6340 * In either case above, counts backwards from the end of +self+
6341 * if +index+ is negative:
6343 * s = '0123456789' # => "0123456789"
6344 * s.byteslice(-4) # => "6"
6345 * s.byteslice(-4, 3) # => "678"
6347 * With Range argument +range+ given, returns
6348 * <tt>byteslice(range.begin, range.size)</tt>:
6350 * s = '0123456789' # => "0123456789"
6351 * s.byteslice(4..6) # => "456"
6352 * s.byteslice(-6..-4) # => "456"
6353 * s.byteslice(5..2) # => "" # range.size is zero.
6354 * s.byteslice(40..42) # => nil
6356 * In all cases, a returned string has the same encoding as +self+:
6358 * s.encoding # => #<Encoding:UTF-8>
6359 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6363 static VALUE
6364 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6366 if (argc == 2) {
6367 long beg = NUM2LONG(argv[0]);
6368 long len = NUM2LONG(argv[1]);
6369 return str_byte_substr(str, beg, len, TRUE);
6371 rb_check_arity(argc, 1, 2);
6372 return str_byte_aref(str, argv[0]);
6375 static void
6376 str_check_beg_len(VALUE str, long *beg, long *len)
6378 long end, slen = RSTRING_LEN(str);
6380 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6381 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6382 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6384 if (*beg < 0) {
6385 *beg += slen;
6387 RUBY_ASSERT(*beg >= 0);
6388 RUBY_ASSERT(*beg <= slen);
6390 if (*len > slen - *beg) {
6391 *len = slen - *beg;
6393 end = *beg + *len;
6394 str_ensure_byte_pos(str, *beg);
6395 str_ensure_byte_pos(str, end);
6399 * call-seq:
6400 * bytesplice(index, length, str) -> string
6401 * bytesplice(index, length, str, str_index, str_length) -> string
6402 * bytesplice(range, str) -> string
6403 * bytesplice(range, str, str_range) -> string
6405 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6406 * The portion of the string affected is determined using
6407 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6408 * If the replacement string is not the same length as the text it is replacing,
6409 * the string will be adjusted accordingly.
6411 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6413 * The form that take an Integer will raise an IndexError if the value is out
6414 * of range; the Range form will raise a RangeError.
6415 * If the beginning or ending offset does not land on character (codepoint)
6416 * boundary, an IndexError will be raised.
6419 static VALUE
6420 rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6422 long beg, len, vbeg, vlen;
6423 VALUE val;
6424 rb_encoding *enc;
6425 int cr;
6427 rb_check_arity(argc, 2, 5);
6428 if (!(argc == 2 || argc == 3 || argc == 5)) {
6429 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6431 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6432 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6433 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6434 rb_builtin_class_name(argv[0]));
6436 val = argv[1];
6437 StringValue(val);
6438 if (argc == 2) {
6439 /* bytesplice(range, str) */
6440 vbeg = 0;
6441 vlen = RSTRING_LEN(val);
6443 else {
6444 /* bytesplice(range, str, str_range) */
6445 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6446 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6447 rb_builtin_class_name(argv[2]));
6451 else {
6452 beg = NUM2LONG(argv[0]);
6453 len = NUM2LONG(argv[1]);
6454 val = argv[2];
6455 StringValue(val);
6456 if (argc == 3) {
6457 /* bytesplice(index, length, str) */
6458 vbeg = 0;
6459 vlen = RSTRING_LEN(val);
6461 else {
6462 /* bytesplice(index, length, str, str_index, str_length) */
6463 vbeg = NUM2LONG(argv[3]);
6464 vlen = NUM2LONG(argv[4]);
6467 str_check_beg_len(str, &beg, &len);
6468 str_check_beg_len(val, &vbeg, &vlen);
6469 enc = rb_enc_check(str, val);
6470 str_modify_keep_cr(str);
6471 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6472 rb_enc_associate(str, enc);
6473 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
6474 if (cr != ENC_CODERANGE_BROKEN)
6475 ENC_CODERANGE_SET(str, cr);
6476 return str;
6480 * call-seq:
6481 * reverse -> string
6483 * Returns a new string with the characters from +self+ in reverse order.
6485 * 'stressed'.reverse # => "desserts"
6489 static VALUE
6490 rb_str_reverse(VALUE str)
6492 rb_encoding *enc;
6493 VALUE rev;
6494 char *s, *e, *p;
6495 int cr;
6497 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6498 enc = STR_ENC_GET(str);
6499 rev = rb_str_new(0, RSTRING_LEN(str));
6500 s = RSTRING_PTR(str); e = RSTRING_END(str);
6501 p = RSTRING_END(rev);
6502 cr = ENC_CODERANGE(str);
6504 if (RSTRING_LEN(str) > 1) {
6505 if (single_byte_optimizable(str)) {
6506 while (s < e) {
6507 *--p = *s++;
6510 else if (cr == ENC_CODERANGE_VALID) {
6511 while (s < e) {
6512 int clen = rb_enc_fast_mbclen(s, e, enc);
6514 p -= clen;
6515 memcpy(p, s, clen);
6516 s += clen;
6519 else {
6520 cr = rb_enc_asciicompat(enc) ?
6521 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6522 while (s < e) {
6523 int clen = rb_enc_mbclen(s, e, enc);
6525 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6526 p -= clen;
6527 memcpy(p, s, clen);
6528 s += clen;
6532 STR_SET_LEN(rev, RSTRING_LEN(str));
6533 str_enc_copy_direct(rev, str);
6534 ENC_CODERANGE_SET(rev, cr);
6536 return rev;
6541 * call-seq:
6542 * reverse! -> self
6544 * Returns +self+ with its characters reversed:
6546 * s = 'stressed'
6547 * s.reverse! # => "desserts"
6548 * s # => "desserts"
6552 static VALUE
6553 rb_str_reverse_bang(VALUE str)
6555 if (RSTRING_LEN(str) > 1) {
6556 if (single_byte_optimizable(str)) {
6557 char *s, *e, c;
6559 str_modify_keep_cr(str);
6560 s = RSTRING_PTR(str);
6561 e = RSTRING_END(str) - 1;
6562 while (s < e) {
6563 c = *s;
6564 *s++ = *e;
6565 *e-- = c;
6568 else {
6569 str_shared_replace(str, rb_str_reverse(str));
6572 else {
6573 str_modify_keep_cr(str);
6575 return str;
6580 * call-seq:
6581 * include?(other_string) -> true or false
6583 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6585 * s = 'foo'
6586 * s.include?('f') # => true
6587 * s.include?('fo') # => true
6588 * s.include?('food') # => false
6592 VALUE
6593 rb_str_include(VALUE str, VALUE arg)
6595 long i;
6597 StringValue(arg);
6598 i = rb_str_index(str, arg, 0);
6600 return RBOOL(i != -1);
6605 * call-seq:
6606 * to_i(base = 10) -> integer
6608 * Returns the result of interpreting leading characters in +self+
6609 * as an integer in the given +base+ (which must be in (0, 2..36)):
6611 * '123456'.to_i # => 123456
6612 * '123def'.to_i(16) # => 1195503
6614 * With +base+ zero, string +object+ may contain leading characters
6615 * to specify the actual base:
6617 * '123def'.to_i(0) # => 123
6618 * '0123def'.to_i(0) # => 83
6619 * '0b123def'.to_i(0) # => 1
6620 * '0o123def'.to_i(0) # => 83
6621 * '0d123def'.to_i(0) # => 123
6622 * '0x123def'.to_i(0) # => 1195503
6624 * Characters past a leading valid number (in the given +base+) are ignored:
6626 * '12.345'.to_i # => 12
6627 * '12345'.to_i(2) # => 1
6629 * Returns zero if there is no leading valid number:
6631 * 'abcdef'.to_i # => 0
6632 * '2'.to_i(2) # => 0
6636 static VALUE
6637 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6639 int base = 10;
6641 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6642 rb_raise(rb_eArgError, "invalid radix %d", base);
6644 return rb_str_to_inum(str, base, FALSE);
6649 * call-seq:
6650 * to_f -> float
6652 * Returns the result of interpreting leading characters in +self+ as a Float:
6654 * '3.14159'.to_f # => 3.14159
6655 * '1.234e-2'.to_f # => 0.01234
6657 * Characters past a leading valid number (in the given +base+) are ignored:
6659 * '3.14 (pi to two places)'.to_f # => 3.14
6661 * Returns zero if there is no leading valid number:
6663 * 'abcdef'.to_f # => 0.0
6667 static VALUE
6668 rb_str_to_f(VALUE str)
6670 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6675 * call-seq:
6676 * to_s -> self or string
6678 * Returns +self+ if +self+ is a +String+,
6679 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
6682 static VALUE
6683 rb_str_to_s(VALUE str)
6685 if (rb_obj_class(str) != rb_cString) {
6686 return str_duplicate(rb_cString, str);
6688 return str;
6691 #if 0
6692 static void
6693 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6695 char s[RUBY_MAX_CHAR_LEN];
6696 int n = rb_enc_codelen(c, enc);
6698 rb_enc_mbcput(c, s, enc);
6699 rb_enc_str_buf_cat(str, s, n, enc);
6701 #endif
6703 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6706 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6708 char buf[CHAR_ESC_LEN + 1];
6709 int l;
6711 #if SIZEOF_INT > 4
6712 c &= 0xffffffff;
6713 #endif
6714 if (unicode_p) {
6715 if (c < 0x7F && ISPRINT(c)) {
6716 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6718 else if (c < 0x10000) {
6719 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6721 else {
6722 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6725 else {
6726 if (c < 0x100) {
6727 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6729 else {
6730 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6733 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6734 rb_str_buf_cat(result, buf, l);
6735 return l;
6738 const char *
6739 ruby_escaped_char(int c)
6741 switch (c) {
6742 case '\0': return "\\0";
6743 case '\n': return "\\n";
6744 case '\r': return "\\r";
6745 case '\t': return "\\t";
6746 case '\f': return "\\f";
6747 case '\013': return "\\v";
6748 case '\010': return "\\b";
6749 case '\007': return "\\a";
6750 case '\033': return "\\e";
6751 case '\x7f': return "\\c?";
6753 return NULL;
6756 VALUE
6757 rb_str_escape(VALUE str)
6759 int encidx = ENCODING_GET(str);
6760 rb_encoding *enc = rb_enc_from_index(encidx);
6761 const char *p = RSTRING_PTR(str);
6762 const char *pend = RSTRING_END(str);
6763 const char *prev = p;
6764 char buf[CHAR_ESC_LEN + 1];
6765 VALUE result = rb_str_buf_new(0);
6766 int unicode_p = rb_enc_unicode_p(enc);
6767 int asciicompat = rb_enc_asciicompat(enc);
6769 while (p < pend) {
6770 unsigned int c;
6771 const char *cc;
6772 int n = rb_enc_precise_mbclen(p, pend, enc);
6773 if (!MBCLEN_CHARFOUND_P(n)) {
6774 if (p > prev) str_buf_cat(result, prev, p - prev);
6775 n = rb_enc_mbminlen(enc);
6776 if (pend < p + n)
6777 n = (int)(pend - p);
6778 while (n--) {
6779 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6780 str_buf_cat(result, buf, strlen(buf));
6781 prev = ++p;
6783 continue;
6785 n = MBCLEN_CHARFOUND_LEN(n);
6786 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6787 p += n;
6788 cc = ruby_escaped_char(c);
6789 if (cc) {
6790 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6791 str_buf_cat(result, cc, strlen(cc));
6792 prev = p;
6794 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6796 else {
6797 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6798 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6799 prev = p;
6802 if (p > prev) str_buf_cat(result, prev, p - prev);
6803 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6805 return result;
6809 * call-seq:
6810 * inspect -> string
6812 * Returns a printable version of +self+, enclosed in double-quotes,
6813 * and with special characters escaped:
6815 * s = "foo\tbar\tbaz\n"
6816 * s.inspect
6817 * # => "\"foo\\tbar\\tbaz\\n\""
6821 VALUE
6822 rb_str_inspect(VALUE str)
6824 int encidx = ENCODING_GET(str);
6825 rb_encoding *enc = rb_enc_from_index(encidx);
6826 const char *p, *pend, *prev;
6827 char buf[CHAR_ESC_LEN + 1];
6828 VALUE result = rb_str_buf_new(0);
6829 rb_encoding *resenc = rb_default_internal_encoding();
6830 int unicode_p = rb_enc_unicode_p(enc);
6831 int asciicompat = rb_enc_asciicompat(enc);
6833 if (resenc == NULL) resenc = rb_default_external_encoding();
6834 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6835 rb_enc_associate(result, resenc);
6836 str_buf_cat2(result, "\"");
6838 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6839 prev = p;
6840 while (p < pend) {
6841 unsigned int c, cc;
6842 int n;
6844 n = rb_enc_precise_mbclen(p, pend, enc);
6845 if (!MBCLEN_CHARFOUND_P(n)) {
6846 if (p > prev) str_buf_cat(result, prev, p - prev);
6847 n = rb_enc_mbminlen(enc);
6848 if (pend < p + n)
6849 n = (int)(pend - p);
6850 while (n--) {
6851 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6852 str_buf_cat(result, buf, strlen(buf));
6853 prev = ++p;
6855 continue;
6857 n = MBCLEN_CHARFOUND_LEN(n);
6858 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6859 p += n;
6860 if ((asciicompat || unicode_p) &&
6861 (c == '"'|| c == '\\' ||
6862 (c == '#' &&
6863 p < pend &&
6864 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6865 (cc = rb_enc_codepoint(p,pend,enc),
6866 (cc == '$' || cc == '@' || cc == '{'))))) {
6867 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6868 str_buf_cat2(result, "\\");
6869 if (asciicompat || enc == resenc) {
6870 prev = p - n;
6871 continue;
6874 switch (c) {
6875 case '\n': cc = 'n'; break;
6876 case '\r': cc = 'r'; break;
6877 case '\t': cc = 't'; break;
6878 case '\f': cc = 'f'; break;
6879 case '\013': cc = 'v'; break;
6880 case '\010': cc = 'b'; break;
6881 case '\007': cc = 'a'; break;
6882 case 033: cc = 'e'; break;
6883 default: cc = 0; break;
6885 if (cc) {
6886 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6887 buf[0] = '\\';
6888 buf[1] = (char)cc;
6889 str_buf_cat(result, buf, 2);
6890 prev = p;
6891 continue;
6893 /* The special casing of 0x85 (NEXT_LINE) here is because
6894 * Oniguruma historically treats it as printable, but it
6895 * doesn't match the print POSIX bracket class or character
6896 * property in regexps.
6898 * See Ruby Bug #16842 for details:
6899 * https://bugs.ruby-lang.org/issues/16842
6901 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6902 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6903 continue;
6905 else {
6906 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6907 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6908 prev = p;
6909 continue;
6912 if (p > prev) str_buf_cat(result, prev, p - prev);
6913 str_buf_cat2(result, "\"");
6915 return result;
6918 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6921 * call-seq:
6922 * dump -> string
6924 * Returns a printable version of +self+, enclosed in double-quotes,
6925 * with special characters escaped, and with non-printing characters
6926 * replaced by hexadecimal notation:
6928 * "hello \n ''".dump # => "\"hello \\n ''\""
6929 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6931 * Related: String#undump (inverse of String#dump).
6935 VALUE
6936 rb_str_dump(VALUE str)
6938 int encidx = rb_enc_get_index(str);
6939 rb_encoding *enc = rb_enc_from_index(encidx);
6940 long len;
6941 const char *p, *pend;
6942 char *q, *qend;
6943 VALUE result;
6944 int u8 = (encidx == rb_utf8_encindex());
6945 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6947 len = 2; /* "" */
6948 if (!rb_enc_asciicompat(enc)) {
6949 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6950 len += strlen(enc->name);
6953 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6954 while (p < pend) {
6955 int clen;
6956 unsigned char c = *p++;
6958 switch (c) {
6959 case '"': case '\\':
6960 case '\n': case '\r':
6961 case '\t': case '\f':
6962 case '\013': case '\010': case '\007': case '\033':
6963 clen = 2;
6964 break;
6966 case '#':
6967 clen = IS_EVSTR(p, pend) ? 2 : 1;
6968 break;
6970 default:
6971 if (ISPRINT(c)) {
6972 clen = 1;
6974 else {
6975 if (u8 && c > 0x7F) { /* \u notation */
6976 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6977 if (MBCLEN_CHARFOUND_P(n)) {
6978 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6979 if (cc <= 0xFFFF)
6980 clen = 6; /* \uXXXX */
6981 else if (cc <= 0xFFFFF)
6982 clen = 9; /* \u{XXXXX} */
6983 else
6984 clen = 10; /* \u{XXXXXX} */
6985 p += MBCLEN_CHARFOUND_LEN(n)-1;
6986 break;
6989 clen = 4; /* \xNN */
6991 break;
6994 if (clen > LONG_MAX - len) {
6995 rb_raise(rb_eRuntimeError, "string size too big");
6997 len += clen;
7000 result = rb_str_new(0, len);
7001 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7002 q = RSTRING_PTR(result); qend = q + len + 1;
7004 *q++ = '"';
7005 while (p < pend) {
7006 unsigned char c = *p++;
7008 if (c == '"' || c == '\\') {
7009 *q++ = '\\';
7010 *q++ = c;
7012 else if (c == '#') {
7013 if (IS_EVSTR(p, pend)) *q++ = '\\';
7014 *q++ = '#';
7016 else if (c == '\n') {
7017 *q++ = '\\';
7018 *q++ = 'n';
7020 else if (c == '\r') {
7021 *q++ = '\\';
7022 *q++ = 'r';
7024 else if (c == '\t') {
7025 *q++ = '\\';
7026 *q++ = 't';
7028 else if (c == '\f') {
7029 *q++ = '\\';
7030 *q++ = 'f';
7032 else if (c == '\013') {
7033 *q++ = '\\';
7034 *q++ = 'v';
7036 else if (c == '\010') {
7037 *q++ = '\\';
7038 *q++ = 'b';
7040 else if (c == '\007') {
7041 *q++ = '\\';
7042 *q++ = 'a';
7044 else if (c == '\033') {
7045 *q++ = '\\';
7046 *q++ = 'e';
7048 else if (ISPRINT(c)) {
7049 *q++ = c;
7051 else {
7052 *q++ = '\\';
7053 if (u8) {
7054 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7055 if (MBCLEN_CHARFOUND_P(n)) {
7056 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7057 p += n;
7058 if (cc <= 0xFFFF)
7059 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7060 else
7061 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7062 q += strlen(q);
7063 continue;
7066 snprintf(q, qend-q, "x%02X", c);
7067 q += 3;
7070 *q++ = '"';
7071 *q = '\0';
7072 if (!rb_enc_asciicompat(enc)) {
7073 snprintf(q, qend-q, nonascii_suffix, enc->name);
7074 encidx = rb_ascii8bit_encindex();
7076 /* result from dump is ASCII */
7077 rb_enc_associate_index(result, encidx);
7078 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
7079 return result;
7082 static int
7083 unescape_ascii(unsigned int c)
7085 switch (c) {
7086 case 'n':
7087 return '\n';
7088 case 'r':
7089 return '\r';
7090 case 't':
7091 return '\t';
7092 case 'f':
7093 return '\f';
7094 case 'v':
7095 return '\13';
7096 case 'b':
7097 return '\010';
7098 case 'a':
7099 return '\007';
7100 case 'e':
7101 return 033;
7103 UNREACHABLE_RETURN(-1);
7106 static void
7107 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7109 const char *s = *ss;
7110 unsigned int c;
7111 int codelen;
7112 size_t hexlen;
7113 unsigned char buf[6];
7114 static rb_encoding *enc_utf8 = NULL;
7116 switch (*s) {
7117 case '\\':
7118 case '"':
7119 case '#':
7120 rb_str_cat(undumped, s, 1); /* cat itself */
7121 s++;
7122 break;
7123 case 'n':
7124 case 'r':
7125 case 't':
7126 case 'f':
7127 case 'v':
7128 case 'b':
7129 case 'a':
7130 case 'e':
7131 *buf = unescape_ascii(*s);
7132 rb_str_cat(undumped, (char *)buf, 1);
7133 s++;
7134 break;
7135 case 'u':
7136 if (*binary) {
7137 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7139 *utf8 = true;
7140 if (++s >= s_end) {
7141 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7143 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7144 if (*penc != enc_utf8) {
7145 *penc = enc_utf8;
7146 rb_enc_associate(undumped, enc_utf8);
7148 if (*s == '{') { /* handle \u{...} form */
7149 s++;
7150 for (;;) {
7151 if (s >= s_end) {
7152 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7154 if (*s == '}') {
7155 s++;
7156 break;
7158 if (ISSPACE(*s)) {
7159 s++;
7160 continue;
7162 c = scan_hex(s, s_end-s, &hexlen);
7163 if (hexlen == 0 || hexlen > 6) {
7164 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7166 if (c > 0x10ffff) {
7167 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7169 if (0xd800 <= c && c <= 0xdfff) {
7170 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7172 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7173 rb_str_cat(undumped, (char *)buf, codelen);
7174 s += hexlen;
7177 else { /* handle \uXXXX form */
7178 c = scan_hex(s, 4, &hexlen);
7179 if (hexlen != 4) {
7180 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7182 if (0xd800 <= c && c <= 0xdfff) {
7183 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7185 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7186 rb_str_cat(undumped, (char *)buf, codelen);
7187 s += hexlen;
7189 break;
7190 case 'x':
7191 if (*utf8) {
7192 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7194 *binary = true;
7195 if (++s >= s_end) {
7196 rb_raise(rb_eRuntimeError, "invalid hex escape");
7198 *buf = scan_hex(s, 2, &hexlen);
7199 if (hexlen != 2) {
7200 rb_raise(rb_eRuntimeError, "invalid hex escape");
7202 rb_str_cat(undumped, (char *)buf, 1);
7203 s += hexlen;
7204 break;
7205 default:
7206 rb_str_cat(undumped, s-1, 2);
7207 s++;
7210 *ss = s;
7213 static VALUE rb_str_is_ascii_only_p(VALUE str);
7216 * call-seq:
7217 * undump -> string
7219 * Returns an unescaped version of +self+:
7221 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7222 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7223 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7224 * s_undumped == s_orig # => true
7226 * Related: String#dump (inverse of String#undump).
7230 static VALUE
7231 str_undump(VALUE str)
7233 const char *s = RSTRING_PTR(str);
7234 const char *s_end = RSTRING_END(str);
7235 rb_encoding *enc = rb_enc_get(str);
7236 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7237 bool utf8 = false;
7238 bool binary = false;
7239 int w;
7241 rb_must_asciicompat(str);
7242 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7243 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7245 if (!str_null_check(str, &w)) {
7246 rb_raise(rb_eRuntimeError, "string contains null byte");
7248 if (RSTRING_LEN(str) < 2) goto invalid_format;
7249 if (*s != '"') goto invalid_format;
7251 /* strip '"' at the start */
7252 s++;
7254 for (;;) {
7255 if (s >= s_end) {
7256 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7259 if (*s == '"') {
7260 /* epilogue */
7261 s++;
7262 if (s == s_end) {
7263 /* ascii compatible dumped string */
7264 break;
7266 else {
7267 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7268 static const char dup_suffix[] = ".dup";
7269 const char *encname;
7270 int encidx;
7271 ptrdiff_t size;
7273 /* check separately for strings dumped by older versions */
7274 size = sizeof(dup_suffix) - 1;
7275 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7277 size = sizeof(force_encoding_suffix) - 1;
7278 if (s_end - s <= size) goto invalid_format;
7279 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7280 s += size;
7282 if (utf8) {
7283 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7286 encname = s;
7287 s = memchr(s, '"', s_end-s);
7288 size = s - encname;
7289 if (!s) goto invalid_format;
7290 if (s_end - s != 2) goto invalid_format;
7291 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7293 encidx = rb_enc_find_index2(encname, (long)size);
7294 if (encidx < 0) {
7295 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7297 rb_enc_associate_index(undumped, encidx);
7299 break;
7302 if (*s == '\\') {
7303 s++;
7304 if (s >= s_end) {
7305 rb_raise(rb_eRuntimeError, "invalid escape");
7307 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7309 else {
7310 rb_str_cat(undumped, s++, 1);
7314 RB_GC_GUARD(str);
7316 return undumped;
7317 invalid_format:
7318 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7321 static void
7322 rb_str_check_dummy_enc(rb_encoding *enc)
7324 if (rb_enc_dummy_p(enc)) {
7325 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7326 rb_enc_name(enc));
7330 static rb_encoding *
7331 str_true_enc(VALUE str)
7333 rb_encoding *enc = STR_ENC_GET(str);
7334 rb_str_check_dummy_enc(enc);
7335 return enc;
7338 static OnigCaseFoldType
7339 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7341 if (argc==0)
7342 return flags;
7343 if (argc>2)
7344 rb_raise(rb_eArgError, "too many options");
7345 if (argv[0]==sym_turkic) {
7346 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7347 if (argc==2) {
7348 if (argv[1]==sym_lithuanian)
7349 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7350 else
7351 rb_raise(rb_eArgError, "invalid second option");
7354 else if (argv[0]==sym_lithuanian) {
7355 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7356 if (argc==2) {
7357 if (argv[1]==sym_turkic)
7358 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7359 else
7360 rb_raise(rb_eArgError, "invalid second option");
7363 else if (argc>1)
7364 rb_raise(rb_eArgError, "too many options");
7365 else if (argv[0]==sym_ascii)
7366 flags |= ONIGENC_CASE_ASCII_ONLY;
7367 else if (argv[0]==sym_fold) {
7368 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7369 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7370 else
7371 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7373 else
7374 rb_raise(rb_eArgError, "invalid option");
7375 return flags;
7378 static inline bool
7379 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7381 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7382 return true;
7383 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7386 /* 16 should be long enough to absorb any kind of single character length increase */
7387 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7388 #ifndef CASEMAP_DEBUG
7389 # define CASEMAP_DEBUG 0
7390 #endif
7392 struct mapping_buffer;
7393 typedef struct mapping_buffer {
7394 size_t capa;
7395 size_t used;
7396 struct mapping_buffer *next;
7397 OnigUChar space[FLEX_ARY_LEN];
7398 } mapping_buffer;
7400 static void
7401 mapping_buffer_free(void *p)
7403 mapping_buffer *previous_buffer;
7404 mapping_buffer *current_buffer = p;
7405 while (current_buffer) {
7406 previous_buffer = current_buffer;
7407 current_buffer = current_buffer->next;
7408 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7412 static const rb_data_type_t mapping_buffer_type = {
7413 "mapping_buffer",
7414 {0, mapping_buffer_free,},
7415 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7418 static VALUE
7419 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7421 VALUE target;
7423 const OnigUChar *source_current, *source_end;
7424 int target_length = 0;
7425 VALUE buffer_anchor;
7426 mapping_buffer *current_buffer = 0;
7427 mapping_buffer **pre_buffer;
7428 size_t buffer_count = 0;
7429 int buffer_length_or_invalid;
7431 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7433 source_current = (OnigUChar*)RSTRING_PTR(source);
7434 source_end = (OnigUChar*)RSTRING_END(source);
7436 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7437 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7438 while (source_current < source_end) {
7439 /* increase multiplier using buffer count to converge quickly */
7440 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7441 if (CASEMAP_DEBUG) {
7442 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7444 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7445 *pre_buffer = current_buffer;
7446 pre_buffer = &current_buffer->next;
7447 current_buffer->next = NULL;
7448 current_buffer->capa = capa;
7449 buffer_length_or_invalid = enc->case_map(flags,
7450 &source_current, source_end,
7451 current_buffer->space,
7452 current_buffer->space+current_buffer->capa,
7453 enc);
7454 if (buffer_length_or_invalid < 0) {
7455 current_buffer = DATA_PTR(buffer_anchor);
7456 DATA_PTR(buffer_anchor) = 0;
7457 mapping_buffer_free(current_buffer);
7458 rb_raise(rb_eArgError, "input string invalid");
7460 target_length += current_buffer->used = buffer_length_or_invalid;
7462 if (CASEMAP_DEBUG) {
7463 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7466 if (buffer_count==1) {
7467 target = rb_str_new((const char*)current_buffer->space, target_length);
7469 else {
7470 char *target_current;
7472 target = rb_str_new(0, target_length);
7473 target_current = RSTRING_PTR(target);
7474 current_buffer = DATA_PTR(buffer_anchor);
7475 while (current_buffer) {
7476 memcpy(target_current, current_buffer->space, current_buffer->used);
7477 target_current += current_buffer->used;
7478 current_buffer = current_buffer->next;
7481 current_buffer = DATA_PTR(buffer_anchor);
7482 DATA_PTR(buffer_anchor) = 0;
7483 mapping_buffer_free(current_buffer);
7485 RB_GC_GUARD(buffer_anchor);
7487 /* TODO: check about string terminator character */
7488 str_enc_copy_direct(target, source);
7489 /*ENC_CODERANGE_SET(mapped, cr);*/
7491 return target;
7494 static VALUE
7495 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7497 const OnigUChar *source_current, *source_end;
7498 OnigUChar *target_current, *target_end;
7499 long old_length = RSTRING_LEN(source);
7500 int length_or_invalid;
7502 if (old_length == 0) return Qnil;
7504 source_current = (OnigUChar*)RSTRING_PTR(source);
7505 source_end = (OnigUChar*)RSTRING_END(source);
7506 if (source == target) {
7507 target_current = (OnigUChar*)source_current;
7508 target_end = (OnigUChar*)source_end;
7510 else {
7511 target_current = (OnigUChar*)RSTRING_PTR(target);
7512 target_end = (OnigUChar*)RSTRING_END(target);
7515 length_or_invalid = onigenc_ascii_only_case_map(flags,
7516 &source_current, source_end,
7517 target_current, target_end, enc);
7518 if (length_or_invalid < 0)
7519 rb_raise(rb_eArgError, "input string invalid");
7520 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7521 fprintf(stderr, "problem with rb_str_ascii_casemap"
7522 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7523 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7524 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7527 str_enc_copy(target, source);
7529 return target;
7532 static bool
7533 upcase_single(VALUE str)
7535 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7536 bool modified = false;
7538 while (s < send) {
7539 unsigned int c = *(unsigned char*)s;
7541 if ('a' <= c && c <= 'z') {
7542 *s = 'A' + (c - 'a');
7543 modified = true;
7545 s++;
7547 return modified;
7551 * call-seq:
7552 * upcase!(*options) -> self or nil
7554 * Upcases the characters in +self+;
7555 * returns +self+ if any changes were made, +nil+ otherwise:
7557 * s = 'Hello World!' # => "Hello World!"
7558 * s.upcase! # => "HELLO WORLD!"
7559 * s # => "HELLO WORLD!"
7560 * s.upcase! # => nil
7562 * The casing may be affected by the given +options+;
7563 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7565 * Related: String#upcase, String#downcase, String#downcase!.
7569 static VALUE
7570 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7572 rb_encoding *enc;
7573 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7575 flags = check_case_options(argc, argv, flags);
7576 str_modify_keep_cr(str);
7577 enc = str_true_enc(str);
7578 if (case_option_single_p(flags, enc, str)) {
7579 if (upcase_single(str))
7580 flags |= ONIGENC_CASE_MODIFIED;
7582 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7583 rb_str_ascii_casemap(str, str, &flags, enc);
7584 else
7585 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7587 if (ONIGENC_CASE_MODIFIED&flags) return str;
7588 return Qnil;
7593 * call-seq:
7594 * upcase(*options) -> string
7596 * Returns a string containing the upcased characters in +self+:
7598 * s = 'Hello World!' # => "Hello World!"
7599 * s.upcase # => "HELLO WORLD!"
7601 * The casing may be affected by the given +options+;
7602 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7604 * Related: String#upcase!, String#downcase, String#downcase!.
7608 static VALUE
7609 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7611 rb_encoding *enc;
7612 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7613 VALUE ret;
7615 flags = check_case_options(argc, argv, flags);
7616 enc = str_true_enc(str);
7617 if (case_option_single_p(flags, enc, str)) {
7618 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7619 str_enc_copy_direct(ret, str);
7620 upcase_single(ret);
7622 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7623 ret = rb_str_new(0, RSTRING_LEN(str));
7624 rb_str_ascii_casemap(str, ret, &flags, enc);
7626 else {
7627 ret = rb_str_casemap(str, &flags, enc);
7630 return ret;
7633 static bool
7634 downcase_single(VALUE str)
7636 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7637 bool modified = false;
7639 while (s < send) {
7640 unsigned int c = *(unsigned char*)s;
7642 if ('A' <= c && c <= 'Z') {
7643 *s = 'a' + (c - 'A');
7644 modified = true;
7646 s++;
7649 return modified;
7653 * call-seq:
7654 * downcase!(*options) -> self or nil
7656 * Downcases the characters in +self+;
7657 * returns +self+ if any changes were made, +nil+ otherwise:
7659 * s = 'Hello World!' # => "Hello World!"
7660 * s.downcase! # => "hello world!"
7661 * s # => "hello world!"
7662 * s.downcase! # => nil
7664 * The casing may be affected by the given +options+;
7665 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7667 * Related: String#downcase, String#upcase, String#upcase!.
7671 static VALUE
7672 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7674 rb_encoding *enc;
7675 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7677 flags = check_case_options(argc, argv, flags);
7678 str_modify_keep_cr(str);
7679 enc = str_true_enc(str);
7680 if (case_option_single_p(flags, enc, str)) {
7681 if (downcase_single(str))
7682 flags |= ONIGENC_CASE_MODIFIED;
7684 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7685 rb_str_ascii_casemap(str, str, &flags, enc);
7686 else
7687 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7689 if (ONIGENC_CASE_MODIFIED&flags) return str;
7690 return Qnil;
7695 * call-seq:
7696 * downcase(*options) -> string
7698 * Returns a string containing the downcased characters in +self+:
7700 * s = 'Hello World!' # => "Hello World!"
7701 * s.downcase # => "hello world!"
7703 * The casing may be affected by the given +options+;
7704 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7706 * Related: String#downcase!, String#upcase, String#upcase!.
7710 static VALUE
7711 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7713 rb_encoding *enc;
7714 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7715 VALUE ret;
7717 flags = check_case_options(argc, argv, flags);
7718 enc = str_true_enc(str);
7719 if (case_option_single_p(flags, enc, str)) {
7720 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7721 str_enc_copy_direct(ret, str);
7722 downcase_single(ret);
7724 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7725 ret = rb_str_new(0, RSTRING_LEN(str));
7726 rb_str_ascii_casemap(str, ret, &flags, enc);
7728 else {
7729 ret = rb_str_casemap(str, &flags, enc);
7732 return ret;
7737 * call-seq:
7738 * capitalize!(*options) -> self or nil
7740 * Upcases the first character in +self+;
7741 * downcases the remaining characters;
7742 * returns +self+ if any changes were made, +nil+ otherwise:
7744 * s = 'hello World!' # => "hello World!"
7745 * s.capitalize! # => "Hello world!"
7746 * s # => "Hello world!"
7747 * s.capitalize! # => nil
7749 * The casing may be affected by the given +options+;
7750 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7752 * Related: String#capitalize.
7756 static VALUE
7757 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7759 rb_encoding *enc;
7760 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7762 flags = check_case_options(argc, argv, flags);
7763 str_modify_keep_cr(str);
7764 enc = str_true_enc(str);
7765 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7766 if (flags&ONIGENC_CASE_ASCII_ONLY)
7767 rb_str_ascii_casemap(str, str, &flags, enc);
7768 else
7769 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7771 if (ONIGENC_CASE_MODIFIED&flags) return str;
7772 return Qnil;
7777 * call-seq:
7778 * capitalize(*options) -> string
7780 * Returns a string containing the characters in +self+;
7781 * the first character is upcased;
7782 * the remaining characters are downcased:
7784 * s = 'hello World!' # => "hello World!"
7785 * s.capitalize # => "Hello world!"
7787 * The casing may be affected by the given +options+;
7788 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7790 * Related: String#capitalize!.
7794 static VALUE
7795 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7797 rb_encoding *enc;
7798 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7799 VALUE ret;
7801 flags = check_case_options(argc, argv, flags);
7802 enc = str_true_enc(str);
7803 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7804 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7805 ret = rb_str_new(0, RSTRING_LEN(str));
7806 rb_str_ascii_casemap(str, ret, &flags, enc);
7808 else {
7809 ret = rb_str_casemap(str, &flags, enc);
7811 return ret;
7816 * call-seq:
7817 * swapcase!(*options) -> self or nil
7819 * Upcases each lowercase character in +self+;
7820 * downcases uppercase character;
7821 * returns +self+ if any changes were made, +nil+ otherwise:
7823 * s = 'Hello World!' # => "Hello World!"
7824 * s.swapcase! # => "hELLO wORLD!"
7825 * s # => "hELLO wORLD!"
7826 * ''.swapcase! # => nil
7828 * The casing may be affected by the given +options+;
7829 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7831 * Related: String#swapcase.
7835 static VALUE
7836 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7838 rb_encoding *enc;
7839 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7841 flags = check_case_options(argc, argv, flags);
7842 str_modify_keep_cr(str);
7843 enc = str_true_enc(str);
7844 if (flags&ONIGENC_CASE_ASCII_ONLY)
7845 rb_str_ascii_casemap(str, str, &flags, enc);
7846 else
7847 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7849 if (ONIGENC_CASE_MODIFIED&flags) return str;
7850 return Qnil;
7855 * call-seq:
7856 * swapcase(*options) -> string
7858 * Returns a string containing the characters in +self+, with cases reversed;
7859 * each uppercase character is downcased;
7860 * each lowercase character is upcased:
7862 * s = 'Hello World!' # => "Hello World!"
7863 * s.swapcase # => "hELLO wORLD!"
7865 * The casing may be affected by the given +options+;
7866 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7868 * Related: String#swapcase!.
7872 static VALUE
7873 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7875 rb_encoding *enc;
7876 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7877 VALUE ret;
7879 flags = check_case_options(argc, argv, flags);
7880 enc = str_true_enc(str);
7881 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7882 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7883 ret = rb_str_new(0, RSTRING_LEN(str));
7884 rb_str_ascii_casemap(str, ret, &flags, enc);
7886 else {
7887 ret = rb_str_casemap(str, &flags, enc);
7889 return ret;
7892 typedef unsigned char *USTR;
7894 struct tr {
7895 int gen;
7896 unsigned int now, max;
7897 char *p, *pend;
7900 static unsigned int
7901 trnext(struct tr *t, rb_encoding *enc)
7903 int n;
7905 for (;;) {
7906 nextpart:
7907 if (!t->gen) {
7908 if (t->p == t->pend) return -1;
7909 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7910 t->p += n;
7912 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7913 t->p += n;
7914 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7915 t->p += n;
7916 if (t->p < t->pend) {
7917 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7918 t->p += n;
7919 if (t->now > c) {
7920 if (t->now < 0x80 && c < 0x80) {
7921 rb_raise(rb_eArgError,
7922 "invalid range \"%c-%c\" in string transliteration",
7923 t->now, c);
7925 else {
7926 rb_raise(rb_eArgError, "invalid range in string transliteration");
7928 continue; /* not reached */
7930 else if (t->now < c) {
7931 t->gen = 1;
7932 t->max = c;
7936 return t->now;
7938 else {
7939 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7940 if (t->now == t->max) {
7941 t->gen = 0;
7942 goto nextpart;
7945 if (t->now < t->max) {
7946 return t->now;
7948 else {
7949 t->gen = 0;
7950 return t->max;
7956 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7958 static VALUE
7959 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7961 const unsigned int errc = -1;
7962 unsigned int trans[256];
7963 rb_encoding *enc, *e1, *e2;
7964 struct tr trsrc, trrepl;
7965 int cflag = 0;
7966 unsigned int c, c0, last = 0;
7967 int modify = 0, i, l;
7968 unsigned char *s, *send;
7969 VALUE hash = 0;
7970 int singlebyte = single_byte_optimizable(str);
7971 int termlen;
7972 int cr;
7974 #define CHECK_IF_ASCII(c) \
7975 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7976 (cr = ENC_CODERANGE_VALID) : 0)
7978 StringValue(src);
7979 StringValue(repl);
7980 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7981 if (RSTRING_LEN(repl) == 0) {
7982 return rb_str_delete_bang(1, &src, str);
7985 cr = ENC_CODERANGE(str);
7986 e1 = rb_enc_check(str, src);
7987 e2 = rb_enc_check(str, repl);
7988 if (e1 == e2) {
7989 enc = e1;
7991 else {
7992 enc = rb_enc_check(src, repl);
7994 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7995 if (RSTRING_LEN(src) > 1 &&
7996 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7997 trsrc.p + l < trsrc.pend) {
7998 cflag = 1;
7999 trsrc.p += l;
8001 trrepl.p = RSTRING_PTR(repl);
8002 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8003 trsrc.gen = trrepl.gen = 0;
8004 trsrc.now = trrepl.now = 0;
8005 trsrc.max = trrepl.max = 0;
8007 if (cflag) {
8008 for (i=0; i<256; i++) {
8009 trans[i] = 1;
8011 while ((c = trnext(&trsrc, enc)) != errc) {
8012 if (c < 256) {
8013 trans[c] = errc;
8015 else {
8016 if (!hash) hash = rb_hash_new();
8017 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8020 while ((c = trnext(&trrepl, enc)) != errc)
8021 /* retrieve last replacer */;
8022 last = trrepl.now;
8023 for (i=0; i<256; i++) {
8024 if (trans[i] != errc) {
8025 trans[i] = last;
8029 else {
8030 unsigned int r;
8032 for (i=0; i<256; i++) {
8033 trans[i] = errc;
8035 while ((c = trnext(&trsrc, enc)) != errc) {
8036 r = trnext(&trrepl, enc);
8037 if (r == errc) r = trrepl.now;
8038 if (c < 256) {
8039 trans[c] = r;
8040 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8042 else {
8043 if (!hash) hash = rb_hash_new();
8044 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8049 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8050 cr = ENC_CODERANGE_7BIT;
8051 str_modify_keep_cr(str);
8052 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8053 termlen = rb_enc_mbminlen(enc);
8054 if (sflag) {
8055 int clen, tlen;
8056 long offset, max = RSTRING_LEN(str);
8057 unsigned int save = -1;
8058 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8060 while (s < send) {
8061 int may_modify = 0;
8063 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8064 if (!MBCLEN_CHARFOUND_P(r)) {
8065 xfree(buf);
8066 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8068 clen = MBCLEN_CHARFOUND_LEN(r);
8069 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8071 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8073 s += clen;
8074 if (c < 256) {
8075 c = trans[c];
8077 else if (hash) {
8078 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8079 if (NIL_P(tmp)) {
8080 if (cflag) c = last;
8081 else c = errc;
8083 else if (cflag) c = errc;
8084 else c = NUM2INT(tmp);
8086 else {
8087 c = errc;
8089 if (c != (unsigned int)-1) {
8090 if (save == c) {
8091 CHECK_IF_ASCII(c);
8092 continue;
8094 save = c;
8095 tlen = rb_enc_codelen(c, enc);
8096 modify = 1;
8098 else {
8099 save = -1;
8100 c = c0;
8101 if (enc != e1) may_modify = 1;
8103 if ((offset = t - buf) + tlen > max) {
8104 size_t MAYBE_UNUSED(old) = max + termlen;
8105 max = offset + tlen + (send - s);
8106 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8107 t = buf + offset;
8109 rb_enc_mbcput(c, t, enc);
8110 if (may_modify && memcmp(s, t, tlen) != 0) {
8111 modify = 1;
8113 CHECK_IF_ASCII(c);
8114 t += tlen;
8116 if (!STR_EMBED_P(str)) {
8117 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8119 TERM_FILL((char *)t, termlen);
8120 RSTRING(str)->as.heap.ptr = (char *)buf;
8121 STR_SET_LEN(str, t - buf);
8122 STR_SET_NOEMBED(str);
8123 RSTRING(str)->as.heap.aux.capa = max;
8125 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8126 while (s < send) {
8127 c = (unsigned char)*s;
8128 if (trans[c] != errc) {
8129 if (!cflag) {
8130 c = trans[c];
8131 *s = c;
8132 modify = 1;
8134 else {
8135 *s = last;
8136 modify = 1;
8139 CHECK_IF_ASCII(c);
8140 s++;
8143 else {
8144 int clen, tlen;
8145 long offset, max = (long)((send - s) * 1.2);
8146 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8148 while (s < send) {
8149 int may_modify = 0;
8151 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8152 if (!MBCLEN_CHARFOUND_P(r)) {
8153 xfree(buf);
8154 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8156 clen = MBCLEN_CHARFOUND_LEN(r);
8157 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8159 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8161 if (c < 256) {
8162 c = trans[c];
8164 else if (hash) {
8165 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8166 if (NIL_P(tmp)) {
8167 if (cflag) c = last;
8168 else c = errc;
8170 else if (cflag) c = errc;
8171 else c = NUM2INT(tmp);
8173 else {
8174 c = cflag ? last : errc;
8176 if (c != errc) {
8177 tlen = rb_enc_codelen(c, enc);
8178 modify = 1;
8180 else {
8181 c = c0;
8182 if (enc != e1) may_modify = 1;
8184 if ((offset = t - buf) + tlen > max) {
8185 size_t MAYBE_UNUSED(old) = max + termlen;
8186 max = offset + tlen + (long)((send - s) * 1.2);
8187 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8188 t = buf + offset;
8190 if (s != t) {
8191 rb_enc_mbcput(c, t, enc);
8192 if (may_modify && memcmp(s, t, tlen) != 0) {
8193 modify = 1;
8196 CHECK_IF_ASCII(c);
8197 s += clen;
8198 t += tlen;
8200 if (!STR_EMBED_P(str)) {
8201 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8203 TERM_FILL((char *)t, termlen);
8204 RSTRING(str)->as.heap.ptr = (char *)buf;
8205 STR_SET_LEN(str, t - buf);
8206 STR_SET_NOEMBED(str);
8207 RSTRING(str)->as.heap.aux.capa = max;
8210 if (modify) {
8211 if (cr != ENC_CODERANGE_BROKEN)
8212 ENC_CODERANGE_SET(str, cr);
8213 rb_enc_associate(str, enc);
8214 return str;
8216 return Qnil;
8221 * call-seq:
8222 * tr!(selector, replacements) -> self or nil
8224 * Like String#tr, but modifies +self+ in place.
8225 * Returns +self+ if any changes were made, +nil+ otherwise.
8229 static VALUE
8230 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8232 return tr_trans(str, src, repl, 0);
8237 * call-seq:
8238 * tr(selector, replacements) -> new_string
8240 * Returns a copy of +self+ with each character specified by string +selector+
8241 * translated to the corresponding character in string +replacements+.
8242 * The correspondence is _positional_:
8244 * - Each occurrence of the first character specified by +selector+
8245 * is translated to the first character in +replacements+.
8246 * - Each occurrence of the second character specified by +selector+
8247 * is translated to the second character in +replacements+.
8248 * - And so on.
8250 * Example:
8252 * 'hello'.tr('el', 'ip') #=> "hippo"
8254 * If +replacements+ is shorter than +selector+,
8255 * it is implicitly padded with its own last character:
8257 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8258 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8260 * Arguments +selector+ and +replacements+ must be valid character selectors
8261 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8262 * and may use any of its valid forms, including negation, ranges, and escaping:
8264 * # Negation.
8265 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8266 * # Ranges.
8267 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8268 * # Escapes.
8269 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8270 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8271 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8275 static VALUE
8276 rb_str_tr(VALUE str, VALUE src, VALUE repl)
8278 str = str_duplicate(rb_cString, str);
8279 tr_trans(str, src, repl, 0);
8280 return str;
8283 #define TR_TABLE_MAX (UCHAR_MAX+1)
8284 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8285 static void
8286 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8287 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8289 const unsigned int errc = -1;
8290 char buf[TR_TABLE_MAX];
8291 struct tr tr;
8292 unsigned int c;
8293 VALUE table = 0, ptable = 0;
8294 int i, l, cflag = 0;
8296 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8297 tr.gen = tr.now = tr.max = 0;
8299 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8300 cflag = 1;
8301 tr.p += l;
8303 if (first) {
8304 for (i=0; i<TR_TABLE_MAX; i++) {
8305 stable[i] = 1;
8307 stable[TR_TABLE_MAX] = cflag;
8309 else if (stable[TR_TABLE_MAX] && !cflag) {
8310 stable[TR_TABLE_MAX] = 0;
8312 for (i=0; i<TR_TABLE_MAX; i++) {
8313 buf[i] = cflag;
8316 while ((c = trnext(&tr, enc)) != errc) {
8317 if (c < TR_TABLE_MAX) {
8318 buf[(unsigned char)c] = !cflag;
8320 else {
8321 VALUE key = UINT2NUM(c);
8323 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8324 if (cflag) {
8325 ptable = *ctablep;
8326 table = ptable ? ptable : rb_hash_new();
8327 *ctablep = table;
8329 else {
8330 table = rb_hash_new();
8331 ptable = *tablep;
8332 *tablep = table;
8335 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8336 rb_hash_aset(table, key, Qtrue);
8340 for (i=0; i<TR_TABLE_MAX; i++) {
8341 stable[i] = stable[i] && buf[i];
8343 if (!table && !cflag) {
8344 *tablep = 0;
8349 static int
8350 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8352 if (c < TR_TABLE_MAX) {
8353 return table[c] != 0;
8355 else {
8356 VALUE v = UINT2NUM(c);
8358 if (del) {
8359 if (!NIL_P(rb_hash_lookup(del, v)) &&
8360 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8361 return TRUE;
8364 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8365 return FALSE;
8367 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8372 * call-seq:
8373 * delete!(*selectors) -> self or nil
8375 * Like String#delete, but modifies +self+ in place.
8376 * Returns +self+ if any changes were made, +nil+ otherwise.
8380 static VALUE
8381 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8383 char squeez[TR_TABLE_SIZE];
8384 rb_encoding *enc = 0;
8385 char *s, *send, *t;
8386 VALUE del = 0, nodel = 0;
8387 int modify = 0;
8388 int i, ascompat, cr;
8390 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8391 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8392 for (i=0; i<argc; i++) {
8393 VALUE s = argv[i];
8395 StringValue(s);
8396 enc = rb_enc_check(str, s);
8397 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8400 str_modify_keep_cr(str);
8401 ascompat = rb_enc_asciicompat(enc);
8402 s = t = RSTRING_PTR(str);
8403 send = RSTRING_END(str);
8404 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8405 while (s < send) {
8406 unsigned int c;
8407 int clen;
8409 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8410 if (squeez[c]) {
8411 modify = 1;
8413 else {
8414 if (t != s) *t = c;
8415 t++;
8417 s++;
8419 else {
8420 c = rb_enc_codepoint_len(s, send, &clen, enc);
8422 if (tr_find(c, squeez, del, nodel)) {
8423 modify = 1;
8425 else {
8426 if (t != s) rb_enc_mbcput(c, t, enc);
8427 t += clen;
8428 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8430 s += clen;
8433 TERM_FILL(t, TERM_LEN(str));
8434 STR_SET_LEN(str, t - RSTRING_PTR(str));
8435 ENC_CODERANGE_SET(str, cr);
8437 if (modify) return str;
8438 return Qnil;
8443 * call-seq:
8444 * delete(*selectors) -> new_string
8446 * Returns a copy of +self+ with characters specified by +selectors+ removed
8447 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8449 * "hello".delete "l","lo" #=> "heo"
8450 * "hello".delete "lo" #=> "he"
8451 * "hello".delete "aeiou", "^e" #=> "hell"
8452 * "hello".delete "ej-m" #=> "ho"
8456 static VALUE
8457 rb_str_delete(int argc, VALUE *argv, VALUE str)
8459 str = str_duplicate(rb_cString, str);
8460 rb_str_delete_bang(argc, argv, str);
8461 return str;
8466 * call-seq:
8467 * squeeze!(*selectors) -> self or nil
8469 * Like String#squeeze, but modifies +self+ in place.
8470 * Returns +self+ if any changes were made, +nil+ otherwise.
8473 static VALUE
8474 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8476 char squeez[TR_TABLE_SIZE];
8477 rb_encoding *enc = 0;
8478 VALUE del = 0, nodel = 0;
8479 unsigned char *s, *send, *t;
8480 int i, modify = 0;
8481 int ascompat, singlebyte = single_byte_optimizable(str);
8482 unsigned int save;
8484 if (argc == 0) {
8485 enc = STR_ENC_GET(str);
8487 else {
8488 for (i=0; i<argc; i++) {
8489 VALUE s = argv[i];
8491 StringValue(s);
8492 enc = rb_enc_check(str, s);
8493 if (singlebyte && !single_byte_optimizable(s))
8494 singlebyte = 0;
8495 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8499 str_modify_keep_cr(str);
8500 s = t = (unsigned char *)RSTRING_PTR(str);
8501 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8502 send = (unsigned char *)RSTRING_END(str);
8503 save = -1;
8504 ascompat = rb_enc_asciicompat(enc);
8506 if (singlebyte) {
8507 while (s < send) {
8508 unsigned int c = *s++;
8509 if (c != save || (argc > 0 && !squeez[c])) {
8510 *t++ = save = c;
8514 else {
8515 while (s < send) {
8516 unsigned int c;
8517 int clen;
8519 if (ascompat && (c = *s) < 0x80) {
8520 if (c != save || (argc > 0 && !squeez[c])) {
8521 *t++ = save = c;
8523 s++;
8525 else {
8526 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8528 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8529 if (t != s) rb_enc_mbcput(c, t, enc);
8530 save = c;
8531 t += clen;
8533 s += clen;
8538 TERM_FILL((char *)t, TERM_LEN(str));
8539 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8540 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8541 modify = 1;
8544 if (modify) return str;
8545 return Qnil;
8550 * call-seq:
8551 * squeeze(*selectors) -> new_string
8553 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8554 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8556 * "Squeezed" means that each multiple-character run of a selected character
8557 * is squeezed down to a single character;
8558 * with no arguments given, squeezes all characters:
8560 * "yellow moon".squeeze #=> "yelow mon"
8561 * " now is the".squeeze(" ") #=> " now is the"
8562 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8566 static VALUE
8567 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8569 str = str_duplicate(rb_cString, str);
8570 rb_str_squeeze_bang(argc, argv, str);
8571 return str;
8576 * call-seq:
8577 * tr_s!(selector, replacements) -> self or nil
8579 * Like String#tr_s, but modifies +self+ in place.
8580 * Returns +self+ if any changes were made, +nil+ otherwise.
8582 * Related: String#squeeze!.
8585 static VALUE
8586 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8588 return tr_trans(str, src, repl, 1);
8593 * call-seq:
8594 * tr_s(selector, replacements) -> string
8596 * Like String#tr, but also squeezes the modified portions of the translated string;
8597 * returns a new string (translated and squeezed).
8599 * 'hello'.tr_s('l', 'r') #=> "hero"
8600 * 'hello'.tr_s('el', '-') #=> "h-o"
8601 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8603 * Related: String#squeeze.
8607 static VALUE
8608 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8610 str = str_duplicate(rb_cString, str);
8611 tr_trans(str, src, repl, 1);
8612 return str;
8617 * call-seq:
8618 * count(*selectors) -> integer
8620 * Returns the total number of characters in +self+
8621 * that are specified by the given +selectors+
8622 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8624 * a = "hello world"
8625 * a.count "lo" #=> 5
8626 * a.count "lo", "o" #=> 2
8627 * a.count "hello", "^l" #=> 4
8628 * a.count "ej-m" #=> 4
8630 * "hello^world".count "\\^aeiou" #=> 4
8631 * "hello-world".count "a\\-eo" #=> 4
8633 * c = "hello world\\r\\n"
8634 * c.count "\\" #=> 2
8635 * c.count "\\A" #=> 0
8636 * c.count "X-\\w" #=> 3
8639 static VALUE
8640 rb_str_count(int argc, VALUE *argv, VALUE str)
8642 char table[TR_TABLE_SIZE];
8643 rb_encoding *enc = 0;
8644 VALUE del = 0, nodel = 0, tstr;
8645 char *s, *send;
8646 int i;
8647 int ascompat;
8648 size_t n = 0;
8650 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8652 tstr = argv[0];
8653 StringValue(tstr);
8654 enc = rb_enc_check(str, tstr);
8655 if (argc == 1) {
8656 const char *ptstr;
8657 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8658 (ptstr = RSTRING_PTR(tstr),
8659 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8660 !is_broken_string(str)) {
8661 int clen;
8662 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8664 s = RSTRING_PTR(str);
8665 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8666 send = RSTRING_END(str);
8667 while (s < send) {
8668 if (*(unsigned char*)s++ == c) n++;
8670 return SIZET2NUM(n);
8674 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8675 for (i=1; i<argc; i++) {
8676 tstr = argv[i];
8677 StringValue(tstr);
8678 enc = rb_enc_check(str, tstr);
8679 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8682 s = RSTRING_PTR(str);
8683 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8684 send = RSTRING_END(str);
8685 ascompat = rb_enc_asciicompat(enc);
8686 while (s < send) {
8687 unsigned int c;
8689 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8690 if (table[c]) {
8691 n++;
8693 s++;
8695 else {
8696 int clen;
8697 c = rb_enc_codepoint_len(s, send, &clen, enc);
8698 if (tr_find(c, table, del, nodel)) {
8699 n++;
8701 s += clen;
8705 return SIZET2NUM(n);
8708 static VALUE
8709 rb_fs_check(VALUE val)
8711 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8712 val = rb_check_string_type(val);
8713 if (NIL_P(val)) return 0;
8715 return val;
8718 static const char isspacetable[256] = {
8719 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8721 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8737 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8739 static long
8740 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8742 if (empty_count >= 0 && len == 0) {
8743 return empty_count + 1;
8745 if (empty_count > 0) {
8746 /* make different substrings */
8747 if (result) {
8748 do {
8749 rb_ary_push(result, str_new_empty_String(str));
8750 } while (--empty_count > 0);
8752 else {
8753 do {
8754 rb_yield(str_new_empty_String(str));
8755 } while (--empty_count > 0);
8758 str = rb_str_subseq(str, beg, len);
8759 if (result) {
8760 rb_ary_push(result, str);
8762 else {
8763 rb_yield(str);
8765 return empty_count;
8768 typedef enum {
8769 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8770 } split_type_t;
8772 static split_type_t
8773 literal_split_pattern(VALUE spat, split_type_t default_type)
8775 rb_encoding *enc = STR_ENC_GET(spat);
8776 const char *ptr;
8777 long len;
8778 RSTRING_GETMEM(spat, ptr, len);
8779 if (len == 0) {
8780 /* Special case - split into chars */
8781 return SPLIT_TYPE_CHARS;
8783 else if (rb_enc_asciicompat(enc)) {
8784 if (len == 1 && ptr[0] == ' ') {
8785 return SPLIT_TYPE_AWK;
8788 else {
8789 int l;
8790 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8791 return SPLIT_TYPE_AWK;
8794 return default_type;
8798 * call-seq:
8799 * split(field_sep = $;, limit = nil) -> array
8800 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8802 * :include: doc/string/split.rdoc
8806 static VALUE
8807 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8809 rb_encoding *enc;
8810 VALUE spat;
8811 VALUE limit;
8812 split_type_t split_type;
8813 long beg, end, i = 0, empty_count = -1;
8814 int lim = 0;
8815 VALUE result, tmp;
8817 result = rb_block_given_p() ? Qfalse : Qnil;
8818 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8819 lim = NUM2INT(limit);
8820 if (lim <= 0) limit = Qnil;
8821 else if (lim == 1) {
8822 if (RSTRING_LEN(str) == 0)
8823 return result ? rb_ary_new2(0) : str;
8824 tmp = str_duplicate(rb_cString, str);
8825 if (!result) {
8826 rb_yield(tmp);
8827 return str;
8829 return rb_ary_new3(1, tmp);
8831 i = 1;
8833 if (NIL_P(limit) && !lim) empty_count = 0;
8835 enc = STR_ENC_GET(str);
8836 split_type = SPLIT_TYPE_REGEXP;
8837 if (!NIL_P(spat)) {
8838 spat = get_pat_quoted(spat, 0);
8840 else if (NIL_P(spat = rb_fs)) {
8841 split_type = SPLIT_TYPE_AWK;
8843 else if (!(spat = rb_fs_check(spat))) {
8844 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8846 else {
8847 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8849 if (split_type != SPLIT_TYPE_AWK) {
8850 switch (BUILTIN_TYPE(spat)) {
8851 case T_REGEXP:
8852 rb_reg_options(spat); /* check if uninitialized */
8853 tmp = RREGEXP_SRC(spat);
8854 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8855 if (split_type == SPLIT_TYPE_AWK) {
8856 spat = tmp;
8857 split_type = SPLIT_TYPE_STRING;
8859 break;
8861 case T_STRING:
8862 mustnot_broken(spat);
8863 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8864 break;
8866 default:
8867 UNREACHABLE_RETURN(Qnil);
8871 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8873 beg = 0;
8874 char *ptr = RSTRING_PTR(str);
8875 char *eptr = RSTRING_END(str);
8876 if (split_type == SPLIT_TYPE_AWK) {
8877 char *bptr = ptr;
8878 int skip = 1;
8879 unsigned int c;
8881 if (result) result = rb_ary_new();
8882 end = beg;
8883 if (is_ascii_string(str)) {
8884 while (ptr < eptr) {
8885 c = (unsigned char)*ptr++;
8886 if (skip) {
8887 if (ascii_isspace(c)) {
8888 beg = ptr - bptr;
8890 else {
8891 end = ptr - bptr;
8892 skip = 0;
8893 if (!NIL_P(limit) && lim <= i) break;
8896 else if (ascii_isspace(c)) {
8897 SPLIT_STR(beg, end-beg);
8898 skip = 1;
8899 beg = ptr - bptr;
8900 if (!NIL_P(limit)) ++i;
8902 else {
8903 end = ptr - bptr;
8907 else {
8908 while (ptr < eptr) {
8909 int n;
8911 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8912 ptr += n;
8913 if (skip) {
8914 if (rb_isspace(c)) {
8915 beg = ptr - bptr;
8917 else {
8918 end = ptr - bptr;
8919 skip = 0;
8920 if (!NIL_P(limit) && lim <= i) break;
8923 else if (rb_isspace(c)) {
8924 SPLIT_STR(beg, end-beg);
8925 skip = 1;
8926 beg = ptr - bptr;
8927 if (!NIL_P(limit)) ++i;
8929 else {
8930 end = ptr - bptr;
8935 else if (split_type == SPLIT_TYPE_STRING) {
8936 char *str_start = ptr;
8937 char *substr_start = ptr;
8938 char *sptr = RSTRING_PTR(spat);
8939 long slen = RSTRING_LEN(spat);
8941 if (result) result = rb_ary_new();
8942 mustnot_broken(str);
8943 enc = rb_enc_check(str, spat);
8944 while (ptr < eptr &&
8945 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8946 /* Check we are at the start of a char */
8947 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8948 if (t != ptr + end) {
8949 ptr = t;
8950 continue;
8952 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8953 ptr += end + slen;
8954 substr_start = ptr;
8955 if (!NIL_P(limit) && lim <= ++i) break;
8957 beg = ptr - str_start;
8959 else if (split_type == SPLIT_TYPE_CHARS) {
8960 char *str_start = ptr;
8961 int n;
8963 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8964 mustnot_broken(str);
8965 enc = rb_enc_get(str);
8966 while (ptr < eptr &&
8967 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8968 SPLIT_STR(ptr - str_start, n);
8969 ptr += n;
8970 if (!NIL_P(limit) && lim <= ++i) break;
8972 beg = ptr - str_start;
8974 else {
8975 if (result) result = rb_ary_new();
8976 long len = RSTRING_LEN(str);
8977 long start = beg;
8978 long idx;
8979 int last_null = 0;
8980 struct re_registers *regs;
8981 VALUE match = 0;
8983 for (; rb_reg_search(spat, str, start, 0) >= 0;
8984 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8985 match = rb_backref_get();
8986 if (!result) rb_match_busy(match);
8987 regs = RMATCH_REGS(match);
8988 end = BEG(0);
8989 if (start == end && BEG(0) == END(0)) {
8990 if (!ptr) {
8991 SPLIT_STR(0, 0);
8992 break;
8994 else if (last_null == 1) {
8995 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8996 beg = start;
8998 else {
8999 if (start == len)
9000 start++;
9001 else
9002 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9003 last_null = 1;
9004 continue;
9007 else {
9008 SPLIT_STR(beg, end-beg);
9009 beg = start = END(0);
9011 last_null = 0;
9013 for (idx=1; idx < regs->num_regs; idx++) {
9014 if (BEG(idx) == -1) continue;
9015 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9017 if (!NIL_P(limit) && lim <= ++i) break;
9019 if (match) rb_match_unbusy(match);
9021 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9022 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9025 return result ? result : str;
9028 VALUE
9029 rb_str_split(VALUE str, const char *sep0)
9031 VALUE sep;
9033 StringValue(str);
9034 sep = rb_str_new_cstr(sep0);
9035 return rb_str_split_m(1, &sep, str);
9038 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9040 static inline int
9041 enumerator_element(VALUE ary, VALUE e)
9043 if (ary) {
9044 rb_ary_push(ary, e);
9045 return 0;
9047 else {
9048 rb_yield(e);
9049 return 1;
9053 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9055 static const char *
9056 chomp_newline(const char *p, const char *e, rb_encoding *enc)
9058 const char *prev = rb_enc_prev_char(p, e, e, enc);
9059 if (rb_enc_is_newline(prev, e, enc)) {
9060 e = prev;
9061 prev = rb_enc_prev_char(p, e, e, enc);
9062 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9063 e = prev;
9065 return e;
9068 static VALUE
9069 get_rs(void)
9071 VALUE rs = rb_rs;
9072 if (!NIL_P(rs) &&
9073 (!RB_TYPE_P(rs, T_STRING) ||
9074 RSTRING_LEN(rs) != 1 ||
9075 RSTRING_PTR(rs)[0] != '\n')) {
9076 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9078 return rs;
9081 #define rb_rs get_rs()
9083 static VALUE
9084 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9086 rb_encoding *enc;
9087 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9088 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9089 long pos, len, rslen;
9090 int rsnewline = 0;
9092 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9093 rs = rb_rs;
9094 if (!NIL_P(opts)) {
9095 static ID keywords[1];
9096 if (!keywords[0]) {
9097 keywords[0] = rb_intern_const("chomp");
9099 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9100 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9103 if (NIL_P(rs)) {
9104 if (!ENUM_ELEM(ary, str)) {
9105 return ary;
9107 else {
9108 return orig;
9112 if (!RSTRING_LEN(str)) goto end;
9113 str = rb_str_new_frozen(str);
9114 ptr = subptr = RSTRING_PTR(str);
9115 pend = RSTRING_END(str);
9116 len = RSTRING_LEN(str);
9117 StringValue(rs);
9118 rslen = RSTRING_LEN(rs);
9120 if (rs == rb_default_rs)
9121 enc = rb_enc_get(str);
9122 else
9123 enc = rb_enc_check(str, rs);
9125 if (rslen == 0) {
9126 /* paragraph mode */
9127 int n;
9128 const char *eol = NULL;
9129 subend = subptr;
9130 while (subend < pend) {
9131 long chomp_rslen = 0;
9132 do {
9133 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9134 n = 0;
9135 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9136 if (rb_enc_is_newline(subend + n, pend, enc)) {
9137 if (eol == subend) break;
9138 subend += rslen;
9139 if (subptr) {
9140 eol = subend;
9141 chomp_rslen = -rslen;
9144 else {
9145 if (!subptr) subptr = subend;
9146 subend += rslen;
9148 rslen = 0;
9149 } while (subend < pend);
9150 if (!subptr) break;
9151 if (rslen == 0) chomp_rslen = 0;
9152 line = rb_str_subseq(str, subptr - ptr,
9153 subend - subptr + (chomp ? chomp_rslen : rslen));
9154 if (ENUM_ELEM(ary, line)) {
9155 str_mod_check(str, ptr, len);
9157 subptr = eol = NULL;
9159 goto end;
9161 else {
9162 rsptr = RSTRING_PTR(rs);
9163 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9164 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9165 rsnewline = 1;
9169 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9170 rs = rb_str_new(rsptr, rslen);
9171 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9172 rsptr = RSTRING_PTR(rs);
9173 rslen = RSTRING_LEN(rs);
9176 while (subptr < pend) {
9177 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9178 if (pos < 0) break;
9179 hit = subptr + pos;
9180 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9181 if (hit != adjusted) {
9182 subptr = adjusted;
9183 continue;
9185 subend = hit += rslen;
9186 if (chomp) {
9187 if (rsnewline) {
9188 subend = chomp_newline(subptr, subend, enc);
9190 else {
9191 subend -= rslen;
9194 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9195 if (ENUM_ELEM(ary, line)) {
9196 str_mod_check(str, ptr, len);
9198 subptr = hit;
9201 if (subptr != pend) {
9202 if (chomp) {
9203 if (rsnewline) {
9204 pend = chomp_newline(subptr, pend, enc);
9206 else if (pend - subptr >= rslen &&
9207 memcmp(pend - rslen, rsptr, rslen) == 0) {
9208 pend -= rslen;
9211 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9212 ENUM_ELEM(ary, line);
9213 RB_GC_GUARD(str);
9216 end:
9217 if (ary)
9218 return ary;
9219 else
9220 return orig;
9224 * call-seq:
9225 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9226 * each_line(line_sep = $/, chomp: false) -> enumerator
9228 * :include: doc/string/each_line.rdoc
9232 static VALUE
9233 rb_str_each_line(int argc, VALUE *argv, VALUE str)
9235 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9236 return rb_str_enumerate_lines(argc, argv, str, 0);
9240 * call-seq:
9241 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9243 * Forms substrings ("lines") of +self+ according to the given arguments
9244 * (see String#each_line for details); returns the lines in an array.
9248 static VALUE
9249 rb_str_lines(int argc, VALUE *argv, VALUE str)
9251 VALUE ary = WANTARRAY("lines", 0);
9252 return rb_str_enumerate_lines(argc, argv, str, ary);
9255 static VALUE
9256 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9258 return LONG2FIX(RSTRING_LEN(str));
9261 static VALUE
9262 rb_str_enumerate_bytes(VALUE str, VALUE ary)
9264 long i;
9266 for (i=0; i<RSTRING_LEN(str); i++) {
9267 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9269 if (ary)
9270 return ary;
9271 else
9272 return str;
9276 * call-seq:
9277 * each_byte {|byte| ... } -> self
9278 * each_byte -> enumerator
9280 * :include: doc/string/each_byte.rdoc
9284 static VALUE
9285 rb_str_each_byte(VALUE str)
9287 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9288 return rb_str_enumerate_bytes(str, 0);
9292 * call-seq:
9293 * bytes -> array_of_bytes
9295 * :include: doc/string/bytes.rdoc
9299 static VALUE
9300 rb_str_bytes(VALUE str)
9302 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9303 return rb_str_enumerate_bytes(str, ary);
9306 static VALUE
9307 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9309 return rb_str_length(str);
9312 static VALUE
9313 rb_str_enumerate_chars(VALUE str, VALUE ary)
9315 VALUE orig = str;
9316 long i, len, n;
9317 const char *ptr;
9318 rb_encoding *enc;
9320 str = rb_str_new_frozen(str);
9321 ptr = RSTRING_PTR(str);
9322 len = RSTRING_LEN(str);
9323 enc = rb_enc_get(str);
9325 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9326 for (i = 0; i < len; i += n) {
9327 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9328 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9331 else {
9332 for (i = 0; i < len; i += n) {
9333 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9334 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9337 RB_GC_GUARD(str);
9338 if (ary)
9339 return ary;
9340 else
9341 return orig;
9345 * call-seq:
9346 * each_char {|c| ... } -> self
9347 * each_char -> enumerator
9349 * :include: doc/string/each_char.rdoc
9353 static VALUE
9354 rb_str_each_char(VALUE str)
9356 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9357 return rb_str_enumerate_chars(str, 0);
9361 * call-seq:
9362 * chars -> array_of_characters
9364 * :include: doc/string/chars.rdoc
9368 static VALUE
9369 rb_str_chars(VALUE str)
9371 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9372 return rb_str_enumerate_chars(str, ary);
9375 static VALUE
9376 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9378 VALUE orig = str;
9379 int n;
9380 unsigned int c;
9381 const char *ptr, *end;
9382 rb_encoding *enc;
9384 if (single_byte_optimizable(str))
9385 return rb_str_enumerate_bytes(str, ary);
9387 str = rb_str_new_frozen(str);
9388 ptr = RSTRING_PTR(str);
9389 end = RSTRING_END(str);
9390 enc = STR_ENC_GET(str);
9392 while (ptr < end) {
9393 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9394 ENUM_ELEM(ary, UINT2NUM(c));
9395 ptr += n;
9397 RB_GC_GUARD(str);
9398 if (ary)
9399 return ary;
9400 else
9401 return orig;
9405 * call-seq:
9406 * each_codepoint {|integer| ... } -> self
9407 * each_codepoint -> enumerator
9409 * :include: doc/string/each_codepoint.rdoc
9413 static VALUE
9414 rb_str_each_codepoint(VALUE str)
9416 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9417 return rb_str_enumerate_codepoints(str, 0);
9421 * call-seq:
9422 * codepoints -> array_of_integers
9424 * :include: doc/string/codepoints.rdoc
9428 static VALUE
9429 rb_str_codepoints(VALUE str)
9431 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9432 return rb_str_enumerate_codepoints(str, ary);
9435 static regex_t *
9436 get_reg_grapheme_cluster(rb_encoding *enc)
9438 int encidx = rb_enc_to_index(enc);
9440 const OnigUChar source_ascii[] = "\\X";
9441 const OnigUChar *source = source_ascii;
9442 size_t source_len = sizeof(source_ascii) - 1;
9444 switch (encidx) {
9445 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9446 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9447 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9448 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9449 #define CASE_UTF(e) \
9450 case ENCINDEX_UTF_##e: { \
9451 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9452 source = source_UTF_##e; \
9453 source_len = sizeof(source_UTF_##e); \
9454 break; \
9456 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9457 #undef CASE_UTF
9458 #undef CHARS_16BE
9459 #undef CHARS_16LE
9460 #undef CHARS_32BE
9461 #undef CHARS_32LE
9464 regex_t *reg_grapheme_cluster;
9465 OnigErrorInfo einfo;
9466 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9467 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9468 if (r) {
9469 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9470 onig_error_code_to_str(message, r, &einfo);
9471 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9474 return reg_grapheme_cluster;
9477 static regex_t *
9478 get_cached_reg_grapheme_cluster(rb_encoding *enc)
9480 int encidx = rb_enc_to_index(enc);
9481 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9483 if (encidx == rb_utf8_encindex()) {
9484 if (!reg_grapheme_cluster_utf8) {
9485 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9488 return reg_grapheme_cluster_utf8;
9491 return NULL;
9494 static VALUE
9495 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9497 size_t grapheme_cluster_count = 0;
9498 rb_encoding *enc = get_encoding(str);
9499 const char *ptr, *end;
9501 if (!rb_enc_unicode_p(enc)) {
9502 return rb_str_length(str);
9505 bool cached_reg_grapheme_cluster = true;
9506 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9507 if (!reg_grapheme_cluster) {
9508 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9509 cached_reg_grapheme_cluster = false;
9512 ptr = RSTRING_PTR(str);
9513 end = RSTRING_END(str);
9515 while (ptr < end) {
9516 OnigPosition len = onig_match(reg_grapheme_cluster,
9517 (const OnigUChar *)ptr, (const OnigUChar *)end,
9518 (const OnigUChar *)ptr, NULL, 0);
9519 if (len <= 0) break;
9520 grapheme_cluster_count++;
9521 ptr += len;
9524 if (!cached_reg_grapheme_cluster) {
9525 onig_free(reg_grapheme_cluster);
9528 return SIZET2NUM(grapheme_cluster_count);
9531 static VALUE
9532 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9534 VALUE orig = str;
9535 rb_encoding *enc = get_encoding(str);
9536 const char *ptr0, *ptr, *end;
9538 if (!rb_enc_unicode_p(enc)) {
9539 return rb_str_enumerate_chars(str, ary);
9542 if (!ary) str = rb_str_new_frozen(str);
9544 bool cached_reg_grapheme_cluster = true;
9545 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9546 if (!reg_grapheme_cluster) {
9547 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9548 cached_reg_grapheme_cluster = false;
9551 ptr0 = ptr = RSTRING_PTR(str);
9552 end = RSTRING_END(str);
9554 while (ptr < end) {
9555 OnigPosition len = onig_match(reg_grapheme_cluster,
9556 (const OnigUChar *)ptr, (const OnigUChar *)end,
9557 (const OnigUChar *)ptr, NULL, 0);
9558 if (len <= 0) break;
9559 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9560 ptr += len;
9563 if (!cached_reg_grapheme_cluster) {
9564 onig_free(reg_grapheme_cluster);
9567 RB_GC_GUARD(str);
9568 if (ary)
9569 return ary;
9570 else
9571 return orig;
9575 * call-seq:
9576 * each_grapheme_cluster {|gc| ... } -> self
9577 * each_grapheme_cluster -> enumerator
9579 * :include: doc/string/each_grapheme_cluster.rdoc
9583 static VALUE
9584 rb_str_each_grapheme_cluster(VALUE str)
9586 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9587 return rb_str_enumerate_grapheme_clusters(str, 0);
9591 * call-seq:
9592 * grapheme_clusters -> array_of_grapheme_clusters
9594 * :include: doc/string/grapheme_clusters.rdoc
9598 static VALUE
9599 rb_str_grapheme_clusters(VALUE str)
9601 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9602 return rb_str_enumerate_grapheme_clusters(str, ary);
9605 static long
9606 chopped_length(VALUE str)
9608 rb_encoding *enc = STR_ENC_GET(str);
9609 const char *p, *p2, *beg, *end;
9611 beg = RSTRING_PTR(str);
9612 end = beg + RSTRING_LEN(str);
9613 if (beg >= end) return 0;
9614 p = rb_enc_prev_char(beg, end, end, enc);
9615 if (!p) return 0;
9616 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9617 p2 = rb_enc_prev_char(beg, p, end, enc);
9618 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9620 return p - beg;
9624 * call-seq:
9625 * chop! -> self or nil
9627 * Like String#chop, but modifies +self+ in place;
9628 * returns +nil+ if +self+ is empty, +self+ otherwise.
9630 * Related: String#chomp!.
9633 static VALUE
9634 rb_str_chop_bang(VALUE str)
9636 str_modify_keep_cr(str);
9637 if (RSTRING_LEN(str) > 0) {
9638 long len;
9639 len = chopped_length(str);
9640 STR_SET_LEN(str, len);
9641 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9642 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9643 ENC_CODERANGE_CLEAR(str);
9645 return str;
9647 return Qnil;
9652 * call-seq:
9653 * chop -> new_string
9655 * :include: doc/string/chop.rdoc
9659 static VALUE
9660 rb_str_chop(VALUE str)
9662 return rb_str_subseq(str, 0, chopped_length(str));
9665 static long
9666 smart_chomp(VALUE str, const char *e, const char *p)
9668 rb_encoding *enc = rb_enc_get(str);
9669 if (rb_enc_mbminlen(enc) > 1) {
9670 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9671 if (rb_enc_is_newline(pp, e, enc)) {
9672 e = pp;
9674 pp = e - rb_enc_mbminlen(enc);
9675 if (pp >= p) {
9676 pp = rb_enc_left_char_head(p, pp, e, enc);
9677 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9678 e = pp;
9682 else {
9683 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9684 case '\n':
9685 if (--e > p && *(e-1) == '\r') {
9686 --e;
9688 break;
9689 case '\r':
9690 --e;
9691 break;
9694 return e - p;
9697 static long
9698 chompped_length(VALUE str, VALUE rs)
9700 rb_encoding *enc;
9701 int newline;
9702 char *pp, *e, *rsptr;
9703 long rslen;
9704 char *const p = RSTRING_PTR(str);
9705 long len = RSTRING_LEN(str);
9707 if (len == 0) return 0;
9708 e = p + len;
9709 if (rs == rb_default_rs) {
9710 return smart_chomp(str, e, p);
9713 enc = rb_enc_get(str);
9714 RSTRING_GETMEM(rs, rsptr, rslen);
9715 if (rslen == 0) {
9716 if (rb_enc_mbminlen(enc) > 1) {
9717 while (e > p) {
9718 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9719 if (!rb_enc_is_newline(pp, e, enc)) break;
9720 e = pp;
9721 pp -= rb_enc_mbminlen(enc);
9722 if (pp >= p) {
9723 pp = rb_enc_left_char_head(p, pp, e, enc);
9724 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9725 e = pp;
9730 else {
9731 while (e > p && *(e-1) == '\n') {
9732 --e;
9733 if (e > p && *(e-1) == '\r')
9734 --e;
9737 return e - p;
9739 if (rslen > len) return len;
9741 enc = rb_enc_get(rs);
9742 newline = rsptr[rslen-1];
9743 if (rslen == rb_enc_mbminlen(enc)) {
9744 if (rslen == 1) {
9745 if (newline == '\n')
9746 return smart_chomp(str, e, p);
9748 else {
9749 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9750 return smart_chomp(str, e, p);
9754 enc = rb_enc_check(str, rs);
9755 if (is_broken_string(rs)) {
9756 return len;
9758 pp = e - rslen;
9759 if (p[len-1] == newline &&
9760 (rslen <= 1 ||
9761 memcmp(rsptr, pp, rslen) == 0)) {
9762 if (at_char_boundary(p, pp, e, enc))
9763 return len - rslen;
9764 RB_GC_GUARD(rs);
9766 return len;
9770 * Returns the separator for arguments of rb_str_chomp.
9772 * @return returns rb_rs ($/) as default, the default value of rb_rs ($/) is "\n".
9774 static VALUE
9775 chomp_rs(int argc, const VALUE *argv)
9777 rb_check_arity(argc, 0, 1);
9778 if (argc > 0) {
9779 VALUE rs = argv[0];
9780 if (!NIL_P(rs)) StringValue(rs);
9781 return rs;
9783 else {
9784 return rb_rs;
9788 VALUE
9789 rb_str_chomp_string(VALUE str, VALUE rs)
9791 long olen = RSTRING_LEN(str);
9792 long len = chompped_length(str, rs);
9793 if (len >= olen) return Qnil;
9794 str_modify_keep_cr(str);
9795 STR_SET_LEN(str, len);
9796 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9797 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9798 ENC_CODERANGE_CLEAR(str);
9800 return str;
9804 * call-seq:
9805 * chomp!(line_sep = $/) -> self or nil
9807 * Like String#chomp, but modifies +self+ in place;
9808 * returns +nil+ if no modification made, +self+ otherwise.
9812 static VALUE
9813 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9815 VALUE rs;
9816 str_modifiable(str);
9817 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9818 rs = chomp_rs(argc, argv);
9819 if (NIL_P(rs)) return Qnil;
9820 return rb_str_chomp_string(str, rs);
9825 * call-seq:
9826 * chomp(line_sep = $/) -> new_string
9828 * :include: doc/string/chomp.rdoc
9832 static VALUE
9833 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9835 VALUE rs = chomp_rs(argc, argv);
9836 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9837 return rb_str_subseq(str, 0, chompped_length(str, rs));
9840 static long
9841 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9843 const char *const start = s;
9845 if (!s || s >= e) return 0;
9847 /* remove spaces at head */
9848 if (single_byte_optimizable(str)) {
9849 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9851 else {
9852 while (s < e) {
9853 int n;
9854 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9856 if (cc && !rb_isspace(cc)) break;
9857 s += n;
9860 return s - start;
9864 * call-seq:
9865 * lstrip! -> self or nil
9867 * Like String#lstrip, except that any modifications are made in +self+;
9868 * returns +self+ if any modification are made, +nil+ otherwise.
9870 * Related: String#rstrip!, String#strip!.
9873 static VALUE
9874 rb_str_lstrip_bang(VALUE str)
9876 rb_encoding *enc;
9877 char *start, *s;
9878 long olen, loffset;
9880 str_modify_keep_cr(str);
9881 enc = STR_ENC_GET(str);
9882 RSTRING_GETMEM(str, start, olen);
9883 loffset = lstrip_offset(str, start, start+olen, enc);
9884 if (loffset > 0) {
9885 long len = olen-loffset;
9886 s = start + loffset;
9887 memmove(start, s, len);
9888 STR_SET_LEN(str, len);
9889 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9890 return str;
9892 return Qnil;
9897 * call-seq:
9898 * lstrip -> new_string
9900 * Returns a copy of +self+ with leading whitespace removed;
9901 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9903 * whitespace = "\x00\t\n\v\f\r "
9904 * s = whitespace + 'abc' + whitespace
9905 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9906 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9908 * Related: String#rstrip, String#strip.
9911 static VALUE
9912 rb_str_lstrip(VALUE str)
9914 char *start;
9915 long len, loffset;
9916 RSTRING_GETMEM(str, start, len);
9917 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9918 if (loffset <= 0) return str_duplicate(rb_cString, str);
9919 return rb_str_subseq(str, loffset, len - loffset);
9922 static long
9923 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9925 const char *t;
9927 rb_str_check_dummy_enc(enc);
9928 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
9929 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9931 if (!s || s >= e) return 0;
9932 t = e;
9934 /* remove trailing spaces or '\0's */
9935 if (single_byte_optimizable(str)) {
9936 unsigned char c;
9937 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9939 else {
9940 char *tp;
9942 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9943 unsigned int c = rb_enc_codepoint(tp, e, enc);
9944 if (c && !rb_isspace(c)) break;
9945 t = tp;
9948 return e - t;
9952 * call-seq:
9953 * rstrip! -> self or nil
9955 * Like String#rstrip, except that any modifications are made in +self+;
9956 * returns +self+ if any modification are made, +nil+ otherwise.
9958 * Related: String#lstrip!, String#strip!.
9961 static VALUE
9962 rb_str_rstrip_bang(VALUE str)
9964 rb_encoding *enc;
9965 char *start;
9966 long olen, roffset;
9968 str_modify_keep_cr(str);
9969 enc = STR_ENC_GET(str);
9970 RSTRING_GETMEM(str, start, olen);
9971 roffset = rstrip_offset(str, start, start+olen, enc);
9972 if (roffset > 0) {
9973 long len = olen - roffset;
9975 STR_SET_LEN(str, len);
9976 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9977 return str;
9979 return Qnil;
9984 * call-seq:
9985 * rstrip -> new_string
9987 * Returns a copy of the receiver with trailing whitespace removed;
9988 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9990 * whitespace = "\x00\t\n\v\f\r "
9991 * s = whitespace + 'abc' + whitespace
9992 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9993 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9995 * Related: String#lstrip, String#strip.
9998 static VALUE
9999 rb_str_rstrip(VALUE str)
10001 rb_encoding *enc;
10002 char *start;
10003 long olen, roffset;
10005 enc = STR_ENC_GET(str);
10006 RSTRING_GETMEM(str, start, olen);
10007 roffset = rstrip_offset(str, start, start+olen, enc);
10009 if (roffset <= 0) return str_duplicate(rb_cString, str);
10010 return rb_str_subseq(str, 0, olen-roffset);
10015 * call-seq:
10016 * strip! -> self or nil
10018 * Like String#strip, except that any modifications are made in +self+;
10019 * returns +self+ if any modification are made, +nil+ otherwise.
10021 * Related: String#lstrip!, String#strip!.
10024 static VALUE
10025 rb_str_strip_bang(VALUE str)
10027 char *start;
10028 long olen, loffset, roffset;
10029 rb_encoding *enc;
10031 str_modify_keep_cr(str);
10032 enc = STR_ENC_GET(str);
10033 RSTRING_GETMEM(str, start, olen);
10034 loffset = lstrip_offset(str, start, start+olen, enc);
10035 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10037 if (loffset > 0 || roffset > 0) {
10038 long len = olen-roffset;
10039 if (loffset > 0) {
10040 len -= loffset;
10041 memmove(start, start + loffset, len);
10043 STR_SET_LEN(str, len);
10044 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10045 return str;
10047 return Qnil;
10052 * call-seq:
10053 * strip -> new_string
10055 * Returns a copy of the receiver with leading and trailing whitespace removed;
10056 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10058 * whitespace = "\x00\t\n\v\f\r "
10059 * s = whitespace + 'abc' + whitespace
10060 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10061 * s.strip # => "abc"
10063 * Related: String#lstrip, String#rstrip.
10066 static VALUE
10067 rb_str_strip(VALUE str)
10069 char *start;
10070 long olen, loffset, roffset;
10071 rb_encoding *enc = STR_ENC_GET(str);
10073 RSTRING_GETMEM(str, start, olen);
10074 loffset = lstrip_offset(str, start, start+olen, enc);
10075 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10077 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10078 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10081 static VALUE
10082 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10084 VALUE result = Qnil;
10085 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10086 if (pos >= 0) {
10087 VALUE match;
10088 struct re_registers *regs;
10089 if (BUILTIN_TYPE(pat) == T_STRING) {
10090 regs = NULL;
10091 end = pos + RSTRING_LEN(pat);
10093 else {
10094 match = rb_backref_get();
10095 regs = RMATCH_REGS(match);
10096 pos = BEG(0);
10097 end = END(0);
10100 if (pos == end) {
10101 rb_encoding *enc = STR_ENC_GET(str);
10103 * Always consume at least one character of the input string
10105 if (RSTRING_LEN(str) > end)
10106 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10107 RSTRING_END(str), enc);
10108 else
10109 *start = end + 1;
10111 else {
10112 *start = end;
10115 if (!regs || regs->num_regs == 1) {
10116 result = rb_str_subseq(str, pos, end - pos);
10117 return result;
10119 else {
10120 result = rb_ary_new2(regs->num_regs);
10121 for (int i = 1; i < regs->num_regs; i++) {
10122 VALUE s = Qnil;
10123 if (BEG(i) >= 0) {
10124 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10127 rb_ary_push(result, s);
10131 RB_GC_GUARD(match);
10134 return result;
10139 * call-seq:
10140 * scan(string_or_regexp) -> array
10141 * scan(string_or_regexp) {|matches| ... } -> self
10143 * Matches a pattern against +self+; the pattern is:
10145 * - +string_or_regexp+ itself, if it is a Regexp.
10146 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10148 * Iterates through +self+, generating a collection of matching results:
10150 * - If the pattern contains no groups, each result is the
10151 * matched string, <code>$&</code>.
10152 * - If the pattern contains groups, each result is an array
10153 * containing one entry per group.
10155 * With no block given, returns an array of the results:
10157 * s = 'cruel world'
10158 * s.scan(/\w+/) # => ["cruel", "world"]
10159 * s.scan(/.../) # => ["cru", "el ", "wor"]
10160 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10161 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10163 * With a block given, calls the block with each result; returns +self+:
10165 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10166 * print "\n"
10167 * s.scan(/(.)(.)/) {|x,y| print y, x }
10168 * print "\n"
10170 * Output:
10172 * <<cruel>> <<world>>
10173 * rceu lowlr
10177 static VALUE
10178 rb_str_scan(VALUE str, VALUE pat)
10180 VALUE result;
10181 long start = 0;
10182 long last = -1, prev = 0;
10183 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10185 pat = get_pat_quoted(pat, 1);
10186 mustnot_broken(str);
10187 if (!rb_block_given_p()) {
10188 VALUE ary = rb_ary_new();
10190 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10191 last = prev;
10192 prev = start;
10193 rb_ary_push(ary, result);
10195 if (last >= 0) rb_pat_search(pat, str, last, 1);
10196 else rb_backref_set(Qnil);
10197 return ary;
10200 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10201 last = prev;
10202 prev = start;
10203 rb_yield(result);
10204 str_mod_check(str, p, len);
10206 if (last >= 0) rb_pat_search(pat, str, last, 1);
10207 return str;
10212 * call-seq:
10213 * hex -> integer
10215 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10216 * (with an optional sign and an optional <code>0x</code>) and returns the
10217 * corresponding number;
10218 * returns zero if there is no such leading substring:
10220 * '0x0a'.hex # => 10
10221 * '-1234'.hex # => -4660
10222 * '0'.hex # => 0
10223 * 'non-numeric'.hex # => 0
10225 * Related: String#oct.
10229 static VALUE
10230 rb_str_hex(VALUE str)
10232 return rb_str_to_inum(str, 16, FALSE);
10237 * call-seq:
10238 * oct -> integer
10240 * Interprets the leading substring of +self+ as a string of octal digits
10241 * (with an optional sign) and returns the corresponding number;
10242 * returns zero if there is no such leading substring:
10244 * '123'.oct # => 83
10245 * '-377'.oct # => -255
10246 * '0377non-numeric'.oct # => 255
10247 * 'non-numeric'.oct # => 0
10249 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10250 * see Kernel#Integer.
10252 * Related: String#hex.
10256 static VALUE
10257 rb_str_oct(VALUE str)
10259 return rb_str_to_inum(str, -8, FALSE);
10262 #ifndef HAVE_CRYPT_R
10263 # include "ruby/thread_native.h"
10264 # include "ruby/atomic.h"
10266 static struct {
10267 rb_nativethread_lock_t lock;
10268 } crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10270 static void
10271 crypt_mutex_initialize(void)
10274 #endif
10277 * call-seq:
10278 * crypt(salt_str) -> new_string
10280 * Returns the string generated by calling <code>crypt(3)</code>
10281 * standard library function with <code>str</code> and
10282 * <code>salt_str</code>, in this order, as its arguments. Please do
10283 * not use this method any longer. It is legacy; provided only for
10284 * backward compatibility with ruby scripts in earlier days. It is
10285 * bad to use in contemporary programs for several reasons:
10287 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10288 * run. The generated string lacks data portability.
10290 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10291 * (i.e. silently ends up in unexpected results).
10293 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10294 * thread safe.
10296 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10297 * very very weak. According to its manpage, Linux's traditional
10298 * <code>crypt(3)</code> output has only 2**56 variations; too
10299 * easy to brute force today. And this is the default behaviour.
10301 * * In order to make things robust some OSes implement so-called
10302 * "modular" usage. To go through, you have to do a complex
10303 * build-up of the <code>salt_str</code> parameter, by hand.
10304 * Failure in generation of a proper salt string tends not to
10305 * yield any errors; typos in parameters are normally not
10306 * detectable.
10308 * * For instance, in the following example, the second invocation
10309 * of String#crypt is wrong; it has a typo in "round=" (lacks
10310 * "s"). However the call does not fail and something unexpected
10311 * is generated.
10313 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10314 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10316 * * Even in the "modular" mode, some hash functions are considered
10317 * archaic and no longer recommended at all; for instance module
10318 * <code>$1$</code> is officially abandoned by its author: see
10319 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10320 * instance module <code>$3$</code> is considered completely
10321 * broken: see the manpage of FreeBSD.
10323 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10324 * written above, <code>crypt(3)</code> on Mac OS never fails.
10325 * This means even if you build up a proper salt string it
10326 * generates a traditional DES hash anyways, and there is no way
10327 * for you to be aware of.
10329 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10331 * If for some reason you cannot migrate to other secure contemporary
10332 * password hashing algorithms, install the string-crypt gem and
10333 * <code>require 'string/crypt'</code> to continue using it.
10336 static VALUE
10337 rb_str_crypt(VALUE str, VALUE salt)
10339 #ifdef HAVE_CRYPT_R
10340 VALUE databuf;
10341 struct crypt_data *data;
10342 # define CRYPT_END() ALLOCV_END(databuf)
10343 #else
10344 extern char *crypt(const char *, const char *);
10345 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10346 #endif
10347 VALUE result;
10348 const char *s, *saltp;
10349 char *res;
10350 #ifdef BROKEN_CRYPT
10351 char salt_8bit_clean[3];
10352 #endif
10354 StringValue(salt);
10355 mustnot_wchar(str);
10356 mustnot_wchar(salt);
10357 s = StringValueCStr(str);
10358 saltp = RSTRING_PTR(salt);
10359 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10360 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10363 #ifdef BROKEN_CRYPT
10364 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10365 salt_8bit_clean[0] = saltp[0] & 0x7f;
10366 salt_8bit_clean[1] = saltp[1] & 0x7f;
10367 salt_8bit_clean[2] = '\0';
10368 saltp = salt_8bit_clean;
10370 #endif
10371 #ifdef HAVE_CRYPT_R
10372 data = ALLOCV(databuf, sizeof(struct crypt_data));
10373 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10374 data->initialized = 0;
10375 # endif
10376 res = crypt_r(s, saltp, data);
10377 #else
10378 crypt_mutex_initialize();
10379 rb_nativethread_lock_lock(&crypt_mutex.lock);
10380 res = crypt(s, saltp);
10381 #endif
10382 if (!res) {
10383 int err = errno;
10384 CRYPT_END();
10385 rb_syserr_fail(err, "crypt");
10387 result = rb_str_new_cstr(res);
10388 CRYPT_END();
10389 return result;
10394 * call-seq:
10395 * ord -> integer
10397 * :include: doc/string/ord.rdoc
10401 static VALUE
10402 rb_str_ord(VALUE s)
10404 unsigned int c;
10406 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10407 return UINT2NUM(c);
10410 * call-seq:
10411 * sum(n = 16) -> integer
10413 * :include: doc/string/sum.rdoc
10417 static VALUE
10418 rb_str_sum(int argc, VALUE *argv, VALUE str)
10420 int bits = 16;
10421 char *ptr, *p, *pend;
10422 long len;
10423 VALUE sum = INT2FIX(0);
10424 unsigned long sum0 = 0;
10426 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10427 bits = 0;
10429 ptr = p = RSTRING_PTR(str);
10430 len = RSTRING_LEN(str);
10431 pend = p + len;
10433 while (p < pend) {
10434 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10435 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10436 str_mod_check(str, ptr, len);
10437 sum0 = 0;
10439 sum0 += (unsigned char)*p;
10440 p++;
10443 if (bits == 0) {
10444 if (sum0) {
10445 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10448 else {
10449 if (sum == INT2FIX(0)) {
10450 if (bits < (int)sizeof(long)*CHAR_BIT) {
10451 sum0 &= (((unsigned long)1)<<bits)-1;
10453 sum = LONG2FIX(sum0);
10455 else {
10456 VALUE mod;
10458 if (sum0) {
10459 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10462 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10463 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10464 sum = rb_funcall(sum, '&', 1, mod);
10467 return sum;
10470 static VALUE
10471 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10473 rb_encoding *enc;
10474 VALUE w;
10475 long width, len, flen = 1, fclen = 1;
10476 VALUE res;
10477 char *p;
10478 const char *f = " ";
10479 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10480 VALUE pad;
10481 int singlebyte = 1, cr;
10482 int termlen;
10484 rb_scan_args(argc, argv, "11", &w, &pad);
10485 enc = STR_ENC_GET(str);
10486 termlen = rb_enc_mbminlen(enc);
10487 width = NUM2LONG(w);
10488 if (argc == 2) {
10489 StringValue(pad);
10490 enc = rb_enc_check(str, pad);
10491 f = RSTRING_PTR(pad);
10492 flen = RSTRING_LEN(pad);
10493 fclen = str_strlen(pad, enc); /* rb_enc_check */
10494 singlebyte = single_byte_optimizable(pad);
10495 if (flen == 0 || fclen == 0) {
10496 rb_raise(rb_eArgError, "zero width padding");
10499 len = str_strlen(str, enc); /* rb_enc_check */
10500 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10501 n = width - len;
10502 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10503 rlen = n - llen;
10504 cr = ENC_CODERANGE(str);
10505 if (flen > 1) {
10506 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10507 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10509 size = RSTRING_LEN(str);
10510 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10511 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10512 (len += llen2 + rlen2) >= LONG_MAX - size) {
10513 rb_raise(rb_eArgError, "argument too big");
10515 len += size;
10516 res = str_new0(rb_cString, 0, len, termlen);
10517 p = RSTRING_PTR(res);
10518 if (flen <= 1) {
10519 memset(p, *f, llen);
10520 p += llen;
10522 else {
10523 while (llen >= fclen) {
10524 memcpy(p,f,flen);
10525 p += flen;
10526 llen -= fclen;
10528 if (llen > 0) {
10529 memcpy(p, f, llen2);
10530 p += llen2;
10533 memcpy(p, RSTRING_PTR(str), size);
10534 p += size;
10535 if (flen <= 1) {
10536 memset(p, *f, rlen);
10537 p += rlen;
10539 else {
10540 while (rlen >= fclen) {
10541 memcpy(p,f,flen);
10542 p += flen;
10543 rlen -= fclen;
10545 if (rlen > 0) {
10546 memcpy(p, f, rlen2);
10547 p += rlen2;
10550 TERM_FILL(p, termlen);
10551 STR_SET_LEN(res, p-RSTRING_PTR(res));
10552 rb_enc_associate(res, enc);
10553 if (argc == 2)
10554 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10555 if (cr != ENC_CODERANGE_BROKEN)
10556 ENC_CODERANGE_SET(res, cr);
10558 RB_GC_GUARD(pad);
10559 return res;
10564 * call-seq:
10565 * ljust(size, pad_string = ' ') -> new_string
10567 * :include: doc/string/ljust.rdoc
10569 * Related: String#rjust, String#center.
10573 static VALUE
10574 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10576 return rb_str_justify(argc, argv, str, 'l');
10580 * call-seq:
10581 * rjust(size, pad_string = ' ') -> new_string
10583 * :include: doc/string/rjust.rdoc
10585 * Related: String#ljust, String#center.
10589 static VALUE
10590 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10592 return rb_str_justify(argc, argv, str, 'r');
10597 * call-seq:
10598 * center(size, pad_string = ' ') -> new_string
10600 * :include: doc/string/center.rdoc
10602 * Related: String#ljust, String#rjust.
10606 static VALUE
10607 rb_str_center(int argc, VALUE *argv, VALUE str)
10609 return rb_str_justify(argc, argv, str, 'c');
10613 * call-seq:
10614 * partition(string_or_regexp) -> [head, match, tail]
10616 * :include: doc/string/partition.rdoc
10620 static VALUE
10621 rb_str_partition(VALUE str, VALUE sep)
10623 long pos;
10625 sep = get_pat_quoted(sep, 0);
10626 if (RB_TYPE_P(sep, T_REGEXP)) {
10627 if (rb_reg_search(sep, str, 0, 0) < 0) {
10628 goto failed;
10630 VALUE match = rb_backref_get();
10631 struct re_registers *regs = RMATCH_REGS(match);
10633 pos = BEG(0);
10634 sep = rb_str_subseq(str, pos, END(0) - pos);
10636 else {
10637 pos = rb_str_index(str, sep, 0);
10638 if (pos < 0) goto failed;
10640 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10641 sep,
10642 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10643 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10645 failed:
10646 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10650 * call-seq:
10651 * rpartition(sep) -> [head, match, tail]
10653 * :include: doc/string/rpartition.rdoc
10657 static VALUE
10658 rb_str_rpartition(VALUE str, VALUE sep)
10660 long pos = RSTRING_LEN(str);
10662 sep = get_pat_quoted(sep, 0);
10663 if (RB_TYPE_P(sep, T_REGEXP)) {
10664 if (rb_reg_search(sep, str, pos, 1) < 0) {
10665 goto failed;
10667 VALUE match = rb_backref_get();
10668 struct re_registers *regs = RMATCH_REGS(match);
10670 pos = BEG(0);
10671 sep = rb_str_subseq(str, pos, END(0) - pos);
10673 else {
10674 pos = rb_str_sublen(str, pos);
10675 pos = rb_str_rindex(str, sep, pos);
10676 if (pos < 0) {
10677 goto failed;
10681 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10682 sep,
10683 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10684 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10685 failed:
10686 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10690 * call-seq:
10691 * start_with?(*string_or_regexp) -> true or false
10693 * :include: doc/string/start_with_p.rdoc
10697 static VALUE
10698 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10700 int i;
10702 for (i=0; i<argc; i++) {
10703 VALUE tmp = argv[i];
10704 if (RB_TYPE_P(tmp, T_REGEXP)) {
10705 if (rb_reg_start_with_p(tmp, str))
10706 return Qtrue;
10708 else {
10709 const char *p, *s, *e;
10710 long slen, tlen;
10711 rb_encoding *enc;
10713 StringValue(tmp);
10714 enc = rb_enc_check(str, tmp);
10715 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10716 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10717 p = RSTRING_PTR(str);
10718 e = p + slen;
10719 s = p + tlen;
10720 if (!at_char_right_boundary(p, s, e, enc))
10721 continue;
10722 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10723 return Qtrue;
10726 return Qfalse;
10730 * call-seq:
10731 * end_with?(*strings) -> true or false
10733 * :include: doc/string/end_with_p.rdoc
10737 static VALUE
10738 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10740 int i;
10742 for (i=0; i<argc; i++) {
10743 VALUE tmp = argv[i];
10744 const char *p, *s, *e;
10745 long slen, tlen;
10746 rb_encoding *enc;
10748 StringValue(tmp);
10749 enc = rb_enc_check(str, tmp);
10750 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10751 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10752 p = RSTRING_PTR(str);
10753 e = p + slen;
10754 s = e - tlen;
10755 if (!at_char_boundary(p, s, e, enc))
10756 continue;
10757 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10758 return Qtrue;
10760 return Qfalse;
10764 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10765 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10767 * @param str the target
10768 * @param prefix the prefix
10769 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10770 * @retval Positive-Integer otherwise
10772 static long
10773 deleted_prefix_length(VALUE str, VALUE prefix)
10775 const char *strptr, *prefixptr;
10776 long olen, prefixlen;
10777 rb_encoding *enc = rb_enc_get(str);
10779 StringValue(prefix);
10781 if (!is_broken_string(prefix) ||
10782 !rb_enc_asciicompat(enc) ||
10783 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10784 enc = rb_enc_check(str, prefix);
10787 /* return 0 if not start with prefix */
10788 prefixlen = RSTRING_LEN(prefix);
10789 if (prefixlen <= 0) return 0;
10790 olen = RSTRING_LEN(str);
10791 if (olen < prefixlen) return 0;
10792 strptr = RSTRING_PTR(str);
10793 prefixptr = RSTRING_PTR(prefix);
10794 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10795 if (is_broken_string(prefix)) {
10796 if (!is_broken_string(str)) {
10797 /* prefix in a valid string cannot be broken */
10798 return 0;
10800 const char *strend = strptr + olen;
10801 const char *after_prefix = strptr + prefixlen;
10802 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10803 /* prefix does not end at char-boundary */
10804 return 0;
10807 /* prefix part in `str` also should be valid. */
10809 return prefixlen;
10813 * call-seq:
10814 * delete_prefix!(prefix) -> self or nil
10816 * Like String#delete_prefix, except that +self+ is modified in place.
10817 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10821 static VALUE
10822 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10824 long prefixlen;
10825 str_modify_keep_cr(str);
10827 prefixlen = deleted_prefix_length(str, prefix);
10828 if (prefixlen <= 0) return Qnil;
10830 return rb_str_drop_bytes(str, prefixlen);
10834 * call-seq:
10835 * delete_prefix(prefix) -> new_string
10837 * :include: doc/string/delete_prefix.rdoc
10841 static VALUE
10842 rb_str_delete_prefix(VALUE str, VALUE prefix)
10844 long prefixlen;
10846 prefixlen = deleted_prefix_length(str, prefix);
10847 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10849 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10853 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10854 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10856 * @param str the target
10857 * @param suffix the suffix
10858 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10859 * @retval Positive-Integer otherwise
10861 static long
10862 deleted_suffix_length(VALUE str, VALUE suffix)
10864 const char *strptr, *suffixptr;
10865 long olen, suffixlen;
10866 rb_encoding *enc;
10868 StringValue(suffix);
10869 if (is_broken_string(suffix)) return 0;
10870 enc = rb_enc_check(str, suffix);
10872 /* return 0 if not start with suffix */
10873 suffixlen = RSTRING_LEN(suffix);
10874 if (suffixlen <= 0) return 0;
10875 olen = RSTRING_LEN(str);
10876 if (olen < suffixlen) return 0;
10877 strptr = RSTRING_PTR(str);
10878 suffixptr = RSTRING_PTR(suffix);
10879 const char *strend = strptr + olen;
10880 const char *before_suffix = strend - suffixlen;
10881 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10882 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10884 return suffixlen;
10888 * call-seq:
10889 * delete_suffix!(suffix) -> self or nil
10891 * Like String#delete_suffix, except that +self+ is modified in place.
10892 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10896 static VALUE
10897 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10899 long olen, suffixlen, len;
10900 str_modifiable(str);
10902 suffixlen = deleted_suffix_length(str, suffix);
10903 if (suffixlen <= 0) return Qnil;
10905 olen = RSTRING_LEN(str);
10906 str_modify_keep_cr(str);
10907 len = olen - suffixlen;
10908 STR_SET_LEN(str, len);
10909 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10910 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10911 ENC_CODERANGE_CLEAR(str);
10913 return str;
10917 * call-seq:
10918 * delete_suffix(suffix) -> new_string
10920 * :include: doc/string/delete_suffix.rdoc
10924 static VALUE
10925 rb_str_delete_suffix(VALUE str, VALUE suffix)
10927 long suffixlen;
10929 suffixlen = deleted_suffix_length(str, suffix);
10930 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10932 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10935 void
10936 rb_str_setter(VALUE val, ID id, VALUE *var)
10938 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10939 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10941 *var = val;
10944 static void
10945 rb_fs_setter(VALUE val, ID id, VALUE *var)
10947 val = rb_fs_check(val);
10948 if (!val) {
10949 rb_raise(rb_eTypeError,
10950 "value of %"PRIsVALUE" must be String or Regexp",
10951 rb_id2str(id));
10953 if (!NIL_P(val)) {
10954 rb_warn_deprecated("'$;'", NULL);
10956 *var = val;
10961 * call-seq:
10962 * force_encoding(encoding) -> self
10964 * :include: doc/string/force_encoding.rdoc
10968 static VALUE
10969 rb_str_force_encoding(VALUE str, VALUE enc)
10971 str_modifiable(str);
10973 rb_encoding *encoding = rb_to_encoding(enc);
10974 int idx = rb_enc_to_index(encoding);
10976 // If the encoding is unchanged, we do nothing.
10977 if (ENCODING_GET(str) == idx) {
10978 return str;
10981 rb_enc_associate_index(str, idx);
10983 // If the coderange was 7bit and the new encoding is ASCII-compatible
10984 // we can keep the coderange.
10985 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10986 return str;
10989 ENC_CODERANGE_CLEAR(str);
10990 return str;
10994 * call-seq:
10995 * b -> string
10997 * :include: doc/string/b.rdoc
11001 static VALUE
11002 rb_str_b(VALUE str)
11004 VALUE str2;
11005 if (STR_EMBED_P(str)) {
11006 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11008 else {
11009 str2 = str_alloc_heap(rb_cString);
11011 str_replace_shared_without_enc(str2, str);
11013 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11014 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11015 // If we know the receiver's code range then we know the result's code range.
11016 int cr = ENC_CODERANGE(str);
11017 switch (cr) {
11018 case ENC_CODERANGE_7BIT:
11019 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
11020 break;
11021 case ENC_CODERANGE_BROKEN:
11022 case ENC_CODERANGE_VALID:
11023 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
11024 break;
11025 default:
11026 ENC_CODERANGE_CLEAR(str2);
11027 break;
11031 return str2;
11035 * call-seq:
11036 * valid_encoding? -> true or false
11038 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11040 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11041 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11042 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11045 static VALUE
11046 rb_str_valid_encoding_p(VALUE str)
11048 int cr = rb_enc_str_coderange(str);
11050 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11054 * call-seq:
11055 * ascii_only? -> true or false
11057 * Returns +true+ if +self+ contains only ASCII characters,
11058 * +false+ otherwise:
11060 * 'abc'.ascii_only? # => true
11061 * "abc\u{6666}".ascii_only? # => false
11065 static VALUE
11066 rb_str_is_ascii_only_p(VALUE str)
11068 int cr = rb_enc_str_coderange(str);
11070 return RBOOL(cr == ENC_CODERANGE_7BIT);
11073 VALUE
11074 rb_str_ellipsize(VALUE str, long len)
11076 static const char ellipsis[] = "...";
11077 const long ellipsislen = sizeof(ellipsis) - 1;
11078 rb_encoding *const enc = rb_enc_get(str);
11079 const long blen = RSTRING_LEN(str);
11080 const char *const p = RSTRING_PTR(str), *e = p + blen;
11081 VALUE estr, ret = 0;
11083 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11084 if (len * rb_enc_mbminlen(enc) >= blen ||
11085 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11086 ret = str;
11088 else if (len <= ellipsislen ||
11089 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11090 if (rb_enc_asciicompat(enc)) {
11091 ret = rb_str_new(ellipsis, len);
11092 rb_enc_associate(ret, enc);
11094 else {
11095 estr = rb_usascii_str_new(ellipsis, len);
11096 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11099 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11100 rb_str_cat(ret, ellipsis, ellipsislen);
11102 else {
11103 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11104 rb_enc_from_encoding(enc), 0, Qnil);
11105 rb_str_append(ret, estr);
11107 return ret;
11110 static VALUE
11111 str_compat_and_valid(VALUE str, rb_encoding *enc)
11113 int cr;
11114 str = StringValue(str);
11115 cr = rb_enc_str_coderange(str);
11116 if (cr == ENC_CODERANGE_BROKEN) {
11117 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11119 else {
11120 rb_encoding *e = STR_ENC_GET(str);
11121 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11122 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11123 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11126 return str;
11129 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11131 VALUE
11132 rb_str_scrub(VALUE str, VALUE repl)
11134 rb_encoding *enc = STR_ENC_GET(str);
11135 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11138 VALUE
11139 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11141 int cr = ENC_CODERANGE_UNKNOWN;
11142 if (enc == STR_ENC_GET(str)) {
11143 /* cached coderange makes sense only when enc equals the
11144 * actual encoding of str */
11145 cr = ENC_CODERANGE(str);
11147 return enc_str_scrub(enc, str, repl, cr);
11150 static VALUE
11151 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11153 int encidx;
11154 VALUE buf = Qnil;
11155 const char *rep, *p, *e, *p1, *sp;
11156 long replen = -1;
11157 long slen;
11159 if (rb_block_given_p()) {
11160 if (!NIL_P(repl))
11161 rb_raise(rb_eArgError, "both of block and replacement given");
11162 replen = 0;
11165 if (ENC_CODERANGE_CLEAN_P(cr))
11166 return Qnil;
11168 if (!NIL_P(repl)) {
11169 repl = str_compat_and_valid(repl, enc);
11172 if (rb_enc_dummy_p(enc)) {
11173 return Qnil;
11175 encidx = rb_enc_to_index(enc);
11177 #define DEFAULT_REPLACE_CHAR(str) do { \
11178 static const char replace[sizeof(str)-1] = str; \
11179 rep = replace; replen = (int)sizeof(replace); \
11180 } while (0)
11182 slen = RSTRING_LEN(str);
11183 p = RSTRING_PTR(str);
11184 e = RSTRING_END(str);
11185 p1 = p;
11186 sp = p;
11188 if (rb_enc_asciicompat(enc)) {
11189 int rep7bit_p;
11190 if (!replen) {
11191 rep = NULL;
11192 rep7bit_p = FALSE;
11194 else if (!NIL_P(repl)) {
11195 rep = RSTRING_PTR(repl);
11196 replen = RSTRING_LEN(repl);
11197 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11199 else if (encidx == rb_utf8_encindex()) {
11200 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11201 rep7bit_p = FALSE;
11203 else {
11204 DEFAULT_REPLACE_CHAR("?");
11205 rep7bit_p = TRUE;
11207 cr = ENC_CODERANGE_7BIT;
11209 p = search_nonascii(p, e);
11210 if (!p) {
11211 p = e;
11213 while (p < e) {
11214 int ret = rb_enc_precise_mbclen(p, e, enc);
11215 if (MBCLEN_NEEDMORE_P(ret)) {
11216 break;
11218 else if (MBCLEN_CHARFOUND_P(ret)) {
11219 cr = ENC_CODERANGE_VALID;
11220 p += MBCLEN_CHARFOUND_LEN(ret);
11222 else if (MBCLEN_INVALID_P(ret)) {
11224 * p1~p: valid ascii/multibyte chars
11225 * p ~e: invalid bytes + unknown bytes
11227 long clen = rb_enc_mbmaxlen(enc);
11228 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11229 if (p > p1) {
11230 rb_str_buf_cat(buf, p1, p - p1);
11233 if (e - p < clen) clen = e - p;
11234 if (clen <= 2) {
11235 clen = 1;
11237 else {
11238 const char *q = p;
11239 clen--;
11240 for (; clen > 1; clen--) {
11241 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11242 if (MBCLEN_NEEDMORE_P(ret)) break;
11243 if (MBCLEN_INVALID_P(ret)) continue;
11244 UNREACHABLE;
11247 if (rep) {
11248 rb_str_buf_cat(buf, rep, replen);
11249 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11251 else {
11252 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11253 str_mod_check(str, sp, slen);
11254 repl = str_compat_and_valid(repl, enc);
11255 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11256 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11257 cr = ENC_CODERANGE_VALID;
11259 p += clen;
11260 p1 = p;
11261 p = search_nonascii(p, e);
11262 if (!p) {
11263 p = e;
11264 break;
11267 else {
11268 UNREACHABLE;
11271 if (NIL_P(buf)) {
11272 if (p == e) {
11273 ENC_CODERANGE_SET(str, cr);
11274 return Qnil;
11276 buf = rb_str_buf_new(RSTRING_LEN(str));
11278 if (p1 < p) {
11279 rb_str_buf_cat(buf, p1, p - p1);
11281 if (p < e) {
11282 if (rep) {
11283 rb_str_buf_cat(buf, rep, replen);
11284 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11286 else {
11287 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11288 str_mod_check(str, sp, slen);
11289 repl = str_compat_and_valid(repl, enc);
11290 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11291 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11292 cr = ENC_CODERANGE_VALID;
11296 else {
11297 /* ASCII incompatible */
11298 long mbminlen = rb_enc_mbminlen(enc);
11299 if (!replen) {
11300 rep = NULL;
11302 else if (!NIL_P(repl)) {
11303 rep = RSTRING_PTR(repl);
11304 replen = RSTRING_LEN(repl);
11306 else if (encidx == ENCINDEX_UTF_16BE) {
11307 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11309 else if (encidx == ENCINDEX_UTF_16LE) {
11310 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11312 else if (encidx == ENCINDEX_UTF_32BE) {
11313 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11315 else if (encidx == ENCINDEX_UTF_32LE) {
11316 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11318 else {
11319 DEFAULT_REPLACE_CHAR("?");
11322 while (p < e) {
11323 int ret = rb_enc_precise_mbclen(p, e, enc);
11324 if (MBCLEN_NEEDMORE_P(ret)) {
11325 break;
11327 else if (MBCLEN_CHARFOUND_P(ret)) {
11328 p += MBCLEN_CHARFOUND_LEN(ret);
11330 else if (MBCLEN_INVALID_P(ret)) {
11331 const char *q = p;
11332 long clen = rb_enc_mbmaxlen(enc);
11333 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11334 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11336 if (e - p < clen) clen = e - p;
11337 if (clen <= mbminlen * 2) {
11338 clen = mbminlen;
11340 else {
11341 clen -= mbminlen;
11342 for (; clen > mbminlen; clen-=mbminlen) {
11343 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11344 if (MBCLEN_NEEDMORE_P(ret)) break;
11345 if (MBCLEN_INVALID_P(ret)) continue;
11346 UNREACHABLE;
11349 if (rep) {
11350 rb_str_buf_cat(buf, rep, replen);
11352 else {
11353 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11354 str_mod_check(str, sp, slen);
11355 repl = str_compat_and_valid(repl, enc);
11356 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11358 p += clen;
11359 p1 = p;
11361 else {
11362 UNREACHABLE;
11365 if (NIL_P(buf)) {
11366 if (p == e) {
11367 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11368 return Qnil;
11370 buf = rb_str_buf_new(RSTRING_LEN(str));
11372 if (p1 < p) {
11373 rb_str_buf_cat(buf, p1, p - p1);
11375 if (p < e) {
11376 if (rep) {
11377 rb_str_buf_cat(buf, rep, replen);
11379 else {
11380 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11381 str_mod_check(str, sp, slen);
11382 repl = str_compat_and_valid(repl, enc);
11383 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11386 cr = ENC_CODERANGE_VALID;
11388 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11389 return buf;
11393 * call-seq:
11394 * scrub(replacement_string = default_replacement) -> new_string
11395 * scrub{|bytes| ... } -> new_string
11397 * :include: doc/string/scrub.rdoc
11400 static VALUE
11401 str_scrub(int argc, VALUE *argv, VALUE str)
11403 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11404 VALUE new = rb_str_scrub(str, repl);
11405 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11409 * call-seq:
11410 * scrub! -> self
11411 * scrub!(replacement_string = default_replacement) -> self
11412 * scrub!{|bytes| ... } -> self
11414 * Like String#scrub, except that any replacements are made in +self+.
11417 static VALUE
11418 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11420 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11421 VALUE new = rb_str_scrub(str, repl);
11422 if (!NIL_P(new)) rb_str_replace(str, new);
11423 return str;
11426 static ID id_normalize;
11427 static ID id_normalized_p;
11428 static VALUE mUnicodeNormalize;
11430 static VALUE
11431 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11433 static int UnicodeNormalizeRequired = 0;
11434 VALUE argv2[2];
11436 if (!UnicodeNormalizeRequired) {
11437 rb_require("unicode_normalize/normalize.rb");
11438 UnicodeNormalizeRequired = 1;
11440 argv2[0] = str;
11441 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11442 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11446 * call-seq:
11447 * unicode_normalize(form = :nfc) -> string
11449 * Returns a copy of +self+ with
11450 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11452 * Argument +form+ must be one of the following symbols
11453 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11455 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11456 * - +:nfd+: Canonical decomposition.
11457 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11458 * - +:nfkd+: Compatibility decomposition.
11460 * The encoding of +self+ must be one of:
11462 * - Encoding::UTF_8
11463 * - Encoding::UTF_16BE
11464 * - Encoding::UTF_16LE
11465 * - Encoding::UTF_32BE
11466 * - Encoding::UTF_32LE
11467 * - Encoding::GB18030
11468 * - Encoding::UCS_2BE
11469 * - Encoding::UCS_4BE
11471 * Examples:
11473 * "a\u0300".unicode_normalize # => "a"
11474 * "\u00E0".unicode_normalize(:nfd) # => "a "
11476 * Related: String#unicode_normalize!, String#unicode_normalized?.
11478 static VALUE
11479 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11481 return unicode_normalize_common(argc, argv, str, id_normalize);
11485 * call-seq:
11486 * unicode_normalize!(form = :nfc) -> self
11488 * Like String#unicode_normalize, except that the normalization
11489 * is performed on +self+.
11491 * Related String#unicode_normalized?.
11494 static VALUE
11495 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11497 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11500 /* call-seq:
11501 * unicode_normalized?(form = :nfc) -> true or false
11503 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11504 * +false+ otherwise.
11505 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11507 * Examples:
11509 * "a\u0300".unicode_normalized? # => false
11510 * "a\u0300".unicode_normalized?(:nfd) # => true
11511 * "\u00E0".unicode_normalized? # => true
11512 * "\u00E0".unicode_normalized?(:nfd) # => false
11515 * Raises an exception if +self+ is not in a Unicode encoding:
11517 * s = "\xE0".force_encoding('ISO-8859-1')
11518 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11520 * Related: String#unicode_normalize, String#unicode_normalize!.
11523 static VALUE
11524 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11526 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11529 /**********************************************************************
11530 * Document-class: Symbol
11532 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11534 * You can create a +Symbol+ object explicitly with:
11536 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11538 * The same +Symbol+ object will be
11539 * created for a given name or string for the duration of a program's
11540 * execution, regardless of the context or meaning of that name. Thus
11541 * if <code>Fred</code> is a constant in one context, a method in
11542 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11543 * will be the same object in all three contexts.
11545 * module One
11546 * class Fred
11547 * end
11548 * $f1 = :Fred
11549 * end
11550 * module Two
11551 * Fred = 1
11552 * $f2 = :Fred
11553 * end
11554 * def Fred()
11555 * end
11556 * $f3 = :Fred
11557 * $f1.object_id #=> 2514190
11558 * $f2.object_id #=> 2514190
11559 * $f3.object_id #=> 2514190
11561 * Constant, method, and variable names are returned as symbols:
11563 * module One
11564 * Two = 2
11565 * def three; 3 end
11566 * @four = 4
11567 * @@five = 5
11568 * $six = 6
11569 * end
11570 * seven = 7
11572 * One.constants
11573 * # => [:Two]
11574 * One.instance_methods(true)
11575 * # => [:three]
11576 * One.instance_variables
11577 * # => [:@four]
11578 * One.class_variables
11579 * # => [:@@five]
11580 * global_variables.grep(/six/)
11581 * # => [:$six]
11582 * local_variables
11583 * # => [:seven]
11585 * A +Symbol+ object differs from a String object in that
11586 * a +Symbol+ object represents an identifier, while a String object
11587 * represents text or data.
11589 * == What's Here
11591 * First, what's elsewhere. \Class +Symbol+:
11593 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11594 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11596 * Here, class +Symbol+ provides methods that are useful for:
11598 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11599 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11600 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11602 * === Methods for Querying
11604 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11605 * - #=~: Returns the index of the first substring in symbol that matches a
11606 * given Regexp or other object; returns +nil+ if no match is found.
11607 * - #[], #slice : Returns a substring of symbol
11608 * determined by a given index, start/length, or range, or string.
11609 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11610 * - #encoding: Returns the Encoding object that represents the encoding
11611 * of symbol.
11612 * - #end_with?: Returns +true+ if symbol ends with
11613 * any of the given strings.
11614 * - #match: Returns a MatchData object if symbol
11615 * matches a given Regexp; +nil+ otherwise.
11616 * - #match?: Returns +true+ if symbol
11617 * matches a given Regexp; +false+ otherwise.
11618 * - #length, #size: Returns the number of characters in symbol.
11619 * - #start_with?: Returns +true+ if symbol starts with
11620 * any of the given strings.
11622 * === Methods for Comparing
11624 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11625 * or larger than symbol.
11626 * - #==, #===: Returns +true+ if a given symbol has the same content and
11627 * encoding.
11628 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11629 * symbol is smaller than, equal to, or larger than symbol.
11630 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11631 * after Unicode case folding; +false+ otherwise.
11633 * === Methods for Converting
11635 * - #capitalize: Returns symbol with the first character upcased
11636 * and all other characters downcased.
11637 * - #downcase: Returns symbol with all characters downcased.
11638 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11639 * - #name: Returns the frozen string corresponding to symbol.
11640 * - #succ, #next: Returns the symbol that is the successor to symbol.
11641 * - #swapcase: Returns symbol with all upcase characters downcased
11642 * and all downcase characters upcased.
11643 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11644 * - #to_s, #id2name: Returns the string corresponding to +self+.
11645 * - #to_sym, #intern: Returns +self+.
11646 * - #upcase: Returns symbol with all characters upcased.
11652 * call-seq:
11653 * symbol == object -> true or false
11655 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11658 #define sym_equal rb_obj_equal
11660 static int
11661 sym_printable(const char *s, const char *send, rb_encoding *enc)
11663 while (s < send) {
11664 int n;
11665 int c = rb_enc_precise_mbclen(s, send, enc);
11667 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11668 n = MBCLEN_CHARFOUND_LEN(c);
11669 c = rb_enc_mbc_to_codepoint(s, send, enc);
11670 if (!rb_enc_isprint(c, enc)) return FALSE;
11671 s += n;
11673 return TRUE;
11677 rb_str_symname_p(VALUE sym)
11679 rb_encoding *enc;
11680 const char *ptr;
11681 long len;
11682 rb_encoding *resenc = rb_default_internal_encoding();
11684 if (resenc == NULL) resenc = rb_default_external_encoding();
11685 enc = STR_ENC_GET(sym);
11686 ptr = RSTRING_PTR(sym);
11687 len = RSTRING_LEN(sym);
11688 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11689 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11690 return FALSE;
11692 return TRUE;
11695 VALUE
11696 rb_str_quote_unprintable(VALUE str)
11698 rb_encoding *enc;
11699 const char *ptr;
11700 long len;
11701 rb_encoding *resenc;
11703 Check_Type(str, T_STRING);
11704 resenc = rb_default_internal_encoding();
11705 if (resenc == NULL) resenc = rb_default_external_encoding();
11706 enc = STR_ENC_GET(str);
11707 ptr = RSTRING_PTR(str);
11708 len = RSTRING_LEN(str);
11709 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11710 !sym_printable(ptr, ptr + len, enc)) {
11711 return rb_str_escape(str);
11713 return str;
11716 VALUE
11717 rb_id_quote_unprintable(ID id)
11719 VALUE str = rb_id2str(id);
11720 if (!rb_str_symname_p(str)) {
11721 return rb_str_escape(str);
11723 return str;
11727 * call-seq:
11728 * inspect -> string
11730 * Returns a string representation of +self+ (including the leading colon):
11732 * :foo.inspect # => ":foo"
11734 * Related: Symbol#to_s, Symbol#name.
11738 static VALUE
11739 sym_inspect(VALUE sym)
11741 VALUE str = rb_sym2str(sym);
11742 const char *ptr;
11743 long len;
11744 char *dest;
11746 if (!rb_str_symname_p(str)) {
11747 str = rb_str_inspect(str);
11748 len = RSTRING_LEN(str);
11749 rb_str_resize(str, len + 1);
11750 dest = RSTRING_PTR(str);
11751 memmove(dest + 1, dest, len);
11753 else {
11754 rb_encoding *enc = STR_ENC_GET(str);
11755 VALUE orig_str = str;
11757 len = RSTRING_LEN(orig_str);
11758 str = rb_enc_str_new(0, len + 1, enc);
11760 // Get data pointer after allocation
11761 ptr = RSTRING_PTR(orig_str);
11762 dest = RSTRING_PTR(str);
11763 memcpy(dest + 1, ptr, len);
11765 RB_GC_GUARD(orig_str);
11767 dest[0] = ':';
11769 RUBY_ASSERT_BUILTIN_TYPE(str, T_STRING);
11771 return str;
11774 VALUE
11775 rb_sym_to_s(VALUE sym)
11777 return str_new_shared(rb_cString, rb_sym2str(sym));
11780 VALUE
11781 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11783 VALUE obj;
11785 if (argc < 1) {
11786 rb_raise(rb_eArgError, "no receiver given");
11788 obj = argv[0];
11789 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11793 * call-seq:
11794 * succ
11796 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11798 * :foo.succ # => :fop
11800 * Related: String#succ.
11803 static VALUE
11804 sym_succ(VALUE sym)
11806 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11810 * call-seq:
11811 * symbol <=> object -> -1, 0, +1, or nil
11813 * If +object+ is a symbol,
11814 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11816 * :bar <=> :foo # => -1
11817 * :foo <=> :foo # => 0
11818 * :foo <=> :bar # => 1
11820 * Otherwise, returns +nil+:
11822 * :foo <=> 'bar' # => nil
11824 * Related: String#<=>.
11827 static VALUE
11828 sym_cmp(VALUE sym, VALUE other)
11830 if (!SYMBOL_P(other)) {
11831 return Qnil;
11833 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11837 * call-seq:
11838 * casecmp(object) -> -1, 0, 1, or nil
11840 * :include: doc/symbol/casecmp.rdoc
11844 static VALUE
11845 sym_casecmp(VALUE sym, VALUE other)
11847 if (!SYMBOL_P(other)) {
11848 return Qnil;
11850 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11854 * call-seq:
11855 * casecmp?(object) -> true, false, or nil
11857 * :include: doc/symbol/casecmp_p.rdoc
11861 static VALUE
11862 sym_casecmp_p(VALUE sym, VALUE other)
11864 if (!SYMBOL_P(other)) {
11865 return Qnil;
11867 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11871 * call-seq:
11872 * symbol =~ object -> integer or nil
11874 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11875 * including possible updates to global variables;
11876 * see String#=~.
11880 static VALUE
11881 sym_match(VALUE sym, VALUE other)
11883 return rb_str_match(rb_sym2str(sym), other);
11887 * call-seq:
11888 * match(pattern, offset = 0) -> matchdata or nil
11889 * match(pattern, offset = 0) {|matchdata| } -> object
11891 * Equivalent to <tt>self.to_s.match</tt>,
11892 * including possible updates to global variables;
11893 * see String#match.
11897 static VALUE
11898 sym_match_m(int argc, VALUE *argv, VALUE sym)
11900 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11904 * call-seq:
11905 * match?(pattern, offset) -> true or false
11907 * Equivalent to <tt>sym.to_s.match?</tt>;
11908 * see String#match.
11912 static VALUE
11913 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11915 return rb_str_match_m_p(argc, argv, sym);
11919 * call-seq:
11920 * symbol[index] -> string or nil
11921 * symbol[start, length] -> string or nil
11922 * symbol[range] -> string or nil
11923 * symbol[regexp, capture = 0] -> string or nil
11924 * symbol[substring] -> string or nil
11926 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11930 static VALUE
11931 sym_aref(int argc, VALUE *argv, VALUE sym)
11933 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11937 * call-seq:
11938 * length -> integer
11940 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11943 static VALUE
11944 sym_length(VALUE sym)
11946 return rb_str_length(rb_sym2str(sym));
11950 * call-seq:
11951 * empty? -> true or false
11953 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11957 static VALUE
11958 sym_empty(VALUE sym)
11960 return rb_str_empty(rb_sym2str(sym));
11964 * call-seq:
11965 * upcase(*options) -> symbol
11967 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11969 * See String#upcase.
11973 static VALUE
11974 sym_upcase(int argc, VALUE *argv, VALUE sym)
11976 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11980 * call-seq:
11981 * downcase(*options) -> symbol
11983 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11985 * See String#downcase.
11987 * Related: Symbol#upcase.
11991 static VALUE
11992 sym_downcase(int argc, VALUE *argv, VALUE sym)
11994 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11998 * call-seq:
11999 * capitalize(*options) -> symbol
12001 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12003 * See String#capitalize.
12007 static VALUE
12008 sym_capitalize(int argc, VALUE *argv, VALUE sym)
12010 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12014 * call-seq:
12015 * swapcase(*options) -> symbol
12017 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12019 * See String#swapcase.
12023 static VALUE
12024 sym_swapcase(int argc, VALUE *argv, VALUE sym)
12026 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12030 * call-seq:
12031 * start_with?(*string_or_regexp) -> true or false
12033 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12037 static VALUE
12038 sym_start_with(int argc, VALUE *argv, VALUE sym)
12040 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12044 * call-seq:
12045 * end_with?(*strings) -> true or false
12048 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12052 static VALUE
12053 sym_end_with(int argc, VALUE *argv, VALUE sym)
12055 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12059 * call-seq:
12060 * encoding -> encoding
12062 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12066 static VALUE
12067 sym_encoding(VALUE sym)
12069 return rb_obj_encoding(rb_sym2str(sym));
12072 static VALUE
12073 string_for_symbol(VALUE name)
12075 if (!RB_TYPE_P(name, T_STRING)) {
12076 VALUE tmp = rb_check_string_type(name);
12077 if (NIL_P(tmp)) {
12078 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
12079 name);
12081 name = tmp;
12083 return name;
12087 rb_to_id(VALUE name)
12089 if (SYMBOL_P(name)) {
12090 return SYM2ID(name);
12092 name = string_for_symbol(name);
12093 return rb_intern_str(name);
12096 VALUE
12097 rb_to_symbol(VALUE name)
12099 if (SYMBOL_P(name)) {
12100 return name;
12102 name = string_for_symbol(name);
12103 return rb_str_intern(name);
12107 * call-seq:
12108 * Symbol.all_symbols -> array_of_symbols
12110 * Returns an array of all symbols currently in Ruby's symbol table:
12112 * Symbol.all_symbols.size # => 9334
12113 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12117 static VALUE
12118 sym_all_symbols(VALUE _)
12120 return rb_sym_all_symbols();
12123 VALUE
12124 rb_str_to_interned_str(VALUE str)
12126 return rb_fstring(str);
12129 VALUE
12130 rb_interned_str(const char *ptr, long len)
12132 struct RString fake_str;
12133 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12136 VALUE
12137 rb_interned_str_cstr(const char *ptr)
12139 return rb_interned_str(ptr, strlen(ptr));
12142 VALUE
12143 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12145 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12146 rb_enc_autoload(enc);
12149 struct RString fake_str;
12150 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12153 VALUE
12154 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12156 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12159 void
12160 Init_String(void)
12162 rb_cString = rb_define_class("String", rb_cObject);
12163 RUBY_ASSERT(rb_vm_fstring_table());
12164 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12165 rb_include_module(rb_cString, rb_mComparable);
12166 rb_define_alloc_func(rb_cString, empty_str_alloc);
12167 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12168 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12169 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12170 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12171 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12172 rb_define_method(rb_cString, "==", rb_str_equal, 1);
12173 rb_define_method(rb_cString, "===", rb_str_equal, 1);
12174 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12175 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12176 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12177 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12178 rb_define_method(rb_cString, "+", rb_str_plus, 1);
12179 rb_define_method(rb_cString, "*", rb_str_times, 1);
12180 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12181 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12182 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12183 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12184 rb_define_method(rb_cString, "length", rb_str_length, 0);
12185 rb_define_method(rb_cString, "size", rb_str_length, 0);
12186 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12187 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12188 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12189 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12190 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12191 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12192 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12193 rb_define_method(rb_cString, "next", rb_str_succ, 0);
12194 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12195 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12196 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12197 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12198 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12199 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12200 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12201 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12202 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12203 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12204 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12205 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12206 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12207 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12208 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12209 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12210 rb_define_method(rb_cString, "+@", str_uplus, 0);
12211 rb_define_method(rb_cString, "-@", str_uminus, 0);
12212 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12213 rb_define_alias(rb_cString, "dedup", "-@");
12215 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12216 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12217 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12218 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12219 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12220 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12221 rb_define_method(rb_cString, "undump", str_undump, 0);
12223 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12224 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12225 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12226 sym_fold = ID2SYM(rb_intern_const("fold"));
12228 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12229 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12230 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12231 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12233 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12234 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12235 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12236 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12238 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12239 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12240 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12241 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12242 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12243 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12244 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12245 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12246 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12247 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12248 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12249 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12250 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12251 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12252 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12253 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12254 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12256 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12257 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12258 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12260 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12262 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12263 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12264 rb_define_method(rb_cString, "center", rb_str_center, -1);
12266 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12267 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12268 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12269 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12270 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12271 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12272 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12273 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12274 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12276 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12277 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12278 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12279 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12280 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12281 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12282 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12283 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12284 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12286 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12287 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12288 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12289 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12290 rb_define_method(rb_cString, "count", rb_str_count, -1);
12292 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12293 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12294 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12295 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12297 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12298 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12299 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12300 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12301 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12303 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12305 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12306 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12308 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12309 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12311 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12312 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12313 rb_define_method(rb_cString, "b", rb_str_b, 0);
12314 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12315 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12317 /* define UnicodeNormalize module here so that we don't have to look it up */
12318 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12319 id_normalize = rb_intern_const("normalize");
12320 id_normalized_p = rb_intern_const("normalized?");
12322 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12323 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12324 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12326 rb_fs = Qnil;
12327 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12328 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12329 rb_gc_register_address(&rb_fs);
12331 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12332 rb_include_module(rb_cSymbol, rb_mComparable);
12333 rb_undef_alloc_func(rb_cSymbol);
12334 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12335 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12337 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12338 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12339 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12340 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12341 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12342 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12343 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12345 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12346 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12347 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12348 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12350 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12351 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12352 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12353 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12354 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12355 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12356 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12358 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12359 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12360 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12361 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12363 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12364 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12366 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);