1 /**********************************************************************
6 created at: Mon Aug 9 18:24:49 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/internal/config.h"
18 #include "internal/hash.h"
19 #include "internal/imemo.h"
20 #include "internal/re.h"
21 #include "internal/string.h"
22 #include "internal/variable.h"
24 #include "ruby/encoding.h"
26 #include "ruby/util.h"
28 VALUE rb_eRegexpError
;
30 typedef char onig_errmsg_buffer
[ONIG_MAX_ERROR_MESSAGE_LEN
];
31 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
33 #define BEG(no) (regs->beg[(no)])
34 #define END(no) (regs->end[(no)])
36 #if 'a' == 97 /* it's ascii */
37 static const char casetable
[] = {
38 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
39 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
40 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
41 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
42 /* ' ' '!' '"' '#' '$' '%' '&' ''' */
43 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
44 /* '(' ')' '*' '+' ',' '-' '.' '/' */
45 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
46 /* '0' '1' '2' '3' '4' '5' '6' '7' */
47 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
48 /* '8' '9' ':' ';' '<' '=' '>' '?' */
49 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
50 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
51 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
52 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
53 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
54 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
55 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
56 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
57 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
58 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
59 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
60 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
61 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
62 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
63 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
64 /* 'x' 'y' 'z' '{' '|' '}' '~' */
65 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
66 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
67 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
68 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
69 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
70 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
71 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
72 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
73 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
74 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
75 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
76 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
77 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
78 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
79 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
80 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
81 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
84 # error >>> "You lose. You will need a translation table for your character set." <<<
88 rb_memcicmp(const void *x
, const void *y
, long len
)
90 const unsigned char *p1
= x
, *p2
= y
;
94 if ((tmp
= casetable
[(unsigned)*p1
++] - casetable
[(unsigned)*p2
++]))
102 rb_memsearch_ss(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
104 const unsigned char *y
;
106 if ((y
= memmem(ys
, n
, xs
, m
)) != NULL
)
113 rb_memsearch_ss(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
115 const unsigned char *x
= xs
, *xe
= xs
+ m
;
116 const unsigned char *y
= ys
, *ye
= ys
+ n
;
117 #define VALUE_MAX ((VALUE)~(VALUE)0)
118 VALUE hx
, hy
, mask
= VALUE_MAX
>> ((SIZEOF_VALUE
- m
) * CHAR_BIT
);
120 if (m
> SIZEOF_VALUE
)
121 rb_bug("!!too long pattern string!!");
123 if (!(y
= memchr(y
, *x
, n
- m
+ 1)))
126 /* Prepare hash value */
127 for (hx
= *x
++, hy
= *y
++; x
< xe
; ++x
, ++y
) {
147 rb_memsearch_qs(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
149 const unsigned char *x
= xs
, *xe
= xs
+ m
;
150 const unsigned char *y
= ys
;
151 VALUE i
, qstable
[256];
154 for (i
= 0; i
< 256; ++i
)
157 qstable
[*x
] = xe
- x
;
159 for (; y
+ m
<= ys
+ n
; y
+= *(qstable
+ y
[m
])) {
160 if (*xs
== *y
&& memcmp(xs
, y
, m
) == 0)
166 static inline unsigned int
167 rb_memsearch_qs_utf8_hash(const unsigned char *x
)
169 register const unsigned int mix
= 8353;
170 register unsigned int h
= *x
;
195 return (unsigned char)h
;
199 rb_memsearch_qs_utf8(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
201 const unsigned char *x
= xs
, *xe
= xs
+ m
;
202 const unsigned char *y
= ys
;
203 VALUE i
, qstable
[512];
206 for (i
= 0; i
< 512; ++i
) {
209 for (; x
< xe
; ++x
) {
210 qstable
[rb_memsearch_qs_utf8_hash(x
)] = xe
- x
;
213 for (; y
+ m
<= ys
+ n
; y
+= qstable
[rb_memsearch_qs_utf8_hash(y
+m
)]) {
214 if (*xs
== *y
&& memcmp(xs
, y
, m
) == 0)
221 rb_memsearch_wchar(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
223 const unsigned char *x
= xs
, x0
= *xs
, *y
= ys
;
224 enum {char_size
= 2};
226 for (n
-= m
; n
>= 0; n
-= char_size
, y
+= char_size
) {
227 if (x0
== *y
&& memcmp(x
+1, y
+1, m
-1) == 0)
234 rb_memsearch_qchar(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
236 const unsigned char *x
= xs
, x0
= *xs
, *y
= ys
;
237 enum {char_size
= 4};
239 for (n
-= m
; n
>= 0; n
-= char_size
, y
+= char_size
) {
240 if (x0
== *y
&& memcmp(x
+1, y
+1, m
-1) == 0)
247 rb_memsearch(const void *x0
, long m
, const void *y0
, long n
, rb_encoding
*enc
)
249 const unsigned char *x
= x0
, *y
= y0
;
251 if (m
> n
) return -1;
253 return memcmp(x0
, y0
, m
) == 0 ? 0 : -1;
259 const unsigned char *ys
= memchr(y
, *x
, n
);
266 else if (LIKELY(rb_enc_mbminlen(enc
) == 1)) {
267 if (m
<= SIZEOF_VALUE
) {
268 return rb_memsearch_ss(x0
, m
, y0
, n
);
270 else if (enc
== rb_utf8_encoding()){
271 return rb_memsearch_qs_utf8(x0
, m
, y0
, n
);
274 else if (LIKELY(rb_enc_mbminlen(enc
) == 2)) {
275 return rb_memsearch_wchar(x0
, m
, y0
, n
);
277 else if (LIKELY(rb_enc_mbminlen(enc
) == 4)) {
278 return rb_memsearch_qchar(x0
, m
, y0
, n
);
280 return rb_memsearch_qs(x0
, m
, y0
, n
);
283 #define REG_LITERAL FL_USER5
284 #define REG_ENCODING_NONE FL_USER6
286 #define KCODE_FIXED FL_USER4
288 #define ARG_REG_OPTION_MASK \
289 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
290 #define ARG_ENCODING_FIXED 16
291 #define ARG_ENCODING_NONE 32
294 char_to_option(int c
)
300 val
= ONIG_OPTION_IGNORECASE
;
303 val
= ONIG_OPTION_EXTEND
;
306 val
= ONIG_OPTION_MULTILINE
;
315 enum { OPTBUF_SIZE
= 4 };
318 option_to_str(char str
[OPTBUF_SIZE
], int options
)
321 if (options
& ONIG_OPTION_MULTILINE
) *p
++ = 'm';
322 if (options
& ONIG_OPTION_IGNORECASE
) *p
++ = 'i';
323 if (options
& ONIG_OPTION_EXTEND
) *p
++ = 'x';
329 rb_char_to_option_kcode(int c
, int *option
, int *kcode
)
335 *kcode
= rb_ascii8bit_encindex();
336 return (*option
= ARG_ENCODING_NONE
);
338 *kcode
= ENCINDEX_EUC_JP
;
341 *kcode
= ENCINDEX_Windows_31J
;
344 *kcode
= rb_utf8_encindex();
348 return (*option
= char_to_option(c
));
350 *option
= ARG_ENCODING_FIXED
;
355 rb_reg_check(VALUE re
)
357 if (!RREGEXP_PTR(re
) || !RREGEXP_SRC(re
) || !RREGEXP_SRC_PTR(re
)) {
358 rb_raise(rb_eTypeError
, "uninitialized Regexp");
363 rb_reg_expr_str(VALUE str
, const char *s
, long len
,
364 rb_encoding
*enc
, rb_encoding
*resenc
, int term
)
366 const char *p
, *pend
;
367 int cr
= ENC_CODERANGE_UNKNOWN
;
371 p
= s
; pend
= p
+ len
;
372 rb_str_coderange_scan_restartable(p
, pend
, enc
, &cr
);
373 if (rb_enc_asciicompat(enc
) && ENC_CODERANGE_CLEAN_P(cr
)) {
375 c
= rb_enc_ascget(p
, pend
, &clen
, enc
);
378 p
+= mbclen(p
, pend
, enc
);
385 else if (c
!= term
&& rb_enc_isprint(c
, enc
)) {
399 rb_str_buf_cat(str
, s
, len
);
402 int unicode_p
= rb_enc_unicode_p(enc
);
405 c
= rb_enc_ascget(p
, pend
, &clen
, enc
);
406 if (c
== '\\' && p
+clen
< pend
) {
407 int n
= clen
+ mbclen(p
+clen
, pend
, enc
);
408 rb_str_buf_cat(str
, p
, n
);
413 clen
= rb_enc_precise_mbclen(p
, pend
, enc
);
414 if (!MBCLEN_CHARFOUND_P(clen
)) {
415 c
= (unsigned char)*p
;
420 unsigned int c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
421 rb_str_buf_cat_escaped_char(str
, c
, unicode_p
);
424 clen
= MBCLEN_CHARFOUND_LEN(clen
);
425 rb_str_buf_cat(str
, p
, clen
);
428 else if (c
== term
) {
430 rb_str_buf_cat(str
, &c
, 1);
431 rb_str_buf_cat(str
, p
, clen
);
433 else if (rb_enc_isprint(c
, enc
)) {
434 rb_str_buf_cat(str
, p
, clen
);
436 else if (!rb_enc_isspace(c
, enc
)) {
440 snprintf(b
, sizeof(b
), "\\x%02X", c
);
441 rb_str_buf_cat(str
, b
, 4);
444 rb_str_buf_cat(str
, p
, clen
);
452 rb_reg_desc(const char *s
, long len
, VALUE re
)
454 rb_encoding
*enc
= rb_enc_get(re
);
455 VALUE str
= rb_str_buf_new2("/");
456 rb_encoding
*resenc
= rb_default_internal_encoding();
457 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
459 if (re
&& rb_enc_asciicompat(enc
)) {
460 rb_enc_copy(str
, re
);
463 rb_enc_associate(str
, rb_usascii_encoding());
465 rb_reg_expr_str(str
, s
, len
, enc
, resenc
, '/');
466 rb_str_buf_cat2(str
, "/");
468 char opts
[OPTBUF_SIZE
];
470 if (*option_to_str(opts
, RREGEXP_PTR(re
)->options
))
471 rb_str_buf_cat2(str
, opts
);
472 if (RBASIC(re
)->flags
& REG_ENCODING_NONE
)
473 rb_str_buf_cat2(str
, "n");
483 * Returns the original string of the pattern.
485 * /ab+c/ix.source #=> "ab+c"
487 * Note that escape sequences are retained as is.
489 * /\x20\+/.source #=> "\\x20\\+"
494 rb_reg_source(VALUE re
)
499 str
= rb_str_dup(RREGEXP_SRC(re
));
505 * rxp.inspect -> string
507 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
508 * <code>#inspect</code> actually produces the more natural version of
509 * the string than <code>#to_s</code>.
511 * /ab+c/ix.inspect #=> "/ab+c/ix"
516 rb_reg_inspect(VALUE re
)
518 if (!RREGEXP_PTR(re
) || !RREGEXP_SRC(re
) || !RREGEXP_SRC_PTR(re
)) {
519 return rb_any_to_s(re
);
521 return rb_reg_desc(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), re
);
524 static VALUE
rb_reg_str_with_term(VALUE re
, int term
);
530 * Returns a string containing the regular expression and its options (using the
531 * <code>(?opts:source)</code> notation. This string can be fed back in to
532 * Regexp::new to a regular expression with the same semantics as the
533 * original. (However, <code>Regexp#==</code> may not return true
534 * when comparing the two, as the source of the regular expression
535 * itself may differ, as the example shows). Regexp#inspect produces
536 * a generally more readable version of <i>rxp</i>.
538 * r1 = /ab+c/ix #=> /ab+c/ix
539 * s1 = r1.to_s #=> "(?ix-m:ab+c)"
540 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
542 * r1.source #=> "ab+c"
543 * r2.source #=> "(?ix-m:ab+c)"
547 rb_reg_to_s(VALUE re
)
549 return rb_reg_str_with_term(re
, '/');
553 rb_reg_str_with_term(VALUE re
, int term
)
556 const int embeddable
= ONIG_OPTION_MULTILINE
|ONIG_OPTION_IGNORECASE
|ONIG_OPTION_EXTEND
;
559 VALUE str
= rb_str_buf_new2("(?");
560 char optbuf
[OPTBUF_SIZE
+ 1]; /* for '-' */
561 rb_encoding
*enc
= rb_enc_get(re
);
565 rb_enc_copy(str
, re
);
566 options
= RREGEXP_PTR(re
)->options
;
567 ptr
= (UChar
*)RREGEXP_SRC_PTR(re
);
568 len
= RREGEXP_SRC_LEN(re
);
570 if (len
>= 4 && ptr
[0] == '(' && ptr
[1] == '?') {
573 if ((len
-= 2) > 0) {
575 opt
= char_to_option((int )*ptr
);
585 if (len
> 1 && *ptr
== '-') {
589 opt
= char_to_option((int )*ptr
);
604 if (*ptr
== ':' && ptr
[len
-1] == ')') {
606 VALUE verbose
= ruby_verbose
;
607 ruby_verbose
= Qfalse
;
611 err
= onig_new(&rp
, ptr
, ptr
+ len
, options
,
612 enc
, OnigDefaultSyntax
, NULL
);
614 ruby_verbose
= verbose
;
617 options
= RREGEXP_PTR(re
)->options
;
618 ptr
= (UChar
*)RREGEXP_SRC_PTR(re
);
619 len
= RREGEXP_SRC_LEN(re
);
623 if (*option_to_str(optbuf
, options
)) rb_str_buf_cat2(str
, optbuf
);
625 if ((options
& embeddable
) != embeddable
) {
627 option_to_str(optbuf
+ 1, ~options
);
628 rb_str_buf_cat2(str
, optbuf
);
631 rb_str_buf_cat2(str
, ":");
632 if (rb_enc_asciicompat(enc
)) {
633 rb_reg_expr_str(str
, (char*)ptr
, len
, enc
, NULL
, term
);
634 rb_str_buf_cat2(str
, ")");
640 rb_str_buf_cat2(str
, ")");
641 rb_enc_associate(str
, rb_usascii_encoding());
642 str
= rb_str_encode(str
, rb_enc_from_encoding(enc
), 0, Qnil
);
644 /* backup encoded ")" to paren */
645 s
= RSTRING_PTR(str
);
646 e
= RSTRING_END(str
);
647 s
= rb_enc_left_char_head(s
, e
-1, e
, enc
);
649 paren
= ALLOCA_N(char, n
);
651 rb_str_resize(str
, RSTRING_LEN(str
) - n
);
653 rb_reg_expr_str(str
, (char*)ptr
, len
, enc
, NULL
, term
);
654 rb_str_buf_cat(str
, paren
, n
);
656 rb_enc_copy(str
, re
);
661 NORETURN(static void rb_reg_raise(const char *s
, long len
, const char *err
, VALUE re
));
664 rb_reg_raise(const char *s
, long len
, const char *err
, VALUE re
)
666 VALUE desc
= rb_reg_desc(s
, len
, re
);
668 rb_raise(rb_eRegexpError
, "%s: %"PRIsVALUE
, err
, desc
);
672 rb_enc_reg_error_desc(const char *s
, long len
, rb_encoding
*enc
, int options
, const char *err
)
674 char opts
[OPTBUF_SIZE
+ 1]; /* for '/' */
675 VALUE desc
= rb_str_buf_new2(err
);
676 rb_encoding
*resenc
= rb_default_internal_encoding();
677 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
679 rb_enc_associate(desc
, enc
);
680 rb_str_buf_cat2(desc
, ": /");
681 rb_reg_expr_str(desc
, s
, len
, enc
, resenc
, '/');
683 option_to_str(opts
+ 1, options
);
684 rb_str_buf_cat2(desc
, opts
);
685 return rb_exc_new3(rb_eRegexpError
, desc
);
688 NORETURN(static void rb_enc_reg_raise(const char *s
, long len
, rb_encoding
*enc
, int options
, const char *err
));
691 rb_enc_reg_raise(const char *s
, long len
, rb_encoding
*enc
, int options
, const char *err
)
693 rb_exc_raise(rb_enc_reg_error_desc(s
, len
, enc
, options
, err
));
697 rb_reg_error_desc(VALUE str
, int options
, const char *err
)
699 return rb_enc_reg_error_desc(RSTRING_PTR(str
), RSTRING_LEN(str
),
700 rb_enc_get(str
), options
, err
);
703 NORETURN(static void rb_reg_raise_str(VALUE str
, int options
, const char *err
));
706 rb_reg_raise_str(VALUE str
, int options
, const char *err
)
708 rb_exc_raise(rb_reg_error_desc(str
, options
, err
));
714 * rxp.casefold? -> true or false
716 * Returns the value of the case-insensitive flag.
718 * /a/.casefold? #=> false
719 * /a/i.casefold? #=> true
720 * /(?i:a)/.casefold? #=> false
724 rb_reg_casefold_p(VALUE re
)
727 return RBOOL(RREGEXP_PTR(re
)->options
& ONIG_OPTION_IGNORECASE
);
733 * rxp.options -> integer
735 * Returns the set of bits corresponding to the options used when
736 * creating this Regexp (see Regexp::new for details. Note that
737 * additional bits may be set in the returned options: these are used
738 * internally by the regular expression code. These extra bits are
739 * ignored if the options are passed to Regexp::new.
741 * Regexp::IGNORECASE #=> 1
742 * Regexp::EXTENDED #=> 2
743 * Regexp::MULTILINE #=> 4
745 * /cat/.options #=> 0
746 * /cat/ix.options #=> 3
747 * Regexp.new('cat', true).options #=> 1
748 * /\xa1\xa2/e.options #=> 16
751 * Regexp.new(r.source, r.options) #=> /cat/ix
755 rb_reg_options_m(VALUE re
)
757 int options
= rb_reg_options(re
);
758 return INT2NUM(options
);
762 reg_names_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
763 int back_num
, int *back_refs
, OnigRegex regex
, void *arg
)
765 VALUE ary
= (VALUE
)arg
;
766 rb_ary_push(ary
, rb_enc_str_new((const char *)name
, name_end
-name
, regex
->enc
));
772 * rxp.names -> [name1, name2, ...]
774 * Returns a list of names of captures as an array of strings.
776 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names
777 * #=> ["foo", "bar", "baz"]
779 * /(?<foo>.)(?<foo>.)/.names
787 rb_reg_names(VALUE re
)
791 ary
= rb_ary_new_capa(onig_number_of_names(RREGEXP_PTR(re
)));
792 onig_foreach_name(RREGEXP_PTR(re
), reg_names_iter
, (void*)ary
);
797 reg_named_captures_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
798 int back_num
, int *back_refs
, OnigRegex regex
, void *arg
)
800 VALUE hash
= (VALUE
)arg
;
801 VALUE ary
= rb_ary_new2(back_num
);
804 for (i
= 0; i
< back_num
; i
++)
805 rb_ary_store(ary
, i
, INT2NUM(back_refs
[i
]));
807 rb_hash_aset(hash
, rb_str_new((const char*)name
, name_end
-name
),ary
);
814 * rxp.named_captures -> hash
816 * Returns a hash representing information about named captures of <i>rxp</i>.
818 * A key of the hash is a name of the named captures.
819 * A value of the hash is an array which is list of indexes of corresponding
822 * /(?<foo>.)(?<bar>.)/.named_captures
823 * #=> {"foo"=>[1], "bar"=>[2]}
825 * /(?<foo>.)(?<foo>.)/.named_captures
826 * #=> {"foo"=>[1, 2]}
828 * If there are no named captures, an empty hash is returned.
830 * /(.)(.)/.named_captures
835 rb_reg_named_captures(VALUE re
)
837 regex_t
*reg
= (rb_reg_check(re
), RREGEXP_PTR(re
));
838 VALUE hash
= rb_hash_new_with_size(onig_number_of_names(reg
));
839 onig_foreach_name(reg
, reg_named_captures_iter
, (void*)hash
);
844 onig_new_with_source(regex_t
** reg
, const UChar
* pattern
, const UChar
* pattern_end
,
845 OnigOptionType option
, OnigEncoding enc
, const OnigSyntaxType
* syntax
,
846 OnigErrorInfo
* einfo
, const char *sourcefile
, int sourceline
)
850 *reg
= (regex_t
* )malloc(sizeof(regex_t
));
851 if (IS_NULL(*reg
)) return ONIGERR_MEMORY
;
853 r
= onig_reg_init(*reg
, option
, ONIGENC_CASE_FOLD_DEFAULT
, enc
, syntax
);
856 r
= onig_compile_ruby(*reg
, pattern
, pattern_end
, einfo
, sourcefile
, sourceline
);
866 make_regexp(const char *s
, long len
, rb_encoding
*enc
, int flags
, onig_errmsg_buffer err
,
867 const char *sourcefile
, int sourceline
)
873 /* Handle escaped characters first. */
875 /* Build a copy of the string (in dest) with the
876 escaped characters translated, and generate the regex
880 r
= onig_new_with_source(&rp
, (UChar
*)s
, (UChar
*)(s
+ len
), flags
,
881 enc
, OnigDefaultSyntax
, &einfo
, sourcefile
, sourceline
);
883 onig_error_code_to_str((UChar
*)err
, r
, &einfo
);
891 * Document-class: MatchData
893 * MatchData encapsulates the result of matching a Regexp against
894 * string. It is returned by Regexp#match and String#match, and also
895 * stored in a global variable returned by Regexp.last_match.
899 * url = 'https://docs.ruby-lang.org/en/2.5.0/MatchData.html'
900 * m = url.match(/(\d\.?)+/) # => #<MatchData "2.5.0" 1:"0">
901 * m.string # => "https://docs.ruby-lang.org/en/2.5.0/MatchData.html"
902 * m.regexp # => /(\d\.?)+/
903 * # entire matched substring:
906 * # Working with unnamed captures
907 * m = url.match(%r{([^/]+)/([^/]+)\.html$})
908 * m.captures # => ["2.5.0", "MatchData"]
910 * m.values_at(1, 2) # => ["2.5.0", "MatchData"]
912 * # Working with named captures
913 * m = url.match(%r{(?<version>[^/]+)/(?<module>[^/]+)\.html$})
914 * m.captures # => ["2.5.0", "MatchData"]
915 * m.named_captures # => {"version"=>"2.5.0", "module"=>"MatchData"}
916 * m[:version] # => "2.5.0"
917 * m.values_at(:version, :module)
918 * # => ["2.5.0", "MatchData"]
919 * # Numerical indexes are working, too
921 * m.values_at(1, 2) # => ["2.5.0", "MatchData"]
923 * == Global variables equivalence
925 * Parts of last MatchData (returned by Regexp.last_match) are also
926 * aliased as global variables:
928 * * <code>$~</code> is Regexp.last_match;
929 * * <code>$&</code> is Regexp.last_match<code>[ 0 ]</code>;
930 * * <code>$1</code>, <code>$2</code>, and so on are
931 * Regexp.last_match<code>[ i ]</code> (captures by number);
932 * * <code>$`</code> is Regexp.last_match<code>.pre_match</code>;
933 * * <code>$'</code> is Regexp.last_match<code>.post_match</code>;
934 * * <code>$+</code> is Regexp.last_match<code>[ -1 ]</code> (the last capture).
936 * See also "Special global variables" section in Regexp documentation.
942 match_alloc(VALUE klass
)
944 NEWOBJ_OF(match
, struct RMatch
, klass
, T_MATCH
);
949 match
->rmatch
= ZALLOC(struct rmatch
);
955 rb_reg_region_copy(struct re_registers
*to
, const struct re_registers
*from
)
957 onig_region_copy(to
, (OnigRegion
*)from
);
958 if (to
->allocated
) return 0;
960 onig_region_copy(to
, (OnigRegion
*)from
);
961 if (to
->allocated
) return 0;
962 return ONIGERR_MEMORY
;
971 pair_byte_cmp(const void *pair1
, const void *pair2
)
973 long diff
= ((pair_t
*)pair1
)->byte_pos
- ((pair_t
*)pair2
)->byte_pos
;
974 #if SIZEOF_LONG > SIZEOF_INT
975 return diff
? diff
> 0 ? 1 : -1 : 0;
982 update_char_offset(VALUE match
)
984 struct rmatch
*rm
= RMATCH(match
)->rmatch
;
985 struct re_registers
*regs
;
986 int i
, num_regs
, num_pos
;
992 if (rm
->char_offset_num_allocated
)
996 num_regs
= rm
->regs
.num_regs
;
998 if (rm
->char_offset_num_allocated
< num_regs
) {
999 REALLOC_N(rm
->char_offset
, struct rmatch_offset
, num_regs
);
1000 rm
->char_offset_num_allocated
= num_regs
;
1003 enc
= rb_enc_get(RMATCH(match
)->str
);
1004 if (rb_enc_mbmaxlen(enc
) == 1) {
1005 for (i
= 0; i
< num_regs
; i
++) {
1006 rm
->char_offset
[i
].beg
= BEG(i
);
1007 rm
->char_offset
[i
].end
= END(i
);
1012 pairs
= ALLOCA_N(pair_t
, num_regs
*2);
1014 for (i
= 0; i
< num_regs
; i
++) {
1017 pairs
[num_pos
++].byte_pos
= BEG(i
);
1018 pairs
[num_pos
++].byte_pos
= END(i
);
1020 qsort(pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
1022 s
= p
= RSTRING_PTR(RMATCH(match
)->str
);
1024 for (i
= 0; i
< num_pos
; i
++) {
1025 q
= s
+ pairs
[i
].byte_pos
;
1026 c
+= rb_enc_strlen(p
, q
, enc
);
1027 pairs
[i
].char_pos
= c
;
1031 for (i
= 0; i
< num_regs
; i
++) {
1034 rm
->char_offset
[i
].beg
= -1;
1035 rm
->char_offset
[i
].end
= -1;
1039 key
.byte_pos
= BEG(i
);
1040 found
= bsearch(&key
, pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
1041 rm
->char_offset
[i
].beg
= found
->char_pos
;
1043 key
.byte_pos
= END(i
);
1044 found
= bsearch(&key
, pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
1045 rm
->char_offset
[i
].end
= found
->char_pos
;
1050 match_check(VALUE match
)
1052 if (!RMATCH(match
)->regexp
) {
1053 rb_raise(rb_eTypeError
, "uninitialized MatchData");
1059 match_init_copy(VALUE obj
, VALUE orig
)
1063 if (!OBJ_INIT_COPY(obj
, orig
)) return obj
;
1065 RMATCH(obj
)->str
= RMATCH(orig
)->str
;
1066 RMATCH(obj
)->regexp
= RMATCH(orig
)->regexp
;
1068 rm
= RMATCH(obj
)->rmatch
;
1069 if (rb_reg_region_copy(&rm
->regs
, RMATCH_REGS(orig
)))
1072 if (RMATCH(orig
)->rmatch
->char_offset_num_allocated
) {
1073 if (rm
->char_offset_num_allocated
< rm
->regs
.num_regs
) {
1074 REALLOC_N(rm
->char_offset
, struct rmatch_offset
, rm
->regs
.num_regs
);
1075 rm
->char_offset_num_allocated
= rm
->regs
.num_regs
;
1077 MEMCPY(rm
->char_offset
, RMATCH(orig
)->rmatch
->char_offset
,
1078 struct rmatch_offset
, rm
->regs
.num_regs
);
1088 * mtch.regexp -> regexp
1090 * Returns the regexp.
1092 * m = /a.*b/.match("abc")
1093 * m.regexp #=> /a.*b/
1097 match_regexp(VALUE match
)
1101 regexp
= RMATCH(match
)->regexp
;
1102 if (NIL_P(regexp
)) {
1103 VALUE str
= rb_reg_nth_match(0, match
);
1104 regexp
= rb_reg_regcomp(rb_reg_quote(str
));
1105 RMATCH(match
)->regexp
= regexp
;
1112 * mtch.names -> [name1, name2, ...]
1114 * Returns a list of names of captures as an array of strings.
1115 * This is the same as mtch.regexp.names.
1117 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
1118 * #=> ["foo", "bar", "baz"]
1120 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
1121 * m.names #=> ["x", "y"]
1125 match_names(VALUE match
)
1128 if (NIL_P(RMATCH(match
)->regexp
))
1129 return rb_ary_new_capa(0);
1130 return rb_reg_names(RMATCH(match
)->regexp
);
1135 * mtch.length -> integer
1136 * mtch.size -> integer
1138 * Returns the number of elements in the match array.
1140 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1146 match_size(VALUE match
)
1149 return INT2FIX(RMATCH_REGS(match
)->num_regs
);
1152 static int name_to_backref_number(struct re_registers
*, VALUE
, const char*, const char*);
1153 NORETURN(static void name_to_backref_error(VALUE name
));
1156 name_to_backref_error(VALUE name
)
1158 rb_raise(rb_eIndexError
, "undefined group name reference: % "PRIsVALUE
,
1163 backref_number_check(struct re_registers
*regs
, int i
)
1165 if (i
< 0 || regs
->num_regs
<= i
)
1166 rb_raise(rb_eIndexError
, "index %d out of matches", i
);
1170 match_backref_number(VALUE match
, VALUE backref
)
1175 struct re_registers
*regs
= RMATCH_REGS(match
);
1176 VALUE regexp
= RMATCH(match
)->regexp
;
1179 if (SYMBOL_P(backref
)) {
1180 backref
= rb_sym2str(backref
);
1182 else if (!RB_TYPE_P(backref
, T_STRING
)) {
1183 return NUM2INT(backref
);
1185 name
= StringValueCStr(backref
);
1187 num
= name_to_backref_number(regs
, regexp
, name
, name
+ RSTRING_LEN(backref
));
1190 name_to_backref_error(backref
);
1197 rb_reg_backref_number(VALUE match
, VALUE backref
)
1199 return match_backref_number(match
, backref
);
1204 * mtch.offset(n) -> array
1206 * Returns a two-element array containing the beginning and ending offsets of
1207 * the <em>n</em>th match.
1208 * <em>n</em> can be a string or symbol to reference a named capture.
1210 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1211 * m.offset(0) #=> [1, 7]
1212 * m.offset(4) #=> [6, 7]
1214 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1215 * p m.offset(:foo) #=> [0, 1]
1216 * p m.offset(:bar) #=> [2, 3]
1221 match_offset(VALUE match
, VALUE n
)
1223 int i
= match_backref_number(match
, n
);
1224 struct re_registers
*regs
= RMATCH_REGS(match
);
1227 backref_number_check(regs
, i
);
1230 return rb_assoc_new(Qnil
, Qnil
);
1232 update_char_offset(match
);
1233 return rb_assoc_new(INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].beg
),
1234 INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].end
));
1240 * mtch.begin(n) -> integer
1242 * Returns the offset of the start of the <em>n</em>th element of the match
1243 * array in the string.
1244 * <em>n</em> can be a string or symbol to reference a named capture.
1246 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1250 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1251 * p m.begin(:foo) #=> 0
1252 * p m.begin(:bar) #=> 2
1256 match_begin(VALUE match
, VALUE n
)
1258 int i
= match_backref_number(match
, n
);
1259 struct re_registers
*regs
= RMATCH_REGS(match
);
1262 backref_number_check(regs
, i
);
1267 update_char_offset(match
);
1268 return INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].beg
);
1274 * mtch.end(n) -> integer
1276 * Returns the offset of the character immediately following the end of the
1277 * <em>n</em>th element of the match array in the string.
1278 * <em>n</em> can be a string or symbol to reference a named capture.
1280 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1284 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1285 * p m.end(:foo) #=> 1
1286 * p m.end(:bar) #=> 3
1290 match_end(VALUE match
, VALUE n
)
1292 int i
= match_backref_number(match
, n
);
1293 struct re_registers
*regs
= RMATCH_REGS(match
);
1296 backref_number_check(regs
, i
);
1301 update_char_offset(match
);
1302 return INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].end
);
1307 * mtch.match(n) -> string or nil
1309 * Returns the captured substring corresponding to the argument.
1310 * <em>n</em> can be a string or symbol to reference a named capture.
1312 * m = /(.)(.)(\d+)(\d)(\w)?/.match("THX1138.")
1313 * m.match(0) #=> "HX1138"
1314 * m.match(4) #=> "8"
1315 * m.match(5) #=> nil
1317 * m = /(?<foo>.)(.)(?<bar>.+)/.match("hoge")
1318 * m.match(:foo) #=> "h"
1319 * m.match(:bar) #=> "ge"
1324 match_nth(VALUE match
, VALUE n
)
1326 int i
= match_backref_number(match
, n
);
1327 struct re_registers
*regs
= RMATCH_REGS(match
);
1329 backref_number_check(regs
, i
);
1331 long start
= BEG(i
), end
= END(i
);
1335 return rb_str_subseq(RMATCH(match
)->str
, start
, end
- start
);
1340 * mtch.match_length(n) -> array
1342 * Returns the length of the captured substring corresponding to the argument.
1343 * <em>n</em> can be a string or symbol to reference a named capture.
1345 * m = /(.)(.)(\d+)(\d)(\w)?/.match("THX1138.")
1346 * m.match_length(0) #=> 6
1347 * m.match_length(4) #=> 1
1348 * m.match_length(5) #=> nil
1350 * m = /(?<foo>.)(.)(?<bar>.+)/.match("hoge")
1351 * m.match_length(:foo) #=> 1
1352 * m.match_length(:bar) #=> 2
1357 match_nth_length(VALUE match
, VALUE n
)
1359 int i
= match_backref_number(match
, n
);
1360 struct re_registers
*regs
= RMATCH_REGS(match
);
1363 backref_number_check(regs
, i
);
1368 update_char_offset(match
);
1369 const struct rmatch_offset
*const ofs
=
1370 &RMATCH(match
)->rmatch
->char_offset
[i
];
1371 return LONG2NUM(ofs
->end
- ofs
->beg
);
1374 #define MATCH_BUSY FL_USER2
1377 rb_match_busy(VALUE match
)
1379 FL_SET(match
, MATCH_BUSY
);
1383 rb_match_unbusy(VALUE match
)
1385 FL_UNSET(match
, MATCH_BUSY
);
1389 rb_match_count(VALUE match
)
1391 struct re_registers
*regs
;
1392 if (NIL_P(match
)) return -1;
1393 regs
= RMATCH_REGS(match
);
1394 if (!regs
) return -1;
1395 return regs
->num_regs
;
1399 rb_match_nth_defined(int nth
, VALUE match
)
1401 struct re_registers
*regs
;
1402 if (NIL_P(match
)) return FALSE
;
1403 regs
= RMATCH_REGS(match
);
1404 if (!regs
) return FALSE
;
1405 if (nth
>= regs
->num_regs
) {
1409 nth
+= regs
->num_regs
;
1410 if (nth
<= 0) return FALSE
;
1412 return (BEG(nth
) != -1);
1416 match_set_string(VALUE m
, VALUE string
, long pos
, long len
)
1418 struct RMatch
*match
= (struct RMatch
*)m
;
1419 struct rmatch
*rmatch
= match
->rmatch
;
1421 match
->str
= string
;
1422 match
->regexp
= Qnil
;
1423 int err
= onig_region_resize(&rmatch
->regs
, 1);
1424 if (err
) rb_memerror();
1425 rmatch
->regs
.beg
[0] = pos
;
1426 rmatch
->regs
.end
[0] = pos
+ len
;
1430 rb_backref_set_string(VALUE string
, long pos
, long len
)
1432 VALUE match
= rb_backref_get();
1433 if (NIL_P(match
) || FL_TEST(match
, MATCH_BUSY
)) {
1434 match
= match_alloc(rb_cMatch
);
1436 match_set_string(match
, string
, pos
, len
);
1437 rb_backref_set(match
);
1442 * rxp.fixed_encoding? -> true or false
1444 * Returns false if rxp is applicable to
1445 * a string with any ASCII compatible encoding.
1446 * Returns true otherwise.
1449 * r.fixed_encoding? #=> false
1450 * r =~ "\u{6666} a" #=> 2
1451 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
1452 * r =~ "abc".force_encoding("euc-jp") #=> 0
1455 * r.fixed_encoding? #=> true
1456 * r.encoding #=> #<Encoding:UTF-8>
1457 * r =~ "\u{6666} a" #=> 2
1458 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
1459 * r =~ "abc".force_encoding("euc-jp") #=> 0
1462 * r.fixed_encoding? #=> true
1463 * r.encoding #=> #<Encoding:UTF-8>
1464 * r =~ "\u{6666} a" #=> 0
1465 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
1466 * r =~ "abc".force_encoding("euc-jp") #=> nil
1470 rb_reg_fixed_encoding_p(VALUE re
)
1472 return RBOOL(FL_TEST(re
, KCODE_FIXED
));
1476 rb_reg_preprocess(const char *p
, const char *end
, rb_encoding
*enc
,
1477 rb_encoding
**fixed_enc
, onig_errmsg_buffer err
);
1479 NORETURN(static void reg_enc_error(VALUE re
, VALUE str
));
1482 reg_enc_error(VALUE re
, VALUE str
)
1484 rb_raise(rb_eEncCompatError
,
1485 "incompatible encoding regexp match (%s regexp with %s string)",
1486 rb_enc_name(rb_enc_get(re
)),
1487 rb_enc_name(rb_enc_get(str
)));
1491 str_coderange(VALUE str
)
1493 int cr
= ENC_CODERANGE(str
);
1494 if (cr
== ENC_CODERANGE_UNKNOWN
) {
1495 cr
= rb_enc_str_coderange(str
);
1501 rb_reg_prepare_enc(VALUE re
, VALUE str
, int warn
)
1503 rb_encoding
*enc
= 0;
1504 int cr
= str_coderange(str
);
1506 if (cr
== ENC_CODERANGE_BROKEN
) {
1507 rb_raise(rb_eArgError
,
1508 "invalid byte sequence in %s",
1509 rb_enc_name(rb_enc_get(str
)));
1513 enc
= rb_enc_get(str
);
1514 if (RREGEXP_PTR(re
)->enc
== enc
) {
1516 else if (cr
== ENC_CODERANGE_7BIT
&&
1517 RREGEXP_PTR(re
)->enc
== rb_usascii_encoding()) {
1518 enc
= RREGEXP_PTR(re
)->enc
;
1520 else if (!rb_enc_asciicompat(enc
)) {
1521 reg_enc_error(re
, str
);
1523 else if (rb_reg_fixed_encoding_p(re
)) {
1524 if ((!rb_enc_asciicompat(RREGEXP_PTR(re
)->enc
) ||
1525 cr
!= ENC_CODERANGE_7BIT
)) {
1526 reg_enc_error(re
, str
);
1528 enc
= RREGEXP_PTR(re
)->enc
;
1530 else if (warn
&& (RBASIC(re
)->flags
& REG_ENCODING_NONE
) &&
1531 enc
!= rb_ascii8bit_encoding() &&
1532 cr
!= ENC_CODERANGE_7BIT
) {
1533 rb_warn("historical binary regexp match /.../n against %s string",
1540 rb_reg_prepare_re0(VALUE re
, VALUE str
, onig_errmsg_buffer err
)
1542 regex_t
*reg
= RREGEXP_PTR(re
);
1544 OnigErrorInfo einfo
;
1545 const char *pattern
;
1547 rb_encoding
*fixed_enc
= 0;
1548 rb_encoding
*enc
= rb_reg_prepare_enc(re
, str
, 1);
1550 if (reg
->enc
== enc
) return reg
;
1553 reg
= RREGEXP_PTR(re
);
1554 pattern
= RREGEXP_SRC_PTR(re
);
1556 unescaped
= rb_reg_preprocess(
1557 pattern
, pattern
+ RREGEXP_SRC_LEN(re
), enc
,
1560 if (NIL_P(unescaped
)) {
1561 rb_raise(rb_eArgError
, "regexp preprocess failed: %s", err
);
1566 RSTRING_GETMEM(unescaped
, ptr
, len
);
1567 r
= onig_new(®
, (UChar
*)ptr
, (UChar
*)(ptr
+ len
),
1569 OnigDefaultSyntax
, &einfo
);
1571 onig_error_code_to_str((UChar
*)err
, r
, &einfo
);
1572 rb_reg_raise(pattern
, RREGEXP_SRC_LEN(re
), err
, re
);
1575 RB_GC_GUARD(unescaped
);
1580 rb_reg_prepare_re(VALUE re
, VALUE str
)
1582 onig_errmsg_buffer err
= "";
1583 return rb_reg_prepare_re0(re
, str
, err
);
1587 rb_reg_adjust_startpos(VALUE re
, VALUE str
, long pos
, int reverse
)
1593 enc
= rb_reg_prepare_enc(re
, str
, 0);
1599 range
= RSTRING_LEN(str
) - pos
;
1602 if (pos
> 0 && ONIGENC_MBC_MAXLEN(enc
) != 1 && pos
< RSTRING_LEN(str
)) {
1603 string
= (UChar
*)RSTRING_PTR(str
);
1606 p
= onigenc_get_right_adjust_char_head(enc
, string
, string
+ pos
, string
+ RSTRING_LEN(str
));
1609 p
= ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc
, string
, string
+ pos
, string
+ RSTRING_LEN(str
));
1617 /* returns byte offset */
1619 rb_reg_search_set_match(VALUE re
, VALUE str
, long pos
, int reverse
, int set_backref_str
, VALUE
*set_match
)
1623 struct re_registers regi
, *regs
= ®i
;
1624 char *start
, *range
;
1628 onig_errmsg_buffer err
= "";
1630 RSTRING_GETMEM(str
, start
, len
);
1632 if (pos
> len
|| pos
< 0) {
1633 rb_backref_set(Qnil
);
1637 reg
= rb_reg_prepare_re0(re
, str
, err
);
1638 tmpreg
= reg
!= RREGEXP_PTR(re
);
1639 if (!tmpreg
) RREGEXP(re
)->usecnt
++;
1641 MEMZERO(regs
, struct re_registers
, 1);
1645 result
= onig_search(reg
,
1647 ((UChar
*)(start
+ len
)),
1648 ((UChar
*)(start
+ pos
)),
1650 regs
, ONIG_OPTION_NONE
);
1651 if (!tmpreg
) RREGEXP(re
)->usecnt
--;
1653 if (RREGEXP(re
)->usecnt
) {
1657 onig_free(RREGEXP_PTR(re
));
1658 RREGEXP_PTR(re
) = reg
;
1663 onig_region_free(regs
, 0);
1664 if (result
== ONIG_MISMATCH
) {
1665 rb_backref_set(Qnil
);
1669 onig_error_code_to_str((UChar
*)err
, (int)result
);
1670 rb_reg_raise(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), err
, re
);
1674 match
= match_alloc(rb_cMatch
);
1675 int copy_err
= rb_reg_region_copy(RMATCH_REGS(match
), regs
);
1676 onig_region_free(regs
, 0);
1677 if (copy_err
) rb_memerror();
1679 if (set_backref_str
) {
1680 RMATCH(match
)->str
= rb_str_new4(str
);
1683 RMATCH(match
)->regexp
= re
;
1684 rb_backref_set(match
);
1685 if (set_match
) *set_match
= match
;
1691 rb_reg_search0(VALUE re
, VALUE str
, long pos
, int reverse
, int set_backref_str
)
1693 return rb_reg_search_set_match(re
, str
, pos
, reverse
, set_backref_str
, NULL
);
1697 rb_reg_search(VALUE re
, VALUE str
, long pos
, int reverse
)
1699 return rb_reg_search0(re
, str
, pos
, reverse
, 1);
1703 rb_reg_start_with_p(VALUE re
, VALUE str
)
1707 struct re_registers regi
, *regs
= ®i
;
1710 onig_errmsg_buffer err
= "";
1712 reg
= rb_reg_prepare_re0(re
, str
, err
);
1713 tmpreg
= reg
!= RREGEXP_PTR(re
);
1714 if (!tmpreg
) RREGEXP(re
)->usecnt
++;
1716 match
= rb_backref_get();
1717 if (!NIL_P(match
)) {
1718 if (FL_TEST(match
, MATCH_BUSY
)) {
1722 regs
= RMATCH_REGS(match
);
1726 MEMZERO(regs
, struct re_registers
, 1);
1730 RSTRING_GETMEM(str
, ptr
, len
);
1731 result
= onig_match(reg
,
1733 ((UChar
*)(ptr
+ len
)),
1735 regs
, ONIG_OPTION_NONE
);
1736 if (!tmpreg
) RREGEXP(re
)->usecnt
--;
1738 if (RREGEXP(re
)->usecnt
) {
1742 onig_free(RREGEXP_PTR(re
));
1743 RREGEXP_PTR(re
) = reg
;
1748 onig_region_free(regs
, 0);
1749 if (result
== ONIG_MISMATCH
) {
1750 rb_backref_set(Qnil
);
1754 onig_error_code_to_str((UChar
*)err
, (int)result
);
1755 rb_reg_raise(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), err
, re
);
1761 match
= match_alloc(rb_cMatch
);
1762 err
= rb_reg_region_copy(RMATCH_REGS(match
), regs
);
1763 onig_region_free(regs
, 0);
1764 if (err
) rb_memerror();
1767 RMATCH(match
)->str
= rb_str_new4(str
);
1769 RMATCH(match
)->regexp
= re
;
1770 rb_backref_set(match
);
1776 rb_reg_nth_defined(int nth
, VALUE match
)
1778 struct re_registers
*regs
;
1779 if (NIL_P(match
)) return Qnil
;
1781 regs
= RMATCH_REGS(match
);
1782 if (nth
>= regs
->num_regs
) {
1786 nth
+= regs
->num_regs
;
1787 if (nth
<= 0) return Qnil
;
1789 return RBOOL(BEG(nth
) != -1);
1793 rb_reg_nth_match(int nth
, VALUE match
)
1796 long start
, end
, len
;
1797 struct re_registers
*regs
;
1799 if (NIL_P(match
)) return Qnil
;
1801 regs
= RMATCH_REGS(match
);
1802 if (nth
>= regs
->num_regs
) {
1806 nth
+= regs
->num_regs
;
1807 if (nth
<= 0) return Qnil
;
1810 if (start
== -1) return Qnil
;
1813 str
= rb_str_subseq(RMATCH(match
)->str
, start
, len
);
1818 rb_reg_last_match(VALUE match
)
1820 return rb_reg_nth_match(0, match
);
1826 * mtch.pre_match -> str
1828 * Returns the portion of the original string before the current match.
1829 * Equivalent to the special variable <code>$`</code>.
1831 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1832 * m.pre_match #=> "T"
1836 rb_reg_match_pre(VALUE match
)
1839 struct re_registers
*regs
;
1841 if (NIL_P(match
)) return Qnil
;
1843 regs
= RMATCH_REGS(match
);
1844 if (BEG(0) == -1) return Qnil
;
1845 str
= rb_str_subseq(RMATCH(match
)->str
, 0, BEG(0));
1852 * mtch.post_match -> str
1854 * Returns the portion of the original string after the current match.
1855 * Equivalent to the special variable <code>$'</code>.
1857 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1858 * m.post_match #=> ": The Movie"
1862 rb_reg_match_post(VALUE match
)
1866 struct re_registers
*regs
;
1868 if (NIL_P(match
)) return Qnil
;
1870 regs
= RMATCH_REGS(match
);
1871 if (BEG(0) == -1) return Qnil
;
1872 str
= RMATCH(match
)->str
;
1874 str
= rb_str_subseq(str
, pos
, RSTRING_LEN(str
) - pos
);
1879 rb_reg_match_last(VALUE match
)
1882 struct re_registers
*regs
;
1884 if (NIL_P(match
)) return Qnil
;
1886 regs
= RMATCH_REGS(match
);
1887 if (BEG(0) == -1) return Qnil
;
1889 for (i
=regs
->num_regs
-1; BEG(i
) == -1 && i
> 0; i
--)
1891 if (i
== 0) return Qnil
;
1892 return rb_reg_nth_match(i
, match
);
1896 last_match_getter(ID _x
, VALUE
*_y
)
1898 return rb_reg_last_match(rb_backref_get());
1902 prematch_getter(ID _x
, VALUE
*_y
)
1904 return rb_reg_match_pre(rb_backref_get());
1908 postmatch_getter(ID _x
, VALUE
*_y
)
1910 return rb_reg_match_post(rb_backref_get());
1914 last_paren_match_getter(ID _x
, VALUE
*_y
)
1916 return rb_reg_match_last(rb_backref_get());
1920 match_array(VALUE match
, int start
)
1922 struct re_registers
*regs
;
1928 regs
= RMATCH_REGS(match
);
1929 ary
= rb_ary_new2(regs
->num_regs
);
1930 target
= RMATCH(match
)->str
;
1932 for (i
=start
; i
<regs
->num_regs
; i
++) {
1933 if (regs
->beg
[i
] == -1) {
1934 rb_ary_push(ary
, Qnil
);
1937 VALUE str
= rb_str_subseq(target
, regs
->beg
[i
], regs
->end
[i
]-regs
->beg
[i
]);
1938 rb_ary_push(ary
, str
);
1947 * mtch.to_a -> anArray
1949 * Returns the array of matches.
1951 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1952 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1954 * Because <code>to_a</code> is called when expanding
1955 * <code>*</code><em>variable</em>, there's a useful assignment
1956 * shortcut for extracting matched fields. This is slightly slower than
1957 * accessing the fields directly (as an intermediate array is
1960 * all,f1,f2,f3 = * /(.)(.)(\d+)(\d)/.match("THX1138.")
1968 match_to_a(VALUE match
)
1970 return match_array(match
, 0);
1976 * mtch.captures -> array
1978 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1980 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1987 match_captures(VALUE match
)
1989 return match_array(match
, 1);
1993 name_to_backref_number(struct re_registers
*regs
, VALUE regexp
, const char* name
, const char* name_end
)
1995 if (NIL_P(regexp
)) return -1;
1996 return onig_name_to_backref_number(RREGEXP_PTR(regexp
),
1997 (const unsigned char *)name
, (const unsigned char *)name_end
, regs
);
2000 #define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end) \
2002 !rb_enc_compatible(RREGEXP_SRC(re), (name)) ? 0 : \
2003 name_to_backref_number((regs), (re), (name_ptr), (name_end)))
2006 namev_to_backref_number(struct re_registers
*regs
, VALUE re
, VALUE name
)
2010 if (SYMBOL_P(name
)) {
2011 name
= rb_sym2str(name
);
2013 else if (!RB_TYPE_P(name
, T_STRING
)) {
2016 num
= NAME_TO_NUMBER(regs
, re
, name
,
2017 RSTRING_PTR(name
), RSTRING_END(name
));
2019 name_to_backref_error(name
);
2025 match_ary_subseq(VALUE match
, long beg
, long len
, VALUE result
)
2027 long olen
= RMATCH_REGS(match
)->num_regs
;
2028 long j
, end
= olen
< beg
+len
? olen
: beg
+len
;
2029 if (NIL_P(result
)) result
= rb_ary_new_capa(len
);
2030 if (len
== 0) return result
;
2032 for (j
= beg
; j
< end
; j
++) {
2033 rb_ary_push(result
, rb_reg_nth_match((int)j
, match
));
2035 if (beg
+ len
> j
) {
2036 rb_ary_resize(result
, RARRAY_LEN(result
) + (beg
+ len
) - j
);
2042 match_ary_aref(VALUE match
, VALUE idx
, VALUE result
)
2045 int num_regs
= RMATCH_REGS(match
)->num_regs
;
2047 /* check if idx is Range */
2048 switch (rb_range_beg_len(idx
, &beg
, &len
, (long)num_regs
, !NIL_P(result
))) {
2050 if (NIL_P(result
)) return rb_reg_nth_match(NUM2INT(idx
), match
);
2051 rb_ary_push(result
, rb_reg_nth_match(NUM2INT(idx
), match
));
2056 return match_ary_subseq(match
, beg
, len
, result
);
2062 * mtch[i] -> str or nil
2063 * mtch[start, length] -> array
2064 * mtch[range] -> array
2065 * mtch[name] -> str or nil
2067 * Match Reference -- MatchData acts as an array, and may be accessed
2068 * using the normal array indexing techniques. <code>mtch[0]</code>
2069 * is equivalent to the special variable <code>$&</code>, and returns
2070 * the entire matched string. <code>mtch[1]</code>,
2071 * <code>mtch[2]</code>, and so on return the values of the matched
2072 * backreferences (portions of the pattern between parentheses).
2074 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2075 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
2077 * m[1, 2] #=> ["H", "X"]
2078 * m[1..3] #=> ["H", "X", "113"]
2079 * m[-3, 2] #=> ["X", "113"]
2081 * m = /(?<foo>a+)b/.match("ccaaab")
2082 * m #=> #<MatchData "aaab" foo:"aaa">
2083 * m["foo"] #=> "aaa"
2088 match_aref(int argc
, VALUE
*argv
, VALUE match
)
2093 rb_scan_args(argc
, argv
, "11", &idx
, &length
);
2095 if (NIL_P(length
)) {
2096 if (FIXNUM_P(idx
)) {
2097 return rb_reg_nth_match(FIX2INT(idx
), match
);
2100 int num
= namev_to_backref_number(RMATCH_REGS(match
), RMATCH(match
)->regexp
, idx
);
2102 return rb_reg_nth_match(num
, match
);
2105 return match_ary_aref(match
, idx
, Qnil
);
2110 long beg
= NUM2LONG(idx
);
2111 long len
= NUM2LONG(length
);
2112 long num_regs
= RMATCH_REGS(match
)->num_regs
;
2118 if (beg
< 0) return Qnil
;
2120 else if (beg
> num_regs
) {
2123 else if (beg
+len
> num_regs
) {
2124 len
= num_regs
- beg
;
2126 return match_ary_subseq(match
, beg
, len
, Qnil
);
2133 * mtch.values_at(index, ...) -> array
2135 * Uses each <i>index</i> to access the matching values, returning an array of
2136 * the corresponding matches.
2138 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
2139 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
2140 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"]
2141 * m.values_at(1..2, -1) #=> ["H", "X", "8"]
2143 * m = /(?<a>\d+) *(?<op>[+\-*\/]) *(?<b>\d+)/.match("1 + 2")
2144 * m.to_a #=> ["1 + 2", "1", "+", "2"]
2145 * m.values_at(:a, :b, :op) #=> ["1", "2", "+"]
2149 match_values_at(int argc
, VALUE
*argv
, VALUE match
)
2155 result
= rb_ary_new2(argc
);
2157 for (i
=0; i
<argc
; i
++) {
2158 if (FIXNUM_P(argv
[i
])) {
2159 rb_ary_push(result
, rb_reg_nth_match(FIX2INT(argv
[i
]), match
));
2162 int num
= namev_to_backref_number(RMATCH_REGS(match
), RMATCH(match
)->regexp
, argv
[i
]);
2164 rb_ary_push(result
, rb_reg_nth_match(num
, match
));
2167 match_ary_aref(match
, argv
[i
], result
);
2179 * Returns the entire matched string.
2181 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2182 * m.to_s #=> "HX1138"
2186 match_to_s(VALUE match
)
2188 VALUE str
= rb_reg_last_match(match
);
2191 if (NIL_P(str
)) str
= rb_str_new(0,0);
2196 match_named_captures_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
2197 int back_num
, int *back_refs
, OnigRegex regex
, void *arg
) {
2198 struct MEMO
*memo
= MEMO_CAST(arg
);
2199 VALUE hash
= memo
->v1
;
2200 VALUE match
= memo
->v2
;
2202 VALUE key
= rb_enc_str_new((const char *)name
, name_end
-name
, regex
->enc
);
2208 for (i
= 0; i
< back_num
; i
++) {
2209 value
= rb_reg_nth_match(back_refs
[i
], match
);
2211 rb_hash_aset(hash
, key
, value
);
2217 rb_hash_aset(hash
, key
, Qnil
);
2225 * mtch.named_captures -> hash
2227 * Returns a Hash using named capture.
2229 * A key of the hash is a name of the named captures.
2230 * A value of the hash is a string of last successful capture of corresponding
2233 * m = /(?<a>.)(?<b>.)/.match("01")
2234 * m.named_captures #=> {"a" => "0", "b" => "1"}
2236 * m = /(?<a>.)(?<b>.)?/.match("0")
2237 * m.named_captures #=> {"a" => "0", "b" => nil}
2239 * m = /(?<a>.)(?<a>.)/.match("01")
2240 * m.named_captures #=> {"a" => "1"}
2242 * m = /(?<a>x)|(?<a>y)/.match("x")
2243 * m.named_captures #=> {"a" => "x"}
2247 match_named_captures(VALUE match
)
2253 if (NIL_P(RMATCH(match
)->regexp
))
2254 return rb_hash_new();
2256 hash
= rb_hash_new();
2257 memo
= MEMO_NEW(hash
, match
, 0);
2259 onig_foreach_name(RREGEXP(RMATCH(match
)->regexp
)->ptr
, match_named_captures_iter
, (void*)memo
);
2266 * mtch.string -> str
2268 * Returns a frozen copy of the string passed in to <code>match</code>.
2270 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2271 * m.string #=> "THX1138."
2275 match_string(VALUE match
)
2278 return RMATCH(match
)->str
; /* str is frozen */
2281 struct backref_name_tag
{
2287 match_inspect_name_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
2288 int back_num
, int *back_refs
, OnigRegex regex
, void *arg0
)
2290 struct backref_name_tag
*arg
= (struct backref_name_tag
*)arg0
;
2293 for (i
= 0; i
< back_num
; i
++) {
2294 arg
[back_refs
[i
]].name
= name
;
2295 arg
[back_refs
[i
]].len
= name_end
- name
;
2302 * mtch.inspect -> str
2304 * Returns a printable version of <i>mtch</i>.
2306 * puts /.$/.match("foo").inspect
2307 * #=> #<MatchData "o">
2309 * puts /(.)(.)(.)/.match("foo").inspect
2310 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
2312 * puts /(.)(.)?(.)/.match("fo").inspect
2313 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
2315 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
2316 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
2321 match_inspect(VALUE match
)
2323 VALUE cname
= rb_class_path(rb_obj_class(match
));
2326 struct re_registers
*regs
= RMATCH_REGS(match
);
2327 int num_regs
= regs
->num_regs
;
2328 struct backref_name_tag
*names
;
2329 VALUE regexp
= RMATCH(match
)->regexp
;
2332 return rb_sprintf("#<%"PRIsVALUE
":%p>", cname
, (void*)match
);
2334 else if (NIL_P(regexp
)) {
2335 return rb_sprintf("#<%"PRIsVALUE
": %"PRIsVALUE
">",
2336 cname
, rb_reg_nth_match(0, match
));
2339 names
= ALLOCA_N(struct backref_name_tag
, num_regs
);
2340 MEMZERO(names
, struct backref_name_tag
, num_regs
);
2342 onig_foreach_name(RREGEXP_PTR(regexp
),
2343 match_inspect_name_iter
, names
);
2345 str
= rb_str_buf_new2("#<");
2346 rb_str_append(str
, cname
);
2348 for (i
= 0; i
< num_regs
; i
++) {
2350 rb_str_buf_cat2(str
, " ");
2353 rb_str_buf_cat(str
, (const char *)names
[i
].name
, names
[i
].len
);
2355 rb_str_catf(str
, "%d", i
);
2357 rb_str_buf_cat2(str
, ":");
2359 v
= rb_reg_nth_match(i
, match
);
2361 rb_str_buf_cat2(str
, "nil");
2363 rb_str_buf_append(str
, rb_str_inspect(v
));
2365 rb_str_buf_cat2(str
, ">");
2373 read_escaped_byte(const char **pp
, const char *end
, onig_errmsg_buffer err
)
2375 const char *p
= *pp
;
2377 int meta_prefix
= 0, ctrl_prefix
= 0;
2380 if (p
== end
|| *p
++ != '\\') {
2381 errcpy(err
, "too short escaped multibyte character");
2387 errcpy(err
, "too short escape sequence");
2391 case '\\': code
= '\\'; break;
2392 case 'n': code
= '\n'; break;
2393 case 't': code
= '\t'; break;
2394 case 'r': code
= '\r'; break;
2395 case 'f': code
= '\f'; break;
2396 case 'v': code
= '\013'; break;
2397 case 'a': code
= '\007'; break;
2398 case 'e': code
= '\033'; break;
2401 case '0': case '1': case '2': case '3':
2402 case '4': case '5': case '6': case '7':
2404 code
= scan_oct(p
, end
< p
+3 ? end
-p
: 3, &len
);
2408 case 'x': /* \xHH */
2409 code
= scan_hex(p
, end
< p
+2 ? end
-p
: 2, &len
);
2411 errcpy(err
, "invalid hex escape");
2417 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2419 errcpy(err
, "duplicate meta escape");
2423 if (p
+1 < end
&& *p
++ == '-' && (*p
& 0x80) == 0) {
2433 errcpy(err
, "too short meta escape");
2436 case 'C': /* \C-X, \C-\M-X */
2437 if (p
== end
|| *p
++ != '-') {
2438 errcpy(err
, "too short control escape");
2441 case 'c': /* \cX, \c\M-X */
2443 errcpy(err
, "duplicate control escape");
2447 if (p
< end
&& (*p
& 0x80) == 0) {
2457 errcpy(err
, "too short control escape");
2461 errcpy(err
, "unexpected escape sequence");
2464 if (code
< 0 || 0xff < code
) {
2465 errcpy(err
, "invalid escape code");
2479 unescape_escaped_nonascii(const char **pp
, const char *end
, rb_encoding
*enc
,
2480 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2482 const char *p
= *pp
;
2483 int chmaxlen
= rb_enc_mbmaxlen(enc
);
2484 unsigned char *area
= ALLOCA_N(unsigned char, chmaxlen
);
2485 char *chbuf
= (char *)area
;
2490 memset(chbuf
, 0, chmaxlen
);
2492 byte
= read_escaped_byte(&p
, end
, err
);
2497 area
[chlen
++] = byte
;
2498 while (chlen
< chmaxlen
&&
2499 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf
, chbuf
+chlen
, enc
))) {
2500 byte
= read_escaped_byte(&p
, end
, err
);
2504 area
[chlen
++] = byte
;
2507 l
= rb_enc_precise_mbclen(chbuf
, chbuf
+chlen
, enc
);
2508 if (MBCLEN_INVALID_P(l
)) {
2509 errcpy(err
, "invalid multibyte escape");
2512 if (1 < chlen
|| (area
[0] & 0x80)) {
2513 rb_str_buf_cat(buf
, chbuf
, chlen
);
2517 else if (*encp
!= enc
) {
2518 errcpy(err
, "escaped non ASCII character in UTF-8 regexp");
2524 snprintf(escbuf
, sizeof(escbuf
), "\\x%02X", area
[0]&0xff);
2525 rb_str_buf_cat(buf
, escbuf
, 4);
2532 check_unicode_range(unsigned long code
, onig_errmsg_buffer err
)
2534 if ((0xd800 <= code
&& code
<= 0xdfff) || /* Surrogates */
2536 errcpy(err
, "invalid Unicode range");
2543 append_utf8(unsigned long uv
,
2544 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2546 if (check_unicode_range(uv
, err
) != 0)
2550 snprintf(escbuf
, sizeof(escbuf
), "\\x%02X", (int)uv
);
2551 rb_str_buf_cat(buf
, escbuf
, 4);
2556 len
= rb_uv_to_utf8(utf8buf
, uv
);
2557 rb_str_buf_cat(buf
, utf8buf
, len
);
2560 *encp
= rb_utf8_encoding();
2561 else if (*encp
!= rb_utf8_encoding()) {
2562 errcpy(err
, "UTF-8 character in non UTF-8 regexp");
2570 unescape_unicode_list(const char **pp
, const char *end
,
2571 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2573 const char *p
= *pp
;
2574 int has_unicode
= 0;
2578 while (p
< end
&& ISSPACE(*p
)) p
++;
2581 code
= ruby_scan_hex(p
, end
-p
, &len
);
2584 if (6 < len
) { /* max 10FFFF */
2585 errcpy(err
, "invalid Unicode range");
2589 if (append_utf8(code
, buf
, encp
, err
) != 0)
2593 while (p
< end
&& ISSPACE(*p
)) p
++;
2596 if (has_unicode
== 0) {
2597 errcpy(err
, "invalid Unicode list");
2607 unescape_unicode_bmp(const char **pp
, const char *end
,
2608 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2610 const char *p
= *pp
;
2615 errcpy(err
, "invalid Unicode escape");
2618 code
= ruby_scan_hex(p
, 4, &len
);
2620 errcpy(err
, "invalid Unicode escape");
2623 if (append_utf8(code
, buf
, encp
, err
) != 0)
2630 unescape_nonascii(const char *p
, const char *end
, rb_encoding
*enc
,
2631 VALUE buf
, rb_encoding
**encp
, int *has_property
,
2632 onig_errmsg_buffer err
)
2638 int chlen
= rb_enc_precise_mbclen(p
, end
, enc
);
2639 if (!MBCLEN_CHARFOUND_P(chlen
)) {
2641 errcpy(err
, "invalid multibyte character");
2644 chlen
= MBCLEN_CHARFOUND_LEN(chlen
);
2645 if (1 < chlen
|| (*p
& 0x80)) {
2647 rb_str_buf_cat(buf
, p
, chlen
);
2651 else if (*encp
!= enc
) {
2652 errcpy(err
, "non ASCII character in UTF-8 regexp");
2661 errcpy(err
, "too short escape sequence");
2664 chlen
= rb_enc_precise_mbclen(p
, end
, enc
);
2665 if (!MBCLEN_CHARFOUND_P(chlen
)) {
2666 goto invalid_multibyte
;
2668 if ((chlen
= MBCLEN_CHARFOUND_LEN(chlen
)) > 1) {
2669 /* include the previous backslash */
2675 case '1': case '2': case '3':
2676 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2678 size_t len
= end
-(p
-1), octlen
;
2679 if (ruby_scan_oct(p
-1, len
< 3 ? len
: 3, &octlen
) <= 0177) {
2680 /* backref or 7bit octal.
2681 no need to unescape anyway.
2682 re-escaping may break backref */
2686 /* xxx: How about more than 199 subexpressions? */
2688 case '0': /* \0, \0O, \0OO */
2690 case 'x': /* \xHH */
2691 case 'c': /* \cX, \c\M-X */
2692 case 'C': /* \C-X, \C-\M-X */
2693 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2695 if (enc
== rb_usascii_encoding()) {
2696 const char *pbeg
= p
;
2697 int byte
= read_escaped_byte(&p
, end
, err
);
2698 if (byte
== -1) return -1;
2700 rb_str_buf_cat(buf
, pbeg
, p
-pbeg
);
2703 if (unescape_escaped_nonascii(&p
, end
, enc
, buf
, encp
, err
) != 0)
2710 errcpy(err
, "too short escape sequence");
2714 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2716 if (unescape_unicode_list(&p
, end
, buf
, encp
, err
) != 0)
2718 if (p
== end
|| *p
++ != '}') {
2719 errcpy(err
, "invalid Unicode list");
2726 if (unescape_unicode_bmp(&p
, end
, buf
, encp
, err
) != 0)
2731 case 'p': /* \p{Hiragana} */
2738 default: /* \n, \\, \d, \9, etc. */
2742 rb_str_buf_cat(buf
, smallbuf
, 2);
2748 rb_str_buf_cat(buf
, (char *)&c
, 1);
2757 rb_reg_preprocess(const char *p
, const char *end
, rb_encoding
*enc
,
2758 rb_encoding
**fixed_enc
, onig_errmsg_buffer err
)
2761 int has_property
= 0;
2763 buf
= rb_str_buf_new(0);
2765 if (rb_enc_asciicompat(enc
))
2769 rb_enc_associate(buf
, enc
);
2772 if (unescape_nonascii(p
, end
, enc
, buf
, fixed_enc
, &has_property
, err
) != 0)
2775 if (has_property
&& !*fixed_enc
) {
2780 rb_enc_associate(buf
, *fixed_enc
);
2787 rb_reg_check_preprocess(VALUE str
)
2789 rb_encoding
*fixed_enc
= 0;
2790 onig_errmsg_buffer err
= "";
2796 p
= RSTRING_PTR(str
);
2797 end
= p
+ RSTRING_LEN(str
);
2798 enc
= rb_enc_get(str
);
2800 buf
= rb_reg_preprocess(p
, end
, enc
, &fixed_enc
, err
);
2804 return rb_reg_error_desc(str
, 0, err
);
2810 rb_reg_preprocess_dregexp(VALUE ary
, int options
)
2812 rb_encoding
*fixed_enc
= 0;
2813 rb_encoding
*regexp_enc
= 0;
2814 onig_errmsg_buffer err
= "";
2817 rb_encoding
*ascii8bit
= rb_ascii8bit_encoding();
2819 if (RARRAY_LEN(ary
) == 0) {
2820 rb_raise(rb_eArgError
, "no arguments given");
2823 for (i
= 0; i
< RARRAY_LEN(ary
); i
++) {
2824 VALUE str
= RARRAY_AREF(ary
, i
);
2827 rb_encoding
*src_enc
;
2829 src_enc
= rb_enc_get(str
);
2830 if (options
& ARG_ENCODING_NONE
&&
2831 src_enc
!= ascii8bit
) {
2832 if (str_coderange(str
) != ENC_CODERANGE_7BIT
)
2833 rb_raise(rb_eRegexpError
, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2835 src_enc
= ascii8bit
;
2839 p
= RSTRING_PTR(str
);
2840 end
= p
+ RSTRING_LEN(str
);
2842 buf
= rb_reg_preprocess(p
, end
, src_enc
, &fixed_enc
, err
);
2845 rb_raise(rb_eArgError
, "%s", err
);
2847 if (fixed_enc
!= 0) {
2848 if (regexp_enc
!= 0 && regexp_enc
!= fixed_enc
) {
2849 rb_raise(rb_eRegexpError
, "encoding mismatch in dynamic regexp : %s and %s",
2850 rb_enc_name(regexp_enc
), rb_enc_name(fixed_enc
));
2852 regexp_enc
= fixed_enc
;
2856 result
= rb_str_new3(str
);
2858 rb_str_buf_append(result
, str
);
2861 rb_enc_associate(result
, regexp_enc
);
2868 rb_reg_initialize(VALUE obj
, const char *s
, long len
, rb_encoding
*enc
,
2869 int options
, onig_errmsg_buffer err
,
2870 const char *sourcefile
, int sourceline
)
2872 struct RRegexp
*re
= RREGEXP(obj
);
2874 rb_encoding
*fixed_enc
= 0;
2875 rb_encoding
*a_enc
= rb_ascii8bit_encoding();
2877 rb_check_frozen(obj
);
2878 if (FL_TEST(obj
, REG_LITERAL
))
2879 rb_raise(rb_eSecurityError
, "can't modify literal regexp");
2881 rb_raise(rb_eTypeError
, "already initialized regexp");
2884 if (rb_enc_dummy_p(enc
)) {
2885 errcpy(err
, "can't make regexp with dummy encoding");
2889 unescaped
= rb_reg_preprocess(s
, s
+len
, enc
, &fixed_enc
, err
);
2890 if (NIL_P(unescaped
))
2894 if ((fixed_enc
!= enc
&& (options
& ARG_ENCODING_FIXED
)) ||
2895 (fixed_enc
!= a_enc
&& (options
& ARG_ENCODING_NONE
))) {
2896 errcpy(err
, "incompatible character encoding");
2899 if (fixed_enc
!= a_enc
) {
2900 options
|= ARG_ENCODING_FIXED
;
2904 else if (!(options
& ARG_ENCODING_FIXED
)) {
2905 enc
= rb_usascii_encoding();
2908 rb_enc_associate((VALUE
)re
, enc
);
2909 if ((options
& ARG_ENCODING_FIXED
) || fixed_enc
) {
2910 re
->basic
.flags
|= KCODE_FIXED
;
2912 if (options
& ARG_ENCODING_NONE
) {
2913 re
->basic
.flags
|= REG_ENCODING_NONE
;
2916 re
->ptr
= make_regexp(RSTRING_PTR(unescaped
), RSTRING_LEN(unescaped
), enc
,
2917 options
& ARG_REG_OPTION_MASK
, err
,
2918 sourcefile
, sourceline
);
2919 if (!re
->ptr
) return -1;
2920 RB_GC_GUARD(unescaped
);
2925 reg_set_source(VALUE reg
, VALUE str
, rb_encoding
*enc
)
2927 rb_encoding
*regenc
= rb_enc_get(reg
);
2928 if (regenc
!= enc
) {
2929 str
= rb_enc_associate(rb_str_dup(str
), enc
= regenc
);
2931 RB_OBJ_WRITE(reg
, &RREGEXP(reg
)->src
, rb_fstring(str
));
2935 rb_reg_initialize_str(VALUE obj
, VALUE str
, int options
, onig_errmsg_buffer err
,
2936 const char *sourcefile
, int sourceline
)
2939 rb_encoding
*str_enc
= rb_enc_get(str
), *enc
= str_enc
;
2940 if (options
& ARG_ENCODING_NONE
) {
2941 rb_encoding
*ascii8bit
= rb_ascii8bit_encoding();
2942 if (enc
!= ascii8bit
) {
2943 if (str_coderange(str
) != ENC_CODERANGE_7BIT
) {
2944 errcpy(err
, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2950 ret
= rb_reg_initialize(obj
, RSTRING_PTR(str
), RSTRING_LEN(str
), enc
,
2951 options
, err
, sourcefile
, sourceline
);
2952 if (ret
== 0) reg_set_source(obj
, str
, str_enc
);
2957 rb_reg_s_alloc(VALUE klass
)
2959 NEWOBJ_OF(re
, struct RRegexp
, klass
, T_REGEXP
| (RGENGC_WB_PROTECTED_REGEXP
? FL_WB_PROTECTED
: 0));
2962 RB_OBJ_WRITE(re
, &re
->src
, 0);
2971 return rb_reg_s_alloc(rb_cRegexp
);
2975 rb_reg_new_str(VALUE s
, int options
)
2977 return rb_reg_init_str(rb_reg_alloc(), s
, options
);
2981 rb_reg_init_str(VALUE re
, VALUE s
, int options
)
2983 onig_errmsg_buffer err
= "";
2985 if (rb_reg_initialize_str(re
, s
, options
, err
, NULL
, 0) != 0) {
2986 rb_reg_raise_str(s
, options
, err
);
2993 rb_reg_init_str_enc(VALUE re
, VALUE s
, rb_encoding
*enc
, int options
)
2995 onig_errmsg_buffer err
= "";
2997 if (rb_reg_initialize(re
, RSTRING_PTR(s
), RSTRING_LEN(s
),
2998 enc
, options
, err
, NULL
, 0) != 0) {
2999 rb_reg_raise_str(s
, options
, err
);
3001 reg_set_source(re
, s
, enc
);
3006 MJIT_FUNC_EXPORTED VALUE
3007 rb_reg_new_ary(VALUE ary
, int opt
)
3009 VALUE re
= rb_reg_new_str(rb_reg_preprocess_dregexp(ary
, opt
), opt
);
3015 rb_enc_reg_new(const char *s
, long len
, rb_encoding
*enc
, int options
)
3017 VALUE re
= rb_reg_alloc();
3018 onig_errmsg_buffer err
= "";
3020 if (rb_reg_initialize(re
, s
, len
, enc
, options
, err
, NULL
, 0) != 0) {
3021 rb_enc_reg_raise(s
, len
, enc
, options
, err
);
3023 RB_OBJ_WRITE(re
, &RREGEXP(re
)->src
, rb_fstring(rb_enc_str_new(s
, len
, enc
)));
3029 rb_reg_new(const char *s
, long len
, int options
)
3031 return rb_enc_reg_new(s
, len
, rb_ascii8bit_encoding(), options
);
3035 rb_reg_compile(VALUE str
, int options
, const char *sourcefile
, int sourceline
)
3037 VALUE re
= rb_reg_alloc();
3038 onig_errmsg_buffer err
= "";
3040 if (!str
) str
= rb_str_new(0,0);
3041 if (rb_reg_initialize_str(re
, str
, options
, err
, sourcefile
, sourceline
) != 0) {
3042 rb_set_errinfo(rb_reg_error_desc(str
, options
, err
));
3045 FL_SET(re
, REG_LITERAL
);
3050 static VALUE reg_cache
;
3053 rb_reg_regcomp(VALUE str
)
3055 if (reg_cache
&& RREGEXP_SRC_LEN(reg_cache
) == RSTRING_LEN(str
)
3056 && ENCODING_GET(reg_cache
) == ENCODING_GET(str
)
3057 && memcmp(RREGEXP_SRC_PTR(reg_cache
), RSTRING_PTR(str
), RSTRING_LEN(str
)) == 0)
3060 return reg_cache
= rb_reg_new_str(str
, 0);
3063 static st_index_t
reg_hash(VALUE re
);
3066 * rxp.hash -> integer
3068 * Produce a hash based on the text and options of this regular expression.
3070 * See also Object#hash.
3074 rb_reg_hash(VALUE re
)
3076 st_index_t hashval
= reg_hash(re
);
3077 return ST2FIX(hashval
);
3086 hashval
= RREGEXP_PTR(re
)->options
;
3087 hashval
= rb_hash_uint(hashval
, rb_memhash(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
)));
3088 return rb_hash_end(hashval
);
3094 * rxp == other_rxp -> true or false
3095 * rxp.eql?(other_rxp) -> true or false
3097 * Equality---Two regexps are equal if their patterns are identical, they have
3098 * the same character set code, and their <code>casefold?</code> values are the
3101 * /abc/ == /abc/x #=> false
3102 * /abc/ == /abc/i #=> false
3103 * /abc/ == /abc/u #=> false
3104 * /abc/u == /abc/n #=> false
3108 rb_reg_equal(VALUE re1
, VALUE re2
)
3110 if (re1
== re2
) return Qtrue
;
3111 if (!RB_TYPE_P(re2
, T_REGEXP
)) return Qfalse
;
3112 rb_reg_check(re1
); rb_reg_check(re2
);
3113 if (FL_TEST(re1
, KCODE_FIXED
) != FL_TEST(re2
, KCODE_FIXED
)) return Qfalse
;
3114 if (RREGEXP_PTR(re1
)->options
!= RREGEXP_PTR(re2
)->options
) return Qfalse
;
3115 if (RREGEXP_SRC_LEN(re1
) != RREGEXP_SRC_LEN(re2
)) return Qfalse
;
3116 if (ENCODING_GET(re1
) != ENCODING_GET(re2
)) return Qfalse
;
3117 return RBOOL(memcmp(RREGEXP_SRC_PTR(re1
), RREGEXP_SRC_PTR(re2
), RREGEXP_SRC_LEN(re1
)) == 0);
3122 * mtch.hash -> integer
3124 * Produce a hash based on the target string, regexp and matched
3125 * positions of this matchdata.
3127 * See also Object#hash.
3131 match_hash(VALUE match
)
3133 const struct re_registers
*regs
;
3137 hashval
= rb_hash_start(rb_str_hash(RMATCH(match
)->str
));
3138 hashval
= rb_hash_uint(hashval
, reg_hash(match_regexp(match
)));
3139 regs
= RMATCH_REGS(match
);
3140 hashval
= rb_hash_uint(hashval
, regs
->num_regs
);
3141 hashval
= rb_hash_uint(hashval
, rb_memhash(regs
->beg
, regs
->num_regs
* sizeof(*regs
->beg
)));
3142 hashval
= rb_hash_uint(hashval
, rb_memhash(regs
->end
, regs
->num_regs
* sizeof(*regs
->end
)));
3143 hashval
= rb_hash_end(hashval
);
3144 return ST2FIX(hashval
);
3149 * mtch == mtch2 -> true or false
3150 * mtch.eql?(mtch2) -> true or false
3152 * Equality---Two matchdata are equal if their target strings,
3153 * patterns, and matched positions are identical.
3157 match_equal(VALUE match1
, VALUE match2
)
3159 const struct re_registers
*regs1
, *regs2
;
3161 if (match1
== match2
) return Qtrue
;
3162 if (!RB_TYPE_P(match2
, T_MATCH
)) return Qfalse
;
3163 if (!RMATCH(match1
)->regexp
|| !RMATCH(match2
)->regexp
) return Qfalse
;
3164 if (!rb_str_equal(RMATCH(match1
)->str
, RMATCH(match2
)->str
)) return Qfalse
;
3165 if (!rb_reg_equal(match_regexp(match1
), match_regexp(match2
))) return Qfalse
;
3166 regs1
= RMATCH_REGS(match1
);
3167 regs2
= RMATCH_REGS(match2
);
3168 if (regs1
->num_regs
!= regs2
->num_regs
) return Qfalse
;
3169 if (memcmp(regs1
->beg
, regs2
->beg
, regs1
->num_regs
* sizeof(*regs1
->beg
))) return Qfalse
;
3170 if (memcmp(regs1
->end
, regs2
->end
, regs1
->num_regs
* sizeof(*regs1
->end
))) return Qfalse
;
3175 reg_operand(VALUE s
, int check
)
3178 return rb_sym2str(s
);
3180 else if (RB_TYPE_P(s
, T_STRING
)) {
3184 return check
? rb_str_to_str(s
) : rb_check_string_type(s
);
3189 reg_match_pos(VALUE re
, VALUE
*strp
, long pos
, VALUE
* set_match
)
3194 rb_backref_set(Qnil
);
3197 *strp
= str
= reg_operand(str
, TRUE
);
3200 VALUE l
= rb_str_length(str
);
3206 pos
= rb_str_offset(str
, pos
);
3208 return rb_reg_search_set_match(re
, str
, pos
, 0, 1, set_match
);
3213 * rxp =~ str -> integer or nil
3215 * Match---Matches <i>rxp</i> against <i>str</i>.
3217 * /at/ =~ "input data" #=> 7
3218 * /ax/ =~ "input data" #=> nil
3220 * If <code>=~</code> is used with a regexp literal with named captures,
3221 * captured strings (or nil) is assigned to local variables named by
3222 * the capture names.
3224 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
3228 * If it is not matched, nil is assigned for the variables.
3230 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
3234 * This assignment is implemented in the Ruby parser.
3235 * The parser detects 'regexp-literal =~ expression' for the assignment.
3236 * The regexp must be a literal without interpolation and placed at left hand side.
3238 * The assignment does not occur if the regexp is not a literal.
3240 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3242 * p lhs # undefined local variable
3243 * p rhs # undefined local variable
3245 * A regexp interpolation, <code>#{}</code>, also disables
3248 * rhs_pat = /(?<rhs>\w+)/
3249 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
3250 * p lhs # undefined local variable
3252 * The assignment does not occur if the regexp is placed at the right hand side.
3254 * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3255 * p lhs, rhs # undefined local variable
3260 rb_reg_match(VALUE re
, VALUE str
)
3262 long pos
= reg_match_pos(re
, &str
, 0, NULL
);
3263 if (pos
< 0) return Qnil
;
3264 pos
= rb_str_sublen(str
, pos
);
3265 return LONG2FIX(pos
);
3270 * rxp === str -> true or false
3272 * Case Equality---Used in case statements.
3276 * when /\A[a-z]*\z/; print "Lower case\n"
3277 * when /\A[A-Z]*\z/; print "Upper case\n"
3278 * else; print "Mixed case\n"
3282 * Following a regular expression literal with the #=== operator allows you to
3283 * compare against a String.
3285 * /^[a-z]*$/ === "HELLO" #=> false
3286 * /^[A-Z]*$/ === "HELLO" #=> true
3290 rb_reg_eqq(VALUE re
, VALUE str
)
3294 str
= reg_operand(str
, FALSE
);
3296 rb_backref_set(Qnil
);
3299 start
= rb_reg_search(re
, str
, 0, 0);
3309 * ~ rxp -> integer or nil
3311 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
3312 * Equivalent to <code><i>rxp</i> =~ $_</code>.
3319 rb_reg_match2(VALUE re
)
3322 VALUE line
= rb_lastline_get();
3324 if (!RB_TYPE_P(line
, T_STRING
)) {
3325 rb_backref_set(Qnil
);
3329 start
= rb_reg_search(re
, line
, 0, 0);
3333 start
= rb_str_sublen(line
, start
);
3334 return LONG2FIX(start
);
3340 * rxp.match(str, pos=0) -> matchdata or nil
3341 * rxp.match(str, pos=0) {|match| block } -> obj
3343 * Returns a MatchData object describing the match, or
3344 * <code>nil</code> if there was no match. This is equivalent to
3345 * retrieving the value of the special variable <code>$~</code>
3346 * following a normal match. If the second parameter is present, it
3347 * specifies the position in the string to begin the search.
3349 * /(.)(.)(.)/.match("abc")[2] #=> "b"
3350 * /(.)(.)/.match("abc", 1)[2] #=> "c"
3352 * If a block is given, invoke the block with MatchData if match succeed, so
3353 * that you can write
3355 * /M(.*)/.match("Matz") do |m|
3362 * if m = /M(.*)/.match("Matz")
3367 * The return value is a value from block execution in this case.
3371 rb_reg_match_m(int argc
, VALUE
*argv
, VALUE re
)
3373 VALUE result
= Qnil
, str
, initpos
;
3376 if (rb_scan_args(argc
, argv
, "11", &str
, &initpos
) == 2) {
3377 pos
= NUM2LONG(initpos
);
3383 pos
= reg_match_pos(re
, &str
, pos
, &result
);
3385 rb_backref_set(Qnil
);
3388 rb_match_busy(result
);
3389 if (!NIL_P(result
) && rb_block_given_p()) {
3390 return rb_yield(result
);
3397 * rxp.match?(str) -> true or false
3398 * rxp.match?(str, pos=0) -> true or false
3400 * Returns <code>true</code> or <code>false</code> to indicate whether the
3401 * regexp is matched or not without updating $~ and other related variables.
3402 * If the second parameter is present, it specifies the position in the string
3403 * to begin the search.
3405 * /R.../.match?("Ruby") #=> true
3406 * /R.../.match?("Ruby", 1) #=> false
3407 * /P.../.match?("Ruby") #=> false
3412 rb_reg_match_m_p(int argc
, VALUE
*argv
, VALUE re
)
3414 long pos
= rb_check_arity(argc
, 1, 2) > 1 ? NUM2LONG(argv
[1]) : 0;
3415 return rb_reg_match_p(re
, argv
[0], pos
);
3419 rb_reg_match_p(VALUE re
, VALUE str
, long pos
)
3422 onig_errmsg_buffer err
= "";
3423 OnigPosition result
;
3424 const UChar
*start
, *end
;
3427 if (NIL_P(str
)) return Qfalse
;
3428 str
= SYMBOL_P(str
) ? rb_sym2str(str
) : StringValue(str
);
3431 pos
+= NUM2LONG(rb_str_length(str
));
3432 if (pos
< 0) return Qfalse
;
3436 const char *beg
= rb_str_subpos(str
, pos
, &len
);
3437 if (!beg
) return Qfalse
;
3438 pos
= beg
- RSTRING_PTR(str
);
3441 reg
= rb_reg_prepare_re0(re
, str
, err
);
3442 tmpreg
= reg
!= RREGEXP_PTR(re
);
3443 if (!tmpreg
) RREGEXP(re
)->usecnt
++;
3444 start
= ((UChar
*)RSTRING_PTR(str
));
3445 end
= start
+ RSTRING_LEN(str
);
3446 result
= onig_search(reg
, start
, end
, start
+ pos
, end
,
3447 NULL
, ONIG_OPTION_NONE
);
3448 if (!tmpreg
) RREGEXP(re
)->usecnt
--;
3450 if (RREGEXP(re
)->usecnt
) {
3454 onig_free(RREGEXP_PTR(re
));
3455 RREGEXP_PTR(re
) = reg
;
3459 if (result
== ONIG_MISMATCH
) {
3463 onig_error_code_to_str((UChar
*)err
, (int)result
);
3464 rb_reg_raise(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), err
, re
);
3471 * Document-method: compile
3473 * Alias for Regexp.new
3478 * Regexp.new(string, [options]) -> regexp
3479 * Regexp.new(regexp) -> regexp
3480 * Regexp.compile(string, [options]) -> regexp
3481 * Regexp.compile(regexp) -> regexp
3483 * Constructs a new regular expression from +pattern+, which can be either a
3484 * String or a Regexp (in which case that regexp's options are propagated),
3485 * and new options may not be specified (a change as of Ruby 1.8).
3487 * If +options+ is an Integer, it should be one or more of the constants
3488 * Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE,
3489 * <em>or</em>-ed together. Otherwise, if +options+ is not
3490 * +nil+ or +false+, the regexp will be case insensitive.
3492 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
3493 * r2 = Regexp.new('cat', true) #=> /cat/i
3494 * r3 = Regexp.new(r2) #=> /cat/i
3495 * r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix
3499 rb_reg_initialize_m(int argc
, VALUE
*argv
, VALUE self
)
3503 rb_encoding
*enc
= 0;
3505 rb_check_arity(argc
, 1, 3);
3506 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
3510 rb_warn("flags ignored");
3513 flags
= rb_reg_options(re
);
3514 str
= RREGEXP_SRC(re
);
3518 if (FIXNUM_P(argv
[1])) flags
= FIX2INT(argv
[1]);
3519 else if (RTEST(argv
[1])) flags
= ONIG_OPTION_IGNORECASE
;
3521 if (argc
== 3 && !NIL_P(argv
[2])) {
3522 char *kcode
= StringValuePtr(argv
[2]);
3523 if (kcode
[0] == 'n' || kcode
[0] == 'N') {
3524 enc
= rb_ascii8bit_encoding();
3525 flags
|= ARG_ENCODING_NONE
;
3528 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "encoding option is ignored - %s", kcode
);
3531 str
= StringValue(argv
[0]);
3533 if (enc
&& rb_enc_get(str
) != enc
)
3534 rb_reg_init_str_enc(self
, str
, enc
, flags
);
3536 rb_reg_init_str(self
, str
, flags
);
3541 rb_reg_quote(VALUE str
)
3543 rb_encoding
*enc
= rb_enc_get(str
);
3547 int ascii_only
= rb_enc_str_asciionly_p(str
);
3549 s
= RSTRING_PTR(str
);
3550 send
= s
+ RSTRING_LEN(str
);
3552 c
= rb_enc_ascget(s
, send
, &clen
, enc
);
3554 s
+= mbclen(s
, send
, enc
);
3558 case '[': case ']': case '{': case '}':
3559 case '(': case ')': case '|': case '-':
3560 case '*': case '.': case '\\':
3561 case '?': case '+': case '^': case '$':
3563 case '\t': case '\f': case '\v': case '\n': case '\r':
3568 tmp
= rb_str_new3(str
);
3570 rb_enc_associate(tmp
, rb_usascii_encoding());
3575 tmp
= rb_str_new(0, RSTRING_LEN(str
)*2);
3577 rb_enc_associate(tmp
, rb_usascii_encoding());
3580 rb_enc_copy(tmp
, str
);
3582 t
= RSTRING_PTR(tmp
);
3583 /* copy upto metacharacter */
3584 const char *p
= RSTRING_PTR(str
);
3585 memcpy(t
, p
, s
- p
);
3589 c
= rb_enc_ascget(s
, send
, &clen
, enc
);
3591 int n
= mbclen(s
, send
, enc
);
3599 case '[': case ']': case '{': case '}':
3600 case '(': case ')': case '|': case '-':
3601 case '*': case '.': case '\\':
3602 case '?': case '+': case '^': case '$':
3604 t
+= rb_enc_mbcput('\\', t
, enc
);
3607 t
+= rb_enc_mbcput('\\', t
, enc
);
3608 t
+= rb_enc_mbcput(' ', t
, enc
);
3611 t
+= rb_enc_mbcput('\\', t
, enc
);
3612 t
+= rb_enc_mbcput('t', t
, enc
);
3615 t
+= rb_enc_mbcput('\\', t
, enc
);
3616 t
+= rb_enc_mbcput('n', t
, enc
);
3619 t
+= rb_enc_mbcput('\\', t
, enc
);
3620 t
+= rb_enc_mbcput('r', t
, enc
);
3623 t
+= rb_enc_mbcput('\\', t
, enc
);
3624 t
+= rb_enc_mbcput('f', t
, enc
);
3627 t
+= rb_enc_mbcput('\\', t
, enc
);
3628 t
+= rb_enc_mbcput('v', t
, enc
);
3631 t
+= rb_enc_mbcput(c
, t
, enc
);
3633 rb_str_resize(tmp
, t
- RSTRING_PTR(tmp
));
3640 * Regexp.escape(str) -> string
3641 * Regexp.quote(str) -> string
3643 * Escapes any characters that would have special meaning in a regular
3644 * expression. Returns a new escaped string with the same or compatible
3645 * encoding. For any string,
3646 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
3648 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
3653 rb_reg_s_quote(VALUE c
, VALUE str
)
3655 return rb_reg_quote(reg_operand(str
, TRUE
));
3659 rb_reg_options(VALUE re
)
3664 options
= RREGEXP_PTR(re
)->options
& ARG_REG_OPTION_MASK
;
3665 if (RBASIC(re
)->flags
& KCODE_FIXED
) options
|= ARG_ENCODING_FIXED
;
3666 if (RBASIC(re
)->flags
& REG_ENCODING_NONE
) options
|= ARG_ENCODING_NONE
;
3671 rb_check_regexp_type(VALUE re
)
3673 return rb_check_convert_type(re
, T_REGEXP
, "Regexp", "to_regexp");
3678 * Regexp.try_convert(obj) -> re or nil
3680 * Try to convert <i>obj</i> into a Regexp, using to_regexp method.
3681 * Returns converted regexp or nil if <i>obj</i> cannot be converted
3684 * Regexp.try_convert(/re/) #=> /re/
3685 * Regexp.try_convert("re") #=> nil
3688 * Regexp.try_convert(o) #=> nil
3689 * def o.to_regexp() /foo/ end
3690 * Regexp.try_convert(o) #=> /foo/
3694 rb_reg_s_try_convert(VALUE dummy
, VALUE re
)
3696 return rb_check_regexp_type(re
);
3700 rb_reg_s_union(VALUE self
, VALUE args0
)
3702 long argc
= RARRAY_LEN(args0
);
3706 args
[0] = rb_str_new2("(?!)");
3707 return rb_class_new_instance(1, args
, rb_cRegexp
);
3709 else if (argc
== 1) {
3710 VALUE arg
= rb_ary_entry(args0
, 0);
3711 VALUE re
= rb_check_regexp_type(arg
);
3716 quoted
= rb_reg_s_quote(Qnil
, arg
);
3717 return rb_reg_new_str(quoted
, 0);
3722 VALUE source
= rb_str_buf_new(0);
3723 rb_encoding
*result_enc
;
3725 int has_asciionly
= 0;
3726 rb_encoding
*has_ascii_compat_fixed
= 0;
3727 rb_encoding
*has_ascii_incompat
= 0;
3729 for (i
= 0; i
< argc
; i
++) {
3731 VALUE e
= rb_ary_entry(args0
, i
);
3734 rb_str_buf_cat_ascii(source
, "|");
3736 v
= rb_check_regexp_type(e
);
3738 rb_encoding
*enc
= rb_enc_get(v
);
3739 if (!rb_enc_asciicompat(enc
)) {
3740 if (!has_ascii_incompat
)
3741 has_ascii_incompat
= enc
;
3742 else if (has_ascii_incompat
!= enc
)
3743 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3744 rb_enc_name(has_ascii_incompat
), rb_enc_name(enc
));
3746 else if (rb_reg_fixed_encoding_p(v
)) {
3747 if (!has_ascii_compat_fixed
)
3748 has_ascii_compat_fixed
= enc
;
3749 else if (has_ascii_compat_fixed
!= enc
)
3750 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3751 rb_enc_name(has_ascii_compat_fixed
), rb_enc_name(enc
));
3756 v
= rb_reg_str_with_term(v
, -1);
3761 enc
= rb_enc_get(e
);
3762 if (!rb_enc_asciicompat(enc
)) {
3763 if (!has_ascii_incompat
)
3764 has_ascii_incompat
= enc
;
3765 else if (has_ascii_incompat
!= enc
)
3766 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3767 rb_enc_name(has_ascii_incompat
), rb_enc_name(enc
));
3769 else if (rb_enc_str_asciionly_p(e
)) {
3773 if (!has_ascii_compat_fixed
)
3774 has_ascii_compat_fixed
= enc
;
3775 else if (has_ascii_compat_fixed
!= enc
)
3776 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3777 rb_enc_name(has_ascii_compat_fixed
), rb_enc_name(enc
));
3779 v
= rb_reg_s_quote(Qnil
, e
);
3781 if (has_ascii_incompat
) {
3782 if (has_asciionly
) {
3783 rb_raise(rb_eArgError
, "ASCII incompatible encoding: %s",
3784 rb_enc_name(has_ascii_incompat
));
3786 if (has_ascii_compat_fixed
) {
3787 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3788 rb_enc_name(has_ascii_incompat
), rb_enc_name(has_ascii_compat_fixed
));
3793 rb_enc_copy(source
, v
);
3795 rb_str_append(source
, v
);
3798 if (has_ascii_incompat
) {
3799 result_enc
= has_ascii_incompat
;
3801 else if (has_ascii_compat_fixed
) {
3802 result_enc
= has_ascii_compat_fixed
;
3805 result_enc
= rb_ascii8bit_encoding();
3808 rb_enc_associate(source
, result_enc
);
3809 return rb_class_new_instance(1, &source
, rb_cRegexp
);
3815 * Regexp.union(pat1, pat2, ...) -> new_regexp
3816 * Regexp.union(pats_ary) -> new_regexp
3818 * Return a Regexp object that is the union of the given
3819 * <em>pattern</em>s, i.e., will match any of its parts. The
3820 * <em>pattern</em>s can be Regexp objects, in which case their
3821 * options will be preserved, or Strings. If no patterns are given,
3822 * returns <code>/(?!)/</code>. The behavior is unspecified if any
3823 * given <em>pattern</em> contains capture.
3825 * Regexp.union #=> /(?!)/
3826 * Regexp.union("penzance") #=> /penzance/
3827 * Regexp.union("a+b*c") #=> /a\+b\*c/
3828 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/
3829 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3830 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
3832 * Note: the arguments for ::union will try to be converted into a regular
3833 * expression literal via #to_regexp.
3836 rb_reg_s_union_m(VALUE self
, VALUE args
)
3839 if (RARRAY_LEN(args
) == 1 &&
3840 !NIL_P(v
= rb_check_array_type(rb_ary_entry(args
, 0)))) {
3841 return rb_reg_s_union(self
, v
);
3843 return rb_reg_s_union(self
, args
);
3848 rb_reg_init_copy(VALUE copy
, VALUE re
)
3850 if (!OBJ_INIT_COPY(copy
, re
)) return copy
;
3852 return rb_reg_init_str(copy
, RREGEXP_SRC(re
), rb_reg_options(re
));
3856 rb_reg_regsub(VALUE str
, VALUE src
, struct re_registers
*regs
, VALUE regexp
)
3861 rb_encoding
*str_enc
= rb_enc_get(str
);
3862 rb_encoding
*src_enc
= rb_enc_get(src
);
3863 int acompat
= rb_enc_asciicompat(str_enc
);
3865 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
3867 RSTRING_GETMEM(str
, s
, n
);
3872 int c
= ASCGET(s
, e
, &clen
);
3876 s
+= mbclen(s
, e
, str_enc
);
3882 if (c
!= '\\' || s
== e
) continue;
3885 val
= rb_str_buf_new(ss
-p
);
3887 rb_enc_str_buf_cat(val
, p
, ss
-p
, str_enc
);
3889 c
= ASCGET(s
, e
, &clen
);
3891 s
+= mbclen(s
, e
, str_enc
);
3892 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3900 case '1': case '2': case '3': case '4':
3901 case '5': case '6': case '7': case '8': case '9':
3902 if (!NIL_P(regexp
) && onig_noname_group_capture_is_active(RREGEXP_PTR(regexp
))) {
3911 if (s
< e
&& ASCGET(s
, e
, &clen
) == '<') {
3912 char *name
, *name_end
;
3914 name_end
= name
= s
+ clen
;
3915 while (name_end
< e
) {
3916 c
= ASCGET(name_end
, e
, &clen
);
3917 if (c
== '>') break;
3918 name_end
+= c
== -1 ? mbclen(name_end
, e
, str_enc
) : clen
;
3921 VALUE n
= rb_str_subseq(str
, (long)(name
- RSTRING_PTR(str
)),
3922 (long)(name_end
- name
));
3923 if ((no
= NAME_TO_NUMBER(regs
, regexp
, n
, name
, name_end
)) < 1) {
3924 name_to_backref_error(n
);
3926 p
= s
= name_end
+ clen
;
3930 rb_raise(rb_eRuntimeError
, "invalid group name reference format");
3934 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3943 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
), BEG(0), src_enc
);
3947 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
)+END(0), RSTRING_LEN(src
)-END(0), src_enc
);
3951 no
= regs
->num_regs
-1;
3952 while (BEG(no
) == -1 && no
> 0) no
--;
3953 if (no
== 0) continue;
3957 rb_enc_str_buf_cat(val
, s
-clen
, clen
, str_enc
);
3961 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3966 if (no
>= regs
->num_regs
) continue;
3967 if (BEG(no
) == -1) continue;
3968 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
)+BEG(no
), END(no
)-BEG(no
), src_enc
);
3972 if (!val
) return str
;
3974 rb_enc_str_buf_cat(val
, p
, e
-p
, str_enc
);
3981 ignorecase_getter(ID _x
, VALUE
*_y
)
3983 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "variable $= is no longer effective");
3988 ignorecase_setter(VALUE val
, ID id
, VALUE
*_
)
3990 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "variable $= is no longer effective; ignored");
3996 VALUE match
= rb_backref_get();
3998 if (NIL_P(match
)) return Qnil
;
3999 rb_match_busy(match
);
4004 get_LAST_MATCH_INFO(ID _x
, VALUE
*_y
)
4006 return match_getter();
4010 match_setter(VALUE val
, ID _x
, VALUE
*_y
)
4013 Check_Type(val
, T_MATCH
);
4015 rb_backref_set(val
);
4020 * Regexp.last_match -> matchdata
4021 * Regexp.last_match(n) -> str
4023 * The first form returns the MatchData object generated by the
4024 * last successful pattern match. Equivalent to reading the special global
4025 * variable <code>$~</code> (see Special global variables in Regexp for
4028 * The second form returns the <i>n</i>th field in this MatchData object.
4029 * _n_ can be a string or symbol to reference a named capture.
4031 * Note that the last_match is local to the thread and method scope of the
4032 * method that did the pattern match.
4034 * /c(.)t/ =~ 'cat' #=> 0
4035 * Regexp.last_match #=> #<MatchData "cat" 1:"a">
4036 * Regexp.last_match(0) #=> "cat"
4037 * Regexp.last_match(1) #=> "a"
4038 * Regexp.last_match(2) #=> nil
4040 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
4041 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
4042 * Regexp.last_match(:lhs) #=> "var"
4043 * Regexp.last_match(:rhs) #=> "val"
4047 rb_reg_s_last_match(int argc
, VALUE
*argv
, VALUE _
)
4049 if (rb_check_arity(argc
, 0, 1) == 1) {
4050 VALUE match
= rb_backref_get();
4052 if (NIL_P(match
)) return Qnil
;
4053 n
= match_backref_number(match
, argv
[0]);
4054 return rb_reg_nth_match(n
, match
);
4056 return match_getter();
4060 re_warn(const char *s
)
4066 * Document-class: RegexpError
4068 * Raised when given an invalid regexp expression.
4072 * <em>raises the exception:</em>
4074 * RegexpError: target of repeat operator is not specified: /?/
4078 * Document-class: Regexp
4080 * A Regexp holds a regular expression, used to match a pattern
4081 * against strings. Regexps are created using the <code>/.../</code>
4082 * and <code>%r{...}</code> literals, and by the Regexp::new
4085 * You can create a \Regexp object explicitly with:
4087 * - A {regexp literal}[doc/syntax/literals_rdoc.html#label-Regexp+Literals].
4089 * :include: doc/regexp.rdoc
4095 rb_eRegexpError
= rb_define_class("RegexpError", rb_eStandardError
);
4097 onigenc_set_default_encoding(ONIG_ENCODING_ASCII
);
4098 onig_set_warn_func(re_warn
);
4099 onig_set_verb_warn_func(re_warn
);
4101 rb_define_virtual_variable("$~", get_LAST_MATCH_INFO
, match_setter
);
4102 rb_define_virtual_variable("$&", last_match_getter
, 0);
4103 rb_define_virtual_variable("$`", prematch_getter
, 0);
4104 rb_define_virtual_variable("$'", postmatch_getter
, 0);
4105 rb_define_virtual_variable("$+", last_paren_match_getter
, 0);
4107 rb_gvar_ractor_local("$~");
4108 rb_gvar_ractor_local("$&");
4109 rb_gvar_ractor_local("$`");
4110 rb_gvar_ractor_local("$'");
4111 rb_gvar_ractor_local("$+");
4113 rb_define_virtual_variable("$=", ignorecase_getter
, ignorecase_setter
);
4115 rb_cRegexp
= rb_define_class("Regexp", rb_cObject
);
4116 rb_define_alloc_func(rb_cRegexp
, rb_reg_s_alloc
);
4117 rb_define_singleton_method(rb_cRegexp
, "compile", rb_class_new_instance
, -1);
4118 rb_define_singleton_method(rb_cRegexp
, "quote", rb_reg_s_quote
, 1);
4119 rb_define_singleton_method(rb_cRegexp
, "escape", rb_reg_s_quote
, 1);
4120 rb_define_singleton_method(rb_cRegexp
, "union", rb_reg_s_union_m
, -2);
4121 rb_define_singleton_method(rb_cRegexp
, "last_match", rb_reg_s_last_match
, -1);
4122 rb_define_singleton_method(rb_cRegexp
, "try_convert", rb_reg_s_try_convert
, 1);
4124 rb_define_method(rb_cRegexp
, "initialize", rb_reg_initialize_m
, -1);
4125 rb_define_method(rb_cRegexp
, "initialize_copy", rb_reg_init_copy
, 1);
4126 rb_define_method(rb_cRegexp
, "hash", rb_reg_hash
, 0);
4127 rb_define_method(rb_cRegexp
, "eql?", rb_reg_equal
, 1);
4128 rb_define_method(rb_cRegexp
, "==", rb_reg_equal
, 1);
4129 rb_define_method(rb_cRegexp
, "=~", rb_reg_match
, 1);
4130 rb_define_method(rb_cRegexp
, "===", rb_reg_eqq
, 1);
4131 rb_define_method(rb_cRegexp
, "~", rb_reg_match2
, 0);
4132 rb_define_method(rb_cRegexp
, "match", rb_reg_match_m
, -1);
4133 rb_define_method(rb_cRegexp
, "match?", rb_reg_match_m_p
, -1);
4134 rb_define_method(rb_cRegexp
, "to_s", rb_reg_to_s
, 0);
4135 rb_define_method(rb_cRegexp
, "inspect", rb_reg_inspect
, 0);
4136 rb_define_method(rb_cRegexp
, "source", rb_reg_source
, 0);
4137 rb_define_method(rb_cRegexp
, "casefold?", rb_reg_casefold_p
, 0);
4138 rb_define_method(rb_cRegexp
, "options", rb_reg_options_m
, 0);
4139 rb_define_method(rb_cRegexp
, "encoding", rb_obj_encoding
, 0); /* in encoding.c */
4140 rb_define_method(rb_cRegexp
, "fixed_encoding?", rb_reg_fixed_encoding_p
, 0);
4141 rb_define_method(rb_cRegexp
, "names", rb_reg_names
, 0);
4142 rb_define_method(rb_cRegexp
, "named_captures", rb_reg_named_captures
, 0);
4144 /* see Regexp.options and Regexp.new */
4145 rb_define_const(rb_cRegexp
, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE
));
4146 /* see Regexp.options and Regexp.new */
4147 rb_define_const(rb_cRegexp
, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND
));
4148 /* see Regexp.options and Regexp.new */
4149 rb_define_const(rb_cRegexp
, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE
));
4150 /* see Regexp.options and Regexp.new */
4151 rb_define_const(rb_cRegexp
, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED
));
4152 /* see Regexp.options and Regexp.new */
4153 rb_define_const(rb_cRegexp
, "NOENCODING", INT2FIX(ARG_ENCODING_NONE
));
4155 rb_global_variable(®_cache
);
4157 rb_cMatch
= rb_define_class("MatchData", rb_cObject
);
4158 rb_define_alloc_func(rb_cMatch
, match_alloc
);
4159 rb_undef_method(CLASS_OF(rb_cMatch
), "new");
4160 rb_undef_method(CLASS_OF(rb_cMatch
), "allocate");
4162 rb_define_method(rb_cMatch
, "initialize_copy", match_init_copy
, 1);
4163 rb_define_method(rb_cMatch
, "regexp", match_regexp
, 0);
4164 rb_define_method(rb_cMatch
, "names", match_names
, 0);
4165 rb_define_method(rb_cMatch
, "size", match_size
, 0);
4166 rb_define_method(rb_cMatch
, "length", match_size
, 0);
4167 rb_define_method(rb_cMatch
, "offset", match_offset
, 1);
4168 rb_define_method(rb_cMatch
, "begin", match_begin
, 1);
4169 rb_define_method(rb_cMatch
, "end", match_end
, 1);
4170 rb_define_method(rb_cMatch
, "match", match_nth
, 1);
4171 rb_define_method(rb_cMatch
, "match_length", match_nth_length
, 1);
4172 rb_define_method(rb_cMatch
, "to_a", match_to_a
, 0);
4173 rb_define_method(rb_cMatch
, "[]", match_aref
, -1);
4174 rb_define_method(rb_cMatch
, "captures", match_captures
, 0);
4175 rb_define_method(rb_cMatch
, "named_captures", match_named_captures
, 0);
4176 rb_define_method(rb_cMatch
, "values_at", match_values_at
, -1);
4177 rb_define_method(rb_cMatch
, "pre_match", rb_reg_match_pre
, 0);
4178 rb_define_method(rb_cMatch
, "post_match", rb_reg_match_post
, 0);
4179 rb_define_method(rb_cMatch
, "to_s", match_to_s
, 0);
4180 rb_define_method(rb_cMatch
, "inspect", match_inspect
, 0);
4181 rb_define_method(rb_cMatch
, "string", match_string
, 0);
4182 rb_define_method(rb_cMatch
, "hash", match_hash
, 0);
4183 rb_define_method(rb_cMatch
, "eql?", match_equal
, 1);
4184 rb_define_method(rb_cMatch
, "==", match_equal
, 1);