1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
23 - structure the opcode space into opcode+flag.
24 - merge with glibc's regex.[ch]
27 /* AIX requires this to be the first thing in the file. */
28 #if defined (_AIX) && !defined (REGEX_MALLOC)
36 /* Converts the pointer to the char to BEG-based offset from the start. */
37 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
38 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
45 /* We need this for `regex.h', and perhaps for the Emacs include files. */
46 #include <sys/types.h>
48 /* This is for other GNU distributions with internationalized messages. */
49 #if HAVE_LIBINTL_H || defined (_LIBC)
52 # define gettext(msgid) (msgid)
56 /* This define is so xgettext can find the internationalizable
58 #define gettext_noop(String) String
61 /* The `emacs' switch turns on certain matching commands
62 that make sense only in Emacs. */
68 /* Make syntax table lookup grant data in gl_state. */
69 #define SYNTAX_ENTRY_VIA_PROPERTY
75 #define malloc xmalloc
76 #define realloc xrealloc
79 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
80 #define RE_STRING_CHAR(p, s) \
81 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
82 #define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
83 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
85 /* Set C a (possibly multibyte) character before P. P points into a
86 string which is the virtual concatenation of STR1 (which ends at
87 END1) or STR2 (which ends at END2). */
88 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
92 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
93 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
94 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
95 c = STRING_CHAR (dtemp, (p) - dtemp); \
98 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
102 #else /* not emacs */
104 /* If we are not linking with Emacs proper,
105 we can't use the relocating allocator
106 even if config.h says that we can. */
109 #if defined (STDC_HEADERS) || defined (_LIBC)
116 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
117 If nothing else has been done, use the method below. */
118 #ifdef INHIBIT_STRING_HEADER
119 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
120 #if !defined (bzero) && !defined (bcopy)
121 #undef INHIBIT_STRING_HEADER
126 /* This is the normal way of making sure we have a bcopy and a bzero.
127 This is used in most programs--a few other programs avoid this
128 by defining INHIBIT_STRING_HEADER. */
129 #ifndef INHIBIT_STRING_HEADER
130 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
133 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
136 #define bcopy(s, d, n) memcpy ((d), (s), (n))
139 #define bzero(s, n) memset ((s), 0, (n))
146 /* Define the syntax stuff for \<, \>, etc. */
148 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
149 enum syntaxcode
{ Swhitespace
= 0, Sword
= 1 };
151 #ifdef SWITCH_ENUM_BUG
152 #define SWITCH_ENUM_CAST(x) ((int)(x))
154 #define SWITCH_ENUM_CAST(x) (x)
159 extern char *re_syntax_table
;
161 #else /* not SYNTAX_TABLE */
163 /* How many characters in the character set. */
164 #define CHAR_SET_SIZE 256
166 static char re_syntax_table
[CHAR_SET_SIZE
];
177 bzero (re_syntax_table
, sizeof re_syntax_table
);
179 for (c
= 'a'; c
<= 'z'; c
++)
180 re_syntax_table
[c
] = Sword
;
182 for (c
= 'A'; c
<= 'Z'; c
++)
183 re_syntax_table
[c
] = Sword
;
185 for (c
= '0'; c
<= '9'; c
++)
186 re_syntax_table
[c
] = Sword
;
188 re_syntax_table
['_'] = Sword
;
193 #endif /* not SYNTAX_TABLE */
195 #define SYNTAX(c) re_syntax_table[c]
197 /* Dummy macros for non-Emacs environments. */
198 #define BASE_LEADING_CODE_P(c) (0)
199 #define CHAR_CHARSET(c) 0
200 #define CHARSET_LEADING_CODE_BASE(c) 0
201 #define MAX_MULTIBYTE_LENGTH 1
202 #define RE_MULTIBYTE_P(x) 0
203 #define WORD_BOUNDARY_P(c1, c2) (0)
204 #define CHAR_HEAD_P(p) (1)
205 #define SINGLE_BYTE_CHAR_P(c) (1)
206 #define SAME_CHARSET_P(c1, c2) (1)
207 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
208 #define STRING_CHAR(p, s) (*(p))
209 #define RE_STRING_CHAR STRING_CHAR
210 #define CHAR_STRING(c, s) (*(s) = (c), 1)
211 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
212 #define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
213 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
214 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
215 #define MAKE_CHAR(charset, c1, c2) (c1)
216 #endif /* not emacs */
219 #define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
220 #define RE_TRANSLATE_P(TBL) (TBL)
223 /* Get the interface, including the syntax bits. */
226 /* isalpha etc. are used for the character classes. */
231 /* 1 if C is an ASCII character. */
232 #define IS_REAL_ASCII(c) ((c) < 0200)
234 /* 1 if C is a unibyte character. */
235 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
237 /* The Emacs definitions should not be directly affected by locales. */
239 /* In Emacs, these are only used for single-byte characters. */
240 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
241 #define ISCNTRL(c) ((c) < ' ')
242 #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
243 || ((c) >= 'a' && (c) <= 'f') \
244 || ((c) >= 'A' && (c) <= 'F'))
246 /* This is only used for single-byte characters. */
247 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
249 /* The rest must handle multibyte characters. */
251 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
252 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
255 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
256 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
259 #define ISALNUM(c) (IS_REAL_ASCII (c) \
260 ? (((c) >= 'a' && (c) <= 'z') \
261 || ((c) >= 'A' && (c) <= 'Z') \
262 || ((c) >= '0' && (c) <= '9')) \
263 : SYNTAX (c) == Sword)
265 #define ISALPHA(c) (IS_REAL_ASCII (c) \
266 ? (((c) >= 'a' && (c) <= 'z') \
267 || ((c) >= 'A' && (c) <= 'Z')) \
268 : SYNTAX (c) == Sword)
270 #define ISLOWER(c) (LOWERCASEP (c))
272 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
273 ? ((c) > ' ' && (c) < 0177 \
274 && !(((c) >= 'a' && (c) <= 'z') \
275 || ((c) >= 'A' && (c) <= 'Z') \
276 || ((c) >= '0' && (c) <= '9'))) \
277 : SYNTAX (c) != Sword)
279 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
281 #define ISUPPER(c) (UPPERCASEP (c))
283 #define ISWORD(c) (SYNTAX (c) == Sword)
285 #else /* not emacs */
287 /* Jim Meyering writes:
289 "... Some ctype macros are valid only for character codes that
290 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
291 using /bin/cc or gcc but without giving an ansi option). So, all
292 ctype uses should be through macros like ISPRINT... If
293 STDC_HEADERS is defined, then autoconf has verified that the ctype
294 macros don't need to be guarded with references to isascii. ...
295 Defining isascii to 1 should let any compiler worth its salt
296 eliminate the && through constant folding." */
298 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
301 #define ISASCII(c) isascii(c)
304 /* 1 if C is an ASCII character. */
305 #define IS_REAL_ASCII(c) ((c) < 0200)
307 /* This distinction is not meaningful, except in Emacs. */
308 #define ISUNIBYTE(c) 1
310 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
311 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
312 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
315 #define ISBLANK(c) (ISASCII (c) && isblank (c))
317 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
320 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
322 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
325 #define ISPRINT(c) (ISASCII (c) && isprint (c))
326 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
327 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
328 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
329 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
330 #define ISLOWER(c) (ISASCII (c) && islower (c))
331 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
332 #define ISSPACE(c) (ISASCII (c) && isspace (c))
333 #define ISUPPER(c) (ISASCII (c) && isupper (c))
334 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
336 #define ISWORD(c) ISALPHA(c)
338 #endif /* not emacs */
341 #define NULL (void *)0
344 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
345 since ours (we hope) works properly with all combinations of
346 machines, compilers, `char' and `unsigned char' argument types.
347 (Per Bothner suggested the basic approach.) */
348 #undef SIGN_EXTEND_CHAR
350 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
351 #else /* not __STDC__ */
352 /* As in Harbison and Steele. */
353 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
356 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
357 use `alloca' instead of `malloc'. This is because using malloc in
358 re_search* or re_match* could cause memory leaks when C-g is used in
359 Emacs; also, malloc is slower and causes storage fragmentation. On
360 the other hand, malloc is more portable, and easier to debug.
362 Because we sometimes use alloca, some routines have to be macros,
363 not functions -- `alloca'-allocated space disappears at the end of the
364 function it is called in. */
368 #define REGEX_ALLOCATE malloc
369 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
370 #define REGEX_FREE free
372 #else /* not REGEX_MALLOC */
374 /* Emacs already defines alloca, sometimes. */
377 /* Make alloca work the best possible way. */
379 #define alloca __builtin_alloca
380 #else /* not __GNUC__ */
383 #else /* not __GNUC__ or HAVE_ALLOCA_H */
384 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
385 #ifndef _AIX /* Already did AIX, up at the top. */
387 #endif /* not _AIX */
389 #endif /* not HAVE_ALLOCA_H */
390 #endif /* not __GNUC__ */
392 #endif /* not alloca */
394 #define REGEX_ALLOCATE alloca
396 /* Assumes a `char *destination' variable. */
397 #define REGEX_REALLOCATE(source, osize, nsize) \
398 (destination = (char *) alloca (nsize), \
399 bcopy (source, destination, osize), \
402 /* No need to do anything to free, after alloca. */
403 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
405 #endif /* not REGEX_MALLOC */
407 /* Define how to allocate the failure stack. */
409 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
411 #define REGEX_ALLOCATE_STACK(size) \
412 r_alloc (&failure_stack_ptr, (size))
413 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
414 r_re_alloc (&failure_stack_ptr, (nsize))
415 #define REGEX_FREE_STACK(ptr) \
416 r_alloc_free (&failure_stack_ptr)
418 #else /* not using relocating allocator */
422 #define REGEX_ALLOCATE_STACK malloc
423 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
424 #define REGEX_FREE_STACK free
426 #else /* not REGEX_MALLOC */
428 #define REGEX_ALLOCATE_STACK alloca
430 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
431 REGEX_REALLOCATE (source, osize, nsize)
432 /* No need to explicitly free anything. */
433 #define REGEX_FREE_STACK(arg) ((void)0)
435 #endif /* not REGEX_MALLOC */
436 #endif /* not using relocating allocator */
439 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
440 `string1' or just past its end. This works if PTR is NULL, which is
442 #define FIRST_STRING_P(ptr) \
443 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
445 /* (Re)Allocate N items of type T using malloc, or fail. */
446 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
447 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
448 #define RETALLOC_IF(addr, n, t) \
449 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
450 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
452 #define BYTEWIDTH 8 /* In bits. */
454 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
458 #define MAX(a, b) ((a) > (b) ? (a) : (b))
459 #define MIN(a, b) ((a) < (b) ? (a) : (b))
461 /* Type of source-pattern and string chars. */
462 typedef const unsigned char re_char
;
464 typedef char boolean
;
468 static int re_match_2_internal ();
470 /* These are the command codes that appear in compiled regular
471 expressions. Some opcodes are followed by argument bytes. A
472 command code can specify any interpretation whatsoever for its
473 arguments. Zero bytes may appear in the compiled regular expression. */
479 /* Succeed right away--no more backtracking. */
482 /* Followed by one byte giving n, then by n literal bytes. */
485 /* Matches any (more or less) character. */
488 /* Matches any one char belonging to specified set. First
489 following byte is number of bitmap bytes. Then come bytes
490 for a bitmap saying which chars are in. Bits in each byte
491 are ordered low-bit-first. A character is in the set if its
492 bit is 1. A character too large to have a bit in the map is
493 automatically not in the set.
495 If the length byte has the 0x80 bit set, then that stuff
496 is followed by a range table:
497 2 bytes of flags for character sets (low 8 bits, high 8 bits)
498 See RANGE_TABLE_WORK_BITS below.
499 2 bytes, the number of pairs that follow
500 pairs, each 2 multibyte characters,
501 each multibyte character represented as 3 bytes. */
504 /* Same parameters as charset, but match any character that is
505 not one of those specified. */
508 /* Start remembering the text that is matched, for storing in a
509 register. Followed by one byte with the register number, in
510 the range 0 to one less than the pattern buffer's re_nsub
514 /* Stop remembering the text that is matched and store it in a
515 memory register. Followed by one byte with the register
516 number, in the range 0 to one less than `re_nsub' in the
520 /* Match a duplicate of something remembered. Followed by one
521 byte containing the register number. */
524 /* Fail unless at beginning of line. */
527 /* Fail unless at end of line. */
530 /* Succeeds if at beginning of buffer (if emacs) or at beginning
531 of string to be matched (if not). */
534 /* Analogously, for end of buffer/string. */
537 /* Followed by two byte relative address to which to jump. */
540 /* Followed by two-byte relative address of place to resume at
541 in case of failure. */
544 /* Like on_failure_jump, but pushes a placeholder instead of the
545 current string position when executed. */
546 on_failure_keep_string_jump
,
548 /* Just like `on_failure_jump', except that it checks that we
549 don't get stuck in an infinite loop (matching an empty string
551 on_failure_jump_loop
,
553 /* Just like `on_failure_jump_loop', except that it checks for
554 a different kind of loop (the kind that shows up with non-greedy
555 operators). This operation has to be immediately preceded
557 on_failure_jump_nastyloop
,
559 /* A smart `on_failure_jump' used for greedy * and + operators.
560 It analyses the loop before which it is put and if the
561 loop does not require backtracking, it changes itself to
562 `on_failure_keep_string_jump' and short-circuits the loop,
563 else it just defaults to changing itself into `on_failure_jump'.
564 It assumes that it is pointing to just past a `jump'. */
565 on_failure_jump_smart
,
567 /* Followed by two-byte relative address and two-byte number n.
568 After matching N times, jump to the address upon failure.
569 Does not work if N starts at 0: use on_failure_jump_loop
573 /* Followed by two-byte relative address, and two-byte number n.
574 Jump to the address N times, then fail. */
577 /* Set the following two-byte relative address to the
578 subsequent two-byte number. The address *includes* the two
582 wordbeg
, /* Succeeds if at word beginning. */
583 wordend
, /* Succeeds if at word end. */
585 wordbound
, /* Succeeds if at a word boundary. */
586 notwordbound
, /* Succeeds if not at a word boundary. */
588 /* Matches any character whose syntax is specified. Followed by
589 a byte which contains a syntax code, e.g., Sword. */
592 /* Matches any character whose syntax is not that specified. */
596 ,before_dot
, /* Succeeds if before point. */
597 at_dot
, /* Succeeds if at point. */
598 after_dot
, /* Succeeds if after point. */
600 /* Matches any character whose category-set contains the specified
601 category. The operator is followed by a byte which contains a
602 category code (mnemonic ASCII character). */
605 /* Matches any character whose category-set does not contain the
606 specified category. The operator is followed by a byte which
607 contains the category code (mnemonic ASCII character). */
612 /* Common operations on the compiled pattern. */
614 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
616 #define STORE_NUMBER(destination, number) \
618 (destination)[0] = (number) & 0377; \
619 (destination)[1] = (number) >> 8; \
622 /* Same as STORE_NUMBER, except increment DESTINATION to
623 the byte after where the number is stored. Therefore, DESTINATION
624 must be an lvalue. */
626 #define STORE_NUMBER_AND_INCR(destination, number) \
628 STORE_NUMBER (destination, number); \
629 (destination) += 2; \
632 /* Put into DESTINATION a number stored in two contiguous bytes starting
635 #define EXTRACT_NUMBER(destination, source) \
637 (destination) = *(source) & 0377; \
638 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
643 extract_number (dest
, source
)
645 unsigned char *source
;
647 int temp
= SIGN_EXTEND_CHAR (*(source
+ 1));
648 *dest
= *source
& 0377;
652 #ifndef EXTRACT_MACROS /* To debug the macros. */
653 #undef EXTRACT_NUMBER
654 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
655 #endif /* not EXTRACT_MACROS */
659 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
660 SOURCE must be an lvalue. */
662 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
664 EXTRACT_NUMBER (destination, source); \
670 extract_number_and_incr (destination
, source
)
672 unsigned char **source
;
674 extract_number (destination
, *source
);
678 #ifndef EXTRACT_MACROS
679 #undef EXTRACT_NUMBER_AND_INCR
680 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
681 extract_number_and_incr (&dest, &src)
682 #endif /* not EXTRACT_MACROS */
686 /* Store a multibyte character in three contiguous bytes starting
687 DESTINATION, and increment DESTINATION to the byte after where the
688 character is stored. Therefore, DESTINATION must be an lvalue. */
690 #define STORE_CHARACTER_AND_INCR(destination, character) \
692 (destination)[0] = (character) & 0377; \
693 (destination)[1] = ((character) >> 8) & 0377; \
694 (destination)[2] = (character) >> 16; \
695 (destination) += 3; \
698 /* Put into DESTINATION a character stored in three contiguous bytes
699 starting at SOURCE. */
701 #define EXTRACT_CHARACTER(destination, source) \
703 (destination) = ((source)[0] \
704 | ((source)[1] << 8) \
705 | ((source)[2] << 16)); \
709 /* Macros for charset. */
711 /* Size of bitmap of charset P in bytes. P is a start of charset,
712 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
713 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
715 /* Nonzero if charset P has range table. */
716 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
718 /* Return the address of range table of charset P. But not the start
719 of table itself, but the before where the number of ranges is
720 stored. `2 +' means to skip re_opcode_t and size of bitmap,
721 and the 2 bytes of flags at the start of the range table. */
722 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
724 /* Extract the bit flags that start a range table. */
725 #define CHARSET_RANGE_TABLE_BITS(p) \
726 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
727 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
729 /* Test if C is listed in the bitmap of charset P. */
730 #define CHARSET_LOOKUP_BITMAP(p, c) \
731 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
732 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
734 /* Return the address of end of RANGE_TABLE. COUNT is number of
735 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
736 is start of range and end of range. `* 3' is size of each start
738 #define CHARSET_RANGE_TABLE_END(range_table, count) \
739 ((range_table) + (count) * 2 * 3)
741 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
742 COUNT is number of ranges in RANGE_TABLE. */
743 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
746 int range_start, range_end; \
748 unsigned char *range_table_end \
749 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
751 for (p = (range_table); p < range_table_end; p += 2 * 3) \
753 EXTRACT_CHARACTER (range_start, p); \
754 EXTRACT_CHARACTER (range_end, p + 3); \
756 if (range_start <= (c) && (c) <= range_end) \
765 /* Test if C is in range table of CHARSET. The flag NOT is negated if
766 C is listed in it. */
767 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
770 /* Number of ranges in range table. */ \
772 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
774 EXTRACT_NUMBER_AND_INCR (count, range_table); \
775 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
779 /* If DEBUG is defined, Regex prints many voluminous messages about what
780 it is doing (if the variable `debug' is nonzero). If linked with the
781 main program in `iregex.c', you can enter patterns and strings
782 interactively. And if linked with the main program in `main.c' and
783 the other test files, you can run the already-written tests. */
787 /* We use standard I/O for debugging. */
790 /* It is useful to test things that ``must'' be true when debugging. */
793 static int debug
= -100000;
795 #define DEBUG_STATEMENT(e) e
796 #define DEBUG_PRINT1(x) if (debug > 0) printf (x)
797 #define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
798 #define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
799 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
800 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
801 if (debug > 0) print_partial_compiled_pattern (s, e)
802 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
803 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
806 /* Print the fastmap in human-readable form. */
809 print_fastmap (fastmap
)
812 unsigned was_a_range
= 0;
815 while (i
< (1 << BYTEWIDTH
))
821 while (i
< (1 << BYTEWIDTH
) && fastmap
[i
])
837 /* Print a compiled pattern string in human-readable form, starting at
838 the START pointer into it and ending just before the pointer END. */
841 print_partial_compiled_pattern (start
, end
)
842 unsigned char *start
;
846 unsigned char *p
= start
;
847 unsigned char *pend
= end
;
855 /* Loop over pattern commands. */
858 printf ("%d:\t", p
- start
);
860 switch ((re_opcode_t
) *p
++)
872 printf ("/exactn/%d", mcnt
);
882 printf ("/start_memory/%d", *p
++);
886 printf ("/stop_memory/%d", *p
++);
890 printf ("/duplicate/%d", *p
++);
900 register int c
, last
= -100;
901 register int in_range
= 0;
902 int length
= CHARSET_BITMAP_SIZE (p
- 1);
903 int has_range_table
= CHARSET_RANGE_TABLE_EXISTS_P (p
- 1);
905 printf ("/charset [%s",
906 (re_opcode_t
) *(p
- 1) == charset_not
? "^" : "");
908 assert (p
+ *p
< pend
);
910 for (c
= 0; c
< 256; c
++)
912 && (p
[1 + (c
/8)] & (1 << (c
% 8))))
914 /* Are we starting a range? */
915 if (last
+ 1 == c
&& ! in_range
)
920 /* Have we broken a range? */
921 else if (last
+ 1 != c
&& in_range
)
943 printf ("has-range-table");
945 /* ??? Should print the range table; for now, just skip it. */
946 p
+= 2; /* skip range table bits */
947 EXTRACT_NUMBER_AND_INCR (count
, p
);
948 p
= CHARSET_RANGE_TABLE_END (p
, count
);
961 case on_failure_jump
:
962 extract_number_and_incr (&mcnt
, &p
);
963 printf ("/on_failure_jump to %d", p
+ mcnt
- start
);
966 case on_failure_keep_string_jump
:
967 extract_number_and_incr (&mcnt
, &p
);
968 printf ("/on_failure_keep_string_jump to %d", p
+ mcnt
- start
);
971 case on_failure_jump_nastyloop
:
972 extract_number_and_incr (&mcnt
, &p
);
973 printf ("/on_failure_jump_nastyloop to %d", p
+ mcnt
- start
);
976 case on_failure_jump_loop
:
977 extract_number_and_incr (&mcnt
, &p
);
978 printf ("/on_failure_jump_loop to %d", p
+ mcnt
- start
);
981 case on_failure_jump_smart
:
982 extract_number_and_incr (&mcnt
, &p
);
983 printf ("/on_failure_jump_smart to %d", p
+ mcnt
- start
);
987 extract_number_and_incr (&mcnt
, &p
);
988 printf ("/jump to %d", p
+ mcnt
- start
);
992 extract_number_and_incr (&mcnt
, &p
);
993 extract_number_and_incr (&mcnt2
, &p
);
994 printf ("/succeed_n to %d, %d times", p
- 2 + mcnt
- start
, mcnt2
);
998 extract_number_and_incr (&mcnt
, &p
);
999 extract_number_and_incr (&mcnt2
, &p
);
1000 printf ("/jump_n to %d, %d times", p
- 2 + mcnt
- start
, mcnt2
);
1004 extract_number_and_incr (&mcnt
, &p
);
1005 extract_number_and_incr (&mcnt2
, &p
);
1006 printf ("/set_number_at location %d to %d", p
- 2 + mcnt
- start
, mcnt2
);
1010 printf ("/wordbound");
1014 printf ("/notwordbound");
1018 printf ("/wordbeg");
1022 printf ("/wordend");
1025 printf ("/syntaxspec");
1027 printf ("/%d", mcnt
);
1031 printf ("/notsyntaxspec");
1033 printf ("/%d", mcnt
);
1038 printf ("/before_dot");
1046 printf ("/after_dot");
1050 printf ("/categoryspec");
1052 printf ("/%d", mcnt
);
1055 case notcategoryspec
:
1056 printf ("/notcategoryspec");
1058 printf ("/%d", mcnt
);
1071 printf ("?%d", *(p
-1));
1077 printf ("%d:\tend of pattern.\n", p
- start
);
1082 print_compiled_pattern (bufp
)
1083 struct re_pattern_buffer
*bufp
;
1085 unsigned char *buffer
= bufp
->buffer
;
1087 print_partial_compiled_pattern (buffer
, buffer
+ bufp
->used
);
1088 printf ("%ld bytes used/%ld bytes allocated.\n", bufp
->used
, bufp
->allocated
);
1090 if (bufp
->fastmap_accurate
&& bufp
->fastmap
)
1092 printf ("fastmap: ");
1093 print_fastmap (bufp
->fastmap
);
1096 printf ("re_nsub: %d\t", bufp
->re_nsub
);
1097 printf ("regs_alloc: %d\t", bufp
->regs_allocated
);
1098 printf ("can_be_null: %d\t", bufp
->can_be_null
);
1099 printf ("newline_anchor: %d\n", bufp
->newline_anchor
);
1100 printf ("no_sub: %d\t", bufp
->no_sub
);
1101 printf ("not_bol: %d\t", bufp
->not_bol
);
1102 printf ("not_eol: %d\t", bufp
->not_eol
);
1103 printf ("syntax: %d\n", bufp
->syntax
);
1105 /* Perhaps we should print the translate table? */
1110 print_double_string (where
, string1
, size1
, string2
, size2
)
1123 if (FIRST_STRING_P (where
))
1125 for (this_char
= where
- string1
; this_char
< size1
; this_char
++)
1126 putchar (string1
[this_char
]);
1131 for (this_char
= where
- string2
; this_char
< size2
; this_char
++)
1132 putchar (string2
[this_char
]);
1136 #else /* not DEBUG */
1141 #define DEBUG_STATEMENT(e)
1142 #define DEBUG_PRINT1(x)
1143 #define DEBUG_PRINT2(x1, x2)
1144 #define DEBUG_PRINT3(x1, x2, x3)
1145 #define DEBUG_PRINT4(x1, x2, x3, x4)
1146 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1147 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1149 #endif /* not DEBUG */
1151 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1152 also be assigned to arbitrarily: each pattern buffer stores its own
1153 syntax, so it can be changed between regex compilations. */
1154 /* This has no initializer because initialized variables in Emacs
1155 become read-only after dumping. */
1156 reg_syntax_t re_syntax_options
;
1159 /* Specify the precise syntax of regexps for compilation. This provides
1160 for compatibility for various utilities which historically have
1161 different, incompatible syntaxes.
1163 The argument SYNTAX is a bit mask comprised of the various bits
1164 defined in regex.h. We return the old syntax. */
1167 re_set_syntax (syntax
)
1168 reg_syntax_t syntax
;
1170 reg_syntax_t ret
= re_syntax_options
;
1172 re_syntax_options
= syntax
;
1176 /* This table gives an error message for each of the error codes listed
1177 in regex.h. Obviously the order here has to be same as there.
1178 POSIX doesn't require that we do anything for REG_NOERROR,
1179 but why not be nice? */
1181 static const char *re_error_msgid
[] =
1183 gettext_noop ("Success"), /* REG_NOERROR */
1184 gettext_noop ("No match"), /* REG_NOMATCH */
1185 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1186 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1187 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1188 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1189 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1190 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1191 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1192 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1193 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1194 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1195 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1196 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1197 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1198 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1199 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1202 /* Avoiding alloca during matching, to placate r_alloc. */
1204 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1205 searching and matching functions should not call alloca. On some
1206 systems, alloca is implemented in terms of malloc, and if we're
1207 using the relocating allocator routines, then malloc could cause a
1208 relocation, which might (if the strings being searched are in the
1209 ralloc heap) shift the data out from underneath the regexp
1212 Here's another reason to avoid allocation: Emacs
1213 processes input from X in a signal handler; processing X input may
1214 call malloc; if input arrives while a matching routine is calling
1215 malloc, then we're scrod. But Emacs can't just block input while
1216 calling matching routines; then we don't notice interrupts when
1217 they come in. So, Emacs blocks input around all regexp calls
1218 except the matching calls, which it leaves unprotected, in the
1219 faith that they will not malloc. */
1221 /* Normally, this is fine. */
1222 #define MATCH_MAY_ALLOCATE
1224 /* When using GNU C, we are not REALLY using the C alloca, no matter
1225 what config.h may say. So don't take precautions for it. */
1230 /* The match routines may not allocate if (1) they would do it with malloc
1231 and (2) it's not safe for them to use malloc.
1232 Note that if REL_ALLOC is defined, matching would not use malloc for the
1233 failure stack, but we would still use it for the register vectors;
1234 so REL_ALLOC should not affect this. */
1235 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1236 #undef MATCH_MAY_ALLOCATE
1240 /* Failure stack declarations and macros; both re_compile_fastmap and
1241 re_match_2 use a failure stack. These have to be macros because of
1242 REGEX_ALLOCATE_STACK. */
1245 /* Approximate number of failure points for which to initially allocate space
1246 when matching. If this number is exceeded, we allocate more
1247 space, so it is not a hard limit. */
1248 #ifndef INIT_FAILURE_ALLOC
1249 #define INIT_FAILURE_ALLOC 20
1252 /* Roughly the maximum number of failure points on the stack. Would be
1253 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1254 This is a variable only so users of regex can assign to it; we never
1255 change it ourselves. */
1256 #if defined (MATCH_MAY_ALLOCATE)
1257 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1258 whose default stack limit is 2mb. In order for a larger
1259 value to work reliably, you have to try to make it accord
1260 with the process stack limit. */
1261 int re_max_failures
= 40000;
1263 int re_max_failures
= 4000;
1266 union fail_stack_elt
1268 const unsigned char *pointer
;
1269 unsigned int integer
;
1272 typedef union fail_stack_elt fail_stack_elt_t
;
1276 fail_stack_elt_t
*stack
;
1278 unsigned avail
; /* Offset of next open position. */
1279 unsigned frame
; /* Offset of the cur constructed frame. */
1282 #define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1283 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1284 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1287 /* Define macros to initialize and free the failure stack.
1288 Do `return -2' if the alloc fails. */
1290 #ifdef MATCH_MAY_ALLOCATE
1291 #define INIT_FAIL_STACK() \
1293 fail_stack.stack = (fail_stack_elt_t *) \
1294 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1295 * sizeof (fail_stack_elt_t)); \
1297 if (fail_stack.stack == NULL) \
1300 fail_stack.size = INIT_FAILURE_ALLOC; \
1301 fail_stack.avail = 0; \
1302 fail_stack.frame = 0; \
1305 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1307 #define INIT_FAIL_STACK() \
1309 fail_stack.avail = 0; \
1310 fail_stack.frame = 0; \
1313 #define RESET_FAIL_STACK() ((void)0)
1317 /* Double the size of FAIL_STACK, up to a limit
1318 which allows approximately `re_max_failures' items.
1320 Return 1 if succeeds, and 0 if either ran out of memory
1321 allocating space for it or it was already too large.
1323 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1325 /* Factor to increase the failure stack size by
1326 when we increase it.
1327 This used to be 2, but 2 was too wasteful
1328 because the old discarded stacks added up to as much space
1329 were as ultimate, maximum-size stack. */
1330 #define FAIL_STACK_GROWTH_FACTOR 4
1332 #define GROW_FAIL_STACK(fail_stack) \
1333 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1334 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1336 : ((fail_stack).stack \
1337 = (fail_stack_elt_t *) \
1338 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1339 (fail_stack).size * sizeof (fail_stack_elt_t), \
1340 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1341 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1342 * FAIL_STACK_GROWTH_FACTOR))), \
1344 (fail_stack).stack == NULL \
1346 : ((fail_stack).size \
1347 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1348 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1349 * FAIL_STACK_GROWTH_FACTOR)) \
1350 / sizeof (fail_stack_elt_t)), \
1354 /* Push pointer POINTER on FAIL_STACK.
1355 Return 1 if was able to do so and 0 if ran out of memory allocating
1357 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1358 ((FAIL_STACK_FULL () \
1359 && !GROW_FAIL_STACK (FAIL_STACK)) \
1361 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1363 #define POP_PATTERN_OP() POP_FAILURE_POINTER ()
1365 /* Push a pointer value onto the failure stack.
1366 Assumes the variable `fail_stack'. Probably should only
1367 be called from within `PUSH_FAILURE_POINT'. */
1368 #define PUSH_FAILURE_POINTER(item) \
1369 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1371 /* This pushes an integer-valued item onto the failure stack.
1372 Assumes the variable `fail_stack'. Probably should only
1373 be called from within `PUSH_FAILURE_POINT'. */
1374 #define PUSH_FAILURE_INT(item) \
1375 fail_stack.stack[fail_stack.avail++].integer = (item)
1377 /* Push a fail_stack_elt_t value onto the failure stack.
1378 Assumes the variable `fail_stack'. Probably should only
1379 be called from within `PUSH_FAILURE_POINT'. */
1380 #define PUSH_FAILURE_ELT(item) \
1381 fail_stack.stack[fail_stack.avail++] = (item)
1383 /* These three POP... operations complement the three PUSH... operations.
1384 All assume that `fail_stack' is nonempty. */
1385 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1386 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1387 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1389 /* Individual items aside from the registers. */
1390 #define NUM_NONREG_ITEMS 3
1392 /* Used to examine the stack (to detect infinite loops). */
1393 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1394 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1395 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1396 #define TOP_FAILURE_HANDLE() fail_stack.frame
1399 #define ENSURE_FAIL_STACK(space) \
1400 while (REMAINING_AVAIL_SLOTS <= space) { \
1401 if (!GROW_FAIL_STACK (fail_stack)) \
1403 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1404 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1407 /* Push register NUM onto the stack. */
1408 #define PUSH_FAILURE_REG(num) \
1410 char *destination; \
1411 ENSURE_FAIL_STACK(3); \
1412 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1413 num, regstart[num], regend[num]); \
1414 PUSH_FAILURE_POINTER (regstart[num]); \
1415 PUSH_FAILURE_POINTER (regend[num]); \
1416 PUSH_FAILURE_INT (num); \
1419 /* Pop a saved register off the stack. */
1420 #define POP_FAILURE_REG() \
1422 int reg = POP_FAILURE_INT (); \
1423 regend[reg] = POP_FAILURE_POINTER (); \
1424 regstart[reg] = POP_FAILURE_POINTER (); \
1425 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1426 reg, regstart[reg], regend[reg]); \
1429 /* Check that we are not stuck in an infinite loop. */
1430 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1432 int failure = TOP_FAILURE_HANDLE(); \
1433 /* Check for infinite matching loops */ \
1434 while (failure > 0 && \
1435 (FAILURE_STR (failure) == string_place \
1436 || FAILURE_STR (failure) == NULL)) \
1438 assert (FAILURE_PAT (failure) >= bufp->buffer \
1439 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1440 if (FAILURE_PAT (failure) == pat_cur) \
1442 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1443 failure = NEXT_FAILURE_HANDLE(failure); \
1445 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1448 /* Push the information about the state we will need
1449 if we ever fail back to it.
1451 Requires variables fail_stack, regstart, regend and
1452 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1455 Does `return FAILURE_CODE' if runs out of memory. */
1457 #define PUSH_FAILURE_POINT(pattern, string_place) \
1459 char *destination; \
1460 /* Must be int, so when we don't save any registers, the arithmetic \
1461 of 0 + -1 isn't done as unsigned. */ \
1463 DEBUG_STATEMENT (failure_id++); \
1464 DEBUG_STATEMENT (nfailure_points_pushed++); \
1465 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1466 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1467 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1469 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1471 DEBUG_PRINT1 ("\n"); \
1473 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1474 PUSH_FAILURE_INT (fail_stack.frame); \
1476 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1477 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1478 DEBUG_PRINT1 ("'\n"); \
1479 PUSH_FAILURE_POINTER (string_place); \
1481 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1482 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1483 PUSH_FAILURE_POINTER (pattern); \
1485 /* Close the frame by moving the frame pointer past it. */ \
1486 fail_stack.frame = fail_stack.avail; \
1489 /* Estimate the size of data pushed by a typical failure stack entry.
1490 An estimate is all we need, because all we use this for
1491 is to choose a limit for how big to make the failure stack. */
1493 #define TYPICAL_FAILURE_SIZE 20
1495 /* How many items can still be added to the stack without overflowing it. */
1496 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1499 /* Pops what PUSH_FAIL_STACK pushes.
1501 We restore into the parameters, all of which should be lvalues:
1502 STR -- the saved data position.
1503 PAT -- the saved pattern position.
1504 REGSTART, REGEND -- arrays of string positions.
1506 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1507 `pend', `string1', `size1', `string2', and `size2'. */
1509 #define POP_FAILURE_POINT(str, pat) \
1511 assert (!FAIL_STACK_EMPTY ()); \
1513 /* Remove failure points and point to how many regs pushed. */ \
1514 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1515 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1516 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1518 /* Pop the saved registers. */ \
1519 while (fail_stack.frame < fail_stack.avail) \
1520 POP_FAILURE_REG (); \
1522 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1523 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1524 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1526 /* If the saved string location is NULL, it came from an \
1527 on_failure_keep_string_jump opcode, and we want to throw away the \
1528 saved NULL, thus retaining our current position in the string. */ \
1529 str = (re_char *) POP_FAILURE_POINTER (); \
1530 DEBUG_PRINT2 (" Popping string %p: `", str); \
1531 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1532 DEBUG_PRINT1 ("'\n"); \
1534 fail_stack.frame = POP_FAILURE_INT (); \
1535 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1537 assert (fail_stack.avail >= 0); \
1538 assert (fail_stack.frame <= fail_stack.avail); \
1540 DEBUG_STATEMENT (nfailure_points_popped++); \
1541 } while (0) /* POP_FAILURE_POINT */
1545 /* Registers are set to a sentinel when they haven't yet matched. */
1546 #define REG_UNSET_VALUE NULL
1547 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1549 /* Subroutine declarations and macros for regex_compile. */
1551 static void store_op1
_RE_ARGS((re_opcode_t op
, unsigned char *loc
, int arg
));
1552 static void store_op2
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1553 int arg1
, int arg2
));
1554 static void insert_op1
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1555 int arg
, unsigned char *end
));
1556 static void insert_op2
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1557 int arg1
, int arg2
, unsigned char *end
));
1558 static boolean at_begline_loc_p
_RE_ARGS((const unsigned char *pattern
,
1559 const unsigned char *p
,
1560 reg_syntax_t syntax
));
1561 static boolean at_endline_loc_p
_RE_ARGS((const unsigned char *p
,
1562 const unsigned char *pend
,
1563 reg_syntax_t syntax
));
1564 static unsigned char *skip_one_char
_RE_ARGS((unsigned char *p
));
1565 static int analyse_first
_RE_ARGS((unsigned char *p
, unsigned char *pend
,
1566 char *fastmap
, const int multibyte
));
1568 /* Fetch the next character in the uncompiled pattern---translating it
1569 if necessary. Also cast from a signed character in the constant
1570 string passed to us by the user to an unsigned char that we can use
1571 as an array index (in, e.g., `translate'). */
1572 #define PATFETCH(c) \
1575 c = TRANSLATE (c); \
1578 /* Fetch the next character in the uncompiled pattern, with no
1580 #define PATFETCH_RAW(c) \
1583 if (p == pend) return REG_EEND; \
1584 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1589 /* If `translate' is non-null, return translate[D], else just D. We
1590 cast the subscript to translate because some data is declared as
1591 `char *', to avoid warnings when a string constant is passed. But
1592 when we use a character as a subscript we must make it unsigned. */
1594 #define TRANSLATE(d) \
1595 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1599 /* Macros for outputting the compiled pattern into `buffer'. */
1601 /* If the buffer isn't allocated when it comes in, use this. */
1602 #define INIT_BUF_SIZE 32
1604 /* Make sure we have at least N more bytes of space in buffer. */
1605 #define GET_BUFFER_SPACE(n) \
1606 while (b - bufp->buffer + (n) > bufp->allocated) \
1609 /* Make sure we have one more byte of buffer space and then add C to it. */
1610 #define BUF_PUSH(c) \
1612 GET_BUFFER_SPACE (1); \
1613 *b++ = (unsigned char) (c); \
1617 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1618 #define BUF_PUSH_2(c1, c2) \
1620 GET_BUFFER_SPACE (2); \
1621 *b++ = (unsigned char) (c1); \
1622 *b++ = (unsigned char) (c2); \
1626 /* As with BUF_PUSH_2, except for three bytes. */
1627 #define BUF_PUSH_3(c1, c2, c3) \
1629 GET_BUFFER_SPACE (3); \
1630 *b++ = (unsigned char) (c1); \
1631 *b++ = (unsigned char) (c2); \
1632 *b++ = (unsigned char) (c3); \
1636 /* Store a jump with opcode OP at LOC to location TO. We store a
1637 relative address offset by the three bytes the jump itself occupies. */
1638 #define STORE_JUMP(op, loc, to) \
1639 store_op1 (op, loc, (to) - (loc) - 3)
1641 /* Likewise, for a two-argument jump. */
1642 #define STORE_JUMP2(op, loc, to, arg) \
1643 store_op2 (op, loc, (to) - (loc) - 3, arg)
1645 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1646 #define INSERT_JUMP(op, loc, to) \
1647 insert_op1 (op, loc, (to) - (loc) - 3, b)
1649 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1650 #define INSERT_JUMP2(op, loc, to, arg) \
1651 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1654 /* This is not an arbitrary limit: the arguments which represent offsets
1655 into the pattern are two bytes long. So if 2^16 bytes turns out to
1656 be too small, many things would have to change. */
1657 #define MAX_BUF_SIZE (1L << 16)
1660 /* Extend the buffer by twice its current size via realloc and
1661 reset the pointers that pointed into the old block to point to the
1662 correct places in the new one. If extending the buffer results in it
1663 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1664 #define EXTEND_BUFFER() \
1666 unsigned char *old_buffer = bufp->buffer; \
1667 if (bufp->allocated == MAX_BUF_SIZE) \
1669 bufp->allocated <<= 1; \
1670 if (bufp->allocated > MAX_BUF_SIZE) \
1671 bufp->allocated = MAX_BUF_SIZE; \
1672 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1673 if (bufp->buffer == NULL) \
1674 return REG_ESPACE; \
1675 /* If the buffer moved, move all the pointers into it. */ \
1676 if (old_buffer != bufp->buffer) \
1678 b = (b - old_buffer) + bufp->buffer; \
1679 begalt = (begalt - old_buffer) + bufp->buffer; \
1680 if (fixup_alt_jump) \
1681 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1683 laststart = (laststart - old_buffer) + bufp->buffer; \
1684 if (pending_exact) \
1685 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1690 /* Since we have one byte reserved for the register number argument to
1691 {start,stop}_memory, the maximum number of groups we can report
1692 things about is what fits in that byte. */
1693 #define MAX_REGNUM 255
1695 /* But patterns can have more than `MAX_REGNUM' registers. We just
1696 ignore the excess. */
1697 typedef unsigned regnum_t
;
1700 /* Macros for the compile stack. */
1702 /* Since offsets can go either forwards or backwards, this type needs to
1703 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1704 typedef int pattern_offset_t
;
1708 pattern_offset_t begalt_offset
;
1709 pattern_offset_t fixup_alt_jump
;
1710 pattern_offset_t laststart_offset
;
1712 } compile_stack_elt_t
;
1717 compile_stack_elt_t
*stack
;
1719 unsigned avail
; /* Offset of next open position. */
1720 } compile_stack_type
;
1723 #define INIT_COMPILE_STACK_SIZE 32
1725 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1726 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1728 /* The next available element. */
1729 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1732 /* Structure to manage work area for range table. */
1733 struct range_table_work_area
1735 int *table
; /* actual work area. */
1736 int allocated
; /* allocated size for work area in bytes. */
1737 int used
; /* actually used size in words. */
1738 int bits
; /* flag to record character classes */
1741 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1742 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1744 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1746 (work_area).allocated += 16 * sizeof (int); \
1747 if ((work_area).table) \
1749 = (int *) realloc ((work_area).table, (work_area).allocated); \
1752 = (int *) malloc ((work_area).allocated); \
1753 if ((work_area).table == 0) \
1754 FREE_STACK_RETURN (REG_ESPACE); \
1758 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1759 (work_area).bits |= (bit)
1761 /* These bits represent the various character classes such as [:alnum:]
1762 in a charset's range table. */
1763 #define BIT_ALNUM 0x1
1764 #define BIT_ALPHA 0x2
1765 #define BIT_WORD 0x4
1766 #define BIT_ASCII 0x8
1767 #define BIT_NONASCII 0x10
1768 #define BIT_GRAPH 0x20
1769 #define BIT_LOWER 0x40
1770 #define BIT_PRINT 0x80
1771 #define BIT_PUNCT 0x100
1772 #define BIT_SPACE 0x200
1773 #define BIT_UPPER 0x400
1774 #define BIT_UNIBYTE 0x800
1775 #define BIT_MULTIBYTE 0x1000
1777 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1778 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1780 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1781 (work_area).table[(work_area).used++] = (range_start); \
1782 (work_area).table[(work_area).used++] = (range_end); \
1785 /* Free allocated memory for WORK_AREA. */
1786 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1788 if ((work_area).table) \
1789 free ((work_area).table); \
1792 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1793 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1794 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1795 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1798 /* Set the bit for character C in a list. */
1799 #define SET_LIST_BIT(c) \
1800 (b[((unsigned char) (c)) / BYTEWIDTH] \
1801 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1804 /* Get the next unsigned number in the uncompiled pattern. */
1805 #define GET_UNSIGNED_NUMBER(num) \
1806 do { if (p != pend) \
1809 while (ISDIGIT (c)) \
1813 num = num * 10 + c - '0'; \
1821 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1823 #define IS_CHAR_CLASS(string) \
1824 (STREQ (string, "alpha") || STREQ (string, "upper") \
1825 || STREQ (string, "lower") || STREQ (string, "digit") \
1826 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1827 || STREQ (string, "space") || STREQ (string, "print") \
1828 || STREQ (string, "punct") || STREQ (string, "graph") \
1829 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1830 || STREQ (string, "word") \
1831 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1832 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1834 /* QUIT is only used on NTemacs. */
1835 #if !defined (WINDOWSNT) || !defined (emacs)
1840 #ifndef MATCH_MAY_ALLOCATE
1842 /* If we cannot allocate large objects within re_match_2_internal,
1843 we make the fail stack and register vectors global.
1844 The fail stack, we grow to the maximum size when a regexp
1846 The register vectors, we adjust in size each time we
1847 compile a regexp, according to the number of registers it needs. */
1849 static fail_stack_type fail_stack
;
1851 /* Size with which the following vectors are currently allocated.
1852 That is so we can make them bigger as needed,
1853 but never make them smaller. */
1854 static int regs_allocated_size
;
1856 static re_char
** regstart
, ** regend
;
1857 static re_char
**best_regstart
, **best_regend
;
1859 /* Make the register vectors big enough for NUM_REGS registers,
1860 but don't make them smaller. */
1863 regex_grow_registers (num_regs
)
1866 if (num_regs
> regs_allocated_size
)
1868 RETALLOC_IF (regstart
, num_regs
, re_char
*);
1869 RETALLOC_IF (regend
, num_regs
, re_char
*);
1870 RETALLOC_IF (best_regstart
, num_regs
, re_char
*);
1871 RETALLOC_IF (best_regend
, num_regs
, re_char
*);
1873 regs_allocated_size
= num_regs
;
1877 #endif /* not MATCH_MAY_ALLOCATE */
1879 static boolean group_in_compile_stack
_RE_ARGS ((compile_stack_type
1883 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1884 Returns one of error codes defined in `regex.h', or zero for success.
1886 Assumes the `allocated' (and perhaps `buffer') and `translate'
1887 fields are set in BUFP on entry.
1889 If it succeeds, results are put in BUFP (if it returns an error, the
1890 contents of BUFP are undefined):
1891 `buffer' is the compiled pattern;
1892 `syntax' is set to SYNTAX;
1893 `used' is set to the length of the compiled pattern;
1894 `fastmap_accurate' is zero;
1895 `re_nsub' is the number of subexpressions in PATTERN;
1896 `not_bol' and `not_eol' are zero;
1898 The `fastmap' and `newline_anchor' fields are neither
1899 examined nor set. */
1901 /* Insert the `jump' from the end of last alternative to "here".
1902 The space for the jump has already been allocated. */
1903 #define FIXUP_ALT_JUMP() \
1905 if (fixup_alt_jump) \
1906 STORE_JUMP (jump, fixup_alt_jump, b); \
1910 /* Return, freeing storage we allocated. */
1911 #define FREE_STACK_RETURN(value) \
1913 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1914 free (compile_stack.stack); \
1918 static reg_errcode_t
1919 regex_compile (pattern
, size
, syntax
, bufp
)
1922 reg_syntax_t syntax
;
1923 struct re_pattern_buffer
*bufp
;
1925 /* We fetch characters from PATTERN here. Even though PATTERN is
1926 `char *' (i.e., signed), we declare these variables as unsigned, so
1927 they can be reliably used as array indices. */
1928 register unsigned int c
, c1
;
1930 /* A random temporary spot in PATTERN. */
1933 /* Points to the end of the buffer, where we should append. */
1934 register unsigned char *b
;
1936 /* Keeps track of unclosed groups. */
1937 compile_stack_type compile_stack
;
1939 /* Points to the current (ending) position in the pattern. */
1941 /* `const' makes AIX compiler fail. */
1942 unsigned char *p
= pattern
;
1944 re_char
*p
= pattern
;
1946 re_char
*pend
= pattern
+ size
;
1948 /* How to translate the characters in the pattern. */
1949 RE_TRANSLATE_TYPE translate
= bufp
->translate
;
1951 /* Address of the count-byte of the most recently inserted `exactn'
1952 command. This makes it possible to tell if a new exact-match
1953 character can be added to that command or if the character requires
1954 a new `exactn' command. */
1955 unsigned char *pending_exact
= 0;
1957 /* Address of start of the most recently finished expression.
1958 This tells, e.g., postfix * where to find the start of its
1959 operand. Reset at the beginning of groups and alternatives. */
1960 unsigned char *laststart
= 0;
1962 /* Address of beginning of regexp, or inside of last group. */
1963 unsigned char *begalt
;
1965 /* Place in the uncompiled pattern (i.e., the {) to
1966 which to go back if the interval is invalid. */
1967 re_char
*beg_interval
;
1969 /* Address of the place where a forward jump should go to the end of
1970 the containing expression. Each alternative of an `or' -- except the
1971 last -- ends with a forward jump of this sort. */
1972 unsigned char *fixup_alt_jump
= 0;
1974 /* Counts open-groups as they are encountered. Remembered for the
1975 matching close-group on the compile stack, so the same register
1976 number is put in the stop_memory as the start_memory. */
1977 regnum_t regnum
= 0;
1979 /* Work area for range table of charset. */
1980 struct range_table_work_area range_table_work
;
1982 /* If the object matched can contain multibyte characters. */
1983 const boolean multibyte
= RE_MULTIBYTE_P (bufp
);
1987 DEBUG_PRINT1 ("\nCompiling pattern: ");
1990 unsigned debug_count
;
1992 for (debug_count
= 0; debug_count
< size
; debug_count
++)
1993 putchar (pattern
[debug_count
]);
1998 /* Initialize the compile stack. */
1999 compile_stack
.stack
= TALLOC (INIT_COMPILE_STACK_SIZE
, compile_stack_elt_t
);
2000 if (compile_stack
.stack
== NULL
)
2003 compile_stack
.size
= INIT_COMPILE_STACK_SIZE
;
2004 compile_stack
.avail
= 0;
2006 range_table_work
.table
= 0;
2007 range_table_work
.allocated
= 0;
2009 /* Initialize the pattern buffer. */
2010 bufp
->syntax
= syntax
;
2011 bufp
->fastmap_accurate
= 0;
2012 bufp
->not_bol
= bufp
->not_eol
= 0;
2014 /* Set `used' to zero, so that if we return an error, the pattern
2015 printer (for debugging) will think there's no pattern. We reset it
2019 /* Always count groups, whether or not bufp->no_sub is set. */
2022 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2023 /* Initialize the syntax table. */
2024 init_syntax_once ();
2027 if (bufp
->allocated
== 0)
2030 { /* If zero allocated, but buffer is non-null, try to realloc
2031 enough space. This loses if buffer's address is bogus, but
2032 that is the user's responsibility. */
2033 RETALLOC (bufp
->buffer
, INIT_BUF_SIZE
, unsigned char);
2036 { /* Caller did not allocate a buffer. Do it for them. */
2037 bufp
->buffer
= TALLOC (INIT_BUF_SIZE
, unsigned char);
2039 if (!bufp
->buffer
) FREE_STACK_RETURN (REG_ESPACE
);
2041 bufp
->allocated
= INIT_BUF_SIZE
;
2044 begalt
= b
= bufp
->buffer
;
2046 /* Loop through the uncompiled pattern until we're at the end. */
2055 if ( /* If at start of pattern, it's an operator. */
2057 /* If context independent, it's an operator. */
2058 || syntax
& RE_CONTEXT_INDEP_ANCHORS
2059 /* Otherwise, depends on what's come before. */
2060 || at_begline_loc_p (pattern
, p
, syntax
))
2070 if ( /* If at end of pattern, it's an operator. */
2072 /* If context independent, it's an operator. */
2073 || syntax
& RE_CONTEXT_INDEP_ANCHORS
2074 /* Otherwise, depends on what's next. */
2075 || at_endline_loc_p (p
, pend
, syntax
))
2085 if ((syntax
& RE_BK_PLUS_QM
)
2086 || (syntax
& RE_LIMITED_OPS
))
2090 /* If there is no previous pattern... */
2093 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2094 FREE_STACK_RETURN (REG_BADRPT
);
2095 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
2100 /* 1 means zero (many) matches is allowed. */
2101 boolean zero_times_ok
= 0, many_times_ok
= 0;
2104 /* If there is a sequence of repetition chars, collapse it
2105 down to just one (the right one). We can't combine
2106 interval operators with these because of, e.g., `a{2}*',
2107 which should only match an even number of `a's. */
2111 if (!(syntax
& RE_ALL_GREEDY
)
2112 && c
== '?' && (zero_times_ok
|| many_times_ok
))
2116 zero_times_ok
|= c
!= '+';
2117 many_times_ok
|= c
!= '?';
2123 || (!(syntax
& RE_BK_PLUS_QM
)
2124 && (*p
== '+' || *p
== '?')))
2126 else if (syntax
& RE_BK_PLUS_QM
&& *p
== '\\')
2129 FREE_STACK_RETURN (REG_EESCAPE
);
2130 if (p
[1] == '+' || p
[1] == '?')
2131 PATFETCH (c
); /* Gobble up the backslash. */
2137 /* If we get here, we found another repeat character. */
2141 /* Star, etc. applied to an empty pattern is equivalent
2142 to an empty pattern. */
2143 if (!laststart
|| laststart
== b
)
2146 /* Now we know whether or not zero matches is allowed
2147 and also whether or not two or more matches is allowed. */
2152 boolean simple
= skip_one_char (laststart
) == b
;
2153 unsigned int startoffset
= 0;
2155 (simple
|| !analyse_first (laststart
, b
, NULL
, 0)) ?
2156 on_failure_jump
: on_failure_jump_loop
;
2157 assert (skip_one_char (laststart
) <= b
);
2159 if (!zero_times_ok
&& simple
)
2160 { /* Since simple * loops can be made faster by using
2161 on_failure_keep_string_jump, we turn simple P+
2162 into PP* if P is simple. */
2163 unsigned char *p1
, *p2
;
2164 startoffset
= b
- laststart
;
2165 GET_BUFFER_SPACE (startoffset
);
2166 p1
= b
; p2
= laststart
;
2172 GET_BUFFER_SPACE (6);
2175 STORE_JUMP (ofj
, b
, b
+ 6);
2177 /* Simple * loops can use on_failure_keep_string_jump
2178 depending on what follows. But since we don't know
2179 that yet, we leave the decision up to
2180 on_failure_jump_smart. */
2181 INSERT_JUMP (simple
? on_failure_jump_smart
: ofj
,
2182 laststart
+ startoffset
, b
+ 6);
2184 STORE_JUMP (jump
, b
, laststart
+ startoffset
);
2189 /* A simple ? pattern. */
2190 assert (zero_times_ok
);
2191 GET_BUFFER_SPACE (3);
2192 INSERT_JUMP (on_failure_jump
, laststart
, b
+ 3);
2196 else /* not greedy */
2197 { /* I wish the greedy and non-greedy cases could be merged. */
2199 GET_BUFFER_SPACE (7); /* We might use less. */
2202 boolean emptyp
= analyse_first (laststart
, b
, NULL
, 0);
2204 /* The non-greedy multiple match looks like a repeat..until:
2205 we only need a conditional jump at the end of the loop */
2206 if (emptyp
) BUF_PUSH (no_op
);
2207 STORE_JUMP (emptyp
? on_failure_jump_nastyloop
2208 : on_failure_jump
, b
, laststart
);
2212 /* The repeat...until naturally matches one or more.
2213 To also match zero times, we need to first jump to
2214 the end of the loop (its conditional jump). */
2215 INSERT_JUMP (jump
, laststart
, b
);
2221 /* non-greedy a?? */
2222 INSERT_JUMP (jump
, laststart
, b
+ 3);
2224 INSERT_JUMP (on_failure_jump
, laststart
, laststart
+ 6);
2241 CLEAR_RANGE_TABLE_WORK_USED (range_table_work
);
2243 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2245 /* Ensure that we have enough space to push a charset: the
2246 opcode, the length count, and the bitset; 34 bytes in all. */
2247 GET_BUFFER_SPACE (34);
2251 /* We test `*p == '^' twice, instead of using an if
2252 statement, so we only need one BUF_PUSH. */
2253 BUF_PUSH (*p
== '^' ? charset_not
: charset
);
2257 /* Remember the first position in the bracket expression. */
2260 /* Push the number of bytes in the bitmap. */
2261 BUF_PUSH ((1 << BYTEWIDTH
) / BYTEWIDTH
);
2263 /* Clear the whole map. */
2264 bzero (b
, (1 << BYTEWIDTH
) / BYTEWIDTH
);
2266 /* charset_not matches newline according to a syntax bit. */
2267 if ((re_opcode_t
) b
[-2] == charset_not
2268 && (syntax
& RE_HAT_LISTS_NOT_NEWLINE
))
2269 SET_LIST_BIT ('\n');
2271 /* Read in characters and ranges, setting map bits. */
2274 boolean escaped_char
= false;
2275 const unsigned char *p2
= p
;
2277 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2281 /* \ might escape characters inside [...] and [^...]. */
2282 if ((syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
) && c
== '\\')
2284 if (p
== pend
) FREE_STACK_RETURN (REG_EESCAPE
);
2287 escaped_char
= true;
2291 /* Could be the end of the bracket expression. If it's
2292 not (i.e., when the bracket expression is `[]' so
2293 far), the ']' character bit gets set way below. */
2294 if (c
== ']' && p2
!= p1
)
2298 /* What should we do for the character which is
2299 greater than 0x7F, but not BASE_LEADING_CODE_P?
2302 /* See if we're at the beginning of a possible character
2305 if (!escaped_char
&&
2306 syntax
& RE_CHAR_CLASSES
&& c
== '[' && *p
== ':')
2308 /* Leave room for the null. */
2309 char str
[CHAR_CLASS_MAX_LENGTH
+ 1];
2310 const unsigned char *class_beg
;
2316 /* If pattern is `[[:'. */
2317 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2322 if (c
== ':' || c
== ']' || p
== pend
2323 || c1
== CHAR_CLASS_MAX_LENGTH
)
2329 /* If isn't a word bracketed by `[:' and `:]':
2330 undo the ending character, the letters, and
2331 leave the leading `:' and `[' (but set bits for
2333 if (c
== ':' && *p
== ']')
2336 boolean is_alnum
= STREQ (str
, "alnum");
2337 boolean is_alpha
= STREQ (str
, "alpha");
2338 boolean is_ascii
= STREQ (str
, "ascii");
2339 boolean is_blank
= STREQ (str
, "blank");
2340 boolean is_cntrl
= STREQ (str
, "cntrl");
2341 boolean is_digit
= STREQ (str
, "digit");
2342 boolean is_graph
= STREQ (str
, "graph");
2343 boolean is_lower
= STREQ (str
, "lower");
2344 boolean is_multibyte
= STREQ (str
, "multibyte");
2345 boolean is_nonascii
= STREQ (str
, "nonascii");
2346 boolean is_print
= STREQ (str
, "print");
2347 boolean is_punct
= STREQ (str
, "punct");
2348 boolean is_space
= STREQ (str
, "space");
2349 boolean is_unibyte
= STREQ (str
, "unibyte");
2350 boolean is_upper
= STREQ (str
, "upper");
2351 boolean is_word
= STREQ (str
, "word");
2352 boolean is_xdigit
= STREQ (str
, "xdigit");
2354 if (!IS_CHAR_CLASS (str
))
2355 FREE_STACK_RETURN (REG_ECTYPE
);
2357 /* Throw away the ] at the end of the character
2361 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2363 /* Most character classes in a multibyte match
2364 just set a flag. Exceptions are is_blank,
2365 is_digit, is_cntrl, and is_xdigit, since
2366 they can only match ASCII characters. We
2367 don't need to handle them for multibyte. */
2373 if (is_alnum
) bit
= BIT_ALNUM
;
2374 if (is_alpha
) bit
= BIT_ALPHA
;
2375 if (is_ascii
) bit
= BIT_ASCII
;
2376 if (is_graph
) bit
= BIT_GRAPH
;
2377 if (is_lower
) bit
= BIT_LOWER
;
2378 if (is_multibyte
) bit
= BIT_MULTIBYTE
;
2379 if (is_nonascii
) bit
= BIT_NONASCII
;
2380 if (is_print
) bit
= BIT_PRINT
;
2381 if (is_punct
) bit
= BIT_PUNCT
;
2382 if (is_space
) bit
= BIT_SPACE
;
2383 if (is_unibyte
) bit
= BIT_UNIBYTE
;
2384 if (is_upper
) bit
= BIT_UPPER
;
2385 if (is_word
) bit
= BIT_WORD
;
2387 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work
,
2391 /* Handle character classes for ASCII characters. */
2392 for (ch
= 0; ch
< 1 << BYTEWIDTH
; ch
++)
2394 int translated
= TRANSLATE (ch
);
2395 /* This was split into 3 if's to
2396 avoid an arbitrary limit in some compiler. */
2397 if ( (is_alnum
&& ISALNUM (ch
))
2398 || (is_alpha
&& ISALPHA (ch
))
2399 || (is_blank
&& ISBLANK (ch
))
2400 || (is_cntrl
&& ISCNTRL (ch
)))
2401 SET_LIST_BIT (translated
);
2402 if ( (is_digit
&& ISDIGIT (ch
))
2403 || (is_graph
&& ISGRAPH (ch
))
2404 || (is_lower
&& ISLOWER (ch
))
2405 || (is_print
&& ISPRINT (ch
)))
2406 SET_LIST_BIT (translated
);
2407 if ( (is_punct
&& ISPUNCT (ch
))
2408 || (is_space
&& ISSPACE (ch
))
2409 || (is_upper
&& ISUPPER (ch
))
2410 || (is_xdigit
&& ISXDIGIT (ch
)))
2411 SET_LIST_BIT (translated
);
2412 if ( (is_ascii
&& IS_REAL_ASCII (ch
))
2413 || (is_nonascii
&& !IS_REAL_ASCII (ch
))
2414 || (is_unibyte
&& ISUNIBYTE (ch
))
2415 || (is_multibyte
&& !ISUNIBYTE (ch
)))
2416 SET_LIST_BIT (translated
);
2418 if ( (is_word
&& ISWORD (ch
)))
2419 SET_LIST_BIT (translated
);
2422 /* Repeat the loop. */
2427 /* Go back to right after the "[:". */
2431 /* Because the `:' may starts the range, we
2432 can't simply set bit and repeat the loop.
2433 Instead, just set it to C and handle below. */
2438 if (p
< pend
&& p
[0] == '-' && p
[1] != ']')
2441 /* Discard the `-'. */
2444 /* Fetch the character which ends the range. */
2447 if (SINGLE_BYTE_CHAR_P (c
))
2449 if (! SINGLE_BYTE_CHAR_P (c1
))
2451 /* Handle a range such as \177-\377 in
2452 multibyte mode. Split that into two
2453 ranges, the low one ending at 0237, and
2454 the high one starting at the smallest
2455 character in the charset of C1 and
2457 int charset
= CHAR_CHARSET (c1
);
2458 int c2
= MAKE_CHAR (charset
, 0, 0);
2460 SET_RANGE_TABLE_WORK_AREA (range_table_work
,
2465 else if (!SAME_CHARSET_P (c
, c1
))
2466 FREE_STACK_RETURN (REG_ERANGE
);
2469 /* Range from C to C. */
2472 /* Set the range ... */
2473 if (SINGLE_BYTE_CHAR_P (c
))
2474 /* ... into bitmap. */
2477 int range_start
= c
, range_end
= c1
;
2479 /* If the start is after the end, the range is empty. */
2480 if (range_start
> range_end
)
2482 if (syntax
& RE_NO_EMPTY_RANGES
)
2483 FREE_STACK_RETURN (REG_ERANGE
);
2484 /* Else, repeat the loop. */
2488 for (this_char
= range_start
; this_char
<= range_end
;
2490 SET_LIST_BIT (TRANSLATE (this_char
));
2494 /* ... into range table. */
2495 SET_RANGE_TABLE_WORK_AREA (range_table_work
, c
, c1
);
2498 /* Discard any (non)matching list bytes that are all 0 at the
2499 end of the map. Decrease the map-length byte too. */
2500 while ((int) b
[-1] > 0 && b
[b
[-1] - 1] == 0)
2504 /* Build real range table from work area. */
2505 if (RANGE_TABLE_WORK_USED (range_table_work
)
2506 || RANGE_TABLE_WORK_BITS (range_table_work
))
2509 int used
= RANGE_TABLE_WORK_USED (range_table_work
);
2511 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2512 bytes for flags, two for COUNT, and three bytes for
2514 GET_BUFFER_SPACE (4 + used
* 3);
2516 /* Indicate the existence of range table. */
2517 laststart
[1] |= 0x80;
2519 /* Store the character class flag bits into the range table.
2520 If not in emacs, these flag bits are always 0. */
2521 *b
++ = RANGE_TABLE_WORK_BITS (range_table_work
) & 0xff;
2522 *b
++ = RANGE_TABLE_WORK_BITS (range_table_work
) >> 8;
2524 STORE_NUMBER_AND_INCR (b
, used
/ 2);
2525 for (i
= 0; i
< used
; i
++)
2526 STORE_CHARACTER_AND_INCR
2527 (b
, RANGE_TABLE_WORK_ELT (range_table_work
, i
));
2534 if (syntax
& RE_NO_BK_PARENS
)
2541 if (syntax
& RE_NO_BK_PARENS
)
2548 if (syntax
& RE_NEWLINE_ALT
)
2555 if (syntax
& RE_NO_BK_VBAR
)
2562 if (syntax
& RE_INTERVALS
&& syntax
& RE_NO_BK_BRACES
)
2563 goto handle_interval
;
2569 if (p
== pend
) FREE_STACK_RETURN (REG_EESCAPE
);
2571 /* Do not translate the character after the \, so that we can
2572 distinguish, e.g., \B from \b, even if we normally would
2573 translate, e.g., B to b. */
2579 if (syntax
& RE_NO_BK_PARENS
)
2580 goto normal_backslash
;
2587 /* Look for a special (?...) construct */
2588 if ((syntax
& RE_SHY_GROUPS
) && *p
== '?')
2590 PATFETCH (c
); /* Gobble up the '?'. */
2594 case ':': shy
= 1; break;
2596 /* Only (?:...) is supported right now. */
2597 FREE_STACK_RETURN (REG_BADPAT
);
2608 if (COMPILE_STACK_FULL
)
2610 RETALLOC (compile_stack
.stack
, compile_stack
.size
<< 1,
2611 compile_stack_elt_t
);
2612 if (compile_stack
.stack
== NULL
) return REG_ESPACE
;
2614 compile_stack
.size
<<= 1;
2617 /* These are the values to restore when we hit end of this
2618 group. They are all relative offsets, so that if the
2619 whole pattern moves because of realloc, they will still
2621 COMPILE_STACK_TOP
.begalt_offset
= begalt
- bufp
->buffer
;
2622 COMPILE_STACK_TOP
.fixup_alt_jump
2623 = fixup_alt_jump
? fixup_alt_jump
- bufp
->buffer
+ 1 : 0;
2624 COMPILE_STACK_TOP
.laststart_offset
= b
- bufp
->buffer
;
2625 COMPILE_STACK_TOP
.regnum
= shy
? -regnum
: regnum
;
2628 start_memory for groups beyond the last one we can
2629 represent in the compiled pattern. */
2630 if (regnum
<= MAX_REGNUM
&& !shy
)
2631 BUF_PUSH_2 (start_memory
, regnum
);
2633 compile_stack
.avail
++;
2638 /* If we've reached MAX_REGNUM groups, then this open
2639 won't actually generate any code, so we'll have to
2640 clear pending_exact explicitly. */
2646 if (syntax
& RE_NO_BK_PARENS
) goto normal_backslash
;
2648 if (COMPILE_STACK_EMPTY
)
2650 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
2651 goto normal_backslash
;
2653 FREE_STACK_RETURN (REG_ERPAREN
);
2659 /* See similar code for backslashed left paren above. */
2660 if (COMPILE_STACK_EMPTY
)
2662 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
2665 FREE_STACK_RETURN (REG_ERPAREN
);
2668 /* Since we just checked for an empty stack above, this
2669 ``can't happen''. */
2670 assert (compile_stack
.avail
!= 0);
2672 /* We don't just want to restore into `regnum', because
2673 later groups should continue to be numbered higher,
2674 as in `(ab)c(de)' -- the second group is #2. */
2675 regnum_t this_group_regnum
;
2677 compile_stack
.avail
--;
2678 begalt
= bufp
->buffer
+ COMPILE_STACK_TOP
.begalt_offset
;
2680 = COMPILE_STACK_TOP
.fixup_alt_jump
2681 ? bufp
->buffer
+ COMPILE_STACK_TOP
.fixup_alt_jump
- 1
2683 laststart
= bufp
->buffer
+ COMPILE_STACK_TOP
.laststart_offset
;
2684 this_group_regnum
= COMPILE_STACK_TOP
.regnum
;
2685 /* If we've reached MAX_REGNUM groups, then this open
2686 won't actually generate any code, so we'll have to
2687 clear pending_exact explicitly. */
2690 /* We're at the end of the group, so now we know how many
2691 groups were inside this one. */
2692 if (this_group_regnum
<= MAX_REGNUM
&& this_group_regnum
> 0)
2693 BUF_PUSH_2 (stop_memory
, this_group_regnum
);
2698 case '|': /* `\|'. */
2699 if (syntax
& RE_LIMITED_OPS
|| syntax
& RE_NO_BK_VBAR
)
2700 goto normal_backslash
;
2702 if (syntax
& RE_LIMITED_OPS
)
2705 /* Insert before the previous alternative a jump which
2706 jumps to this alternative if the former fails. */
2707 GET_BUFFER_SPACE (3);
2708 INSERT_JUMP (on_failure_jump
, begalt
, b
+ 6);
2712 /* The alternative before this one has a jump after it
2713 which gets executed if it gets matched. Adjust that
2714 jump so it will jump to this alternative's analogous
2715 jump (put in below, which in turn will jump to the next
2716 (if any) alternative's such jump, etc.). The last such
2717 jump jumps to the correct final destination. A picture:
2723 If we are at `b', then fixup_alt_jump right now points to a
2724 three-byte space after `a'. We'll put in the jump, set
2725 fixup_alt_jump to right after `b', and leave behind three
2726 bytes which we'll fill in when we get to after `c'. */
2730 /* Mark and leave space for a jump after this alternative,
2731 to be filled in later either by next alternative or
2732 when know we're at the end of a series of alternatives. */
2734 GET_BUFFER_SPACE (3);
2743 /* If \{ is a literal. */
2744 if (!(syntax
& RE_INTERVALS
)
2745 /* If we're at `\{' and it's not the open-interval
2747 || (syntax
& RE_NO_BK_BRACES
)
2748 /* What is that? -sm */
2749 /* || (p - 2 == pattern && p == pend) */)
2750 goto normal_backslash
;
2754 /* If got here, then the syntax allows intervals. */
2756 /* At least (most) this many matches must be made. */
2757 int lower_bound
= 0, upper_bound
= -1;
2763 if (syntax
& RE_NO_BK_BRACES
)
2764 goto unfetch_interval
;
2766 FREE_STACK_RETURN (REG_EBRACE
);
2769 GET_UNSIGNED_NUMBER (lower_bound
);
2772 GET_UNSIGNED_NUMBER (upper_bound
);
2774 /* Interval such as `{1}' => match exactly once. */
2775 upper_bound
= lower_bound
;
2777 if (lower_bound
< 0 || upper_bound
> RE_DUP_MAX
2778 || (upper_bound
>= 0 && lower_bound
> upper_bound
))
2780 if (syntax
& RE_NO_BK_BRACES
)
2781 goto unfetch_interval
;
2783 FREE_STACK_RETURN (REG_BADBR
);
2786 if (!(syntax
& RE_NO_BK_BRACES
))
2788 if (c
!= '\\') FREE_STACK_RETURN (REG_EBRACE
);
2795 if (syntax
& RE_NO_BK_BRACES
)
2796 goto unfetch_interval
;
2798 FREE_STACK_RETURN (REG_BADBR
);
2801 /* We just parsed a valid interval. */
2803 /* If it's invalid to have no preceding re. */
2806 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2807 FREE_STACK_RETURN (REG_BADRPT
);
2808 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2811 goto unfetch_interval
;
2814 if (upper_bound
== 0)
2815 /* If the upper bound is zero, just drop the sub pattern
2818 else if (lower_bound
== 1 && upper_bound
== 1)
2819 /* Just match it once: nothing to do here. */
2822 /* Otherwise, we have a nontrivial interval. When
2823 we're all done, the pattern will look like:
2824 set_number_at <jump count> <upper bound>
2825 set_number_at <succeed_n count> <lower bound>
2826 succeed_n <after jump addr> <succeed_n count>
2828 jump_n <succeed_n addr> <jump count>
2829 (The upper bound and `jump_n' are omitted if
2830 `upper_bound' is 1, though.) */
2832 { /* If the upper bound is > 1, we need to insert
2833 more at the end of the loop. */
2834 unsigned int nbytes
= (upper_bound
< 0 ? 3
2835 : upper_bound
> 1 ? 5 : 0);
2836 unsigned int startoffset
= 0;
2838 GET_BUFFER_SPACE (20); /* We might use less. */
2840 if (lower_bound
== 0)
2842 /* A succeed_n that starts with 0 is really a
2843 a simple on_failure_jump_loop. */
2844 INSERT_JUMP (on_failure_jump_loop
, laststart
,
2850 /* Initialize lower bound of the `succeed_n', even
2851 though it will be set during matching by its
2852 attendant `set_number_at' (inserted next),
2853 because `re_compile_fastmap' needs to know.
2854 Jump to the `jump_n' we might insert below. */
2855 INSERT_JUMP2 (succeed_n
, laststart
,
2860 /* Code to initialize the lower bound. Insert
2861 before the `succeed_n'. The `5' is the last two
2862 bytes of this `set_number_at', plus 3 bytes of
2863 the following `succeed_n'. */
2864 insert_op2 (set_number_at
, laststart
, 5, lower_bound
, b
);
2869 if (upper_bound
< 0)
2871 /* A negative upper bound stands for infinity,
2872 in which case it degenerates to a plain jump. */
2873 STORE_JUMP (jump
, b
, laststart
+ startoffset
);
2876 else if (upper_bound
> 1)
2877 { /* More than one repetition is allowed, so
2878 append a backward jump to the `succeed_n'
2879 that starts this interval.
2881 When we've reached this during matching,
2882 we'll have matched the interval once, so
2883 jump back only `upper_bound - 1' times. */
2884 STORE_JUMP2 (jump_n
, b
, laststart
+ startoffset
,
2888 /* The location we want to set is the second
2889 parameter of the `jump_n'; that is `b-2' as
2890 an absolute address. `laststart' will be
2891 the `set_number_at' we're about to insert;
2892 `laststart+3' the number to set, the source
2893 for the relative address. But we are
2894 inserting into the middle of the pattern --
2895 so everything is getting moved up by 5.
2896 Conclusion: (b - 2) - (laststart + 3) + 5,
2897 i.e., b - laststart.
2899 We insert this at the beginning of the loop
2900 so that if we fail during matching, we'll
2901 reinitialize the bounds. */
2902 insert_op2 (set_number_at
, laststart
, b
- laststart
,
2903 upper_bound
- 1, b
);
2908 beg_interval
= NULL
;
2913 /* If an invalid interval, match the characters as literals. */
2914 assert (beg_interval
);
2916 beg_interval
= NULL
;
2918 /* normal_char and normal_backslash need `c'. */
2921 if (!(syntax
& RE_NO_BK_BRACES
))
2923 assert (p
> pattern
&& p
[-1] == '\\');
2924 goto normal_backslash
;
2930 /* There is no way to specify the before_dot and after_dot
2931 operators. rms says this is ok. --karl */
2939 BUF_PUSH_2 (syntaxspec
, syntax_spec_code
[c
]);
2945 BUF_PUSH_2 (notsyntaxspec
, syntax_spec_code
[c
]);
2951 BUF_PUSH_2 (categoryspec
, c
);
2957 BUF_PUSH_2 (notcategoryspec
, c
);
2964 BUF_PUSH_2 (syntaxspec
, Sword
);
2970 BUF_PUSH_2 (notsyntaxspec
, Sword
);
2983 BUF_PUSH (wordbound
);
2987 BUF_PUSH (notwordbound
);
2998 case '1': case '2': case '3': case '4': case '5':
2999 case '6': case '7': case '8': case '9':
3000 if (syntax
& RE_NO_BK_REFS
)
3006 FREE_STACK_RETURN (REG_ESUBREG
);
3008 /* Can't back reference to a subexpression if inside of it. */
3009 if (group_in_compile_stack (compile_stack
, c1
))
3013 BUF_PUSH_2 (duplicate
, c1
);
3019 if (syntax
& RE_BK_PLUS_QM
)
3022 goto normal_backslash
;
3026 /* You might think it would be useful for \ to mean
3027 not to translate; but if we don't translate it
3028 it will never match anything. */
3036 /* Expects the character in `c'. */
3038 /* If no exactn currently being built. */
3041 /* If last exactn not at current position. */
3042 || pending_exact
+ *pending_exact
+ 1 != b
3044 /* We have only one byte following the exactn for the count. */
3045 || *pending_exact
>= (1 << BYTEWIDTH
) - MAX_MULTIBYTE_LENGTH
3047 /* If followed by a repetition operator. */
3048 || (p
!= pend
&& (*p
== '*' || *p
== '^'))
3049 || ((syntax
& RE_BK_PLUS_QM
)
3050 ? p
+ 1 < pend
&& *p
== '\\' && (p
[1] == '+' || p
[1] == '?')
3051 : p
!= pend
&& (*p
== '+' || *p
== '?'))
3052 || ((syntax
& RE_INTERVALS
)
3053 && ((syntax
& RE_NO_BK_BRACES
)
3054 ? p
!= pend
&& *p
== '{'
3055 : p
+ 1 < pend
&& p
[0] == '\\' && p
[1] == '{')))
3057 /* Start building a new exactn. */
3061 BUF_PUSH_2 (exactn
, 0);
3062 pending_exact
= b
- 1;
3065 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH
);
3067 int len
= CHAR_STRING (c
, b
);
3069 (*pending_exact
) += len
;
3074 } /* while p != pend */
3077 /* Through the pattern now. */
3081 if (!COMPILE_STACK_EMPTY
)
3082 FREE_STACK_RETURN (REG_EPAREN
);
3084 /* If we don't want backtracking, force success
3085 the first time we reach the end of the compiled pattern. */
3086 if (syntax
& RE_NO_POSIX_BACKTRACKING
)
3089 free (compile_stack
.stack
);
3091 /* We have succeeded; set the length of the buffer. */
3092 bufp
->used
= b
- bufp
->buffer
;
3097 re_compile_fastmap (bufp
);
3098 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3099 print_compiled_pattern (bufp
);
3104 #ifndef MATCH_MAY_ALLOCATE
3105 /* Initialize the failure stack to the largest possible stack. This
3106 isn't necessary unless we're trying to avoid calling alloca in
3107 the search and match routines. */
3109 int num_regs
= bufp
->re_nsub
+ 1;
3111 if (fail_stack
.size
< re_max_failures
* TYPICAL_FAILURE_SIZE
)
3113 fail_stack
.size
= re_max_failures
* TYPICAL_FAILURE_SIZE
;
3115 if (! fail_stack
.stack
)
3117 = (fail_stack_elt_t
*) malloc (fail_stack
.size
3118 * sizeof (fail_stack_elt_t
));
3121 = (fail_stack_elt_t
*) realloc (fail_stack
.stack
,
3123 * sizeof (fail_stack_elt_t
)));
3126 regex_grow_registers (num_regs
);
3128 #endif /* not MATCH_MAY_ALLOCATE */
3131 } /* regex_compile */
3133 /* Subroutines for `regex_compile'. */
3135 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3138 store_op1 (op
, loc
, arg
)
3143 *loc
= (unsigned char) op
;
3144 STORE_NUMBER (loc
+ 1, arg
);
3148 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3151 store_op2 (op
, loc
, arg1
, arg2
)
3156 *loc
= (unsigned char) op
;
3157 STORE_NUMBER (loc
+ 1, arg1
);
3158 STORE_NUMBER (loc
+ 3, arg2
);
3162 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3163 for OP followed by two-byte integer parameter ARG. */
3166 insert_op1 (op
, loc
, arg
, end
)
3172 register unsigned char *pfrom
= end
;
3173 register unsigned char *pto
= end
+ 3;
3175 while (pfrom
!= loc
)
3178 store_op1 (op
, loc
, arg
);
3182 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3185 insert_op2 (op
, loc
, arg1
, arg2
, end
)
3191 register unsigned char *pfrom
= end
;
3192 register unsigned char *pto
= end
+ 5;
3194 while (pfrom
!= loc
)
3197 store_op2 (op
, loc
, arg1
, arg2
);
3201 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3202 after an alternative or a begin-subexpression. We assume there is at
3203 least one character before the ^. */
3206 at_begline_loc_p (pattern
, p
, syntax
)
3207 const unsigned char *pattern
, *p
;
3208 reg_syntax_t syntax
;
3210 const unsigned char *prev
= p
- 2;
3211 boolean prev_prev_backslash
= prev
> pattern
&& prev
[-1] == '\\';
3214 /* After a subexpression? */
3215 (*prev
== '(' && (syntax
& RE_NO_BK_PARENS
|| prev_prev_backslash
))
3216 /* After an alternative? */
3217 || (*prev
== '|' && (syntax
& RE_NO_BK_VBAR
|| prev_prev_backslash
))
3218 /* After a shy subexpression? */
3219 || ((syntax
& RE_SHY_GROUPS
) && prev
- 2 >= pattern
3220 && prev
[-1] == '?' && prev
[-2] == '('
3221 && (syntax
& RE_NO_BK_PARENS
3222 || (prev
- 3 >= pattern
&& prev
[-3] == '\\')));
3226 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3227 at least one character after the $, i.e., `P < PEND'. */
3230 at_endline_loc_p (p
, pend
, syntax
)
3231 const unsigned char *p
, *pend
;
3232 reg_syntax_t syntax
;
3234 const unsigned char *next
= p
;
3235 boolean next_backslash
= *next
== '\\';
3236 const unsigned char *next_next
= p
+ 1 < pend
? p
+ 1 : 0;
3239 /* Before a subexpression? */
3240 (syntax
& RE_NO_BK_PARENS
? *next
== ')'
3241 : next_backslash
&& next_next
&& *next_next
== ')')
3242 /* Before an alternative? */
3243 || (syntax
& RE_NO_BK_VBAR
? *next
== '|'
3244 : next_backslash
&& next_next
&& *next_next
== '|');
3248 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3249 false if it's not. */
3252 group_in_compile_stack (compile_stack
, regnum
)
3253 compile_stack_type compile_stack
;
3258 for (this_element
= compile_stack
.avail
- 1;
3261 if (compile_stack
.stack
[this_element
].regnum
== regnum
)
3268 If fastmap is non-NULL, go through the pattern and fill fastmap
3269 with all the possible leading chars. If fastmap is NULL, don't
3270 bother filling it up (obviously) and only return whether the
3271 pattern could potentially match the empty string.
3273 Return 1 if p..pend might match the empty string.
3274 Return 0 if p..pend matches at least one char.
3275 Return -1 if p..pend matches at least one char, but fastmap was not
3277 Return -2 if an error occurred. */
3280 analyse_first (p
, pend
, fastmap
, multibyte
)
3281 unsigned char *p
, *pend
;
3283 const int multibyte
;
3287 #ifdef MATCH_MAY_ALLOCATE
3288 fail_stack_type fail_stack
;
3290 #ifndef REGEX_MALLOC
3294 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3295 /* This holds the pointer to the failure stack, when
3296 it is allocated relocatably. */
3297 fail_stack_elt_t
*failure_stack_ptr
;
3300 /* Assume that each path through the pattern can be null until
3301 proven otherwise. We set this false at the bottom of switch
3302 statement, to which we get only if a particular path doesn't
3303 match the empty string. */
3304 boolean path_can_be_null
= true;
3306 /* If all elements for base leading-codes in fastmap is set, this
3307 flag is set true. */
3308 boolean match_any_multibyte_characters
= false;
3314 /* The loop below works as follows:
3315 - It has a working-list kept in the PATTERN_STACK and which basically
3316 starts by only containing a pointer to the first operation.
3317 - If the opcode we're looking at is a match against some set of
3318 chars, then we add those chars to the fastmap and go on to the
3319 next work element from the worklist (done via `break').
3320 - If the opcode is a control operator on the other hand, we either
3321 ignore it (if it's meaningless at this point, such as `start_memory')
3322 or execute it (if it's a jump). If the jump has several destinations
3323 (i.e. `on_failure_jump'), then we push the other destination onto the
3325 We guarantee termination by ignoring backward jumps (more or less),
3326 so that `p' is monotonically increasing. More to the point, we
3327 never set `p' (or push) anything `<= p1'. */
3329 /* If can_be_null is set, then the fastmap will not be used anyway. */
3332 /* `p1' is used as a marker of how far back a `on_failure_jump'
3333 can go without being ignored. It is normally equal to `p'
3334 (which prevents any backward `on_failure_jump') except right
3335 after a plain `jump', to allow patterns such as:
3338 10: on_failure_jump 3
3339 as used for the *? operator. */
3340 unsigned char *p1
= p
;
3344 if (path_can_be_null
)
3345 return (RESET_FAIL_STACK (), 1);
3347 /* We have reached the (effective) end of pattern. */
3348 if (PATTERN_STACK_EMPTY ())
3349 return (RESET_FAIL_STACK (), 0);
3351 p
= (unsigned char*) POP_PATTERN_OP ();
3352 path_can_be_null
= true;
3356 /* We should never be about to go beyond the end of the pattern. */
3359 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
++))
3366 /* If the first character has to match a backreference, that means
3367 that the group was empty (since it already matched). Since this
3368 is the only case that interests us here, we can assume that the
3369 backreference must match the empty string. */
3374 /* Following are the cases which match a character. These end
3378 if (fastmap
) fastmap
[p
[1]] = 1;
3383 /* We could put all the chars except for \n (and maybe \0)
3384 but we don't bother since it is generally not worth it. */
3385 if (!fastmap
) break;
3386 return (RESET_FAIL_STACK (), -1);
3390 /* Chars beyond end of bitmap are possible matches.
3391 All the single-byte codes can occur in multibyte buffers.
3392 So any that are not listed in the charset
3393 are possible matches, even in multibyte buffers. */
3394 if (!fastmap
) break;
3395 for (j
= CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
;
3396 j
< (1 << BYTEWIDTH
); j
++)
3400 if (!fastmap
) break;
3401 not = (re_opcode_t
) *(p
- 1) == charset_not
;
3402 for (j
= CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
- 1, p
++;
3404 if (!!(p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
))) ^ not)
3407 if ((not && multibyte
)
3408 /* Any character set can possibly contain a character
3409 which doesn't match the specified set of characters. */
3410 || (CHARSET_RANGE_TABLE_EXISTS_P (&p
[-2])
3411 && CHARSET_RANGE_TABLE_BITS (&p
[-2]) != 0))
3412 /* If we can match a character class, we can match
3413 any character set. */
3415 set_fastmap_for_multibyte_characters
:
3416 if (match_any_multibyte_characters
== false)
3418 for (j
= 0x80; j
< 0xA0; j
++) /* XXX */
3419 if (BASE_LEADING_CODE_P (j
))
3421 match_any_multibyte_characters
= true;
3425 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p
[-2])
3426 && match_any_multibyte_characters
== false)
3428 /* Set fastmap[I] 1 where I is a base leading code of each
3429 multibyte character in the range table. */
3432 /* Make P points the range table. `+ 2' is to skip flag
3433 bits for a character class. */
3434 p
+= CHARSET_BITMAP_SIZE (&p
[-2]) + 2;
3436 /* Extract the number of ranges in range table into COUNT. */
3437 EXTRACT_NUMBER_AND_INCR (count
, p
);
3438 for (; count
> 0; count
--, p
+= 2 * 3) /* XXX */
3440 /* Extract the start of each range. */
3441 EXTRACT_CHARACTER (c
, p
);
3442 j
= CHAR_CHARSET (c
);
3443 fastmap
[CHARSET_LEADING_CODE_BASE (j
)] = 1;
3450 if (!fastmap
) break;
3452 not = (re_opcode_t
)p
[-1] == notsyntaxspec
;
3454 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
3455 if ((SYNTAX (j
) == (enum syntaxcode
) k
) ^ not)
3459 /* This match depends on text properties. These end with
3460 aborting optimizations. */
3461 return (RESET_FAIL_STACK (), -1);
3464 case notcategoryspec
:
3465 if (!fastmap
) break;
3466 not = (re_opcode_t
)p
[-1] == notcategoryspec
;
3468 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
3469 if ((CHAR_HAS_CATEGORY (j
, k
)) ^ not)
3473 /* Any character set can possibly contain a character
3474 whose category is K (or not). */
3475 goto set_fastmap_for_multibyte_characters
;
3478 /* All cases after this match the empty string. These end with
3498 EXTRACT_NUMBER_AND_INCR (j
, p
);
3500 /* Backward jumps can only go back to code that we've already
3501 visited. `re_compile' should make sure this is true. */
3504 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
))
3506 case on_failure_jump
:
3507 case on_failure_keep_string_jump
:
3508 case on_failure_jump_loop
:
3509 case on_failure_jump_nastyloop
:
3510 case on_failure_jump_smart
:
3516 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3517 to jump back to "just after here". */
3520 case on_failure_jump
:
3521 case on_failure_keep_string_jump
:
3522 case on_failure_jump_nastyloop
:
3523 case on_failure_jump_loop
:
3524 case on_failure_jump_smart
:
3525 EXTRACT_NUMBER_AND_INCR (j
, p
);
3527 ; /* Backward jump to be ignored. */
3528 else if (!PUSH_PATTERN_OP (p
+ j
, fail_stack
))
3529 return (RESET_FAIL_STACK (), -2);
3534 /* This code simply does not properly handle forward jump_n. */
3535 DEBUG_STATEMENT (EXTRACT_NUMBER (j
, p
); assert (j
< 0));
3537 /* jump_n can either jump or fall through. The (backward) jump
3538 case has already been handled, so we only need to look at the
3539 fallthrough case. */
3543 /* If N == 0, it should be an on_failure_jump_loop instead. */
3544 DEBUG_STATEMENT (EXTRACT_NUMBER (j
, p
+ 2); assert (j
> 0));
3546 /* We only care about one iteration of the loop, so we don't
3547 need to consider the case where this behaves like an
3564 abort (); /* We have listed all the cases. */
3567 /* Getting here means we have found the possible starting
3568 characters for one path of the pattern -- and that the empty
3569 string does not match. We need not follow this path further.
3570 Instead, look at the next alternative (remembered on the
3571 stack), or quit if no more. The test at the top of the loop
3572 does these things. */
3573 path_can_be_null
= false;
3577 return (RESET_FAIL_STACK (), 0);
3578 } /* analyse_first */
3580 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3581 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3582 characters can start a string that matches the pattern. This fastmap
3583 is used by re_search to skip quickly over impossible starting points.
3585 Character codes above (1 << BYTEWIDTH) are not represented in the
3586 fastmap, but the leading codes are represented. Thus, the fastmap
3587 indicates which character sets could start a match.
3589 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3590 area as BUFP->fastmap.
3592 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3595 Returns 0 if we succeed, -2 if an internal error. */
3598 re_compile_fastmap (bufp
)
3599 struct re_pattern_buffer
*bufp
;
3601 char *fastmap
= bufp
->fastmap
;
3604 assert (fastmap
&& bufp
->buffer
);
3606 bzero (fastmap
, 1 << BYTEWIDTH
); /* Assume nothing's valid. */
3607 bufp
->fastmap_accurate
= 1; /* It will be when we're done. */
3609 analysis
= analyse_first (bufp
->buffer
, bufp
->buffer
+ bufp
->used
,
3610 fastmap
, RE_MULTIBYTE_P (bufp
));
3613 bufp
->can_be_null
= (analysis
!= 0);
3615 } /* re_compile_fastmap */
3617 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3618 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3619 this memory for recording register information. STARTS and ENDS
3620 must be allocated using the malloc library routine, and must each
3621 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3623 If NUM_REGS == 0, then subsequent matches should allocate their own
3626 Unless this function is called, the first search or match using
3627 PATTERN_BUFFER will allocate its own register data, without
3628 freeing the old data. */
3631 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
3632 struct re_pattern_buffer
*bufp
;
3633 struct re_registers
*regs
;
3635 regoff_t
*starts
, *ends
;
3639 bufp
->regs_allocated
= REGS_REALLOCATE
;
3640 regs
->num_regs
= num_regs
;
3641 regs
->start
= starts
;
3646 bufp
->regs_allocated
= REGS_UNALLOCATED
;
3648 regs
->start
= regs
->end
= (regoff_t
*) 0;
3652 /* Searching routines. */
3654 /* Like re_search_2, below, but only one string is specified, and
3655 doesn't let you say where to stop matching. */
3658 re_search (bufp
, string
, size
, startpos
, range
, regs
)
3659 struct re_pattern_buffer
*bufp
;
3661 int size
, startpos
, range
;
3662 struct re_registers
*regs
;
3664 return re_search_2 (bufp
, NULL
, 0, string
, size
, startpos
, range
,
3668 /* End address of virtual concatenation of string. */
3669 #define STOP_ADDR_VSTRING(P) \
3670 (((P) >= size1 ? string2 + size2 : string1 + size1))
3672 /* Address of POS in the concatenation of virtual string. */
3673 #define POS_ADDR_VSTRING(POS) \
3674 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3676 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3677 virtual concatenation of STRING1 and STRING2, starting first at index
3678 STARTPOS, then at STARTPOS + 1, and so on.
3680 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3682 RANGE is how far to scan while trying to match. RANGE = 0 means try
3683 only at STARTPOS; in general, the last start tried is STARTPOS +
3686 In REGS, return the indices of the virtual concatenation of STRING1
3687 and STRING2 that matched the entire BUFP->buffer and its contained
3690 Do not consider matching one past the index STOP in the virtual
3691 concatenation of STRING1 and STRING2.
3693 We return either the position in the strings at which the match was
3694 found, -1 if no match, or -2 if error (such as failure
3698 re_search_2 (bufp
, str1
, size1
, str2
, size2
, startpos
, range
, regs
, stop
)
3699 struct re_pattern_buffer
*bufp
;
3700 const char *str1
, *str2
;
3704 struct re_registers
*regs
;
3708 re_char
*string1
= (re_char
*) str1
;
3709 re_char
*string2
= (re_char
*) str2
;
3710 register char *fastmap
= bufp
->fastmap
;
3711 register RE_TRANSLATE_TYPE translate
= bufp
->translate
;
3712 int total_size
= size1
+ size2
;
3713 int endpos
= startpos
+ range
;
3714 int anchored_start
= 0;
3716 /* Nonzero if we have to concern multibyte character. */
3717 const boolean multibyte
= RE_MULTIBYTE_P (bufp
);
3719 /* Check for out-of-range STARTPOS. */
3720 if (startpos
< 0 || startpos
> total_size
)
3723 /* Fix up RANGE if it might eventually take us outside
3724 the virtual concatenation of STRING1 and STRING2.
3725 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3727 range
= 0 - startpos
;
3728 else if (endpos
> total_size
)
3729 range
= total_size
- startpos
;
3731 /* If the search isn't to be a backwards one, don't waste time in a
3732 search for a pattern anchored at beginning of buffer. */
3733 if (bufp
->used
> 0 && (re_opcode_t
) bufp
->buffer
[0] == begbuf
&& range
> 0)
3742 /* In a forward search for something that starts with \=.
3743 don't keep searching past point. */
3744 if (bufp
->used
> 0 && (re_opcode_t
) bufp
->buffer
[0] == at_dot
&& range
> 0)
3746 range
= PT_BYTE
- BEGV_BYTE
- startpos
;
3752 /* Update the fastmap now if not correct already. */
3753 if (fastmap
&& !bufp
->fastmap_accurate
)
3754 if (re_compile_fastmap (bufp
) == -2)
3757 /* See whether the pattern is anchored. */
3758 if (bufp
->buffer
[0] == begline
)
3762 gl_state
.object
= re_match_object
;
3764 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos
));
3766 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object
, charpos
, 1);
3770 /* Loop through the string, looking for a place to start matching. */
3773 /* If the pattern is anchored,
3774 skip quickly past places we cannot match.
3775 We don't bother to treat startpos == 0 specially
3776 because that case doesn't repeat. */
3777 if (anchored_start
&& startpos
> 0)
3779 if (! (bufp
->newline_anchor
3780 && ((startpos
<= size1
? string1
[startpos
- 1]
3781 : string2
[startpos
- size1
- 1])
3786 /* If a fastmap is supplied, skip quickly over characters that
3787 cannot be the start of a match. If the pattern can match the
3788 null string, however, we don't need to skip characters; we want
3789 the first null string. */
3790 if (fastmap
&& startpos
< total_size
&& !bufp
->can_be_null
)
3792 register re_char
*d
;
3793 register unsigned int buf_ch
;
3795 d
= POS_ADDR_VSTRING (startpos
);
3797 if (range
> 0) /* Searching forwards. */
3799 register int lim
= 0;
3802 if (startpos
< size1
&& startpos
+ range
>= size1
)
3803 lim
= range
- (size1
- startpos
);
3805 /* Written out as an if-else to avoid testing `translate'
3807 if (RE_TRANSLATE_P (translate
))
3814 buf_ch
= STRING_CHAR_AND_LENGTH (d
, range
- lim
,
3817 buf_ch
= RE_TRANSLATE (translate
, buf_ch
);
3822 range
-= buf_charlen
;
3827 && !fastmap
[RE_TRANSLATE (translate
, *d
)])
3834 while (range
> lim
&& !fastmap
[*d
])
3840 startpos
+= irange
- range
;
3842 else /* Searching backwards. */
3844 int room
= (startpos
>= size1
3845 ? size2
+ size1
- startpos
3846 : size1
- startpos
);
3847 buf_ch
= RE_STRING_CHAR (d
, room
);
3848 buf_ch
= TRANSLATE (buf_ch
);
3850 if (! (buf_ch
>= 0400
3851 || fastmap
[buf_ch
]))
3856 /* If can't match the null string, and that's all we have left, fail. */
3857 if (range
>= 0 && startpos
== total_size
&& fastmap
3858 && !bufp
->can_be_null
)
3861 val
= re_match_2_internal (bufp
, string1
, size1
, string2
, size2
,
3862 startpos
, regs
, stop
);
3863 #ifndef REGEX_MALLOC
3880 /* Update STARTPOS to the next character boundary. */
3883 re_char
*p
= POS_ADDR_VSTRING (startpos
);
3884 re_char
*pend
= STOP_ADDR_VSTRING (startpos
);
3885 int len
= MULTIBYTE_FORM_LENGTH (p
, pend
- p
);
3903 /* Update STARTPOS to the previous character boundary. */
3906 re_char
*p
= POS_ADDR_VSTRING (startpos
);
3909 /* Find the head of multibyte form. */
3910 while (!CHAR_HEAD_P (*p
))
3915 if (MULTIBYTE_FORM_LENGTH (p
, len
+ 1) != (len
+ 1))
3932 /* Declarations and macros for re_match_2. */
3934 static int bcmp_translate
_RE_ARGS((re_char
*s1
, re_char
*s2
,
3936 RE_TRANSLATE_TYPE translate
,
3937 const int multibyte
));
3939 /* This converts PTR, a pointer into one of the search strings `string1'
3940 and `string2' into an offset from the beginning of that string. */
3941 #define POINTER_TO_OFFSET(ptr) \
3942 (FIRST_STRING_P (ptr) \
3943 ? ((regoff_t) ((ptr) - string1)) \
3944 : ((regoff_t) ((ptr) - string2 + size1)))
3946 /* Call before fetching a character with *d. This switches over to
3947 string2 if necessary.
3948 Check re_match_2_internal for a discussion of why end_match_2 might
3949 not be within string2 (but be equal to end_match_1 instead). */
3950 #define PREFETCH() \
3953 /* End of string2 => fail. */ \
3954 if (dend == end_match_2) \
3956 /* End of string1 => advance to string2. */ \
3958 dend = end_match_2; \
3961 /* Call before fetching a char with *d if you already checked other limits.
3962 This is meant for use in lookahead operations like wordend, etc..
3963 where we might need to look at parts of the string that might be
3964 outside of the LIMITs (i.e past `stop'). */
3965 #define PREFETCH_NOLIMIT() \
3969 dend = end_match_2; \
3972 /* Test if at very beginning or at very end of the virtual concatenation
3973 of `string1' and `string2'. If only one string, it's `string2'. */
3974 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3975 #define AT_STRINGS_END(d) ((d) == end2)
3978 /* Test if D points to a character which is word-constituent. We have
3979 two special cases to check for: if past the end of string1, look at
3980 the first character in string2; and if before the beginning of
3981 string2, look at the last character in string1. */
3982 #define WORDCHAR_P(d) \
3983 (SYNTAX ((d) == end1 ? *string2 \
3984 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3987 /* Disabled due to a compiler bug -- see comment at case wordbound */
3989 /* The comment at case wordbound is following one, but we don't use
3990 AT_WORD_BOUNDARY anymore to support multibyte form.
3992 The DEC Alpha C compiler 3.x generates incorrect code for the
3993 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3994 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
3995 macro and introducing temporary variables works around the bug. */
3998 /* Test if the character before D and the one at D differ with respect
3999 to being word-constituent. */
4000 #define AT_WORD_BOUNDARY(d) \
4001 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4002 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4005 /* Free everything we malloc. */
4006 #ifdef MATCH_MAY_ALLOCATE
4007 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4008 #define FREE_VARIABLES() \
4010 REGEX_FREE_STACK (fail_stack.stack); \
4011 FREE_VAR (regstart); \
4012 FREE_VAR (regend); \
4013 FREE_VAR (best_regstart); \
4014 FREE_VAR (best_regend); \
4017 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4018 #endif /* not MATCH_MAY_ALLOCATE */
4021 /* Optimization routines. */
4023 /* If the operation is a match against one or more chars,
4024 return a pointer to the next operation, else return NULL. */
4025 static unsigned char *
4029 switch (SWITCH_ENUM_CAST (*p
++))
4040 if (CHARSET_RANGE_TABLE_EXISTS_P (p
- 1))
4043 p
= CHARSET_RANGE_TABLE (p
- 1);
4044 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
4045 p
= CHARSET_RANGE_TABLE_END (p
, mcnt
);
4048 p
+= 1 + CHARSET_BITMAP_SIZE (p
- 1);
4055 case notcategoryspec
:
4067 /* Jump over non-matching operations. */
4068 static unsigned char *
4069 skip_noops (p
, pend
)
4070 unsigned char *p
, *pend
;
4075 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
))
4084 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
4095 /* Non-zero if "p1 matches something" implies "p2 fails". */
4097 mutually_exclusive_p (bufp
, p1
, p2
)
4098 struct re_pattern_buffer
*bufp
;
4099 unsigned char *p1
, *p2
;
4102 const boolean multibyte
= RE_MULTIBYTE_P (bufp
);
4103 unsigned char *pend
= bufp
->buffer
+ bufp
->used
;
4105 assert (p1
>= bufp
->buffer
&& p1
< pend
4106 && p2
>= bufp
->buffer
&& p2
<= pend
);
4108 /* Skip over open/close-group commands.
4109 If what follows this loop is a ...+ construct,
4110 look at what begins its body, since we will have to
4111 match at least one of that. */
4112 p2
= skip_noops (p2
, pend
);
4113 /* The same skip can be done for p1, except that this function
4114 is only used in the case where p1 is a simple match operator. */
4115 /* p1 = skip_noops (p1, pend); */
4117 assert (p1
>= bufp
->buffer
&& p1
< pend
4118 && p2
>= bufp
->buffer
&& p2
<= pend
);
4120 op2
= p2
== pend
? succeed
: *p2
;
4122 switch (SWITCH_ENUM_CAST (op2
))
4126 /* If we're at the end of the pattern, we can change. */
4127 if (skip_one_char (p1
))
4129 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4135 if (!bufp
->newline_anchor
)
4140 register unsigned int c
4141 = (re_opcode_t
) *p2
== endline
? '\n'
4142 : RE_STRING_CHAR(p2
+ 2, pend
- p2
- 2);
4144 if ((re_opcode_t
) *p1
== exactn
)
4146 if (c
!= RE_STRING_CHAR (p1
+ 2, pend
- p1
- 2))
4148 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c
, p1
[2]);
4153 else if ((re_opcode_t
) *p1
== charset
4154 || (re_opcode_t
) *p1
== charset_not
)
4156 int not = (re_opcode_t
) *p1
== charset_not
;
4158 /* Test if C is listed in charset (or charset_not)
4160 if (SINGLE_BYTE_CHAR_P (c
))
4162 if (c
< CHARSET_BITMAP_SIZE (p1
) * BYTEWIDTH
4163 && p1
[2 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
4166 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1
))
4167 CHARSET_LOOKUP_RANGE_TABLE (not, c
, p1
);
4169 /* `not' is equal to 1 if c would match, which means
4170 that we can't change to pop_failure_jump. */
4173 DEBUG_PRINT1 (" No match => fast loop.\n");
4177 else if ((re_opcode_t
) *p1
== anychar
4180 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4189 if ((re_opcode_t
) *p1
== exactn
)
4190 /* Reuse the code above. */
4191 return mutually_exclusive_p (bufp
, p2
, p1
);
4194 /* It is hard to list up all the character in charset
4195 P2 if it includes multibyte character. Give up in
4197 else if (!multibyte
|| !CHARSET_RANGE_TABLE_EXISTS_P (p2
))
4199 /* Now, we are sure that P2 has no range table.
4200 So, for the size of bitmap in P2, `p2[1]' is
4201 enough. But P1 may have range table, so the
4202 size of bitmap table of P1 is extracted by
4203 using macro `CHARSET_BITMAP_SIZE'.
4205 Since we know that all the character listed in
4206 P2 is ASCII, it is enough to test only bitmap
4212 /* We win if the charset inside the loop
4213 has no overlap with the one after the loop. */
4216 && idx
< CHARSET_BITMAP_SIZE (p1
));
4218 if ((p2
[2 + idx
] & p1
[2 + idx
]) != 0)
4222 || idx
== CHARSET_BITMAP_SIZE (p1
))
4224 DEBUG_PRINT1 (" No match => fast loop.\n");
4228 else if ((re_opcode_t
) *p1
== charset
4229 || (re_opcode_t
) *p1
== charset_not
)
4232 /* We win if the charset_not inside the loop lists
4233 every character listed in the charset after. */
4234 for (idx
= 0; idx
< (int) p2
[1]; idx
++)
4235 if (! (p2
[2 + idx
] == 0
4236 || (idx
< CHARSET_BITMAP_SIZE (p1
)
4237 && ((p2
[2 + idx
] & ~ p1
[2 + idx
]) == 0))))
4242 DEBUG_PRINT1 (" No match => fast loop.\n");
4251 return ((re_opcode_t
) *p1
== syntaxspec
4252 && p1
[1] == (op2
== wordend
? Sword
: p2
[1]));
4256 return ((re_opcode_t
) *p1
== notsyntaxspec
4257 && p1
[1] == (op2
== wordend
? Sword
: p2
[1]));
4260 return (((re_opcode_t
) *p1
== notsyntaxspec
4261 || (re_opcode_t
) *p1
== syntaxspec
)
4266 return ((re_opcode_t
) *p1
== notcategoryspec
&& p1
[1] == p2
[1]);
4267 case notcategoryspec
:
4268 return ((re_opcode_t
) *p1
== categoryspec
&& p1
[1] == p2
[1]);
4280 /* Matching routines. */
4282 #ifndef emacs /* Emacs never uses this. */
4283 /* re_match is like re_match_2 except it takes only a single string. */
4286 re_match (bufp
, string
, size
, pos
, regs
)
4287 struct re_pattern_buffer
*bufp
;
4290 struct re_registers
*regs
;
4292 int result
= re_match_2_internal (bufp
, NULL
, 0, string
, size
,
4297 #endif /* not emacs */
4300 /* In Emacs, this is the string or buffer in which we
4301 are matching. It is used for looking up syntax properties. */
4302 Lisp_Object re_match_object
;
4305 /* re_match_2 matches the compiled pattern in BUFP against the
4306 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4307 and SIZE2, respectively). We start matching at POS, and stop
4310 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4311 store offsets for the substring each group matched in REGS. See the
4312 documentation for exactly how many groups we fill.
4314 We return -1 if no match, -2 if an internal error (such as the
4315 failure stack overflowing). Otherwise, we return the length of the
4316 matched substring. */
4319 re_match_2 (bufp
, string1
, size1
, string2
, size2
, pos
, regs
, stop
)
4320 struct re_pattern_buffer
*bufp
;
4321 const char *string1
, *string2
;
4324 struct re_registers
*regs
;
4331 gl_state
.object
= re_match_object
;
4332 charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos
));
4333 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object
, charpos
, 1);
4336 result
= re_match_2_internal (bufp
, string1
, size1
, string2
, size2
,
4342 /* This is a separate function so that we can force an alloca cleanup
4345 re_match_2_internal (bufp
, string1
, size1
, string2
, size2
, pos
, regs
, stop
)
4346 struct re_pattern_buffer
*bufp
;
4347 re_char
*string1
, *string2
;
4350 struct re_registers
*regs
;
4353 /* General temporaries. */
4358 /* Just past the end of the corresponding string. */
4359 re_char
*end1
, *end2
;
4361 /* Pointers into string1 and string2, just past the last characters in
4362 each to consider matching. */
4363 re_char
*end_match_1
, *end_match_2
;
4365 /* Where we are in the data, and the end of the current string. */
4368 /* Used sometimes to remember where we were before starting matching
4369 an operator so that we can go back in case of failure. This "atomic"
4370 behavior of matching opcodes is indispensable to the correctness
4371 of the on_failure_keep_string_jump optimization. */
4374 /* Where we are in the pattern, and the end of the pattern. */
4375 unsigned char *p
= bufp
->buffer
;
4376 register unsigned char *pend
= p
+ bufp
->used
;
4378 /* We use this to map every character in the string. */
4379 RE_TRANSLATE_TYPE translate
= bufp
->translate
;
4381 /* Nonzero if we have to concern multibyte character. */
4382 const boolean multibyte
= RE_MULTIBYTE_P (bufp
);
4384 /* Failure point stack. Each place that can handle a failure further
4385 down the line pushes a failure point on this stack. It consists of
4386 regstart, and regend for all registers corresponding to
4387 the subexpressions we're currently inside, plus the number of such
4388 registers, and, finally, two char *'s. The first char * is where
4389 to resume scanning the pattern; the second one is where to resume
4390 scanning the strings. */
4391 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4392 fail_stack_type fail_stack
;
4395 static unsigned failure_id
= 0;
4396 unsigned nfailure_points_pushed
= 0, nfailure_points_popped
= 0;
4399 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4400 /* This holds the pointer to the failure stack, when
4401 it is allocated relocatably. */
4402 fail_stack_elt_t
*failure_stack_ptr
;
4405 /* We fill all the registers internally, independent of what we
4406 return, for use in backreferences. The number here includes
4407 an element for register zero. */
4408 unsigned num_regs
= bufp
->re_nsub
+ 1;
4410 /* Information on the contents of registers. These are pointers into
4411 the input strings; they record just what was matched (on this
4412 attempt) by a subexpression part of the pattern, that is, the
4413 regnum-th regstart pointer points to where in the pattern we began
4414 matching and the regnum-th regend points to right after where we
4415 stopped matching the regnum-th subexpression. (The zeroth register
4416 keeps track of what the whole pattern matches.) */
4417 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4418 re_char
**regstart
, **regend
;
4421 /* The following record the register info as found in the above
4422 variables when we find a match better than any we've seen before.
4423 This happens as we backtrack through the failure points, which in
4424 turn happens only if we have not yet matched the entire string. */
4425 unsigned best_regs_set
= false;
4426 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4427 re_char
**best_regstart
, **best_regend
;
4430 /* Logically, this is `best_regend[0]'. But we don't want to have to
4431 allocate space for that if we're not allocating space for anything
4432 else (see below). Also, we never need info about register 0 for
4433 any of the other register vectors, and it seems rather a kludge to
4434 treat `best_regend' differently than the rest. So we keep track of
4435 the end of the best match so far in a separate variable. We
4436 initialize this to NULL so that when we backtrack the first time
4437 and need to test it, it's not garbage. */
4438 re_char
*match_end
= NULL
;
4441 /* Counts the total number of registers pushed. */
4442 unsigned num_regs_pushed
= 0;
4445 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4449 #ifdef MATCH_MAY_ALLOCATE
4450 /* Do not bother to initialize all the register variables if there are
4451 no groups in the pattern, as it takes a fair amount of time. If
4452 there are groups, we include space for register 0 (the whole
4453 pattern), even though we never use it, since it simplifies the
4454 array indexing. We should fix this. */
4457 regstart
= REGEX_TALLOC (num_regs
, re_char
*);
4458 regend
= REGEX_TALLOC (num_regs
, re_char
*);
4459 best_regstart
= REGEX_TALLOC (num_regs
, re_char
*);
4460 best_regend
= REGEX_TALLOC (num_regs
, re_char
*);
4462 if (!(regstart
&& regend
&& best_regstart
&& best_regend
))
4470 /* We must initialize all our variables to NULL, so that
4471 `FREE_VARIABLES' doesn't try to free them. */
4472 regstart
= regend
= best_regstart
= best_regend
= NULL
;
4474 #endif /* MATCH_MAY_ALLOCATE */
4476 /* The starting position is bogus. */
4477 if (pos
< 0 || pos
> size1
+ size2
)
4483 /* Initialize subexpression text positions to -1 to mark ones that no
4484 start_memory/stop_memory has been seen for. Also initialize the
4485 register information struct. */
4486 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4487 regstart
[mcnt
] = regend
[mcnt
] = REG_UNSET_VALUE
;
4489 /* We move `string1' into `string2' if the latter's empty -- but not if
4490 `string1' is null. */
4491 if (size2
== 0 && string1
!= NULL
)
4498 end1
= string1
+ size1
;
4499 end2
= string2
+ size2
;
4501 /* `p' scans through the pattern as `d' scans through the data.
4502 `dend' is the end of the input string that `d' points within. `d'
4503 is advanced into the following input string whenever necessary, but
4504 this happens before fetching; therefore, at the beginning of the
4505 loop, `d' can be pointing at the end of a string, but it cannot
4509 /* Only match within string2. */
4510 d
= string2
+ pos
- size1
;
4511 dend
= end_match_2
= string2
+ stop
- size1
;
4512 end_match_1
= end1
; /* Just to give it a value. */
4518 /* Only match within string1. */
4519 end_match_1
= string1
+ stop
;
4521 When we reach end_match_1, PREFETCH normally switches to string2.
4522 But in the present case, this means that just doing a PREFETCH
4523 makes us jump from `stop' to `gap' within the string.
4524 What we really want here is for the search to stop as
4525 soon as we hit end_match_1. That's why we set end_match_2
4526 to end_match_1 (since PREFETCH fails as soon as we hit
4528 end_match_2
= end_match_1
;
4531 { /* It's important to use this code when stop == size so that
4532 moving `d' from end1 to string2 will not prevent the d == dend
4533 check from catching the end of string. */
4535 end_match_2
= string2
+ stop
- size1
;
4541 DEBUG_PRINT1 ("The compiled pattern is: ");
4542 DEBUG_PRINT_COMPILED_PATTERN (bufp
, p
, pend
);
4543 DEBUG_PRINT1 ("The string to match is: `");
4544 DEBUG_PRINT_DOUBLE_STRING (d
, string1
, size1
, string2
, size2
);
4545 DEBUG_PRINT1 ("'\n");
4547 /* This loops over pattern commands. It exits by returning from the
4548 function if the match is complete, or it drops through if the match
4549 fails at this starting point in the input data. */
4552 DEBUG_PRINT2 ("\n%p: ", p
);
4555 { /* End of pattern means we might have succeeded. */
4556 DEBUG_PRINT1 ("end of pattern ... ");
4558 /* If we haven't matched the entire string, and we want the
4559 longest match, try backtracking. */
4560 if (d
!= end_match_2
)
4562 /* 1 if this match ends in the same string (string1 or string2)
4563 as the best previous match. */
4564 boolean same_str_p
= (FIRST_STRING_P (match_end
)
4565 == FIRST_STRING_P (d
));
4566 /* 1 if this match is the best seen so far. */
4567 boolean best_match_p
;
4569 /* AIX compiler got confused when this was combined
4570 with the previous declaration. */
4572 best_match_p
= d
> match_end
;
4574 best_match_p
= !FIRST_STRING_P (d
);
4576 DEBUG_PRINT1 ("backtracking.\n");
4578 if (!FAIL_STACK_EMPTY ())
4579 { /* More failure points to try. */
4581 /* If exceeds best match so far, save it. */
4582 if (!best_regs_set
|| best_match_p
)
4584 best_regs_set
= true;
4587 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4589 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4591 best_regstart
[mcnt
] = regstart
[mcnt
];
4592 best_regend
[mcnt
] = regend
[mcnt
];
4598 /* If no failure points, don't restore garbage. And if
4599 last match is real best match, don't restore second
4601 else if (best_regs_set
&& !best_match_p
)
4604 /* Restore best match. It may happen that `dend ==
4605 end_match_1' while the restored d is in string2.
4606 For example, the pattern `x.*y.*z' against the
4607 strings `x-' and `y-z-', if the two strings are
4608 not consecutive in memory. */
4609 DEBUG_PRINT1 ("Restoring best registers.\n");
4612 dend
= ((d
>= string1
&& d
<= end1
)
4613 ? end_match_1
: end_match_2
);
4615 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4617 regstart
[mcnt
] = best_regstart
[mcnt
];
4618 regend
[mcnt
] = best_regend
[mcnt
];
4621 } /* d != end_match_2 */
4624 DEBUG_PRINT1 ("Accepting match.\n");
4626 /* If caller wants register contents data back, do it. */
4627 if (regs
&& !bufp
->no_sub
)
4629 /* Have the register data arrays been allocated? */
4630 if (bufp
->regs_allocated
== REGS_UNALLOCATED
)
4631 { /* No. So allocate them with malloc. We need one
4632 extra element beyond `num_regs' for the `-1' marker
4634 regs
->num_regs
= MAX (RE_NREGS
, num_regs
+ 1);
4635 regs
->start
= TALLOC (regs
->num_regs
, regoff_t
);
4636 regs
->end
= TALLOC (regs
->num_regs
, regoff_t
);
4637 if (regs
->start
== NULL
|| regs
->end
== NULL
)
4642 bufp
->regs_allocated
= REGS_REALLOCATE
;
4644 else if (bufp
->regs_allocated
== REGS_REALLOCATE
)
4645 { /* Yes. If we need more elements than were already
4646 allocated, reallocate them. If we need fewer, just
4648 if (regs
->num_regs
< num_regs
+ 1)
4650 regs
->num_regs
= num_regs
+ 1;
4651 RETALLOC (regs
->start
, regs
->num_regs
, regoff_t
);
4652 RETALLOC (regs
->end
, regs
->num_regs
, regoff_t
);
4653 if (regs
->start
== NULL
|| regs
->end
== NULL
)
4662 /* These braces fend off a "empty body in an else-statement"
4663 warning under GCC when assert expands to nothing. */
4664 assert (bufp
->regs_allocated
== REGS_FIXED
);
4667 /* Convert the pointer data in `regstart' and `regend' to
4668 indices. Register zero has to be set differently,
4669 since we haven't kept track of any info for it. */
4670 if (regs
->num_regs
> 0)
4672 regs
->start
[0] = pos
;
4673 regs
->end
[0] = POINTER_TO_OFFSET (d
);
4676 /* Go through the first `min (num_regs, regs->num_regs)'
4677 registers, since that is all we initialized. */
4678 for (mcnt
= 1; mcnt
< MIN (num_regs
, regs
->num_regs
); mcnt
++)
4680 if (REG_UNSET (regstart
[mcnt
]) || REG_UNSET (regend
[mcnt
]))
4681 regs
->start
[mcnt
] = regs
->end
[mcnt
] = -1;
4685 = (regoff_t
) POINTER_TO_OFFSET (regstart
[mcnt
]);
4687 = (regoff_t
) POINTER_TO_OFFSET (regend
[mcnt
]);
4691 /* If the regs structure we return has more elements than
4692 were in the pattern, set the extra elements to -1. If
4693 we (re)allocated the registers, this is the case,
4694 because we always allocate enough to have at least one
4696 for (mcnt
= num_regs
; mcnt
< regs
->num_regs
; mcnt
++)
4697 regs
->start
[mcnt
] = regs
->end
[mcnt
] = -1;
4698 } /* regs && !bufp->no_sub */
4700 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4701 nfailure_points_pushed
, nfailure_points_popped
,
4702 nfailure_points_pushed
- nfailure_points_popped
);
4703 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed
);
4705 mcnt
= POINTER_TO_OFFSET (d
) - pos
;
4707 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt
);
4713 /* Otherwise match next pattern command. */
4714 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
++))
4716 /* Ignore these. Used to ignore the n of succeed_n's which
4717 currently have n == 0. */
4719 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4723 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4726 /* Match the next n pattern characters exactly. The following
4727 byte in the pattern defines n, and the n bytes after that
4728 are the characters to match. */
4731 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt
);
4733 /* Remember the start point to rollback upon failure. */
4736 /* This is written out as an if-else so we don't waste time
4737 testing `translate' inside the loop. */
4738 if (RE_TRANSLATE_P (translate
))
4743 int pat_charlen
, buf_charlen
;
4744 unsigned int pat_ch
, buf_ch
;
4747 pat_ch
= STRING_CHAR_AND_LENGTH (p
, pend
- p
, pat_charlen
);
4748 buf_ch
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, buf_charlen
);
4750 if (RE_TRANSLATE (translate
, buf_ch
)
4759 mcnt
-= pat_charlen
;
4766 if (RE_TRANSLATE (translate
, *d
) != *p
++)
4791 /* Match any character except possibly a newline or a null. */
4795 unsigned int buf_ch
;
4797 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4800 buf_ch
= RE_STRING_CHAR_AND_LENGTH (d
, dend
- d
, buf_charlen
);
4801 buf_ch
= TRANSLATE (buf_ch
);
4803 if ((!(bufp
->syntax
& RE_DOT_NEWLINE
)
4805 || ((bufp
->syntax
& RE_DOT_NOT_NULL
)
4806 && buf_ch
== '\000'))
4809 DEBUG_PRINT2 (" Matched `%d'.\n", *d
);
4818 register unsigned int c
;
4819 boolean
not = (re_opcode_t
) *(p
- 1) == charset_not
;
4822 /* Start of actual range_table, or end of bitmap if there is no
4824 unsigned char *range_table
;
4826 /* Nonzero if there is a range table. */
4827 int range_table_exists
;
4829 /* Number of ranges of range table. This is not included
4830 in the initial byte-length of the command. */
4833 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4835 range_table_exists
= CHARSET_RANGE_TABLE_EXISTS_P (&p
[-1]);
4837 if (range_table_exists
)
4839 range_table
= CHARSET_RANGE_TABLE (&p
[-1]); /* Past the bitmap. */
4840 EXTRACT_NUMBER_AND_INCR (count
, range_table
);
4844 c
= RE_STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
4845 c
= TRANSLATE (c
); /* The character to match. */
4847 if (SINGLE_BYTE_CHAR_P (c
))
4848 { /* Lookup bitmap. */
4849 /* Cast to `unsigned' instead of `unsigned char' in
4850 case the bit list is a full 32 bytes long. */
4851 if (c
< (unsigned) (CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
)
4852 && p
[1 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
4856 else if (range_table_exists
)
4858 int class_bits
= CHARSET_RANGE_TABLE_BITS (&p
[-1]);
4860 if ( (class_bits
& BIT_ALNUM
&& ISALNUM (c
))
4861 | (class_bits
& BIT_ALPHA
&& ISALPHA (c
))
4862 | (class_bits
& BIT_ASCII
&& IS_REAL_ASCII (c
))
4863 | (class_bits
& BIT_GRAPH
&& ISGRAPH (c
))
4864 | (class_bits
& BIT_LOWER
&& ISLOWER (c
))
4865 | (class_bits
& BIT_MULTIBYTE
&& !ISUNIBYTE (c
))
4866 | (class_bits
& BIT_NONASCII
&& !IS_REAL_ASCII (c
))
4867 | (class_bits
& BIT_PRINT
&& ISPRINT (c
))
4868 | (class_bits
& BIT_PUNCT
&& ISPUNCT (c
))
4869 | (class_bits
& BIT_SPACE
&& ISSPACE (c
))
4870 | (class_bits
& BIT_UNIBYTE
&& ISUNIBYTE (c
))
4871 | (class_bits
& BIT_UPPER
&& ISUPPER (c
))
4872 | (class_bits
& BIT_WORD
&& ISWORD (c
)))
4875 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c
, range_table
, count
);
4879 if (range_table_exists
)
4880 p
= CHARSET_RANGE_TABLE_END (range_table
, count
);
4882 p
+= CHARSET_BITMAP_SIZE (&p
[-1]) + 1;
4884 if (!not) goto fail
;
4891 /* The beginning of a group is represented by start_memory.
4892 The argument is the register number. The text
4893 matched within the group is recorded (in the internal
4894 registers data structure) under the register number. */
4896 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p
);
4898 /* In case we need to undo this operation (via backtracking). */
4899 PUSH_FAILURE_REG ((unsigned int)*p
);
4902 regend
[*p
] = REG_UNSET_VALUE
; /* probably unnecessary. -sm */
4903 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart
[*p
]));
4905 /* Move past the register number and inner group count. */
4910 /* The stop_memory opcode represents the end of a group. Its
4911 argument is the same as start_memory's: the register number. */
4913 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p
);
4915 assert (!REG_UNSET (regstart
[*p
]));
4916 /* Strictly speaking, there should be code such as:
4918 assert (REG_UNSET (regend[*p]));
4919 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4921 But the only info to be pushed is regend[*p] and it is known to
4922 be UNSET, so there really isn't anything to push.
4923 Not pushing anything, on the other hand deprives us from the
4924 guarantee that regend[*p] is UNSET since undoing this operation
4925 will not reset its value properly. This is not important since
4926 the value will only be read on the next start_memory or at
4927 the very end and both events can only happen if this stop_memory
4931 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend
[*p
]));
4933 /* Move past the register number and the inner group count. */
4938 /* \<digit> has been turned into a `duplicate' command which is
4939 followed by the numeric value of <digit> as the register number. */
4942 register re_char
*d2
, *dend2
;
4943 int regno
= *p
++; /* Get which register to match against. */
4944 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno
);
4946 /* Can't back reference a group which we've never matched. */
4947 if (REG_UNSET (regstart
[regno
]) || REG_UNSET (regend
[regno
]))
4950 /* Where in input to try to start matching. */
4951 d2
= regstart
[regno
];
4953 /* Remember the start point to rollback upon failure. */
4956 /* Where to stop matching; if both the place to start and
4957 the place to stop matching are in the same string, then
4958 set to the place to stop, otherwise, for now have to use
4959 the end of the first string. */
4961 dend2
= ((FIRST_STRING_P (regstart
[regno
])
4962 == FIRST_STRING_P (regend
[regno
]))
4963 ? regend
[regno
] : end_match_1
);
4966 /* If necessary, advance to next segment in register
4970 if (dend2
== end_match_2
) break;
4971 if (dend2
== regend
[regno
]) break;
4973 /* End of string1 => advance to string2. */
4975 dend2
= regend
[regno
];
4977 /* At end of register contents => success */
4978 if (d2
== dend2
) break;
4980 /* If necessary, advance to next segment in data. */
4983 /* How many characters left in this segment to match. */
4986 /* Want how many consecutive characters we can match in
4987 one shot, so, if necessary, adjust the count. */
4988 if (mcnt
> dend2
- d2
)
4991 /* Compare that many; failure if mismatch, else move
4993 if (RE_TRANSLATE_P (translate
)
4994 ? bcmp_translate (d
, d2
, mcnt
, translate
, multibyte
)
4995 : bcmp (d
, d2
, mcnt
))
5000 d
+= mcnt
, d2
+= mcnt
;
5006 /* begline matches the empty string at the beginning of the string
5007 (unless `not_bol' is set in `bufp'), and, if
5008 `newline_anchor' is set, after newlines. */
5010 DEBUG_PRINT1 ("EXECUTING begline.\n");
5012 if (AT_STRINGS_BEG (d
))
5014 if (!bufp
->not_bol
) break;
5019 GET_CHAR_BEFORE_2 (c
, d
, string1
, end1
, string2
, end2
);
5020 if (c
== '\n' && bufp
->newline_anchor
)
5023 /* In all other cases, we fail. */
5027 /* endline is the dual of begline. */
5029 DEBUG_PRINT1 ("EXECUTING endline.\n");
5031 if (AT_STRINGS_END (d
))
5033 if (!bufp
->not_eol
) break;
5037 PREFETCH_NOLIMIT ();
5038 if (*d
== '\n' && bufp
->newline_anchor
)
5044 /* Match at the very beginning of the data. */
5046 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5047 if (AT_STRINGS_BEG (d
))
5052 /* Match at the very end of the data. */
5054 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5055 if (AT_STRINGS_END (d
))
5060 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5061 pushes NULL as the value for the string on the stack. Then
5062 `POP_FAILURE_POINT' will keep the current value for the
5063 string, instead of restoring it. To see why, consider
5064 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5065 then the . fails against the \n. But the next thing we want
5066 to do is match the \n against the \n; if we restored the
5067 string value, we would be back at the foo.
5069 Because this is used only in specific cases, we don't need to
5070 check all the things that `on_failure_jump' does, to make
5071 sure the right things get saved on the stack. Hence we don't
5072 share its code. The only reason to push anything on the
5073 stack at all is that otherwise we would have to change
5074 `anychar's code to do something besides goto fail in this
5075 case; that seems worse than this. */
5076 case on_failure_keep_string_jump
:
5077 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5078 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5081 PUSH_FAILURE_POINT (p
- 3, NULL
);
5084 /* A nasty loop is introduced by the non-greedy *? and +?.
5085 With such loops, the stack only ever contains one failure point
5086 at a time, so that a plain on_failure_jump_loop kind of
5087 cycle detection cannot work. Worse yet, such a detection
5088 can not only fail to detect a cycle, but it can also wrongly
5089 detect a cycle (between different instantiations of the same
5091 So the method used for those nasty loops is a little different:
5092 We use a special cycle-detection-stack-frame which is pushed
5093 when the on_failure_jump_nastyloop failure-point is *popped*.
5094 This special frame thus marks the beginning of one iteration
5095 through the loop and we can hence easily check right here
5096 whether something matched between the beginning and the end of
5098 case on_failure_jump_nastyloop
:
5099 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5100 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5103 assert ((re_opcode_t
)p
[-4] == no_op
);
5104 CHECK_INFINITE_LOOP (p
- 4, d
);
5105 PUSH_FAILURE_POINT (p
- 3, d
);
5109 /* Simple loop detecting on_failure_jump: just check on the
5110 failure stack if the same spot was already hit earlier. */
5111 case on_failure_jump_loop
:
5113 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5114 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5117 CHECK_INFINITE_LOOP (p
- 3, d
);
5118 PUSH_FAILURE_POINT (p
- 3, d
);
5122 /* Uses of on_failure_jump:
5124 Each alternative starts with an on_failure_jump that points
5125 to the beginning of the next alternative. Each alternative
5126 except the last ends with a jump that in effect jumps past
5127 the rest of the alternatives. (They really jump to the
5128 ending jump of the following alternative, because tensioning
5129 these jumps is a hassle.)
5131 Repeats start with an on_failure_jump that points past both
5132 the repetition text and either the following jump or
5133 pop_failure_jump back to this on_failure_jump. */
5134 case on_failure_jump
:
5136 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5137 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5140 PUSH_FAILURE_POINT (p
-3, d
);
5143 /* This operation is used for greedy *.
5144 Compare the beginning of the repeat with what in the
5145 pattern follows its end. If we can establish that there
5146 is nothing that they would both match, i.e., that we
5147 would have to backtrack because of (as in, e.g., `a*a')
5148 then we can use a non-backtracking loop based on
5149 on_failure_keep_string_jump instead of on_failure_jump. */
5150 case on_failure_jump_smart
:
5152 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5153 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5156 unsigned char *p1
= p
; /* Next operation. */
5157 unsigned char *p2
= p
+ mcnt
; /* Destination of the jump. */
5159 p
-= 3; /* Reset so that we will re-execute the
5160 instruction once it's been changed. */
5162 EXTRACT_NUMBER (mcnt
, p2
- 2);
5164 /* Ensure this is a indeed the trivial kind of loop
5165 we are expecting. */
5166 assert (skip_one_char (p1
) == p2
- 3);
5167 assert ((re_opcode_t
) p2
[-3] == jump
&& p2
+ mcnt
== p
);
5168 DEBUG_STATEMENT (debug
+= 2);
5169 if (mutually_exclusive_p (bufp
, p1
, p2
))
5171 /* Use a fast `on_failure_keep_string_jump' loop. */
5172 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5173 *p
= (unsigned char) on_failure_keep_string_jump
;
5174 STORE_NUMBER (p2
- 2, mcnt
+ 3);
5178 /* Default to a safe `on_failure_jump' loop. */
5179 DEBUG_PRINT1 (" smart default => slow loop.\n");
5180 *p
= (unsigned char) on_failure_jump
;
5182 DEBUG_STATEMENT (debug
-= 2);
5186 /* Unconditionally jump (without popping any failure points). */
5190 EXTRACT_NUMBER_AND_INCR (mcnt
, p
); /* Get the amount to jump. */
5191 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt
);
5192 p
+= mcnt
; /* Do the jump. */
5193 DEBUG_PRINT2 ("(to %p).\n", p
);
5197 /* Have to succeed matching what follows at least n times.
5198 After that, handle like `on_failure_jump'. */
5200 EXTRACT_NUMBER (mcnt
, p
+ 2);
5201 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt
);
5204 /* Originally, this is how many times we HAVE to succeed. */
5209 STORE_NUMBER_AND_INCR (p
, mcnt
);
5210 DEBUG_PRINT3 (" Setting %p to %d.\n", p
, mcnt
);
5214 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p
+2);
5215 p
[2] = (unsigned char) no_op
;
5216 p
[3] = (unsigned char) no_op
;
5222 EXTRACT_NUMBER (mcnt
, p
+ 2);
5223 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt
);
5225 /* Originally, this is how many times we CAN jump. */
5229 STORE_NUMBER (p
+ 2, mcnt
);
5230 goto unconditional_jump
;
5232 /* If don't have to jump any more, skip over the rest of command. */
5239 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5241 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5243 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5244 DEBUG_PRINT3 (" Setting %p to %d.\n", p1
, mcnt
);
5245 STORE_NUMBER (p1
, mcnt
);
5251 not = (re_opcode_t
) *(p
- 1) == notwordbound
;
5252 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5254 /* We SUCCEED (or FAIL) in one of the following cases: */
5256 /* Case 1: D is at the beginning or the end of string. */
5257 if (AT_STRINGS_BEG (d
) || AT_STRINGS_END (d
))
5261 /* C1 is the character before D, S1 is the syntax of C1, C2
5262 is the character at D, and S2 is the syntax of C2. */
5265 int offset
= PTR_TO_OFFSET (d
- 1);
5266 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (offset
);
5267 UPDATE_SYNTAX_TABLE (charpos
);
5269 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5272 UPDATE_SYNTAX_TABLE_FORWARD (charpos
+ 1);
5274 PREFETCH_NOLIMIT ();
5275 c2
= RE_STRING_CHAR (d
, dend
- d
);
5278 if (/* Case 2: Only one of S1 and S2 is Sword. */
5279 ((s1
== Sword
) != (s2
== Sword
))
5280 /* Case 3: Both of S1 and S2 are Sword, and macro
5281 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5282 || ((s1
== Sword
) && WORD_BOUNDARY_P (c1
, c2
)))
5291 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5293 /* We FAIL in one of the following cases: */
5295 /* Case 1: D is at the end of string. */
5296 if (AT_STRINGS_END (d
))
5300 /* C1 is the character before D, S1 is the syntax of C1, C2
5301 is the character at D, and S2 is the syntax of C2. */
5304 int offset
= PTR_TO_OFFSET (d
);
5305 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (offset
);
5306 UPDATE_SYNTAX_TABLE (charpos
);
5309 c2
= RE_STRING_CHAR (d
, dend
- d
);
5312 /* Case 2: S2 is not Sword. */
5316 /* Case 3: D is not at the beginning of string ... */
5317 if (!AT_STRINGS_BEG (d
))
5319 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5321 UPDATE_SYNTAX_TABLE_BACKWARD (charpos
- 1);
5325 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5327 if ((s1
== Sword
) && !WORD_BOUNDARY_P (c1
, c2
))
5334 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5336 /* We FAIL in one of the following cases: */
5338 /* Case 1: D is at the beginning of string. */
5339 if (AT_STRINGS_BEG (d
))
5343 /* C1 is the character before D, S1 is the syntax of C1, C2
5344 is the character at D, and S2 is the syntax of C2. */
5347 int offset
= PTR_TO_OFFSET (d
) - 1;
5348 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (offset
);
5349 UPDATE_SYNTAX_TABLE (charpos
);
5351 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5354 /* Case 2: S1 is not Sword. */
5358 /* Case 3: D is not at the end of string ... */
5359 if (!AT_STRINGS_END (d
))
5361 PREFETCH_NOLIMIT ();
5362 c2
= RE_STRING_CHAR (d
, dend
- d
);
5364 UPDATE_SYNTAX_TABLE_FORWARD (charpos
);
5368 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5370 if ((s2
== Sword
) && !WORD_BOUNDARY_P (c1
, c2
))
5378 not = (re_opcode_t
) *(p
- 1) == notsyntaxspec
;
5380 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt
);
5384 int offset
= PTR_TO_OFFSET (d
);
5385 int pos1
= SYNTAX_TABLE_BYTE_TO_CHAR (offset
);
5386 UPDATE_SYNTAX_TABLE (pos1
);
5392 c
= RE_STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5394 if ((SYNTAX (c
) != (enum syntaxcode
) mcnt
) ^ not)
5402 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5403 if (PTR_BYTE_POS (d
) >= PT_BYTE
)
5408 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5409 if (PTR_BYTE_POS (d
) != PT_BYTE
)
5414 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5415 if (PTR_BYTE_POS (d
) <= PT_BYTE
)
5420 case notcategoryspec
:
5421 not = (re_opcode_t
) *(p
- 1) == notcategoryspec
;
5423 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt
);
5427 c
= RE_STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5429 if ((!CHAR_HAS_CATEGORY (c
, mcnt
)) ^ not)
5440 continue; /* Successfully executed one pattern command; keep going. */
5443 /* We goto here if a matching operation fails. */
5446 if (!FAIL_STACK_EMPTY ())
5450 /* A restart point is known. Restore to that state. */
5451 DEBUG_PRINT1 ("\nFAIL:\n");
5452 POP_FAILURE_POINT (str
, pat
);
5453 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *pat
++))
5455 case on_failure_keep_string_jump
:
5456 assert (str
== NULL
);
5457 goto continue_failure_jump
;
5459 case on_failure_jump_nastyloop
:
5460 assert ((re_opcode_t
)pat
[-2] == no_op
);
5461 PUSH_FAILURE_POINT (pat
- 2, str
);
5464 case on_failure_jump_loop
:
5465 case on_failure_jump
:
5468 continue_failure_jump
:
5469 EXTRACT_NUMBER_AND_INCR (mcnt
, pat
);
5474 /* A special frame used for nastyloops. */
5481 assert (p
>= bufp
->buffer
&& p
<= pend
);
5483 if (d
>= string1
&& d
<= end1
)
5487 break; /* Matching at this starting point really fails. */
5491 goto restore_best_regs
;
5495 return -1; /* Failure to match. */
5498 /* Subroutine definitions for re_match_2. */
5500 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5501 bytes; nonzero otherwise. */
5504 bcmp_translate (s1
, s2
, len
, translate
, multibyte
)
5507 RE_TRANSLATE_TYPE translate
;
5508 const int multibyte
;
5510 register re_char
*p1
= s1
, *p2
= s2
;
5511 re_char
*p1_end
= s1
+ len
;
5512 re_char
*p2_end
= s2
+ len
;
5514 while (p1
!= p1_end
&& p2
!= p2_end
)
5516 int p1_charlen
, p2_charlen
;
5519 p1_ch
= RE_STRING_CHAR_AND_LENGTH (p1
, p1_end
- p1
, p1_charlen
);
5520 p2_ch
= RE_STRING_CHAR_AND_LENGTH (p2
, p2_end
- p2
, p2_charlen
);
5522 if (RE_TRANSLATE (translate
, p1_ch
)
5523 != RE_TRANSLATE (translate
, p2_ch
))
5526 p1
+= p1_charlen
, p2
+= p2_charlen
;
5529 if (p1
!= p1_end
|| p2
!= p2_end
)
5535 /* Entry points for GNU code. */
5537 /* re_compile_pattern is the GNU regular expression compiler: it
5538 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5539 Returns 0 if the pattern was valid, otherwise an error string.
5541 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5542 are set in BUFP on entry.
5544 We call regex_compile to do the actual compilation. */
5547 re_compile_pattern (pattern
, length
, bufp
)
5548 const char *pattern
;
5550 struct re_pattern_buffer
*bufp
;
5554 /* GNU code is written to assume at least RE_NREGS registers will be set
5555 (and at least one extra will be -1). */
5556 bufp
->regs_allocated
= REGS_UNALLOCATED
;
5558 /* And GNU code determines whether or not to get register information
5559 by passing null for the REGS argument to re_match, etc., not by
5563 /* Match anchors at newline. */
5564 bufp
->newline_anchor
= 1;
5566 ret
= regex_compile (pattern
, length
, re_syntax_options
, bufp
);
5570 return gettext (re_error_msgid
[(int) ret
]);
5573 /* Entry points compatible with 4.2 BSD regex library. We don't define
5574 them unless specifically requested. */
5576 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
5578 /* BSD has one and only one pattern buffer. */
5579 static struct re_pattern_buffer re_comp_buf
;
5583 /* Make these definitions weak in libc, so POSIX programs can redefine
5584 these names if they don't use our functions, and still use
5585 regcomp/regexec below without link errors. */
5595 if (!re_comp_buf
.buffer
)
5596 return gettext ("No previous regular expression");
5600 if (!re_comp_buf
.buffer
)
5602 re_comp_buf
.buffer
= (unsigned char *) malloc (200);
5603 if (re_comp_buf
.buffer
== NULL
)
5604 return gettext (re_error_msgid
[(int) REG_ESPACE
]);
5605 re_comp_buf
.allocated
= 200;
5607 re_comp_buf
.fastmap
= (char *) malloc (1 << BYTEWIDTH
);
5608 if (re_comp_buf
.fastmap
== NULL
)
5609 return gettext (re_error_msgid
[(int) REG_ESPACE
]);
5612 /* Since `re_exec' always passes NULL for the `regs' argument, we
5613 don't need to initialize the pattern buffer fields which affect it. */
5615 /* Match anchors at newlines. */
5616 re_comp_buf
.newline_anchor
= 1;
5618 ret
= regex_compile (s
, strlen (s
), re_syntax_options
, &re_comp_buf
);
5623 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5624 return (char *) gettext (re_error_msgid
[(int) ret
]);
5635 const int len
= strlen (s
);
5637 0 <= re_search (&re_comp_buf
, s
, len
, 0, len
, (struct re_registers
*) 0);
5639 #endif /* _REGEX_RE_COMP */
5641 /* POSIX.2 functions. Don't define these for Emacs. */
5645 /* regcomp takes a regular expression as a string and compiles it.
5647 PREG is a regex_t *. We do not expect any fields to be initialized,
5648 since POSIX says we shouldn't. Thus, we set
5650 `buffer' to the compiled pattern;
5651 `used' to the length of the compiled pattern;
5652 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5653 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5654 RE_SYNTAX_POSIX_BASIC;
5655 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5656 `fastmap' and `fastmap_accurate' to zero;
5657 `re_nsub' to the number of subexpressions in PATTERN.
5659 PATTERN is the address of the pattern string.
5661 CFLAGS is a series of bits which affect compilation.
5663 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5664 use POSIX basic syntax.
5666 If REG_NEWLINE is set, then . and [^...] don't match newline.
5667 Also, regexec will try a match beginning after every newline.
5669 If REG_ICASE is set, then we considers upper- and lowercase
5670 versions of letters to be equivalent when matching.
5672 If REG_NOSUB is set, then when PREG is passed to regexec, that
5673 routine will report only success or failure, and nothing about the
5676 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5677 the return codes and their meanings.) */
5680 regcomp (preg
, pattern
, cflags
)
5682 const char *pattern
;
5687 = (cflags
& REG_EXTENDED
) ?
5688 RE_SYNTAX_POSIX_EXTENDED
: RE_SYNTAX_POSIX_BASIC
;
5690 /* regex_compile will allocate the space for the compiled pattern. */
5692 preg
->allocated
= 0;
5695 /* Don't bother to use a fastmap when searching. This simplifies the
5696 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5697 characters after newlines into the fastmap. This way, we just try
5701 if (cflags
& REG_ICASE
)
5706 = (RE_TRANSLATE_TYPE
) malloc (CHAR_SET_SIZE
5707 * sizeof (*(RE_TRANSLATE_TYPE
)0));
5708 if (preg
->translate
== NULL
)
5709 return (int) REG_ESPACE
;
5711 /* Map uppercase characters to corresponding lowercase ones. */
5712 for (i
= 0; i
< CHAR_SET_SIZE
; i
++)
5713 preg
->translate
[i
] = ISUPPER (i
) ? tolower (i
) : i
;
5716 preg
->translate
= NULL
;
5718 /* If REG_NEWLINE is set, newlines are treated differently. */
5719 if (cflags
& REG_NEWLINE
)
5720 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5721 syntax
&= ~RE_DOT_NEWLINE
;
5722 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
5723 /* It also changes the matching behavior. */
5724 preg
->newline_anchor
= 1;
5727 preg
->newline_anchor
= 0;
5729 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
5731 /* POSIX says a null character in the pattern terminates it, so we
5732 can use strlen here in compiling the pattern. */
5733 ret
= regex_compile (pattern
, strlen (pattern
), syntax
, preg
);
5735 /* POSIX doesn't distinguish between an unmatched open-group and an
5736 unmatched close-group: both are REG_EPAREN. */
5737 if (ret
== REG_ERPAREN
) ret
= REG_EPAREN
;
5743 /* regexec searches for a given pattern, specified by PREG, in the
5746 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5747 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5748 least NMATCH elements, and we set them to the offsets of the
5749 corresponding matched substrings.
5751 EFLAGS specifies `execution flags' which affect matching: if
5752 REG_NOTBOL is set, then ^ does not match at the beginning of the
5753 string; if REG_NOTEOL is set, then $ does not match at the end.
5755 We return 0 if we find a match and REG_NOMATCH if not. */
5758 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
5759 const regex_t
*preg
;
5762 regmatch_t pmatch
[];
5766 struct re_registers regs
;
5767 regex_t private_preg
;
5768 int len
= strlen (string
);
5769 boolean want_reg_info
= !preg
->no_sub
&& nmatch
> 0;
5771 private_preg
= *preg
;
5773 private_preg
.not_bol
= !!(eflags
& REG_NOTBOL
);
5774 private_preg
.not_eol
= !!(eflags
& REG_NOTEOL
);
5776 /* The user has told us exactly how many registers to return
5777 information about, via `nmatch'. We have to pass that on to the
5778 matching routines. */
5779 private_preg
.regs_allocated
= REGS_FIXED
;
5783 regs
.num_regs
= nmatch
;
5784 regs
.start
= TALLOC (nmatch
, regoff_t
);
5785 regs
.end
= TALLOC (nmatch
, regoff_t
);
5786 if (regs
.start
== NULL
|| regs
.end
== NULL
)
5787 return (int) REG_NOMATCH
;
5790 /* Perform the searching operation. */
5791 ret
= re_search (&private_preg
, string
, len
,
5792 /* start: */ 0, /* range: */ len
,
5793 want_reg_info
? ®s
: (struct re_registers
*) 0);
5795 /* Copy the register information to the POSIX structure. */
5802 for (r
= 0; r
< nmatch
; r
++)
5804 pmatch
[r
].rm_so
= regs
.start
[r
];
5805 pmatch
[r
].rm_eo
= regs
.end
[r
];
5809 /* If we needed the temporary register info, free the space now. */
5814 /* We want zero return to mean success, unlike `re_search'. */
5815 return ret
>= 0 ? (int) REG_NOERROR
: (int) REG_NOMATCH
;
5819 /* Returns a message corresponding to an error code, ERRCODE, returned
5820 from either regcomp or regexec. We don't use PREG here. */
5823 regerror (errcode
, preg
, errbuf
, errbuf_size
)
5825 const regex_t
*preg
;
5833 || errcode
>= (sizeof (re_error_msgid
) / sizeof (re_error_msgid
[0])))
5834 /* Only error codes returned by the rest of the code should be passed
5835 to this routine. If we are given anything else, or if other regex
5836 code generates an invalid error code, then the program has a bug.
5837 Dump core so we can fix it. */
5840 msg
= gettext (re_error_msgid
[errcode
]);
5842 msg_size
= strlen (msg
) + 1; /* Includes the null. */
5844 if (errbuf_size
!= 0)
5846 if (msg_size
> errbuf_size
)
5848 strncpy (errbuf
, msg
, errbuf_size
- 1);
5849 errbuf
[errbuf_size
- 1] = 0;
5852 strcpy (errbuf
, msg
);
5859 /* Free dynamically allocated space used by PREG. */
5865 if (preg
->buffer
!= NULL
)
5866 free (preg
->buffer
);
5867 preg
->buffer
= NULL
;
5869 preg
->allocated
= 0;
5872 if (preg
->fastmap
!= NULL
)
5873 free (preg
->fastmap
);
5874 preg
->fastmap
= NULL
;
5875 preg
->fastmap_accurate
= 0;
5877 if (preg
->translate
!= NULL
)
5878 free (preg
->translate
);
5879 preg
->translate
= NULL
;
5882 #endif /* not emacs */