1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
23 - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac".
24 - use `keep_string' more often than just .*\n.
25 - structure the opcode space into opcode+flag.
26 - merge with glic's regex.[ch]
28 That's it for now -sm */
30 /* AIX requires this to be the first thing in the file. */
31 #if defined (_AIX) && !defined (REGEX_MALLOC)
39 /* Converts the pointer to the char to BEG-based offset from the start. */
40 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
41 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
43 #define PTR_TO_OFFSET(d) 0
50 /* We need this for `regex.h', and perhaps for the Emacs include files. */
51 #include <sys/types.h>
53 /* This is for other GNU distributions with internationalized messages. */
54 #if HAVE_LIBINTL_H || defined (_LIBC)
57 # define gettext(msgid) (msgid)
61 /* This define is so xgettext can find the internationalizable
63 #define gettext_noop(String) String
66 /* The `emacs' switch turns on certain matching commands
67 that make sense only in Emacs. */
73 /* Make syntax table lookup grant data in gl_state. */
74 #define SYNTAX_ENTRY_VIA_PROPERTY
80 #define malloc xmalloc
81 #define realloc xrealloc
86 /* If we are not linking with Emacs proper,
87 we can't use the relocating allocator
88 even if config.h says that we can. */
91 #if defined (STDC_HEADERS) || defined (_LIBC)
98 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
99 If nothing else has been done, use the method below. */
100 #ifdef INHIBIT_STRING_HEADER
101 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
102 #if !defined (bzero) && !defined (bcopy)
103 #undef INHIBIT_STRING_HEADER
108 /* This is the normal way of making sure we have a bcopy and a bzero.
109 This is used in most programs--a few other programs avoid this
110 by defining INHIBIT_STRING_HEADER. */
111 #ifndef INHIBIT_STRING_HEADER
112 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
115 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
118 #define bcopy(s, d, n) memcpy ((d), (s), (n))
121 #define bzero(s, n) memset ((s), 0, (n))
128 /* Define the syntax stuff for \<, \>, etc. */
130 /* This must be nonzero for the wordchar and notwordchar pattern
131 commands in re_match_2. */
136 #ifdef SWITCH_ENUM_BUG
137 #define SWITCH_ENUM_CAST(x) ((int)(x))
139 #define SWITCH_ENUM_CAST(x) (x)
144 extern char *re_syntax_table
;
146 #else /* not SYNTAX_TABLE */
148 /* How many characters in the character set. */
149 #define CHAR_SET_SIZE 256
151 static char re_syntax_table
[CHAR_SET_SIZE
];
162 bzero (re_syntax_table
, sizeof re_syntax_table
);
164 for (c
= 'a'; c
<= 'z'; c
++)
165 re_syntax_table
[c
] = Sword
;
167 for (c
= 'A'; c
<= 'Z'; c
++)
168 re_syntax_table
[c
] = Sword
;
170 for (c
= '0'; c
<= '9'; c
++)
171 re_syntax_table
[c
] = Sword
;
173 re_syntax_table
['_'] = Sword
;
178 #endif /* not SYNTAX_TABLE */
180 #define SYNTAX(c) re_syntax_table[c]
182 /* Dummy macros for non-Emacs environments. */
183 #define BASE_LEADING_CODE_P(c) (0)
184 #define WORD_BOUNDARY_P(c1, c2) (0)
185 #define CHAR_HEAD_P(p) (1)
186 #define SINGLE_BYTE_CHAR_P(c) (1)
187 #define SAME_CHARSET_P(c1, c2) (1)
188 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
189 #define STRING_CHAR(p, s) (*(p))
190 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
191 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
192 (c = ((p) == (end1) ? *(str2) : *(p)))
193 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
194 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
195 #endif /* not emacs */
197 /* Get the interface, including the syntax bits. */
200 /* isalpha etc. are used for the character classes. */
205 /* 1 if C is an ASCII character. */
206 #define IS_REAL_ASCII(c) ((c) < 0200)
208 /* 1 if C is a unibyte character. */
209 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
211 /* The Emacs definitions should not be directly affected by locales. */
213 /* In Emacs, these are only used for single-byte characters. */
214 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
215 #define ISCNTRL(c) ((c) < ' ')
216 #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
217 || ((c) >= 'a' && (c) <= 'f') \
218 || ((c) >= 'A' && (c) <= 'F'))
220 /* This is only used for single-byte characters. */
221 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
223 /* The rest must handle multibyte characters. */
225 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
226 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
229 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
230 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
233 #define ISALNUM(c) (IS_REAL_ASCII (c) \
234 ? (((c) >= 'a' && (c) <= 'z') \
235 || ((c) >= 'A' && (c) <= 'Z') \
236 || ((c) >= '0' && (c) <= '9')) \
237 : SYNTAX (c) == Sword)
239 #define ISALPHA(c) (IS_REAL_ASCII (c) \
240 ? (((c) >= 'a' && (c) <= 'z') \
241 || ((c) >= 'A' && (c) <= 'Z')) \
242 : SYNTAX (c) == Sword)
244 #define ISLOWER(c) (LOWERCASEP (c))
246 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
247 ? ((c) > ' ' && (c) < 0177 \
248 && !(((c) >= 'a' && (c) <= 'z') \
249 || ((c) >= 'A' && (c) <= 'Z') \
250 || ((c) >= '0' && (c) <= '9'))) \
251 : SYNTAX (c) != Sword)
253 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
255 #define ISUPPER(c) (UPPERCASEP (c))
257 #define ISWORD(c) (SYNTAX (c) == Sword)
259 #else /* not emacs */
261 /* Jim Meyering writes:
263 "... Some ctype macros are valid only for character codes that
264 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
265 using /bin/cc or gcc but without giving an ansi option). So, all
266 ctype uses should be through macros like ISPRINT... If
267 STDC_HEADERS is defined, then autoconf has verified that the ctype
268 macros don't need to be guarded with references to isascii. ...
269 Defining isascii to 1 should let any compiler worth its salt
270 eliminate the && through constant folding." */
272 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
275 #define ISASCII(c) isascii(c)
278 /* 1 if C is an ASCII character. */
279 #define IS_REAL_ASCII(c) ((c) < 0200)
281 /* This distinction is not meaningful, except in Emacs. */
282 #define ISUNIBYTE(c) 1
284 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
289 #define ISBLANK(c) (ISASCII (c) && isblank (c))
291 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
294 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
296 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
299 #define ISPRINT(c) (ISASCII (c) && isprint (c))
300 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
301 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
302 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
303 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
304 #define ISLOWER(c) (ISASCII (c) && islower (c))
305 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
306 #define ISSPACE(c) (ISASCII (c) && isspace (c))
307 #define ISUPPER(c) (ISASCII (c) && isupper (c))
308 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
310 #define ISWORD(c) ISALPHA(c)
312 #endif /* not emacs */
315 #define NULL (void *)0
318 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
319 since ours (we hope) works properly with all combinations of
320 machines, compilers, `char' and `unsigned char' argument types.
321 (Per Bothner suggested the basic approach.) */
322 #undef SIGN_EXTEND_CHAR
324 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
325 #else /* not __STDC__ */
326 /* As in Harbison and Steele. */
327 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
330 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
331 use `alloca' instead of `malloc'. This is because using malloc in
332 re_search* or re_match* could cause memory leaks when C-g is used in
333 Emacs; also, malloc is slower and causes storage fragmentation. On
334 the other hand, malloc is more portable, and easier to debug.
336 Because we sometimes use alloca, some routines have to be macros,
337 not functions -- `alloca'-allocated space disappears at the end of the
338 function it is called in. */
342 #define REGEX_ALLOCATE malloc
343 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
344 #define REGEX_FREE free
346 #else /* not REGEX_MALLOC */
348 /* Emacs already defines alloca, sometimes. */
351 /* Make alloca work the best possible way. */
353 #define alloca __builtin_alloca
354 #else /* not __GNUC__ */
357 #else /* not __GNUC__ or HAVE_ALLOCA_H */
358 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
359 #ifndef _AIX /* Already did AIX, up at the top. */
361 #endif /* not _AIX */
363 #endif /* not HAVE_ALLOCA_H */
364 #endif /* not __GNUC__ */
366 #endif /* not alloca */
368 #define REGEX_ALLOCATE alloca
370 /* Assumes a `char *destination' variable. */
371 #define REGEX_REALLOCATE(source, osize, nsize) \
372 (destination = (char *) alloca (nsize), \
373 bcopy (source, destination, osize), \
376 /* No need to do anything to free, after alloca. */
377 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
379 #endif /* not REGEX_MALLOC */
381 /* Define how to allocate the failure stack. */
383 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
385 #define REGEX_ALLOCATE_STACK(size) \
386 r_alloc (&failure_stack_ptr, (size))
387 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
388 r_re_alloc (&failure_stack_ptr, (nsize))
389 #define REGEX_FREE_STACK(ptr) \
390 r_alloc_free (&failure_stack_ptr)
392 #else /* not using relocating allocator */
396 #define REGEX_ALLOCATE_STACK malloc
397 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
398 #define REGEX_FREE_STACK free
400 #else /* not REGEX_MALLOC */
402 #define REGEX_ALLOCATE_STACK alloca
404 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
405 REGEX_REALLOCATE (source, osize, nsize)
406 /* No need to explicitly free anything. */
407 #define REGEX_FREE_STACK(arg)
409 #endif /* not REGEX_MALLOC */
410 #endif /* not using relocating allocator */
413 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
414 `string1' or just past its end. This works if PTR is NULL, which is
416 #define FIRST_STRING_P(ptr) \
417 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
419 /* (Re)Allocate N items of type T using malloc, or fail. */
420 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
421 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
422 #define RETALLOC_IF(addr, n, t) \
423 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
424 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
426 #define BYTEWIDTH 8 /* In bits. */
428 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
432 #define MAX(a, b) ((a) > (b) ? (a) : (b))
433 #define MIN(a, b) ((a) < (b) ? (a) : (b))
435 /* Type of source-pattern and string chars. */
436 typedef const unsigned char re_char
;
438 typedef char boolean
;
442 static int re_match_2_internal ();
444 /* These are the command codes that appear in compiled regular
445 expressions. Some opcodes are followed by argument bytes. A
446 command code can specify any interpretation whatsoever for its
447 arguments. Zero bytes may appear in the compiled regular expression. */
453 /* Succeed right away--no more backtracking. */
456 /* Followed by one byte giving n, then by n literal bytes. */
459 /* Matches any (more or less) character. */
462 /* Matches any one char belonging to specified set. First
463 following byte is number of bitmap bytes. Then come bytes
464 for a bitmap saying which chars are in. Bits in each byte
465 are ordered low-bit-first. A character is in the set if its
466 bit is 1. A character too large to have a bit in the map is
467 automatically not in the set.
469 If the length byte has the 0x80 bit set, then that stuff
470 is followed by a range table:
471 2 bytes of flags for character sets (low 8 bits, high 8 bits)
472 See RANGE_TABLE_WORK_BITS below.
473 2 bytes, the number of pairs that follow
474 pairs, each 2 multibyte characters,
475 each multibyte character represented as 3 bytes. */
478 /* Same parameters as charset, but match any character that is
479 not one of those specified. */
482 /* Start remembering the text that is matched, for storing in a
483 register. Followed by one byte with the register number, in
484 the range 0 to one less than the pattern buffer's re_nsub
488 /* Stop remembering the text that is matched and store it in a
489 memory register. Followed by one byte with the register
490 number, in the range 0 to one less than `re_nsub' in the
494 /* Match a duplicate of something remembered. Followed by one
495 byte containing the register number. */
498 /* Fail unless at beginning of line. */
501 /* Fail unless at end of line. */
504 /* Succeeds if at beginning of buffer (if emacs) or at beginning
505 of string to be matched (if not). */
508 /* Analogously, for end of buffer/string. */
511 /* Followed by two byte relative address to which to jump. */
514 /* Followed by two-byte relative address of place to resume at
515 in case of failure. */
518 /* Like on_failure_jump, but pushes a placeholder instead of the
519 current string position when executed. */
520 on_failure_keep_string_jump
,
522 /* Like `on_failure_jump', except that it assumes that the
523 pattern following it is mutually exclusive with the pattern
524 at the destination of the jump: if one matches something,
525 the other won't match at all.
526 Always used via `on_failure_jump_smart'. */
527 on_failure_jump_exclusive
,
529 /* Just like `on_failure_jump', except that it checks that we
530 don't get stuck in an infinite loop (matching an empty string
532 on_failure_jump_loop
,
534 /* A smart `on_failure_jump' used for greedy * and + operators.
535 It analyses the loop before which it is put and if the
536 loop does not require backtracking, it changes itself to
537 `on_failure_jump_exclusive', else it just defaults to
538 changing itself into `on_failure_jump_loop'. */
539 on_failure_jump_smart
,
541 /* Followed by two-byte relative address and two-byte number n.
542 After matching N times, jump to the address upon failure. */
545 /* Followed by two-byte relative address, and two-byte number n.
546 Jump to the address N times, then fail. */
549 /* Set the following two-byte relative address to the
550 subsequent two-byte number. The address *includes* the two
554 wordchar
, /* Matches any word-constituent character. */
555 notwordchar
, /* Matches any char that is not a word-constituent. */
557 wordbeg
, /* Succeeds if at word beginning. */
558 wordend
, /* Succeeds if at word end. */
560 wordbound
, /* Succeeds if at a word boundary. */
561 notwordbound
/* Succeeds if not at a word boundary. */
564 ,before_dot
, /* Succeeds if before point. */
565 at_dot
, /* Succeeds if at point. */
566 after_dot
, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
575 /* Matches any character whose category-set contains the specified
576 category. The operator is followed by a byte which contains a
577 category code (mnemonic ASCII character). */
580 /* Matches any character whose category-set does not contain the
581 specified category. The operator is followed by a byte which
582 contains the category code (mnemonic ASCII character). */
587 /* Common operations on the compiled pattern. */
589 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
591 #define STORE_NUMBER(destination, number) \
593 (destination)[0] = (number) & 0377; \
594 (destination)[1] = (number) >> 8; \
597 /* Same as STORE_NUMBER, except increment DESTINATION to
598 the byte after where the number is stored. Therefore, DESTINATION
599 must be an lvalue. */
601 #define STORE_NUMBER_AND_INCR(destination, number) \
603 STORE_NUMBER (destination, number); \
604 (destination) += 2; \
607 /* Put into DESTINATION a number stored in two contiguous bytes starting
610 #define EXTRACT_NUMBER(destination, source) \
612 (destination) = *(source) & 0377; \
613 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
618 extract_number (dest
, source
)
620 unsigned char *source
;
622 int temp
= SIGN_EXTEND_CHAR (*(source
+ 1));
623 *dest
= *source
& 0377;
627 #ifndef EXTRACT_MACROS /* To debug the macros. */
628 #undef EXTRACT_NUMBER
629 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
630 #endif /* not EXTRACT_MACROS */
634 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
635 SOURCE must be an lvalue. */
637 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
639 EXTRACT_NUMBER (destination, source); \
645 extract_number_and_incr (destination
, source
)
647 unsigned char **source
;
649 extract_number (destination
, *source
);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
661 /* Store a multibyte character in three contiguous bytes starting
662 DESTINATION, and increment DESTINATION to the byte after where the
663 character is stored. Therefore, DESTINATION must be an lvalue. */
665 #define STORE_CHARACTER_AND_INCR(destination, character) \
667 (destination)[0] = (character) & 0377; \
668 (destination)[1] = ((character) >> 8) & 0377; \
669 (destination)[2] = (character) >> 16; \
670 (destination) += 3; \
673 /* Put into DESTINATION a character stored in three contiguous bytes
674 starting at SOURCE. */
676 #define EXTRACT_CHARACTER(destination, source) \
678 (destination) = ((source)[0] \
679 | ((source)[1] << 8) \
680 | ((source)[2] << 16)); \
684 /* Macros for charset. */
686 /* Size of bitmap of charset P in bytes. P is a start of charset,
687 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
688 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
690 /* Nonzero if charset P has range table. */
691 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
693 /* Return the address of range table of charset P. But not the start
694 of table itself, but the before where the number of ranges is
695 stored. `2 +' means to skip re_opcode_t and size of bitmap,
696 and the 2 bytes of flags at the start of the range table. */
697 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
699 /* Extract the bit flags that start a range table. */
700 #define CHARSET_RANGE_TABLE_BITS(p) \
701 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
702 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
704 /* Test if C is listed in the bitmap of charset P. */
705 #define CHARSET_LOOKUP_BITMAP(p, c) \
706 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
707 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
709 /* Return the address of end of RANGE_TABLE. COUNT is number of
710 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
711 is start of range and end of range. `* 3' is size of each start
713 #define CHARSET_RANGE_TABLE_END(range_table, count) \
714 ((range_table) + (count) * 2 * 3)
716 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
717 COUNT is number of ranges in RANGE_TABLE. */
718 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
721 int range_start, range_end; \
723 unsigned char *range_table_end \
724 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
726 for (p = (range_table); p < range_table_end; p += 2 * 3) \
728 EXTRACT_CHARACTER (range_start, p); \
729 EXTRACT_CHARACTER (range_end, p + 3); \
731 if (range_start <= (c) && (c) <= range_end) \
740 /* Test if C is in range table of CHARSET. The flag NOT is negated if
741 C is listed in it. */
742 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
745 /* Number of ranges in range table. */ \
747 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
749 EXTRACT_NUMBER_AND_INCR (count, range_table); \
750 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
754 /* If DEBUG is defined, Regex prints many voluminous messages about what
755 it is doing (if the variable `debug' is nonzero). If linked with the
756 main program in `iregex.c', you can enter patterns and strings
757 interactively. And if linked with the main program in `main.c' and
758 the other test files, you can run the already-written tests. */
762 /* We use standard I/O for debugging. */
765 /* It is useful to test things that ``must'' be true when debugging. */
768 static int debug
= -100000;
770 #define DEBUG_STATEMENT(e) e
771 #define DEBUG_PRINT1(x) if (debug > 0) printf (x)
772 #define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
773 #define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
774 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
775 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
776 if (debug > 0) print_partial_compiled_pattern (s, e)
777 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
778 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
781 /* Print the fastmap in human-readable form. */
784 print_fastmap (fastmap
)
787 unsigned was_a_range
= 0;
790 while (i
< (1 << BYTEWIDTH
))
796 while (i
< (1 << BYTEWIDTH
) && fastmap
[i
])
812 /* Print a compiled pattern string in human-readable form, starting at
813 the START pointer into it and ending just before the pointer END. */
816 print_partial_compiled_pattern (start
, end
)
817 unsigned char *start
;
821 unsigned char *p
= start
;
822 unsigned char *pend
= end
;
830 /* Loop over pattern commands. */
833 printf ("%d:\t", p
- start
);
835 switch ((re_opcode_t
) *p
++)
847 printf ("/exactn/%d", mcnt
);
857 printf ("/start_memory/%d", *p
++);
861 printf ("/stop_memory/%d", *p
++);
865 printf ("/duplicate/%d", *p
++);
875 register int c
, last
= -100;
876 register int in_range
= 0;
877 int length
= CHARSET_BITMAP_SIZE (p
- 1);
878 int has_range_table
= CHARSET_RANGE_TABLE_EXISTS_P (p
- 1);
880 printf ("/charset [%s",
881 (re_opcode_t
) *(p
- 1) == charset_not
? "^" : "");
883 assert (p
+ *p
< pend
);
885 for (c
= 0; c
< 256; c
++)
887 && (p
[1 + (c
/8)] & (1 << (c
% 8))))
889 /* Are we starting a range? */
890 if (last
+ 1 == c
&& ! in_range
)
895 /* Have we broken a range? */
896 else if (last
+ 1 != c
&& in_range
)
918 printf ("has-range-table");
920 /* ??? Should print the range table; for now, just skip it. */
921 p
+= 2; /* skip range table bits */
922 EXTRACT_NUMBER_AND_INCR (count
, p
);
923 p
= CHARSET_RANGE_TABLE_END (p
, count
);
936 case on_failure_jump
:
937 extract_number_and_incr (&mcnt
, &p
);
938 printf ("/on_failure_jump to %d", p
+ mcnt
- start
);
941 case on_failure_keep_string_jump
:
942 extract_number_and_incr (&mcnt
, &p
);
943 printf ("/on_failure_keep_string_jump to %d", p
+ mcnt
- start
);
946 case on_failure_jump_exclusive
:
947 extract_number_and_incr (&mcnt
, &p
);
948 printf ("/on_failure_jump_exclusive to %d", p
+ mcnt
- start
);
951 case on_failure_jump_loop
:
952 extract_number_and_incr (&mcnt
, &p
);
953 printf ("/on_failure_jump_loop to %d", p
+ mcnt
- start
);
956 case on_failure_jump_smart
:
957 extract_number_and_incr (&mcnt
, &p
);
958 printf ("/on_failure_jump_smart to %d", p
+ mcnt
- start
);
962 extract_number_and_incr (&mcnt
, &p
);
963 printf ("/jump to %d", p
+ mcnt
- start
);
967 extract_number_and_incr (&mcnt
, &p
);
968 extract_number_and_incr (&mcnt2
, &p
);
969 printf ("/succeed_n to %d, %d times", p
- 2 + mcnt
- start
, mcnt2
);
973 extract_number_and_incr (&mcnt
, &p
);
974 extract_number_and_incr (&mcnt2
, &p
);
975 printf ("/jump_n to %d, %d times", p
- 2 + mcnt
- start
, mcnt2
);
979 extract_number_and_incr (&mcnt
, &p
);
980 extract_number_and_incr (&mcnt2
, &p
);
981 printf ("/set_number_at location %d to %d", p
- 2 + mcnt
- start
, mcnt2
);
985 printf ("/wordbound");
989 printf ("/notwordbound");
1001 printf ("/before_dot");
1009 printf ("/after_dot");
1013 printf ("/syntaxspec");
1015 printf ("/%d", mcnt
);
1019 printf ("/notsyntaxspec");
1021 printf ("/%d", mcnt
);
1026 printf ("/wordchar");
1030 printf ("/notwordchar");
1042 printf ("?%d", *(p
-1));
1048 printf ("%d:\tend of pattern.\n", p
- start
);
1053 print_compiled_pattern (bufp
)
1054 struct re_pattern_buffer
*bufp
;
1056 unsigned char *buffer
= bufp
->buffer
;
1058 print_partial_compiled_pattern (buffer
, buffer
+ bufp
->used
);
1059 printf ("%ld bytes used/%ld bytes allocated.\n", bufp
->used
, bufp
->allocated
);
1061 if (bufp
->fastmap_accurate
&& bufp
->fastmap
)
1063 printf ("fastmap: ");
1064 print_fastmap (bufp
->fastmap
);
1067 printf ("re_nsub: %d\t", bufp
->re_nsub
);
1068 printf ("regs_alloc: %d\t", bufp
->regs_allocated
);
1069 printf ("can_be_null: %d\t", bufp
->can_be_null
);
1070 printf ("newline_anchor: %d\n", bufp
->newline_anchor
);
1071 printf ("no_sub: %d\t", bufp
->no_sub
);
1072 printf ("not_bol: %d\t", bufp
->not_bol
);
1073 printf ("not_eol: %d\t", bufp
->not_eol
);
1074 printf ("syntax: %d\n", bufp
->syntax
);
1076 /* Perhaps we should print the translate table? */
1081 print_double_string (where
, string1
, size1
, string2
, size2
)
1094 if (FIRST_STRING_P (where
))
1096 for (this_char
= where
- string1
; this_char
< size1
; this_char
++)
1097 putchar (string1
[this_char
]);
1102 for (this_char
= where
- string2
; this_char
< size2
; this_char
++)
1103 putchar (string2
[this_char
]);
1107 #else /* not DEBUG */
1112 #define DEBUG_STATEMENT(e)
1113 #define DEBUG_PRINT1(x)
1114 #define DEBUG_PRINT2(x1, x2)
1115 #define DEBUG_PRINT3(x1, x2, x3)
1116 #define DEBUG_PRINT4(x1, x2, x3, x4)
1117 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1118 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1120 #endif /* not DEBUG */
1122 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1123 also be assigned to arbitrarily: each pattern buffer stores its own
1124 syntax, so it can be changed between regex compilations. */
1125 /* This has no initializer because initialized variables in Emacs
1126 become read-only after dumping. */
1127 reg_syntax_t re_syntax_options
;
1130 /* Specify the precise syntax of regexps for compilation. This provides
1131 for compatibility for various utilities which historically have
1132 different, incompatible syntaxes.
1134 The argument SYNTAX is a bit mask comprised of the various bits
1135 defined in regex.h. We return the old syntax. */
1138 re_set_syntax (syntax
)
1139 reg_syntax_t syntax
;
1141 reg_syntax_t ret
= re_syntax_options
;
1143 re_syntax_options
= syntax
;
1147 /* This table gives an error message for each of the error codes listed
1148 in regex.h. Obviously the order here has to be same as there.
1149 POSIX doesn't require that we do anything for REG_NOERROR,
1150 but why not be nice? */
1152 static const char *re_error_msgid
[] =
1154 gettext_noop ("Success"), /* REG_NOERROR */
1155 gettext_noop ("No match"), /* REG_NOMATCH */
1156 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1157 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1158 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1159 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1160 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1161 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1162 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1163 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1164 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1165 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1166 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1167 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1168 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1169 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1170 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1173 /* Avoiding alloca during matching, to placate r_alloc. */
1175 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1176 searching and matching functions should not call alloca. On some
1177 systems, alloca is implemented in terms of malloc, and if we're
1178 using the relocating allocator routines, then malloc could cause a
1179 relocation, which might (if the strings being searched are in the
1180 ralloc heap) shift the data out from underneath the regexp
1183 Here's another reason to avoid allocation: Emacs
1184 processes input from X in a signal handler; processing X input may
1185 call malloc; if input arrives while a matching routine is calling
1186 malloc, then we're scrod. But Emacs can't just block input while
1187 calling matching routines; then we don't notice interrupts when
1188 they come in. So, Emacs blocks input around all regexp calls
1189 except the matching calls, which it leaves unprotected, in the
1190 faith that they will not malloc. */
1192 /* Normally, this is fine. */
1193 #define MATCH_MAY_ALLOCATE
1195 /* When using GNU C, we are not REALLY using the C alloca, no matter
1196 what config.h may say. So don't take precautions for it. */
1201 /* The match routines may not allocate if (1) they would do it with malloc
1202 and (2) it's not safe for them to use malloc.
1203 Note that if REL_ALLOC is defined, matching would not use malloc for the
1204 failure stack, but we would still use it for the register vectors;
1205 so REL_ALLOC should not affect this. */
1206 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1207 #undef MATCH_MAY_ALLOCATE
1211 /* Failure stack declarations and macros; both re_compile_fastmap and
1212 re_match_2 use a failure stack. These have to be macros because of
1213 REGEX_ALLOCATE_STACK. */
1216 /* Approximate number of failure points for which to initially allocate space
1217 when matching. If this number is exceeded, we allocate more
1218 space, so it is not a hard limit. */
1219 #ifndef INIT_FAILURE_ALLOC
1220 #define INIT_FAILURE_ALLOC 20
1223 /* Roughly the maximum number of failure points on the stack. Would be
1224 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1225 This is a variable only so users of regex can assign to it; we never
1226 change it ourselves. */
1227 #if defined (MATCH_MAY_ALLOCATE)
1228 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1229 whose default stack limit is 2mb. In order for a larger
1230 value to work reliably, you have to try to make it accord
1231 with the process stack limit. */
1232 int re_max_failures
= 40000;
1234 int re_max_failures
= 4000;
1237 union fail_stack_elt
1239 const unsigned char *pointer
;
1240 unsigned int integer
;
1243 typedef union fail_stack_elt fail_stack_elt_t
;
1247 fail_stack_elt_t
*stack
;
1249 unsigned avail
; /* Offset of next open position. */
1250 unsigned frame
; /* Offset of the cur constructed frame. */
1253 #define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1254 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1255 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1258 /* Define macros to initialize and free the failure stack.
1259 Do `return -2' if the alloc fails. */
1261 #ifdef MATCH_MAY_ALLOCATE
1262 #define INIT_FAIL_STACK() \
1264 fail_stack.stack = (fail_stack_elt_t *) \
1265 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1266 * sizeof (fail_stack_elt_t)); \
1268 if (fail_stack.stack == NULL) \
1271 fail_stack.size = INIT_FAILURE_ALLOC; \
1272 fail_stack.avail = 0; \
1273 fail_stack.frame = 0; \
1276 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1278 #define INIT_FAIL_STACK() \
1280 fail_stack.avail = 0; \
1281 fail_stack.frame = 0; \
1284 #define RESET_FAIL_STACK()
1288 /* Double the size of FAIL_STACK, up to a limit
1289 which allows approximately `re_max_failures' items.
1291 Return 1 if succeeds, and 0 if either ran out of memory
1292 allocating space for it or it was already too large.
1294 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1296 /* Factor to increase the failure stack size by
1297 when we increase it.
1298 This used to be 2, but 2 was too wasteful
1299 because the old discarded stacks added up to as much space
1300 were as ultimate, maximum-size stack. */
1301 #define FAIL_STACK_GROWTH_FACTOR 4
1303 #define GROW_FAIL_STACK(fail_stack) \
1304 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1305 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1307 : ((fail_stack).stack \
1308 = (fail_stack_elt_t *) \
1309 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1310 (fail_stack).size * sizeof (fail_stack_elt_t), \
1311 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1312 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1313 * FAIL_STACK_GROWTH_FACTOR))), \
1315 (fail_stack).stack == NULL \
1317 : ((fail_stack).size \
1318 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1319 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1320 * FAIL_STACK_GROWTH_FACTOR)) \
1321 / sizeof (fail_stack_elt_t)), \
1325 /* Push pointer POINTER on FAIL_STACK.
1326 Return 1 if was able to do so and 0 if ran out of memory allocating
1328 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1329 ((FAIL_STACK_FULL () \
1330 && !GROW_FAIL_STACK (FAIL_STACK)) \
1332 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1334 #define POP_PATTERN_OP() POP_FAILURE_POINTER ()
1336 /* Push a pointer value onto the failure stack.
1337 Assumes the variable `fail_stack'. Probably should only
1338 be called from within `PUSH_FAILURE_POINT'. */
1339 #define PUSH_FAILURE_POINTER(item) \
1340 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1342 /* This pushes an integer-valued item onto the failure stack.
1343 Assumes the variable `fail_stack'. Probably should only
1344 be called from within `PUSH_FAILURE_POINT'. */
1345 #define PUSH_FAILURE_INT(item) \
1346 fail_stack.stack[fail_stack.avail++].integer = (item)
1348 /* Push a fail_stack_elt_t value onto the failure stack.
1349 Assumes the variable `fail_stack'. Probably should only
1350 be called from within `PUSH_FAILURE_POINT'. */
1351 #define PUSH_FAILURE_ELT(item) \
1352 fail_stack.stack[fail_stack.avail++] = (item)
1354 /* These three POP... operations complement the three PUSH... operations.
1355 All assume that `fail_stack' is nonempty. */
1356 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1357 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1358 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1360 /* Individual items aside from the registers. */
1361 #define NUM_NONREG_ITEMS 3
1363 /* Used to examine the stack (to detect infinite loops). */
1364 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1365 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1366 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1367 #define TOP_FAILURE_HANDLE() fail_stack.frame
1370 #define ENSURE_FAIL_STACK(space) \
1371 while (REMAINING_AVAIL_SLOTS <= space) { \
1372 if (!GROW_FAIL_STACK (fail_stack)) \
1374 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1375 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1378 /* Push register NUM onto the stack. */
1379 #define PUSH_FAILURE_REG(num) \
1381 char *destination; \
1382 ENSURE_FAIL_STACK(3); \
1383 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1384 num, regstart[num], regend[num]); \
1385 PUSH_FAILURE_POINTER (regstart[num]); \
1386 PUSH_FAILURE_POINTER (regend[num]); \
1387 PUSH_FAILURE_INT (num); \
1390 /* Pop a saved register off the stack. */
1391 #define POP_FAILURE_REG() \
1393 int reg = POP_FAILURE_INT (); \
1394 regend[reg] = POP_FAILURE_POINTER (); \
1395 regstart[reg] = POP_FAILURE_POINTER (); \
1396 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1397 reg, regstart[reg], regend[reg]); \
1400 /* Check that we are not stuck in an infinite loop. */
1401 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1403 int failure = TOP_FAILURE_HANDLE(); \
1404 /* Check for infinite matching loops */ \
1405 while (failure > 0 && \
1406 (FAILURE_STR (failure) == string_place \
1407 || FAILURE_STR (failure) == NULL)) \
1409 assert (FAILURE_PAT (failure) >= bufp->buffer \
1410 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1411 if (FAILURE_PAT (failure) == pat_cur) \
1413 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1414 failure = NEXT_FAILURE_HANDLE(failure); \
1416 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1419 /* Push the information about the state we will need
1420 if we ever fail back to it.
1422 Requires variables fail_stack, regstart, regend and
1423 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1426 Does `return FAILURE_CODE' if runs out of memory. */
1428 #define PUSH_FAILURE_POINT(pattern, string_place) \
1430 char *destination; \
1431 /* Must be int, so when we don't save any registers, the arithmetic \
1432 of 0 + -1 isn't done as unsigned. */ \
1434 DEBUG_STATEMENT (failure_id++); \
1435 DEBUG_STATEMENT (nfailure_points_pushed++); \
1436 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1437 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1438 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1440 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1442 DEBUG_PRINT1 ("\n"); \
1444 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1445 PUSH_FAILURE_INT (fail_stack.frame); \
1447 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1448 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1449 DEBUG_PRINT1 ("'\n"); \
1450 PUSH_FAILURE_POINTER (string_place); \
1452 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1453 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1454 PUSH_FAILURE_POINTER (pattern); \
1456 /* Close the frame by moving the frame pointer past it. */ \
1457 fail_stack.frame = fail_stack.avail; \
1460 /* Estimate the size of data pushed by a typical failure stack entry.
1461 An estimate is all we need, because all we use this for
1462 is to choose a limit for how big to make the failure stack. */
1464 #define TYPICAL_FAILURE_SIZE 20
1466 /* How many items can still be added to the stack without overflowing it. */
1467 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1470 /* Pops what PUSH_FAIL_STACK pushes.
1472 We restore into the parameters, all of which should be lvalues:
1473 STR -- the saved data position.
1474 PAT -- the saved pattern position.
1475 REGSTART, REGEND -- arrays of string positions.
1477 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1478 `pend', `string1', `size1', `string2', and `size2'. */
1480 #define POP_FAILURE_POINT(str, pat) \
1482 assert (!FAIL_STACK_EMPTY ()); \
1484 /* Remove failure points and point to how many regs pushed. */ \
1485 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1486 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1487 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1489 /* Pop the saved registers. */ \
1490 while (fail_stack.frame < fail_stack.avail) \
1491 POP_FAILURE_REG (); \
1493 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1494 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1495 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1497 /* If the saved string location is NULL, it came from an \
1498 on_failure_keep_string_jump opcode, and we want to throw away the \
1499 saved NULL, thus retaining our current position in the string. */ \
1500 str = (re_char *) POP_FAILURE_POINTER (); \
1501 DEBUG_PRINT2 (" Popping string %p: `", str); \
1502 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1503 DEBUG_PRINT1 ("'\n"); \
1505 fail_stack.frame = POP_FAILURE_INT (); \
1506 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1508 assert (fail_stack.avail >= 0); \
1509 assert (fail_stack.frame <= fail_stack.avail); \
1511 DEBUG_STATEMENT (nfailure_points_popped++); \
1512 } while (0) /* POP_FAILURE_POINT */
1516 /* Registers are set to a sentinel when they haven't yet matched. */
1517 #define REG_UNSET_VALUE NULL
1518 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1520 /* Subroutine declarations and macros for regex_compile. */
1522 static void store_op1
_RE_ARGS((re_opcode_t op
, unsigned char *loc
, int arg
));
1523 static void store_op2
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1524 int arg1
, int arg2
));
1525 static void insert_op1
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1526 int arg
, unsigned char *end
));
1527 static void insert_op2
_RE_ARGS((re_opcode_t op
, unsigned char *loc
,
1528 int arg1
, int arg2
, unsigned char *end
));
1529 static boolean at_begline_loc_p
_RE_ARGS((const unsigned char *pattern
,
1530 const unsigned char *p
,
1531 reg_syntax_t syntax
));
1532 static boolean at_endline_loc_p
_RE_ARGS((const unsigned char *p
,
1533 const unsigned char *pend
,
1534 reg_syntax_t syntax
));
1536 /* Fetch the next character in the uncompiled pattern---translating it
1537 if necessary. Also cast from a signed character in the constant
1538 string passed to us by the user to an unsigned char that we can use
1539 as an array index (in, e.g., `translate'). */
1541 #define PATFETCH(c) \
1544 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
1548 /* Fetch the next character in the uncompiled pattern, with no
1550 #define PATFETCH_RAW(c) \
1551 do {if (p == pend) return REG_EEND; \
1555 /* Go backwards one character in the pattern. */
1556 #define PATUNFETCH p--
1559 /* If `translate' is non-null, return translate[D], else just D. We
1560 cast the subscript to translate because some data is declared as
1561 `char *', to avoid warnings when a string constant is passed. But
1562 when we use a character as a subscript we must make it unsigned. */
1564 #define TRANSLATE(d) \
1565 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1569 /* Macros for outputting the compiled pattern into `buffer'. */
1571 /* If the buffer isn't allocated when it comes in, use this. */
1572 #define INIT_BUF_SIZE 32
1574 /* Make sure we have at least N more bytes of space in buffer. */
1575 #define GET_BUFFER_SPACE(n) \
1576 while (b - bufp->buffer + (n) > bufp->allocated) \
1579 /* Make sure we have one more byte of buffer space and then add C to it. */
1580 #define BUF_PUSH(c) \
1582 GET_BUFFER_SPACE (1); \
1583 *b++ = (unsigned char) (c); \
1587 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1588 #define BUF_PUSH_2(c1, c2) \
1590 GET_BUFFER_SPACE (2); \
1591 *b++ = (unsigned char) (c1); \
1592 *b++ = (unsigned char) (c2); \
1596 /* As with BUF_PUSH_2, except for three bytes. */
1597 #define BUF_PUSH_3(c1, c2, c3) \
1599 GET_BUFFER_SPACE (3); \
1600 *b++ = (unsigned char) (c1); \
1601 *b++ = (unsigned char) (c2); \
1602 *b++ = (unsigned char) (c3); \
1606 /* Store a jump with opcode OP at LOC to location TO. We store a
1607 relative address offset by the three bytes the jump itself occupies. */
1608 #define STORE_JUMP(op, loc, to) \
1609 store_op1 (op, loc, (to) - (loc) - 3)
1611 /* Likewise, for a two-argument jump. */
1612 #define STORE_JUMP2(op, loc, to, arg) \
1613 store_op2 (op, loc, (to) - (loc) - 3, arg)
1615 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1616 #define INSERT_JUMP(op, loc, to) \
1617 insert_op1 (op, loc, (to) - (loc) - 3, b)
1619 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1620 #define INSERT_JUMP2(op, loc, to, arg) \
1621 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1624 /* This is not an arbitrary limit: the arguments which represent offsets
1625 into the pattern are two bytes long. So if 2^16 bytes turns out to
1626 be too small, many things would have to change. */
1627 #define MAX_BUF_SIZE (1L << 16)
1630 /* Extend the buffer by twice its current size via realloc and
1631 reset the pointers that pointed into the old block to point to the
1632 correct places in the new one. If extending the buffer results in it
1633 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1634 #define EXTEND_BUFFER() \
1636 unsigned char *old_buffer = bufp->buffer; \
1637 if (bufp->allocated == MAX_BUF_SIZE) \
1639 bufp->allocated <<= 1; \
1640 if (bufp->allocated > MAX_BUF_SIZE) \
1641 bufp->allocated = MAX_BUF_SIZE; \
1642 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1643 if (bufp->buffer == NULL) \
1644 return REG_ESPACE; \
1645 /* If the buffer moved, move all the pointers into it. */ \
1646 if (old_buffer != bufp->buffer) \
1648 b = (b - old_buffer) + bufp->buffer; \
1649 begalt = (begalt - old_buffer) + bufp->buffer; \
1650 if (fixup_alt_jump) \
1651 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1653 laststart = (laststart - old_buffer) + bufp->buffer; \
1654 if (pending_exact) \
1655 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1660 /* Since we have one byte reserved for the register number argument to
1661 {start,stop}_memory, the maximum number of groups we can report
1662 things about is what fits in that byte. */
1663 #define MAX_REGNUM 255
1665 /* But patterns can have more than `MAX_REGNUM' registers. We just
1666 ignore the excess. */
1667 typedef unsigned regnum_t
;
1670 /* Macros for the compile stack. */
1672 /* Since offsets can go either forwards or backwards, this type needs to
1673 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1674 typedef int pattern_offset_t
;
1678 pattern_offset_t begalt_offset
;
1679 pattern_offset_t fixup_alt_jump
;
1680 pattern_offset_t laststart_offset
;
1682 } compile_stack_elt_t
;
1687 compile_stack_elt_t
*stack
;
1689 unsigned avail
; /* Offset of next open position. */
1690 } compile_stack_type
;
1693 #define INIT_COMPILE_STACK_SIZE 32
1695 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1696 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1698 /* The next available element. */
1699 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1702 /* Structure to manage work area for range table. */
1703 struct range_table_work_area
1705 int *table
; /* actual work area. */
1706 int allocated
; /* allocated size for work area in bytes. */
1707 int used
; /* actually used size in words. */
1708 int bits
; /* flag to record character classes */
1711 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1712 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1714 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1716 (work_area).allocated += 16 * sizeof (int); \
1717 if ((work_area).table) \
1719 = (int *) realloc ((work_area).table, (work_area).allocated); \
1722 = (int *) malloc ((work_area).allocated); \
1723 if ((work_area).table == 0) \
1724 FREE_STACK_RETURN (REG_ESPACE); \
1728 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1729 (work_area).bits |= (bit)
1731 /* These bits represent the various character classes such as [:alnum:]
1732 in a charset's range table. */
1733 #define BIT_ALNUM 0x1
1734 #define BIT_ALPHA 0x2
1735 #define BIT_WORD 0x4
1736 #define BIT_ASCII 0x8
1737 #define BIT_NONASCII 0x10
1738 #define BIT_GRAPH 0x20
1739 #define BIT_LOWER 0x40
1740 #define BIT_PRINT 0x80
1741 #define BIT_PUNCT 0x100
1742 #define BIT_SPACE 0x200
1743 #define BIT_UPPER 0x400
1744 #define BIT_UNIBYTE 0x800
1745 #define BIT_MULTIBYTE 0x1000
1747 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1748 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1750 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1751 (work_area).table[(work_area).used++] = (range_start); \
1752 (work_area).table[(work_area).used++] = (range_end); \
1755 /* Free allocated memory for WORK_AREA. */
1756 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1758 if ((work_area).table) \
1759 free ((work_area).table); \
1762 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1763 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1764 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1765 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1768 /* Set the bit for character C in a list. */
1769 #define SET_LIST_BIT(c) \
1770 (b[((unsigned char) (c)) / BYTEWIDTH] \
1771 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1774 /* Get the next unsigned number in the uncompiled pattern. */
1775 #define GET_UNSIGNED_NUMBER(num) \
1776 do { if (p != pend) \
1779 while (ISDIGIT (c)) \
1783 num = num * 10 + c - '0'; \
1791 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1793 #define IS_CHAR_CLASS(string) \
1794 (STREQ (string, "alpha") || STREQ (string, "upper") \
1795 || STREQ (string, "lower") || STREQ (string, "digit") \
1796 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1797 || STREQ (string, "space") || STREQ (string, "print") \
1798 || STREQ (string, "punct") || STREQ (string, "graph") \
1799 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1800 || STREQ (string, "word") \
1801 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1802 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1804 /* QUIT is only used on NTemacs. */
1805 #if !defined (WINDOWSNT) || !defined (emacs)
1810 #ifndef MATCH_MAY_ALLOCATE
1812 /* If we cannot allocate large objects within re_match_2_internal,
1813 we make the fail stack and register vectors global.
1814 The fail stack, we grow to the maximum size when a regexp
1816 The register vectors, we adjust in size each time we
1817 compile a regexp, according to the number of registers it needs. */
1819 static fail_stack_type fail_stack
;
1821 /* Size with which the following vectors are currently allocated.
1822 That is so we can make them bigger as needed,
1823 but never make them smaller. */
1824 static int regs_allocated_size
;
1826 static re_char
** regstart
, ** regend
;
1827 static re_char
**best_regstart
, **best_regend
;
1829 /* Make the register vectors big enough for NUM_REGS registers,
1830 but don't make them smaller. */
1833 regex_grow_registers (num_regs
)
1836 if (num_regs
> regs_allocated_size
)
1838 RETALLOC_IF (regstart
, num_regs
, re_char
*);
1839 RETALLOC_IF (regend
, num_regs
, re_char
*);
1840 RETALLOC_IF (best_regstart
, num_regs
, re_char
*);
1841 RETALLOC_IF (best_regend
, num_regs
, re_char
*);
1843 regs_allocated_size
= num_regs
;
1847 #endif /* not MATCH_MAY_ALLOCATE */
1849 static boolean group_in_compile_stack
_RE_ARGS ((compile_stack_type
1853 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1854 Returns one of error codes defined in `regex.h', or zero for success.
1856 Assumes the `allocated' (and perhaps `buffer') and `translate'
1857 fields are set in BUFP on entry.
1859 If it succeeds, results are put in BUFP (if it returns an error, the
1860 contents of BUFP are undefined):
1861 `buffer' is the compiled pattern;
1862 `syntax' is set to SYNTAX;
1863 `used' is set to the length of the compiled pattern;
1864 `fastmap_accurate' is zero;
1865 `re_nsub' is the number of subexpressions in PATTERN;
1866 `not_bol' and `not_eol' are zero;
1868 The `fastmap' and `newline_anchor' fields are neither
1869 examined nor set. */
1871 /* Insert the `jump' from the end of last alternative to "here".
1872 The space for the jump has already been allocated. */
1873 #define FIXUP_ALT_JUMP() \
1875 if (fixup_alt_jump) \
1876 STORE_JUMP (jump, fixup_alt_jump, b); \
1880 /* Return, freeing storage we allocated. */
1881 #define FREE_STACK_RETURN(value) \
1883 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1884 free (compile_stack.stack); \
1888 static reg_errcode_t
1889 regex_compile (pattern
, size
, syntax
, bufp
)
1892 reg_syntax_t syntax
;
1893 struct re_pattern_buffer
*bufp
;
1895 /* We fetch characters from PATTERN here. Even though PATTERN is
1896 `char *' (i.e., signed), we declare these variables as unsigned, so
1897 they can be reliably used as array indices. */
1898 register unsigned int c
, c1
;
1900 /* A random temporary spot in PATTERN. */
1903 /* Points to the end of the buffer, where we should append. */
1904 register unsigned char *b
;
1906 /* Keeps track of unclosed groups. */
1907 compile_stack_type compile_stack
;
1909 /* Points to the current (ending) position in the pattern. */
1911 /* `const' makes AIX compiler fail. */
1912 unsigned char *p
= pattern
;
1914 re_char
*p
= pattern
;
1916 re_char
*pend
= pattern
+ size
;
1918 /* How to translate the characters in the pattern. */
1919 RE_TRANSLATE_TYPE translate
= bufp
->translate
;
1921 /* Address of the count-byte of the most recently inserted `exactn'
1922 command. This makes it possible to tell if a new exact-match
1923 character can be added to that command or if the character requires
1924 a new `exactn' command. */
1925 unsigned char *pending_exact
= 0;
1927 /* Address of start of the most recently finished expression.
1928 This tells, e.g., postfix * where to find the start of its
1929 operand. Reset at the beginning of groups and alternatives. */
1930 unsigned char *laststart
= 0;
1932 /* Address of beginning of regexp, or inside of last group. */
1933 unsigned char *begalt
;
1935 /* Place in the uncompiled pattern (i.e., the {) to
1936 which to go back if the interval is invalid. */
1937 re_char
*beg_interval
;
1939 /* Address of the place where a forward jump should go to the end of
1940 the containing expression. Each alternative of an `or' -- except the
1941 last -- ends with a forward jump of this sort. */
1942 unsigned char *fixup_alt_jump
= 0;
1944 /* Counts open-groups as they are encountered. Remembered for the
1945 matching close-group on the compile stack, so the same register
1946 number is put in the stop_memory as the start_memory. */
1947 regnum_t regnum
= 0;
1949 /* Work area for range table of charset. */
1950 struct range_table_work_area range_table_work
;
1954 DEBUG_PRINT1 ("\nCompiling pattern: ");
1957 unsigned debug_count
;
1959 for (debug_count
= 0; debug_count
< size
; debug_count
++)
1960 putchar (pattern
[debug_count
]);
1965 /* Initialize the compile stack. */
1966 compile_stack
.stack
= TALLOC (INIT_COMPILE_STACK_SIZE
, compile_stack_elt_t
);
1967 if (compile_stack
.stack
== NULL
)
1970 compile_stack
.size
= INIT_COMPILE_STACK_SIZE
;
1971 compile_stack
.avail
= 0;
1973 range_table_work
.table
= 0;
1974 range_table_work
.allocated
= 0;
1976 /* Initialize the pattern buffer. */
1977 bufp
->syntax
= syntax
;
1978 bufp
->fastmap_accurate
= 0;
1979 bufp
->not_bol
= bufp
->not_eol
= 0;
1981 /* Set `used' to zero, so that if we return an error, the pattern
1982 printer (for debugging) will think there's no pattern. We reset it
1986 /* Always count groups, whether or not bufp->no_sub is set. */
1990 /* bufp->multibyte is set before regex_compile is called, so don't alter
1992 #else /* not emacs */
1993 /* Nothing is recognized as a multibyte character. */
1994 bufp
->multibyte
= 0;
1997 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1998 /* Initialize the syntax table. */
1999 init_syntax_once ();
2002 if (bufp
->allocated
== 0)
2005 { /* If zero allocated, but buffer is non-null, try to realloc
2006 enough space. This loses if buffer's address is bogus, but
2007 that is the user's responsibility. */
2008 RETALLOC (bufp
->buffer
, INIT_BUF_SIZE
, unsigned char);
2011 { /* Caller did not allocate a buffer. Do it for them. */
2012 bufp
->buffer
= TALLOC (INIT_BUF_SIZE
, unsigned char);
2014 if (!bufp
->buffer
) FREE_STACK_RETURN (REG_ESPACE
);
2016 bufp
->allocated
= INIT_BUF_SIZE
;
2019 begalt
= b
= bufp
->buffer
;
2021 /* Loop through the uncompiled pattern until we're at the end. */
2030 if ( /* If at start of pattern, it's an operator. */
2032 /* If context independent, it's an operator. */
2033 || syntax
& RE_CONTEXT_INDEP_ANCHORS
2034 /* Otherwise, depends on what's come before. */
2035 || at_begline_loc_p (pattern
, p
, syntax
))
2045 if ( /* If at end of pattern, it's an operator. */
2047 /* If context independent, it's an operator. */
2048 || syntax
& RE_CONTEXT_INDEP_ANCHORS
2049 /* Otherwise, depends on what's next. */
2050 || at_endline_loc_p (p
, pend
, syntax
))
2060 if ((syntax
& RE_BK_PLUS_QM
)
2061 || (syntax
& RE_LIMITED_OPS
))
2065 /* If there is no previous pattern... */
2068 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2069 FREE_STACK_RETURN (REG_BADRPT
);
2070 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
2075 /* Are we optimizing this jump? */
2076 boolean keep_string_p
= false;
2078 /* 1 means zero (many) matches is allowed. */
2079 boolean zero_times_ok
= 0, many_times_ok
= 0;
2082 /* If there is a sequence of repetition chars, collapse it
2083 down to just one (the right one). We can't combine
2084 interval operators with these because of, e.g., `a{2}*',
2085 which should only match an even number of `a's. */
2089 if (!(syntax
& RE_ALL_GREEDY
)
2090 && c
== '?' && (zero_times_ok
|| many_times_ok
))
2094 zero_times_ok
|= c
!= '+';
2095 many_times_ok
|= c
!= '?';
2104 || (!(syntax
& RE_BK_PLUS_QM
) && (c
== '+' || c
== '?')))
2107 else if (syntax
& RE_BK_PLUS_QM
&& c
== '\\')
2109 if (p
== pend
) FREE_STACK_RETURN (REG_EESCAPE
);
2112 if (!(c1
== '+' || c1
== '?'))
2127 /* If we get here, we found another repeat character. */
2130 /* Star, etc. applied to an empty pattern is equivalent
2131 to an empty pattern. */
2135 /* Now we know whether or not zero matches is allowed
2136 and also whether or not two or more matches is allowed. */
2140 { /* More than one repetition is allowed, so put in at the
2141 end a backward relative jump from `b' to before the next
2142 jump we're going to put in below (which jumps from
2143 laststart to after this jump).
2145 But if we are at the `*' in the exact sequence `.*\n',
2146 insert an unconditional jump backwards to the .,
2147 instead of the beginning of the loop. This way we only
2148 push a failure point once, instead of every time
2149 through the loop. */
2150 assert (p
- 1 > pattern
);
2152 /* Allocate the space for the jump. */
2153 GET_BUFFER_SPACE (3);
2155 /* We know we are not at the first character of the pattern,
2156 because laststart was nonzero. And we've already
2157 incremented `p', by the way, to be the character after
2158 the `*'. Do we have to do something analogous here
2159 for null bytes, because of RE_DOT_NOT_NULL? */
2160 if (TRANSLATE (*(p
- 2)) == TRANSLATE ('.')
2163 && TRANSLATE (*p
) == TRANSLATE ('\n')
2164 && !(syntax
& RE_DOT_NEWLINE
))
2165 { /* We have .*\n. */
2166 STORE_JUMP (jump
, b
, laststart
);
2167 keep_string_p
= true;
2170 STORE_JUMP (jump
, b
, laststart
- 3);
2172 /* We've added more stuff to the buffer. */
2176 /* On failure, jump from laststart to b + 3, which will be the
2177 end of the buffer after this jump is inserted. */
2178 GET_BUFFER_SPACE (3);
2181 assert (many_times_ok
);
2182 INSERT_JUMP (on_failure_jump_smart
, b
- 3, b
+ 3);
2188 INSERT_JUMP (keep_string_p
? on_failure_keep_string_jump
2190 on_failure_jump
: on_failure_jump_smart
,
2196 else /* not greedy */
2197 { /* I wish the greedy and non-greedy cases could be merged. */
2201 /* The non-greedy multiple match looks like a repeat..until:
2202 we only need a conditional jump at the end of the loop */
2203 GET_BUFFER_SPACE (3);
2204 STORE_JUMP (on_failure_jump
, b
, laststart
);
2208 /* The repeat...until naturally matches one or more.
2209 To also match zero times, we need to first jump to
2210 the end of the loop (its conditional jump). */
2211 GET_BUFFER_SPACE (3);
2212 INSERT_JUMP (jump
, laststart
, b
);
2218 /* non-greedy a?? */
2219 GET_BUFFER_SPACE (6);
2220 INSERT_JUMP (jump
, laststart
, b
+ 3);
2222 INSERT_JUMP (on_failure_jump
, laststart
, laststart
+ 6);
2238 CLEAR_RANGE_TABLE_WORK_USED (range_table_work
);
2240 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2242 /* Ensure that we have enough space to push a charset: the
2243 opcode, the length count, and the bitset; 34 bytes in all. */
2244 GET_BUFFER_SPACE (34);
2248 /* We test `*p == '^' twice, instead of using an if
2249 statement, so we only need one BUF_PUSH. */
2250 BUF_PUSH (*p
== '^' ? charset_not
: charset
);
2254 /* Remember the first position in the bracket expression. */
2257 /* Push the number of bytes in the bitmap. */
2258 BUF_PUSH ((1 << BYTEWIDTH
) / BYTEWIDTH
);
2260 /* Clear the whole map. */
2261 bzero (b
, (1 << BYTEWIDTH
) / BYTEWIDTH
);
2263 /* charset_not matches newline according to a syntax bit. */
2264 if ((re_opcode_t
) b
[-2] == charset_not
2265 && (syntax
& RE_HAT_LISTS_NOT_NEWLINE
))
2266 SET_LIST_BIT ('\n');
2268 /* Read in characters and ranges, setting map bits. */
2272 boolean escaped_char
= false;
2274 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2278 /* \ might escape characters inside [...] and [^...]. */
2279 if ((syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
) && c
== '\\')
2281 if (p
== pend
) FREE_STACK_RETURN (REG_EESCAPE
);
2284 escaped_char
= true;
2288 /* Could be the end of the bracket expression. If it's
2289 not (i.e., when the bracket expression is `[]' so
2290 far), the ']' character bit gets set way below. */
2291 if (c
== ']' && p
!= p1
+ 1)
2295 /* If C indicates start of multibyte char, get the
2296 actual character code in C, and set the pattern
2297 pointer P to the next character boundary. */
2298 if (bufp
->multibyte
&& BASE_LEADING_CODE_P (c
))
2301 c
= STRING_CHAR_AND_LENGTH (p
, pend
- p
, len
);
2304 /* What should we do for the character which is
2305 greater than 0x7F, but not BASE_LEADING_CODE_P?
2308 /* See if we're at the beginning of a possible character
2311 else if (!escaped_char
&&
2312 syntax
& RE_CHAR_CLASSES
&& c
== '[' && *p
== ':')
2314 /* Leave room for the null. */
2315 char str
[CHAR_CLASS_MAX_LENGTH
+ 1];
2320 /* If pattern is `[[:'. */
2321 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2326 if (c
== ':' || c
== ']' || p
== pend
2327 || c1
== CHAR_CLASS_MAX_LENGTH
)
2333 /* If isn't a word bracketed by `[:' and `:]':
2334 undo the ending character, the letters, and
2335 leave the leading `:' and `[' (but set bits for
2337 if (c
== ':' && *p
== ']')
2340 boolean is_alnum
= STREQ (str
, "alnum");
2341 boolean is_alpha
= STREQ (str
, "alpha");
2342 boolean is_ascii
= STREQ (str
, "ascii");
2343 boolean is_blank
= STREQ (str
, "blank");
2344 boolean is_cntrl
= STREQ (str
, "cntrl");
2345 boolean is_digit
= STREQ (str
, "digit");
2346 boolean is_graph
= STREQ (str
, "graph");
2347 boolean is_lower
= STREQ (str
, "lower");
2348 boolean is_multibyte
= STREQ (str
, "multibyte");
2349 boolean is_nonascii
= STREQ (str
, "nonascii");
2350 boolean is_print
= STREQ (str
, "print");
2351 boolean is_punct
= STREQ (str
, "punct");
2352 boolean is_space
= STREQ (str
, "space");
2353 boolean is_unibyte
= STREQ (str
, "unibyte");
2354 boolean is_upper
= STREQ (str
, "upper");
2355 boolean is_word
= STREQ (str
, "word");
2356 boolean is_xdigit
= STREQ (str
, "xdigit");
2358 if (!IS_CHAR_CLASS (str
))
2359 FREE_STACK_RETURN (REG_ECTYPE
);
2361 /* Throw away the ] at the end of the character
2365 if (p
== pend
) FREE_STACK_RETURN (REG_EBRACK
);
2367 /* Most character classes in a multibyte match
2368 just set a flag. Exceptions are is_blank,
2369 is_digit, is_cntrl, and is_xdigit, since
2370 they can only match ASCII characters. We
2371 don't need to handle them for multibyte. */
2373 if (bufp
->multibyte
)
2377 if (is_alnum
) bit
= BIT_ALNUM
;
2378 if (is_alpha
) bit
= BIT_ALPHA
;
2379 if (is_ascii
) bit
= BIT_ASCII
;
2380 if (is_graph
) bit
= BIT_GRAPH
;
2381 if (is_lower
) bit
= BIT_LOWER
;
2382 if (is_multibyte
) bit
= BIT_MULTIBYTE
;
2383 if (is_nonascii
) bit
= BIT_NONASCII
;
2384 if (is_print
) bit
= BIT_PRINT
;
2385 if (is_punct
) bit
= BIT_PUNCT
;
2386 if (is_space
) bit
= BIT_SPACE
;
2387 if (is_unibyte
) bit
= BIT_UNIBYTE
;
2388 if (is_upper
) bit
= BIT_UPPER
;
2389 if (is_word
) bit
= BIT_WORD
;
2391 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work
,
2395 /* Handle character classes for ASCII characters. */
2396 for (ch
= 0; ch
< 1 << BYTEWIDTH
; ch
++)
2398 int translated
= TRANSLATE (ch
);
2399 /* This was split into 3 if's to
2400 avoid an arbitrary limit in some compiler. */
2401 if ( (is_alnum
&& ISALNUM (ch
))
2402 || (is_alpha
&& ISALPHA (ch
))
2403 || (is_blank
&& ISBLANK (ch
))
2404 || (is_cntrl
&& ISCNTRL (ch
)))
2405 SET_LIST_BIT (translated
);
2406 if ( (is_digit
&& ISDIGIT (ch
))
2407 || (is_graph
&& ISGRAPH (ch
))
2408 || (is_lower
&& ISLOWER (ch
))
2409 || (is_print
&& ISPRINT (ch
)))
2410 SET_LIST_BIT (translated
);
2411 if ( (is_punct
&& ISPUNCT (ch
))
2412 || (is_space
&& ISSPACE (ch
))
2413 || (is_upper
&& ISUPPER (ch
))
2414 || (is_xdigit
&& ISXDIGIT (ch
)))
2415 SET_LIST_BIT (translated
);
2416 if ( (is_ascii
&& IS_REAL_ASCII (ch
))
2417 || (is_nonascii
&& !IS_REAL_ASCII (ch
))
2418 || (is_unibyte
&& ISUNIBYTE (ch
))
2419 || (is_multibyte
&& !ISUNIBYTE (ch
)))
2420 SET_LIST_BIT (translated
);
2422 if ( (is_word
&& ISWORD (ch
)))
2423 SET_LIST_BIT (translated
);
2426 /* Repeat the loop. */
2436 /* Because the `:' may starts the range, we
2437 can't simply set bit and repeat the loop.
2438 Instead, just set it to C and handle below. */
2443 if (p
< pend
&& p
[0] == '-' && p
[1] != ']')
2446 /* Discard the `-'. */
2449 /* Fetch the character which ends the range. */
2451 if (bufp
->multibyte
&& BASE_LEADING_CODE_P (c1
))
2454 c1
= STRING_CHAR_AND_LENGTH (p
, pend
- p
, len
);
2458 if (SINGLE_BYTE_CHAR_P (c
)
2459 && ! SINGLE_BYTE_CHAR_P (c1
))
2461 /* Handle a range such as \177-\377 in multibyte mode.
2462 Split that into two ranges,,
2463 the low one ending at 0237, and the high one
2464 starting at ...040. */
2465 /* Unless I'm missing something,
2466 this line is useless. -sm
2467 int c1_base = (c1 & ~0177) | 040; */
2468 SET_RANGE_TABLE_WORK_AREA (range_table_work
, c
, c1
);
2471 else if (!SAME_CHARSET_P (c
, c1
))
2472 FREE_STACK_RETURN (REG_ERANGE
);
2475 /* Range from C to C. */
2478 /* Set the range ... */
2479 if (SINGLE_BYTE_CHAR_P (c
))
2480 /* ... into bitmap. */
2483 int range_start
= c
, range_end
= c1
;
2485 /* If the start is after the end, the range is empty. */
2486 if (range_start
> range_end
)
2488 if (syntax
& RE_NO_EMPTY_RANGES
)
2489 FREE_STACK_RETURN (REG_ERANGE
);
2490 /* Else, repeat the loop. */
2494 for (this_char
= range_start
; this_char
<= range_end
;
2496 SET_LIST_BIT (TRANSLATE (this_char
));
2500 /* ... into range table. */
2501 SET_RANGE_TABLE_WORK_AREA (range_table_work
, c
, c1
);
2504 /* Discard any (non)matching list bytes that are all 0 at the
2505 end of the map. Decrease the map-length byte too. */
2506 while ((int) b
[-1] > 0 && b
[b
[-1] - 1] == 0)
2510 /* Build real range table from work area. */
2511 if (RANGE_TABLE_WORK_USED (range_table_work
)
2512 || RANGE_TABLE_WORK_BITS (range_table_work
))
2515 int used
= RANGE_TABLE_WORK_USED (range_table_work
);
2517 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2518 bytes for flags, two for COUNT, and three bytes for
2520 GET_BUFFER_SPACE (4 + used
* 3);
2522 /* Indicate the existence of range table. */
2523 laststart
[1] |= 0x80;
2525 /* Store the character class flag bits into the range table.
2526 If not in emacs, these flag bits are always 0. */
2527 *b
++ = RANGE_TABLE_WORK_BITS (range_table_work
) & 0xff;
2528 *b
++ = RANGE_TABLE_WORK_BITS (range_table_work
) >> 8;
2530 STORE_NUMBER_AND_INCR (b
, used
/ 2);
2531 for (i
= 0; i
< used
; i
++)
2532 STORE_CHARACTER_AND_INCR
2533 (b
, RANGE_TABLE_WORK_ELT (range_table_work
, i
));
2540 if (syntax
& RE_NO_BK_PARENS
)
2547 if (syntax
& RE_NO_BK_PARENS
)
2554 if (syntax
& RE_NEWLINE_ALT
)
2561 if (syntax
& RE_NO_BK_VBAR
)
2568 if (syntax
& RE_INTERVALS
&& syntax
& RE_NO_BK_BRACES
)
2569 goto handle_interval
;
2575 if (p
== pend
) FREE_STACK_RETURN (REG_EESCAPE
);
2577 /* Do not translate the character after the \, so that we can
2578 distinguish, e.g., \B from \b, even if we normally would
2579 translate, e.g., B to b. */
2585 if (syntax
& RE_NO_BK_PARENS
)
2586 goto normal_backslash
;
2593 /* Look for a special (?...) construct */
2595 if ((syntax
& RE_SHY_GROUPS
) && c
== '?')
2600 case ':': shy
= 1; break;
2602 /* Only (?:...) is supported right now. */
2603 FREE_STACK_RETURN (REG_BADPAT
);
2615 if (COMPILE_STACK_FULL
)
2617 RETALLOC (compile_stack
.stack
, compile_stack
.size
<< 1,
2618 compile_stack_elt_t
);
2619 if (compile_stack
.stack
== NULL
) return REG_ESPACE
;
2621 compile_stack
.size
<<= 1;
2624 /* These are the values to restore when we hit end of this
2625 group. They are all relative offsets, so that if the
2626 whole pattern moves because of realloc, they will still
2628 COMPILE_STACK_TOP
.begalt_offset
= begalt
- bufp
->buffer
;
2629 COMPILE_STACK_TOP
.fixup_alt_jump
2630 = fixup_alt_jump
? fixup_alt_jump
- bufp
->buffer
+ 1 : 0;
2631 COMPILE_STACK_TOP
.laststart_offset
= b
- bufp
->buffer
;
2632 COMPILE_STACK_TOP
.regnum
= shy
? -regnum
: regnum
;
2635 start_memory for groups beyond the last one we can
2636 represent in the compiled pattern. */
2637 if (regnum
<= MAX_REGNUM
&& !shy
)
2638 BUF_PUSH_2 (start_memory
, regnum
);
2640 compile_stack
.avail
++;
2645 /* If we've reached MAX_REGNUM groups, then this open
2646 won't actually generate any code, so we'll have to
2647 clear pending_exact explicitly. */
2653 if (syntax
& RE_NO_BK_PARENS
) goto normal_backslash
;
2655 if (COMPILE_STACK_EMPTY
)
2657 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
2658 goto normal_backslash
;
2660 FREE_STACK_RETURN (REG_ERPAREN
);
2666 /* See similar code for backslashed left paren above. */
2667 if (COMPILE_STACK_EMPTY
)
2669 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
2672 FREE_STACK_RETURN (REG_ERPAREN
);
2675 /* Since we just checked for an empty stack above, this
2676 ``can't happen''. */
2677 assert (compile_stack
.avail
!= 0);
2679 /* We don't just want to restore into `regnum', because
2680 later groups should continue to be numbered higher,
2681 as in `(ab)c(de)' -- the second group is #2. */
2682 regnum_t this_group_regnum
;
2684 compile_stack
.avail
--;
2685 begalt
= bufp
->buffer
+ COMPILE_STACK_TOP
.begalt_offset
;
2687 = COMPILE_STACK_TOP
.fixup_alt_jump
2688 ? bufp
->buffer
+ COMPILE_STACK_TOP
.fixup_alt_jump
- 1
2690 laststart
= bufp
->buffer
+ COMPILE_STACK_TOP
.laststart_offset
;
2691 this_group_regnum
= COMPILE_STACK_TOP
.regnum
;
2692 /* If we've reached MAX_REGNUM groups, then this open
2693 won't actually generate any code, so we'll have to
2694 clear pending_exact explicitly. */
2697 /* We're at the end of the group, so now we know how many
2698 groups were inside this one. */
2699 if (this_group_regnum
<= MAX_REGNUM
&& this_group_regnum
> 0)
2700 BUF_PUSH_2 (stop_memory
, this_group_regnum
);
2705 case '|': /* `\|'. */
2706 if (syntax
& RE_LIMITED_OPS
|| syntax
& RE_NO_BK_VBAR
)
2707 goto normal_backslash
;
2709 if (syntax
& RE_LIMITED_OPS
)
2712 /* Insert before the previous alternative a jump which
2713 jumps to this alternative if the former fails. */
2714 GET_BUFFER_SPACE (3);
2715 INSERT_JUMP (on_failure_jump
, begalt
, b
+ 6);
2719 /* The alternative before this one has a jump after it
2720 which gets executed if it gets matched. Adjust that
2721 jump so it will jump to this alternative's analogous
2722 jump (put in below, which in turn will jump to the next
2723 (if any) alternative's such jump, etc.). The last such
2724 jump jumps to the correct final destination. A picture:
2730 If we are at `b', then fixup_alt_jump right now points to a
2731 three-byte space after `a'. We'll put in the jump, set
2732 fixup_alt_jump to right after `b', and leave behind three
2733 bytes which we'll fill in when we get to after `c'. */
2737 /* Mark and leave space for a jump after this alternative,
2738 to be filled in later either by next alternative or
2739 when know we're at the end of a series of alternatives. */
2741 GET_BUFFER_SPACE (3);
2750 /* If \{ is a literal. */
2751 if (!(syntax
& RE_INTERVALS
)
2752 /* If we're at `\{' and it's not the open-interval
2754 || ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
2755 || (p
- 2 == pattern
&& p
== pend
))
2756 goto normal_backslash
;
2760 /* If got here, then the syntax allows intervals. */
2762 /* At least (most) this many matches must be made. */
2763 int lower_bound
= 0, upper_bound
= -1;
2765 beg_interval
= p
- 1;
2769 if (syntax
& RE_NO_BK_BRACES
)
2770 goto unfetch_interval
;
2772 FREE_STACK_RETURN (REG_EBRACE
);
2775 GET_UNSIGNED_NUMBER (lower_bound
);
2779 GET_UNSIGNED_NUMBER (upper_bound
);
2780 if (upper_bound
< 0) upper_bound
= RE_DUP_MAX
;
2783 /* Interval such as `{1}' => match exactly once. */
2784 upper_bound
= lower_bound
;
2786 if (lower_bound
< 0 || upper_bound
> RE_DUP_MAX
2787 || lower_bound
> upper_bound
)
2789 if (syntax
& RE_NO_BK_BRACES
)
2790 goto unfetch_interval
;
2792 FREE_STACK_RETURN (REG_BADBR
);
2795 if (!(syntax
& RE_NO_BK_BRACES
))
2797 if (c
!= '\\') FREE_STACK_RETURN (REG_EBRACE
);
2804 if (syntax
& RE_NO_BK_BRACES
)
2805 goto unfetch_interval
;
2807 FREE_STACK_RETURN (REG_BADBR
);
2810 /* We just parsed a valid interval. */
2812 /* If it's invalid to have no preceding re. */
2815 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2816 FREE_STACK_RETURN (REG_BADRPT
);
2817 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2820 goto unfetch_interval
;
2823 /* If the upper bound is zero, don't want to succeed at
2824 all; jump from `laststart' to `b + 3', which will be
2825 the end of the buffer after we insert the jump. */
2826 if (upper_bound
== 0)
2828 GET_BUFFER_SPACE (3);
2829 INSERT_JUMP (jump
, laststart
, b
+ 3);
2833 /* Otherwise, we have a nontrivial interval. When
2834 we're all done, the pattern will look like:
2835 set_number_at <jump count> <upper bound>
2836 set_number_at <succeed_n count> <lower bound>
2837 succeed_n <after jump addr> <succeed_n count>
2839 jump_n <succeed_n addr> <jump count>
2840 (The upper bound and `jump_n' are omitted if
2841 `upper_bound' is 1, though.) */
2843 { /* If the upper bound is > 1, we need to insert
2844 more at the end of the loop. */
2845 unsigned nbytes
= 10 + (upper_bound
> 1) * 10;
2847 GET_BUFFER_SPACE (nbytes
);
2849 /* Initialize lower bound of the `succeed_n', even
2850 though it will be set during matching by its
2851 attendant `set_number_at' (inserted next),
2852 because `re_compile_fastmap' needs to know.
2853 Jump to the `jump_n' we might insert below. */
2854 INSERT_JUMP2 (succeed_n
, laststart
,
2855 b
+ 5 + (upper_bound
> 1) * 5,
2859 /* Code to initialize the lower bound. Insert
2860 before the `succeed_n'. The `5' is the last two
2861 bytes of this `set_number_at', plus 3 bytes of
2862 the following `succeed_n'. */
2863 insert_op2 (set_number_at
, laststart
, 5, lower_bound
, b
);
2866 if (upper_bound
> 1)
2867 { /* More than one repetition is allowed, so
2868 append a backward jump to the `succeed_n'
2869 that starts this interval.
2871 When we've reached this during matching,
2872 we'll have matched the interval once, so
2873 jump back only `upper_bound - 1' times. */
2874 STORE_JUMP2 (jump_n
, b
, laststart
+ 5,
2878 /* The location we want to set is the second
2879 parameter of the `jump_n'; that is `b-2' as
2880 an absolute address. `laststart' will be
2881 the `set_number_at' we're about to insert;
2882 `laststart+3' the number to set, the source
2883 for the relative address. But we are
2884 inserting into the middle of the pattern --
2885 so everything is getting moved up by 5.
2886 Conclusion: (b - 2) - (laststart + 3) + 5,
2887 i.e., b - laststart.
2889 We insert this at the beginning of the loop
2890 so that if we fail during matching, we'll
2891 reinitialize the bounds. */
2892 insert_op2 (set_number_at
, laststart
, b
- laststart
,
2893 upper_bound
- 1, b
);
2898 beg_interval
= NULL
;
2903 /* If an invalid interval, match the characters as literals. */
2904 assert (beg_interval
);
2906 beg_interval
= NULL
;
2908 /* normal_char and normal_backslash need `c'. */
2911 if (!(syntax
& RE_NO_BK_BRACES
))
2913 if (p
> pattern
&& p
[-1] == '\\')
2914 goto normal_backslash
;
2919 /* There is no way to specify the before_dot and after_dot
2920 operators. rms says this is ok. --karl */
2928 BUF_PUSH_2 (syntaxspec
, syntax_spec_code
[c
]);
2934 BUF_PUSH_2 (notsyntaxspec
, syntax_spec_code
[c
]);
2940 BUF_PUSH_2 (categoryspec
, c
);
2946 BUF_PUSH_2 (notcategoryspec
, c
);
2953 BUF_PUSH (wordchar
);
2959 BUF_PUSH (notwordchar
);
2972 BUF_PUSH (wordbound
);
2976 BUF_PUSH (notwordbound
);
2987 case '1': case '2': case '3': case '4': case '5':
2988 case '6': case '7': case '8': case '9':
2989 if (syntax
& RE_NO_BK_REFS
)
2995 FREE_STACK_RETURN (REG_ESUBREG
);
2997 /* Can't back reference to a subexpression if inside of it. */
2998 if (group_in_compile_stack (compile_stack
, c1
))
3002 BUF_PUSH_2 (duplicate
, c1
);
3008 if (syntax
& RE_BK_PLUS_QM
)
3011 goto normal_backslash
;
3015 /* You might think it would be useful for \ to mean
3016 not to translate; but if we don't translate it
3017 it will never match anything. */
3025 /* Expects the character in `c'. */
3027 p1
= p
- 1; /* P1 points the head of C. */
3029 if (bufp
->multibyte
)
3031 c
= STRING_CHAR (p1
, pend
- p1
);
3033 /* Set P to the next character boundary. */
3034 p
+= MULTIBYTE_FORM_LENGTH (p1
, pend
- p1
) - 1;
3037 /* If no exactn currently being built. */
3040 /* If last exactn not at current position. */
3041 || pending_exact
+ *pending_exact
+ 1 != b
3043 /* We have only one byte following the exactn for the count. */
3044 || *pending_exact
>= (1 << BYTEWIDTH
) - (p
- p1
)
3046 /* If followed by a repetition operator. */
3047 || (p
!= pend
&& (*p
== '*' || *p
== '^'))
3048 || ((syntax
& RE_BK_PLUS_QM
)
3049 ? p
+ 1 < pend
&& *p
== '\\' && (p
[1] == '+' || p
[1] == '?')
3050 : p
!= pend
&& (*p
== '+' || *p
== '?'))
3051 || ((syntax
& RE_INTERVALS
)
3052 && ((syntax
& RE_NO_BK_BRACES
)
3053 ? p
!= pend
&& *p
== '{'
3054 : p
+ 1 < pend
&& p
[0] == '\\' && p
[1] == '{')))
3056 /* Start building a new exactn. */
3060 BUF_PUSH_2 (exactn
, 0);
3061 pending_exact
= b
- 1;
3065 if (! SINGLE_BYTE_CHAR_P (c
))
3067 unsigned char str
[MAX_MULTIBYTE_LENGTH
];
3068 int i
= CHAR_STRING (c
, str
);
3070 for (j
= 0; j
< i
; j
++)
3084 } /* while p != pend */
3087 /* Through the pattern now. */
3091 if (!COMPILE_STACK_EMPTY
)
3092 FREE_STACK_RETURN (REG_EPAREN
);
3094 /* If we don't want backtracking, force success
3095 the first time we reach the end of the compiled pattern. */
3096 if (syntax
& RE_NO_POSIX_BACKTRACKING
)
3099 free (compile_stack
.stack
);
3101 /* We have succeeded; set the length of the buffer. */
3102 bufp
->used
= b
- bufp
->buffer
;
3107 re_compile_fastmap (bufp
);
3108 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3109 print_compiled_pattern (bufp
);
3114 #ifndef MATCH_MAY_ALLOCATE
3115 /* Initialize the failure stack to the largest possible stack. This
3116 isn't necessary unless we're trying to avoid calling alloca in
3117 the search and match routines. */
3119 int num_regs
= bufp
->re_nsub
+ 1;
3121 if (fail_stack
.size
< re_max_failures
* TYPICAL_FAILURE_SIZE
)
3123 fail_stack
.size
= re_max_failures
* TYPICAL_FAILURE_SIZE
;
3125 if (! fail_stack
.stack
)
3127 = (fail_stack_elt_t
*) malloc (fail_stack
.size
3128 * sizeof (fail_stack_elt_t
));
3131 = (fail_stack_elt_t
*) realloc (fail_stack
.stack
,
3133 * sizeof (fail_stack_elt_t
)));
3136 regex_grow_registers (num_regs
);
3138 #endif /* not MATCH_MAY_ALLOCATE */
3141 } /* regex_compile */
3143 /* Subroutines for `regex_compile'. */
3145 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3148 store_op1 (op
, loc
, arg
)
3153 *loc
= (unsigned char) op
;
3154 STORE_NUMBER (loc
+ 1, arg
);
3158 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3161 store_op2 (op
, loc
, arg1
, arg2
)
3166 *loc
= (unsigned char) op
;
3167 STORE_NUMBER (loc
+ 1, arg1
);
3168 STORE_NUMBER (loc
+ 3, arg2
);
3172 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3173 for OP followed by two-byte integer parameter ARG. */
3176 insert_op1 (op
, loc
, arg
, end
)
3182 register unsigned char *pfrom
= end
;
3183 register unsigned char *pto
= end
+ 3;
3185 while (pfrom
!= loc
)
3188 store_op1 (op
, loc
, arg
);
3192 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3195 insert_op2 (op
, loc
, arg1
, arg2
, end
)
3201 register unsigned char *pfrom
= end
;
3202 register unsigned char *pto
= end
+ 5;
3204 while (pfrom
!= loc
)
3207 store_op2 (op
, loc
, arg1
, arg2
);
3211 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3212 after an alternative or a begin-subexpression. We assume there is at
3213 least one character before the ^. */
3216 at_begline_loc_p (pattern
, p
, syntax
)
3217 const unsigned char *pattern
, *p
;
3218 reg_syntax_t syntax
;
3220 re_char
*prev
= p
- 2;
3221 boolean prev_prev_backslash
= prev
> pattern
&& prev
[-1] == '\\';
3224 /* After a subexpression? */
3225 (*prev
== '(' && (syntax
& RE_NO_BK_PARENS
|| prev_prev_backslash
))
3226 /* After an alternative? */
3227 || (*prev
== '|' && (syntax
& RE_NO_BK_VBAR
|| prev_prev_backslash
));
3231 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3232 at least one character after the $, i.e., `P < PEND'. */
3235 at_endline_loc_p (p
, pend
, syntax
)
3236 const unsigned char *p
, *pend
;
3237 reg_syntax_t syntax
;
3240 boolean next_backslash
= *next
== '\\';
3241 re_char
*next_next
= p
+ 1 < pend
? p
+ 1 : 0;
3244 /* Before a subexpression? */
3245 (syntax
& RE_NO_BK_PARENS
? *next
== ')'
3246 : next_backslash
&& next_next
&& *next_next
== ')')
3247 /* Before an alternative? */
3248 || (syntax
& RE_NO_BK_VBAR
? *next
== '|'
3249 : next_backslash
&& next_next
&& *next_next
== '|');
3253 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3254 false if it's not. */
3257 group_in_compile_stack (compile_stack
, regnum
)
3258 compile_stack_type compile_stack
;
3263 for (this_element
= compile_stack
.avail
- 1;
3266 if (compile_stack
.stack
[this_element
].regnum
== regnum
)
3272 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3273 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3274 characters can start a string that matches the pattern. This fastmap
3275 is used by re_search to skip quickly over impossible starting points.
3277 Character codes above (1 << BYTEWIDTH) are not represented in the
3278 fastmap, but the leading codes are represented. Thus, the fastmap
3279 indicates which character sets could start a match.
3281 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3282 area as BUFP->fastmap.
3284 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3287 Returns 0 if we succeed, -2 if an internal error. */
3290 re_compile_fastmap (bufp
)
3291 struct re_pattern_buffer
*bufp
;
3294 #ifdef MATCH_MAY_ALLOCATE
3295 fail_stack_type fail_stack
;
3297 #ifndef REGEX_MALLOC
3301 register char *fastmap
= bufp
->fastmap
;
3302 unsigned char *pattern
= bufp
->buffer
;
3303 unsigned long size
= bufp
->used
;
3304 unsigned char *p
= pattern
;
3305 register unsigned char *pend
= pattern
+ size
;
3307 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3308 /* This holds the pointer to the failure stack, when
3309 it is allocated relocatably. */
3310 fail_stack_elt_t
*failure_stack_ptr
;
3313 /* Assume that each path through the pattern can be null until
3314 proven otherwise. We set this false at the bottom of switch
3315 statement, to which we get only if a particular path doesn't
3316 match the empty string. */
3317 boolean path_can_be_null
= true;
3319 /* We aren't doing a `succeed_n' to begin with. */
3320 boolean succeed_n_p
= false;
3322 /* If all elements for base leading-codes in fastmap is set, this
3323 flag is set true. */
3324 boolean match_any_multibyte_characters
= false;
3326 /* Maximum code of simple (single byte) character. */
3327 int simple_char_max
;
3329 assert (fastmap
!= NULL
&& p
!= NULL
);
3332 bzero (fastmap
, 1 << BYTEWIDTH
); /* Assume nothing's valid. */
3333 bufp
->fastmap_accurate
= 1; /* It will be when we're done. */
3334 bufp
->can_be_null
= 0;
3336 /* The loop below works as follows:
3337 - It has a working-list kept in the PATTERN_STACK and which basically
3338 starts by only containing a pointer to the first operation.
3339 - If the opcode we're looking at is a match against some set of
3340 chars, then we add those chars to the fastmap and go on to the
3341 next work element from the worklist (done via `break').
3342 - If the opcode is a control operator on the other hand, we either
3343 ignore it (if it's meaningless at this point, such as `start_memory')
3344 or execute it (if it's a jump). If the jump has several destinations
3345 (i.e. `on_failure_jump'), then we push the other destination onto the
3347 We guarantee termination by ignoring backward jumps (more or less),
3348 so that `p' is monotonically increasing. More to the point, we
3349 never set `p' (or push) anything `<= p1'. */
3351 /* If can_be_null is set, then the fastmap will not be used anyway. */
3352 while (!bufp
->can_be_null
)
3354 /* `p1' is used as a marker of how far back a `on_failure_jump'
3355 can go without being ignored. It is normally equal to `p'
3356 (which prevents any backward `on_failure_jump') except right
3357 after a plain `jump', to allow patterns such as:
3360 10: on_failure_jump 3
3361 as used for the *? operator. */
3362 unsigned char *p1
= p
;
3364 if (p
== pend
|| *p
== succeed
)
3366 /* We have reached the (effective) end of pattern. */
3367 if (!PATTERN_STACK_EMPTY ())
3369 bufp
->can_be_null
|= path_can_be_null
;
3371 /* Reset for next path. */
3372 path_can_be_null
= true;
3374 p
= (unsigned char*) POP_PATTERN_OP ();
3382 /* We should never be about to go beyond the end of the pattern. */
3385 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
++))
3389 /* If the first character has to match a backreference, that means
3390 that the group was empty (since it already matched). Since this
3391 is the only case that interests us here, we can assume that the
3392 backreference must match the empty string. */
3397 /* Following are the cases which match a character. These end
3408 int length
= (*p
& 0x7f);;
3411 for (j
= length
* BYTEWIDTH
- 1; j
>= 0; j
--)
3412 if (p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
)))
3418 /* Chars beyond end of map must be allowed. */
3420 int length
= (*p
& 0x7f);;
3423 for (j
= length
* BYTEWIDTH
; j
< (1 << BYTEWIDTH
); j
++)
3426 for (j
= length
* BYTEWIDTH
- 1; j
>= 0; j
--)
3427 if (!(p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
))))
3433 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
3434 if (SYNTAX (j
) == Sword
)
3440 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
3441 if (SYNTAX (j
) != Sword
)
3446 for (j
= CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
- 1, p
++;
3448 if (p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
)))
3451 /* If we can match a character class, we can match
3452 any character set. */
3453 if (CHARSET_RANGE_TABLE_EXISTS_P (&p
[-2])
3454 && CHARSET_RANGE_TABLE_BITS (&p
[-2]) != 0)
3455 goto set_fastmap_for_multibyte_characters
;
3457 if (CHARSET_RANGE_TABLE_EXISTS_P (&p
[-2])
3458 && match_any_multibyte_characters
== false)
3460 /* Set fastmap[I] 1 where I is a base leading code of each
3461 multibyte character in the range table. */
3464 /* Make P points the range table. */
3465 p
+= CHARSET_BITMAP_SIZE (&p
[-2]);
3467 /* Extract the number of ranges in range table into COUNT. */
3468 EXTRACT_NUMBER_AND_INCR (count
, p
);
3469 for (; count
> 0; count
--, p
+= 2 * 3) /* XXX */
3471 /* Extract the start of each range. */
3472 EXTRACT_CHARACTER (c
, p
);
3473 j
= CHAR_CHARSET (c
);
3474 fastmap
[CHARSET_LEADING_CODE_BASE (j
)] = 1;
3481 /* Chars beyond end of bitmap are possible matches.
3482 All the single-byte codes can occur in multibyte buffers.
3483 So any that are not listed in the charset
3484 are possible matches, even in multibyte buffers. */
3485 simple_char_max
= (1 << BYTEWIDTH
);
3486 for (j
= CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
;
3487 j
< simple_char_max
; j
++)
3490 for (j
= CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
- 1, p
++;
3492 if (!(p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
))))
3495 if (bufp
->multibyte
)
3496 /* Any character set can possibly contain a character
3497 which doesn't match the specified set of characters. */
3499 set_fastmap_for_multibyte_characters
:
3500 if (match_any_multibyte_characters
== false)
3502 for (j
= 0x80; j
< 0xA0; j
++) /* XXX */
3503 if (BASE_LEADING_CODE_P (j
))
3505 match_any_multibyte_characters
= true;
3512 /* All the single-byte codes can occur in multibyte buffers,
3513 and they may have word syntax. So do consider them. */
3514 simple_char_max
= (1 << BYTEWIDTH
);
3515 for (j
= 0; j
< simple_char_max
; j
++)
3516 if (SYNTAX (j
) == Sword
)
3519 if (bufp
->multibyte
)
3520 /* Any character set can possibly contain a character
3521 whose syntax is `Sword'. */
3522 goto set_fastmap_for_multibyte_characters
;
3527 /* All the single-byte codes can occur in multibyte buffers,
3528 and they may not have word syntax. So do consider them. */
3529 simple_char_max
= (1 << BYTEWIDTH
);
3530 for (j
= 0; j
< simple_char_max
; j
++)
3531 if (SYNTAX (j
) != Sword
)
3534 if (bufp
->multibyte
)
3535 /* Any character set can possibly contain a character
3536 whose syntax is not `Sword'. */
3537 goto set_fastmap_for_multibyte_characters
;
3543 int fastmap_newline
= fastmap
['\n'];
3545 /* `.' matches anything, except perhaps newline.
3546 Even in a multibyte buffer, it should match any
3547 conceivable byte value for the fastmap. */
3548 if (bufp
->multibyte
)
3549 match_any_multibyte_characters
= true;
3551 simple_char_max
= (1 << BYTEWIDTH
);
3552 for (j
= 0; j
< simple_char_max
; j
++)
3555 /* ... except perhaps newline. */
3556 if (!(bufp
->syntax
& RE_DOT_NEWLINE
))
3557 fastmap
['\n'] = fastmap_newline
;
3559 /* Otherwise, have to check alternative paths. */
3570 /* This match depends on text properties. These end with
3571 aborting optimizations. */
3572 bufp
->can_be_null
= 1;
3576 simple_char_max
= bufp
->multibyte
? 0x80 : (1 << BYTEWIDTH
);
3577 for (j
= 0; j
< simple_char_max
; j
++)
3578 if (SYNTAX (j
) == (enum syntaxcode
) k
)
3581 if (bufp
->multibyte
)
3582 /* Any character set can possibly contain a character
3583 whose syntax is K. */
3584 goto set_fastmap_for_multibyte_characters
;
3589 simple_char_max
= bufp
->multibyte
? 0x80 : (1 << BYTEWIDTH
);
3590 for (j
= 0; j
< simple_char_max
; j
++)
3591 if (SYNTAX (j
) != (enum syntaxcode
) k
)
3594 if (bufp
->multibyte
)
3595 /* Any character set can possibly contain a character
3596 whose syntax is not K. */
3597 goto set_fastmap_for_multibyte_characters
;
3604 simple_char_max
= (1 << BYTEWIDTH
);
3605 for (j
= 0; j
< simple_char_max
; j
++)
3606 if (CHAR_HAS_CATEGORY (j
, k
))
3609 if (bufp
->multibyte
)
3610 /* Any character set can possibly contain a character
3611 whose category is K. */
3612 goto set_fastmap_for_multibyte_characters
;
3616 case notcategoryspec
:
3618 simple_char_max
= (1 << BYTEWIDTH
);
3619 for (j
= 0; j
< simple_char_max
; j
++)
3620 if (!CHAR_HAS_CATEGORY (j
, k
))
3623 if (bufp
->multibyte
)
3624 /* Any character set can possibly contain a character
3625 whose category is not K. */
3626 goto set_fastmap_for_multibyte_characters
;
3629 /* All cases after this match the empty string. These end with
3656 EXTRACT_NUMBER_AND_INCR (j
, p
);
3658 /* Backward jumps can only go back to code that we've already
3659 visited. `re_compile' should make sure this is true. */
3662 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
))
3664 case on_failure_jump
:
3665 case on_failure_keep_string_jump
:
3666 case on_failure_jump_exclusive
:
3667 case on_failure_jump_loop
:
3668 case on_failure_jump_smart
:
3674 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3675 to jump back to "just after here". */
3678 case on_failure_jump
:
3679 case on_failure_keep_string_jump
:
3680 case on_failure_jump_exclusive
:
3681 case on_failure_jump_loop
:
3682 case on_failure_jump_smart
:
3683 handle_on_failure_jump
:
3684 EXTRACT_NUMBER_AND_INCR (j
, p
);
3686 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3687 end of the pattern. We don't want to push such a point,
3688 since when we restore it above, entering the switch will
3689 increment `p' past the end of the pattern. We don't need
3690 to push such a point since we obviously won't find any more
3691 fastmap entries beyond `pend'. Such a pattern can match
3692 the null string, though. */
3694 /* Backward jump to be ignored. */
3696 else if (p
+ j
< pend
)
3698 if (!PUSH_PATTERN_OP (p
+ j
, fail_stack
))
3700 RESET_FAIL_STACK ();
3705 bufp
->can_be_null
= 1;
3709 EXTRACT_NUMBER_AND_INCR (k
, p
); /* Skip the n. */
3710 succeed_n_p
= false;
3717 /* Get to the number of times to succeed. */
3720 /* Increment p past the n for when k != 0. */
3721 EXTRACT_NUMBER_AND_INCR (k
, p
);
3725 succeed_n_p
= true; /* Spaghetti code alert. */
3726 goto handle_on_failure_jump
;
3743 abort (); /* We have listed all the cases. */
3746 /* Getting here means we have found the possible starting
3747 characters for one path of the pattern -- and that the empty
3748 string does not match. We need not follow this path further.
3749 Instead, look at the next alternative (remembered on the
3750 stack), or quit if no more. The test at the top of the loop
3751 does these things. */
3752 path_can_be_null
= false;
3756 /* Set `can_be_null' for the last path (also the first path, if the
3757 pattern is empty). */
3758 bufp
->can_be_null
|= path_can_be_null
;
3759 RESET_FAIL_STACK ();
3761 } /* re_compile_fastmap */
3763 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3764 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3765 this memory for recording register information. STARTS and ENDS
3766 must be allocated using the malloc library routine, and must each
3767 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3769 If NUM_REGS == 0, then subsequent matches should allocate their own
3772 Unless this function is called, the first search or match using
3773 PATTERN_BUFFER will allocate its own register data, without
3774 freeing the old data. */
3777 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
3778 struct re_pattern_buffer
*bufp
;
3779 struct re_registers
*regs
;
3781 regoff_t
*starts
, *ends
;
3785 bufp
->regs_allocated
= REGS_REALLOCATE
;
3786 regs
->num_regs
= num_regs
;
3787 regs
->start
= starts
;
3792 bufp
->regs_allocated
= REGS_UNALLOCATED
;
3794 regs
->start
= regs
->end
= (regoff_t
*) 0;
3798 /* Searching routines. */
3800 /* Like re_search_2, below, but only one string is specified, and
3801 doesn't let you say where to stop matching. */
3804 re_search (bufp
, string
, size
, startpos
, range
, regs
)
3805 struct re_pattern_buffer
*bufp
;
3807 int size
, startpos
, range
;
3808 struct re_registers
*regs
;
3810 return re_search_2 (bufp
, NULL
, 0, string
, size
, startpos
, range
,
3814 /* End address of virtual concatenation of string. */
3815 #define STOP_ADDR_VSTRING(P) \
3816 (((P) >= size1 ? string2 + size2 : string1 + size1))
3818 /* Address of POS in the concatenation of virtual string. */
3819 #define POS_ADDR_VSTRING(POS) \
3820 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3822 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3823 virtual concatenation of STRING1 and STRING2, starting first at index
3824 STARTPOS, then at STARTPOS + 1, and so on.
3826 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3828 RANGE is how far to scan while trying to match. RANGE = 0 means try
3829 only at STARTPOS; in general, the last start tried is STARTPOS +
3832 In REGS, return the indices of the virtual concatenation of STRING1
3833 and STRING2 that matched the entire BUFP->buffer and its contained
3836 Do not consider matching one past the index STOP in the virtual
3837 concatenation of STRING1 and STRING2.
3839 We return either the position in the strings at which the match was
3840 found, -1 if no match, or -2 if error (such as failure
3844 re_search_2 (bufp
, str1
, size1
, str2
, size2
, startpos
, range
, regs
, stop
)
3845 struct re_pattern_buffer
*bufp
;
3846 const char *str1
, *str2
;
3850 struct re_registers
*regs
;
3854 re_char
*string1
= (re_char
*) str1
;
3855 re_char
*string2
= (re_char
*) str2
;
3856 register char *fastmap
= bufp
->fastmap
;
3857 register RE_TRANSLATE_TYPE translate
= bufp
->translate
;
3858 int total_size
= size1
+ size2
;
3859 int endpos
= startpos
+ range
;
3860 int anchored_start
= 0;
3862 /* Nonzero if we have to concern multibyte character. */
3863 int multibyte
= bufp
->multibyte
;
3865 /* Check for out-of-range STARTPOS. */
3866 if (startpos
< 0 || startpos
> total_size
)
3869 /* Fix up RANGE if it might eventually take us outside
3870 the virtual concatenation of STRING1 and STRING2.
3871 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3873 range
= 0 - startpos
;
3874 else if (endpos
> total_size
)
3875 range
= total_size
- startpos
;
3877 /* If the search isn't to be a backwards one, don't waste time in a
3878 search for a pattern anchored at beginning of buffer. */
3879 if (bufp
->used
> 0 && (re_opcode_t
) bufp
->buffer
[0] == begbuf
&& range
> 0)
3888 /* In a forward search for something that starts with \=.
3889 don't keep searching past point. */
3890 if (bufp
->used
> 0 && (re_opcode_t
) bufp
->buffer
[0] == at_dot
&& range
> 0)
3892 range
= PT_BYTE
- BEGV_BYTE
- startpos
;
3898 /* Update the fastmap now if not correct already. */
3899 if (fastmap
&& !bufp
->fastmap_accurate
)
3900 if (re_compile_fastmap (bufp
) == -2)
3903 /* See whether the pattern is anchored. */
3904 if (bufp
->buffer
[0] == begline
)
3908 gl_state
.object
= re_match_object
;
3910 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos
));
3912 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object
, charpos
, 1);
3916 /* Loop through the string, looking for a place to start matching. */
3919 /* If the pattern is anchored,
3920 skip quickly past places we cannot match.
3921 We don't bother to treat startpos == 0 specially
3922 because that case doesn't repeat. */
3923 if (anchored_start
&& startpos
> 0)
3925 if (! (bufp
->newline_anchor
3926 && ((startpos
<= size1
? string1
[startpos
- 1]
3927 : string2
[startpos
- size1
- 1])
3932 /* If a fastmap is supplied, skip quickly over characters that
3933 cannot be the start of a match. If the pattern can match the
3934 null string, however, we don't need to skip characters; we want
3935 the first null string. */
3936 if (fastmap
&& startpos
< total_size
&& !bufp
->can_be_null
)
3938 register re_char
*d
;
3939 register unsigned int buf_ch
;
3941 d
= POS_ADDR_VSTRING (startpos
);
3943 if (range
> 0) /* Searching forwards. */
3945 register int lim
= 0;
3948 if (startpos
< size1
&& startpos
+ range
>= size1
)
3949 lim
= range
- (size1
- startpos
);
3951 /* Written out as an if-else to avoid testing `translate'
3953 if (RE_TRANSLATE_P (translate
))
3960 buf_ch
= STRING_CHAR_AND_LENGTH (d
, range
- lim
,
3963 buf_ch
= RE_TRANSLATE (translate
, buf_ch
);
3968 range
-= buf_charlen
;
3973 && !fastmap
[RE_TRANSLATE (translate
, *d
)])
3980 while (range
> lim
&& !fastmap
[*d
])
3986 startpos
+= irange
- range
;
3988 else /* Searching backwards. */
3990 int room
= (startpos
>= size1
3991 ? size2
+ size1
- startpos
3992 : size1
- startpos
);
3994 buf_ch
= STRING_CHAR (d
, room
);
3995 if (RE_TRANSLATE_P (translate
))
3996 buf_ch
= RE_TRANSLATE (translate
, buf_ch
);
3998 if (! (buf_ch
>= 0400
3999 || fastmap
[buf_ch
]))
4004 /* If can't match the null string, and that's all we have left, fail. */
4005 if (range
>= 0 && startpos
== total_size
&& fastmap
4006 && !bufp
->can_be_null
)
4009 val
= re_match_2_internal (bufp
, string1
, size1
, string2
, size2
,
4010 startpos
, regs
, stop
);
4011 #ifndef REGEX_MALLOC
4028 /* Update STARTPOS to the next character boundary. */
4031 re_char
*p
= POS_ADDR_VSTRING (startpos
);
4032 re_char
*pend
= STOP_ADDR_VSTRING (startpos
);
4033 int len
= MULTIBYTE_FORM_LENGTH (p
, pend
- p
);
4051 /* Update STARTPOS to the previous character boundary. */
4054 re_char
*p
= POS_ADDR_VSTRING (startpos
);
4057 /* Find the head of multibyte form. */
4058 while (!CHAR_HEAD_P (*p
))
4063 if (MULTIBYTE_FORM_LENGTH (p
, len
+ 1) != (len
+ 1))
4080 /* Declarations and macros for re_match_2. */
4082 static int bcmp_translate ();
4084 /* This converts PTR, a pointer into one of the search strings `string1'
4085 and `string2' into an offset from the beginning of that string. */
4086 #define POINTER_TO_OFFSET(ptr) \
4087 (FIRST_STRING_P (ptr) \
4088 ? ((regoff_t) ((ptr) - string1)) \
4089 : ((regoff_t) ((ptr) - string2 + size1)))
4091 /* Call before fetching a character with *d. This switches over to
4092 string2 if necessary. */
4093 #define PREFETCH() \
4096 /* End of string2 => fail. */ \
4097 if (dend == end_match_2) \
4099 /* End of string1 => advance to string2. */ \
4101 dend = end_match_2; \
4105 /* Test if at very beginning or at very end of the virtual concatenation
4106 of `string1' and `string2'. If only one string, it's `string2'. */
4107 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4108 #define AT_STRINGS_END(d) ((d) == end2)
4111 /* Test if D points to a character which is word-constituent. We have
4112 two special cases to check for: if past the end of string1, look at
4113 the first character in string2; and if before the beginning of
4114 string2, look at the last character in string1. */
4115 #define WORDCHAR_P(d) \
4116 (SYNTAX ((d) == end1 ? *string2 \
4117 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4120 /* Disabled due to a compiler bug -- see comment at case wordbound */
4122 /* The comment at case wordbound is following one, but we don't use
4123 AT_WORD_BOUNDARY anymore to support multibyte form.
4125 The DEC Alpha C compiler 3.x generates incorrect code for the
4126 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4127 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4128 macro and introducing temporary variables works around the bug. */
4131 /* Test if the character before D and the one at D differ with respect
4132 to being word-constituent. */
4133 #define AT_WORD_BOUNDARY(d) \
4134 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4135 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4138 /* Free everything we malloc. */
4139 #ifdef MATCH_MAY_ALLOCATE
4140 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4141 #define FREE_VARIABLES() \
4143 REGEX_FREE_STACK (fail_stack.stack); \
4144 FREE_VAR (regstart); \
4145 FREE_VAR (regend); \
4146 FREE_VAR (best_regstart); \
4147 FREE_VAR (best_regend); \
4150 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4151 #endif /* not MATCH_MAY_ALLOCATE */
4154 /* Optimization routines. */
4156 /* Jump over non-matching operations. */
4157 static unsigned char *
4158 skip_noops (p
, pend
, memory
)
4159 unsigned char *p
, *pend
;
4165 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
))
4176 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
4187 /* Non-zero if "p1 matches something" implies "p2 fails". */
4189 mutually_exclusive_p (bufp
, p1
, p2
)
4190 struct re_pattern_buffer
*bufp
;
4191 unsigned char *p1
, *p2
;
4193 int multibyte
= bufp
->multibyte
;
4194 unsigned char *pend
= bufp
->buffer
+ bufp
->used
;
4196 assert (p1
>= bufp
->buffer
&& p1
<= pend
4197 && p2
>= bufp
->buffer
&& p2
<= pend
);
4199 /* Skip over open/close-group commands.
4200 If what follows this loop is a ...+ construct,
4201 look at what begins its body, since we will have to
4202 match at least one of that. */
4203 p2
= skip_noops (p2
, pend
, 1);
4204 /* The same skip can be done for p1, except that skipping over
4205 start_memory is not a good idea (if there's a group inside
4206 the loop delimited by on_failure_jump_exclusive, then it
4207 can't optimize the push away (it still works properly, but
4208 slightly slower rather than faster)). */
4209 p1
= skip_noops (p1
, pend
, 0);
4211 /* If we're at the end of the pattern, we can change. */
4214 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p1
))
4220 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4227 else if ((re_opcode_t
) *p2
== exactn
4228 || (bufp
->newline_anchor
&& (re_opcode_t
) *p2
== endline
))
4230 register unsigned int c
4231 = *p2
== (unsigned char) endline
? '\n' : p2
[2];
4233 if ((re_opcode_t
) *p1
== exactn
)
4235 if (!(multibyte
/* && (c != '\n') */
4236 && BASE_LEADING_CODE_P (c
))
4238 : (STRING_CHAR (&p2
[2], pend
- &p2
[2])
4239 != STRING_CHAR (&p1
[2], pend
- &p1
[2])))
4241 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c
, p1
[2]);
4246 else if ((re_opcode_t
) *p1
== charset
4247 || (re_opcode_t
) *p1
== charset_not
)
4249 int not = (re_opcode_t
) *p1
== charset_not
;
4251 if (multibyte
/* && (c != '\n') */
4252 && BASE_LEADING_CODE_P (c
))
4253 c
= STRING_CHAR (&p2
[2], pend
- &p2
[2]);
4255 /* Test if C is listed in charset (or charset_not)
4257 if (SINGLE_BYTE_CHAR_P (c
))
4259 if (c
< CHARSET_BITMAP_SIZE (p1
) * BYTEWIDTH
4260 && p1
[2 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
4263 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1
))
4264 CHARSET_LOOKUP_RANGE_TABLE (not, c
, p1
);
4266 /* `not' is equal to 1 if c would match, which means
4267 that we can't change to pop_failure_jump. */
4270 DEBUG_PRINT1 (" No match => fast loop.\n");
4274 else if ((re_opcode_t
) *p1
== anychar
4277 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4281 else if ((re_opcode_t
) *p2
== charset
4282 || (re_opcode_t
) *p2
== charset_not
)
4284 if ((re_opcode_t
) *p1
== exactn
)
4285 /* Reuse the code above. */
4286 return mutually_exclusive_p (bufp
, p2
, p1
);
4289 /* It is hard to list up all the character in charset
4290 P2 if it includes multibyte character. Give up in
4292 else if (!multibyte
|| !CHARSET_RANGE_TABLE_EXISTS_P (p2
))
4294 /* Now, we are sure that P2 has no range table.
4295 So, for the size of bitmap in P2, `p2[1]' is
4296 enough. But P1 may have range table, so the
4297 size of bitmap table of P1 is extracted by
4298 using macro `CHARSET_BITMAP_SIZE'.
4300 Since we know that all the character listed in
4301 P2 is ASCII, it is enough to test only bitmap
4307 /* We win if the charset inside the loop
4308 has no overlap with the one after the loop. */
4311 && idx
< CHARSET_BITMAP_SIZE (p1
));
4313 if ((p2
[2 + idx
] & p1
[2 + idx
]) != 0)
4317 || idx
== CHARSET_BITMAP_SIZE (p1
))
4319 DEBUG_PRINT1 (" No match => fast loop.\n");
4323 else if ((re_opcode_t
) *p1
== charset
4324 || (re_opcode_t
) *p1
== charset_not
)
4327 /* We win if the charset_not inside the loop lists
4328 every character listed in the charset after. */
4329 for (idx
= 0; idx
< (int) p2
[1]; idx
++)
4330 if (! (p2
[2 + idx
] == 0
4331 || (idx
< CHARSET_BITMAP_SIZE (p1
)
4332 && ((p2
[2 + idx
] & ~ p1
[2 + idx
]) == 0))))
4337 DEBUG_PRINT1 (" No match => fast loop.\n");
4349 /* Matching routines. */
4351 #ifndef emacs /* Emacs never uses this. */
4352 /* re_match is like re_match_2 except it takes only a single string. */
4355 re_match (bufp
, string
, size
, pos
, regs
)
4356 struct re_pattern_buffer
*bufp
;
4359 struct re_registers
*regs
;
4361 int result
= re_match_2_internal (bufp
, NULL
, 0, string
, size
,
4366 #endif /* not emacs */
4369 /* In Emacs, this is the string or buffer in which we
4370 are matching. It is used for looking up syntax properties. */
4371 Lisp_Object re_match_object
;
4374 /* re_match_2 matches the compiled pattern in BUFP against the
4375 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4376 and SIZE2, respectively). We start matching at POS, and stop
4379 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4380 store offsets for the substring each group matched in REGS. See the
4381 documentation for exactly how many groups we fill.
4383 We return -1 if no match, -2 if an internal error (such as the
4384 failure stack overflowing). Otherwise, we return the length of the
4385 matched substring. */
4388 re_match_2 (bufp
, string1
, size1
, string2
, size2
, pos
, regs
, stop
)
4389 struct re_pattern_buffer
*bufp
;
4390 const char *string1
, *string2
;
4393 struct re_registers
*regs
;
4400 gl_state
.object
= re_match_object
;
4401 charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos
));
4402 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object
, charpos
, 1);
4405 result
= re_match_2_internal (bufp
, string1
, size1
, string2
, size2
,
4411 /* This is a separate function so that we can force an alloca cleanup
4414 re_match_2_internal (bufp
, string1
, size1
, string2
, size2
, pos
, regs
, stop
)
4415 struct re_pattern_buffer
*bufp
;
4416 re_char
*string1
, *string2
;
4419 struct re_registers
*regs
;
4422 /* General temporaries. */
4427 /* Just past the end of the corresponding string. */
4428 re_char
*end1
, *end2
;
4430 /* Pointers into string1 and string2, just past the last characters in
4431 each to consider matching. */
4432 re_char
*end_match_1
, *end_match_2
;
4434 /* Where we are in the data, and the end of the current string. */
4437 /* Used sometimes to remember where we were before starting matching
4438 an operator so that we can go back in case of failure. This "atomic"
4439 behavior of matching opcodes is indispensable to the correctness
4440 of the on_failure_keep_string_jump optimization. */
4443 /* Where we are in the pattern, and the end of the pattern. */
4444 unsigned char *p
= bufp
->buffer
;
4445 register unsigned char *pend
= p
+ bufp
->used
;
4447 /* We use this to map every character in the string. */
4448 RE_TRANSLATE_TYPE translate
= bufp
->translate
;
4450 /* Nonzero if we have to concern multibyte character. */
4451 int multibyte
= bufp
->multibyte
;
4453 /* Failure point stack. Each place that can handle a failure further
4454 down the line pushes a failure point on this stack. It consists of
4455 regstart, and regend for all registers corresponding to
4456 the subexpressions we're currently inside, plus the number of such
4457 registers, and, finally, two char *'s. The first char * is where
4458 to resume scanning the pattern; the second one is where to resume
4459 scanning the strings. */
4460 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4461 fail_stack_type fail_stack
;
4464 static unsigned failure_id
= 0;
4465 unsigned nfailure_points_pushed
= 0, nfailure_points_popped
= 0;
4468 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4469 /* This holds the pointer to the failure stack, when
4470 it is allocated relocatably. */
4471 fail_stack_elt_t
*failure_stack_ptr
;
4474 /* We fill all the registers internally, independent of what we
4475 return, for use in backreferences. The number here includes
4476 an element for register zero. */
4477 unsigned num_regs
= bufp
->re_nsub
+ 1;
4479 /* Information on the contents of registers. These are pointers into
4480 the input strings; they record just what was matched (on this
4481 attempt) by a subexpression part of the pattern, that is, the
4482 regnum-th regstart pointer points to where in the pattern we began
4483 matching and the regnum-th regend points to right after where we
4484 stopped matching the regnum-th subexpression. (The zeroth register
4485 keeps track of what the whole pattern matches.) */
4486 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4487 re_char
**regstart
, **regend
;
4490 /* The following record the register info as found in the above
4491 variables when we find a match better than any we've seen before.
4492 This happens as we backtrack through the failure points, which in
4493 turn happens only if we have not yet matched the entire string. */
4494 unsigned best_regs_set
= false;
4495 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4496 re_char
**best_regstart
, **best_regend
;
4499 /* Logically, this is `best_regend[0]'. But we don't want to have to
4500 allocate space for that if we're not allocating space for anything
4501 else (see below). Also, we never need info about register 0 for
4502 any of the other register vectors, and it seems rather a kludge to
4503 treat `best_regend' differently than the rest. So we keep track of
4504 the end of the best match so far in a separate variable. We
4505 initialize this to NULL so that when we backtrack the first time
4506 and need to test it, it's not garbage. */
4507 re_char
*match_end
= NULL
;
4510 /* Counts the total number of registers pushed. */
4511 unsigned num_regs_pushed
= 0;
4514 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4518 #ifdef MATCH_MAY_ALLOCATE
4519 /* Do not bother to initialize all the register variables if there are
4520 no groups in the pattern, as it takes a fair amount of time. If
4521 there are groups, we include space for register 0 (the whole
4522 pattern), even though we never use it, since it simplifies the
4523 array indexing. We should fix this. */
4526 regstart
= REGEX_TALLOC (num_regs
, re_char
*);
4527 regend
= REGEX_TALLOC (num_regs
, re_char
*);
4528 best_regstart
= REGEX_TALLOC (num_regs
, re_char
*);
4529 best_regend
= REGEX_TALLOC (num_regs
, re_char
*);
4531 if (!(regstart
&& regend
&& best_regstart
&& best_regend
))
4539 /* We must initialize all our variables to NULL, so that
4540 `FREE_VARIABLES' doesn't try to free them. */
4541 regstart
= regend
= best_regstart
= best_regend
= NULL
;
4543 #endif /* MATCH_MAY_ALLOCATE */
4545 /* The starting position is bogus. */
4546 if (pos
< 0 || pos
> size1
+ size2
)
4552 /* Initialize subexpression text positions to -1 to mark ones that no
4553 start_memory/stop_memory has been seen for. Also initialize the
4554 register information struct. */
4555 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4556 regstart
[mcnt
] = regend
[mcnt
] = REG_UNSET_VALUE
;
4558 /* Shorten strings to `stop'. */
4564 else if (stop
<= size1
+ size2
)
4565 size2
= stop
- size1
;
4567 /* We move `string1' into `string2' if the latter's empty -- but not if
4568 `string1' is null. */
4569 if (size2
== 0 && string1
!= NULL
)
4576 end1
= string1
+ size1
;
4577 end2
= string2
+ size2
;
4579 /* Compute where to stop matching, within the two strings. */
4583 /* `p' scans through the pattern as `d' scans through the data.
4584 `dend' is the end of the input string that `d' points within. `d'
4585 is advanced into the following input string whenever necessary, but
4586 this happens before fetching; therefore, at the beginning of the
4587 loop, `d' can be pointing at the end of a string, but it cannot
4589 if (size1
> 0 && pos
<= size1
)
4596 d
= string2
+ pos
- size1
;
4600 DEBUG_PRINT1 ("The compiled pattern is: ");
4601 DEBUG_PRINT_COMPILED_PATTERN (bufp
, p
, pend
);
4602 DEBUG_PRINT1 ("The string to match is: `");
4603 DEBUG_PRINT_DOUBLE_STRING (d
, string1
, size1
, string2
, size2
);
4604 DEBUG_PRINT1 ("'\n");
4606 /* This loops over pattern commands. It exits by returning from the
4607 function if the match is complete, or it drops through if the match
4608 fails at this starting point in the input data. */
4611 DEBUG_PRINT2 ("\n%p: ", p
);
4614 { /* End of pattern means we might have succeeded. */
4615 DEBUG_PRINT1 ("end of pattern ... ");
4617 /* If we haven't matched the entire string, and we want the
4618 longest match, try backtracking. */
4619 if (d
!= end_match_2
)
4621 /* 1 if this match ends in the same string (string1 or string2)
4622 as the best previous match. */
4623 boolean same_str_p
= (FIRST_STRING_P (match_end
)
4624 == FIRST_STRING_P (d
));
4625 /* 1 if this match is the best seen so far. */
4626 boolean best_match_p
;
4628 /* AIX compiler got confused when this was combined
4629 with the previous declaration. */
4631 best_match_p
= d
> match_end
;
4633 best_match_p
= !FIRST_STRING_P (d
);
4635 DEBUG_PRINT1 ("backtracking.\n");
4637 if (!FAIL_STACK_EMPTY ())
4638 { /* More failure points to try. */
4640 /* If exceeds best match so far, save it. */
4641 if (!best_regs_set
|| best_match_p
)
4643 best_regs_set
= true;
4646 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4648 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4650 best_regstart
[mcnt
] = regstart
[mcnt
];
4651 best_regend
[mcnt
] = regend
[mcnt
];
4657 /* If no failure points, don't restore garbage. And if
4658 last match is real best match, don't restore second
4660 else if (best_regs_set
&& !best_match_p
)
4663 /* Restore best match. It may happen that `dend ==
4664 end_match_1' while the restored d is in string2.
4665 For example, the pattern `x.*y.*z' against the
4666 strings `x-' and `y-z-', if the two strings are
4667 not consecutive in memory. */
4668 DEBUG_PRINT1 ("Restoring best registers.\n");
4671 dend
= ((d
>= string1
&& d
<= end1
)
4672 ? end_match_1
: end_match_2
);
4674 for (mcnt
= 1; mcnt
< num_regs
; mcnt
++)
4676 regstart
[mcnt
] = best_regstart
[mcnt
];
4677 regend
[mcnt
] = best_regend
[mcnt
];
4680 } /* d != end_match_2 */
4683 DEBUG_PRINT1 ("Accepting match.\n");
4685 /* If caller wants register contents data back, do it. */
4686 if (regs
&& !bufp
->no_sub
)
4688 /* Have the register data arrays been allocated? */
4689 if (bufp
->regs_allocated
== REGS_UNALLOCATED
)
4690 { /* No. So allocate them with malloc. We need one
4691 extra element beyond `num_regs' for the `-1' marker
4693 regs
->num_regs
= MAX (RE_NREGS
, num_regs
+ 1);
4694 regs
->start
= TALLOC (regs
->num_regs
, regoff_t
);
4695 regs
->end
= TALLOC (regs
->num_regs
, regoff_t
);
4696 if (regs
->start
== NULL
|| regs
->end
== NULL
)
4701 bufp
->regs_allocated
= REGS_REALLOCATE
;
4703 else if (bufp
->regs_allocated
== REGS_REALLOCATE
)
4704 { /* Yes. If we need more elements than were already
4705 allocated, reallocate them. If we need fewer, just
4707 if (regs
->num_regs
< num_regs
+ 1)
4709 regs
->num_regs
= num_regs
+ 1;
4710 RETALLOC (regs
->start
, regs
->num_regs
, regoff_t
);
4711 RETALLOC (regs
->end
, regs
->num_regs
, regoff_t
);
4712 if (regs
->start
== NULL
|| regs
->end
== NULL
)
4721 /* These braces fend off a "empty body in an else-statement"
4722 warning under GCC when assert expands to nothing. */
4723 assert (bufp
->regs_allocated
== REGS_FIXED
);
4726 /* Convert the pointer data in `regstart' and `regend' to
4727 indices. Register zero has to be set differently,
4728 since we haven't kept track of any info for it. */
4729 if (regs
->num_regs
> 0)
4731 regs
->start
[0] = pos
;
4732 regs
->end
[0] = POINTER_TO_OFFSET (d
);
4735 /* Go through the first `min (num_regs, regs->num_regs)'
4736 registers, since that is all we initialized. */
4737 for (mcnt
= 1; mcnt
< MIN (num_regs
, regs
->num_regs
); mcnt
++)
4739 if (REG_UNSET (regstart
[mcnt
]) || REG_UNSET (regend
[mcnt
]))
4740 regs
->start
[mcnt
] = regs
->end
[mcnt
] = -1;
4744 = (regoff_t
) POINTER_TO_OFFSET (regstart
[mcnt
]);
4746 = (regoff_t
) POINTER_TO_OFFSET (regend
[mcnt
]);
4750 /* If the regs structure we return has more elements than
4751 were in the pattern, set the extra elements to -1. If
4752 we (re)allocated the registers, this is the case,
4753 because we always allocate enough to have at least one
4755 for (mcnt
= num_regs
; mcnt
< regs
->num_regs
; mcnt
++)
4756 regs
->start
[mcnt
] = regs
->end
[mcnt
] = -1;
4757 } /* regs && !bufp->no_sub */
4759 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4760 nfailure_points_pushed
, nfailure_points_popped
,
4761 nfailure_points_pushed
- nfailure_points_popped
);
4762 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed
);
4764 mcnt
= POINTER_TO_OFFSET (d
) - pos
;
4766 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt
);
4772 /* Otherwise match next pattern command. */
4773 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *p
++))
4775 /* Ignore these. Used to ignore the n of succeed_n's which
4776 currently have n == 0. */
4778 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4782 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4785 /* Match the next n pattern characters exactly. The following
4786 byte in the pattern defines n, and the n bytes after that
4787 are the characters to match. */
4790 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt
);
4792 /* Remember the start point to rollback upon failure. */
4795 /* This is written out as an if-else so we don't waste time
4796 testing `translate' inside the loop. */
4797 if (RE_TRANSLATE_P (translate
))
4803 int pat_charlen
, buf_charlen
;
4804 unsigned int pat_ch
, buf_ch
;
4807 pat_ch
= STRING_CHAR_AND_LENGTH (p
, pend
- p
, pat_charlen
);
4808 buf_ch
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, buf_charlen
);
4810 if (RE_TRANSLATE (translate
, buf_ch
)
4819 mcnt
-= pat_charlen
;
4823 #endif /* not emacs */
4827 if (RE_TRANSLATE (translate
, *d
) != *p
++)
4852 /* Match any character except possibly a newline or a null. */
4856 unsigned int buf_ch
;
4858 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4864 buf_ch
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, buf_charlen
);
4866 #endif /* not emacs */
4872 buf_ch
= TRANSLATE (buf_ch
);
4874 if ((!(bufp
->syntax
& RE_DOT_NEWLINE
)
4876 || ((bufp
->syntax
& RE_DOT_NOT_NULL
)
4877 && buf_ch
== '\000'))
4880 DEBUG_PRINT2 (" Matched `%d'.\n", *d
);
4889 register unsigned int c
;
4890 boolean
not = (re_opcode_t
) *(p
- 1) == charset_not
;
4893 /* Start of actual range_table, or end of bitmap if there is no
4895 unsigned char *range_table
;
4897 /* Nonzero if there is a range table. */
4898 int range_table_exists
;
4900 /* Number of ranges of range table. This is not included
4901 in the initial byte-length of the command. */
4904 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4909 range_table_exists
= CHARSET_RANGE_TABLE_EXISTS_P (&p
[-1]);
4912 if (range_table_exists
)
4914 range_table
= CHARSET_RANGE_TABLE (&p
[-1]); /* Past the bitmap. */
4915 EXTRACT_NUMBER_AND_INCR (count
, range_table
);
4918 if (multibyte
&& BASE_LEADING_CODE_P (c
))
4919 c
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
4922 if (SINGLE_BYTE_CHAR_P (c
))
4923 { /* Lookup bitmap. */
4924 c
= TRANSLATE (c
); /* The character to match. */
4927 /* Cast to `unsigned' instead of `unsigned char' in
4928 case the bit list is a full 32 bytes long. */
4929 if (c
< (unsigned) (CHARSET_BITMAP_SIZE (&p
[-1]) * BYTEWIDTH
)
4930 && p
[1 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
4934 else if (range_table_exists
)
4936 int class_bits
= CHARSET_RANGE_TABLE_BITS (&p
[-1]);
4938 if ( (class_bits
& BIT_ALNUM
&& ISALNUM (c
))
4939 | (class_bits
& BIT_ALPHA
&& ISALPHA (c
))
4940 | (class_bits
& BIT_ASCII
&& IS_REAL_ASCII (c
))
4941 | (class_bits
& BIT_GRAPH
&& ISGRAPH (c
))
4942 | (class_bits
& BIT_LOWER
&& ISLOWER (c
))
4943 | (class_bits
& BIT_MULTIBYTE
&& !ISUNIBYTE (c
))
4944 | (class_bits
& BIT_NONASCII
&& !IS_REAL_ASCII (c
))
4945 | (class_bits
& BIT_PRINT
&& ISPRINT (c
))
4946 | (class_bits
& BIT_PUNCT
&& ISPUNCT (c
))
4947 | (class_bits
& BIT_SPACE
&& ISSPACE (c
))
4948 | (class_bits
& BIT_UNIBYTE
&& ISUNIBYTE (c
))
4949 | (class_bits
& BIT_UPPER
&& ISUPPER (c
))
4950 | (class_bits
& BIT_WORD
&& ISWORD (c
)))
4953 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c
, range_table
, count
);
4957 if (range_table_exists
)
4958 p
= CHARSET_RANGE_TABLE_END (range_table
, count
);
4960 p
+= CHARSET_BITMAP_SIZE (&p
[-1]) + 1;
4962 if (!not) goto fail
;
4969 /* The beginning of a group is represented by start_memory.
4970 The argument is the register number. The text
4971 matched within the group is recorded (in the internal
4972 registers data structure) under the register number. */
4974 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p
);
4976 /* In case we need to undo this operation (via backtracking). */
4977 PUSH_FAILURE_REG ((unsigned int)*p
);
4980 regend
[*p
] = REG_UNSET_VALUE
; /* probably unnecessary. -sm */
4981 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart
[*p
]));
4983 /* Move past the register number and inner group count. */
4988 /* The stop_memory opcode represents the end of a group. Its
4989 argument is the same as start_memory's: the register number. */
4991 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p
);
4993 assert (!REG_UNSET (regstart
[*p
]));
4994 /* Strictly speaking, there should be code such as:
4996 assert (REG_UNSET (regend[*p]));
4997 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4999 But the only info to be pushed is regend[*p] and it is known to
5000 be UNSET, so there really isn't anything to push.
5001 Not pushing anything, on the other hand deprives us from the
5002 guarantee that regend[*p] is UNSET since undoing this operation
5003 will not reset its value properly. This is not important since
5004 the value will only be read on the next start_memory or at
5005 the very end and both events can only happen if this stop_memory
5009 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend
[*p
]));
5011 /* Move past the register number and the inner group count. */
5016 /* \<digit> has been turned into a `duplicate' command which is
5017 followed by the numeric value of <digit> as the register number. */
5020 register re_char
*d2
, *dend2
;
5021 int regno
= *p
++; /* Get which register to match against. */
5022 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno
);
5024 /* Can't back reference a group which we've never matched. */
5025 if (REG_UNSET (regstart
[regno
]) || REG_UNSET (regend
[regno
]))
5028 /* Where in input to try to start matching. */
5029 d2
= regstart
[regno
];
5031 /* Remember the start point to rollback upon failure. */
5034 /* Where to stop matching; if both the place to start and
5035 the place to stop matching are in the same string, then
5036 set to the place to stop, otherwise, for now have to use
5037 the end of the first string. */
5039 dend2
= ((FIRST_STRING_P (regstart
[regno
])
5040 == FIRST_STRING_P (regend
[regno
]))
5041 ? regend
[regno
] : end_match_1
);
5044 /* If necessary, advance to next segment in register
5048 if (dend2
== end_match_2
) break;
5049 if (dend2
== regend
[regno
]) break;
5051 /* End of string1 => advance to string2. */
5053 dend2
= regend
[regno
];
5055 /* At end of register contents => success */
5056 if (d2
== dend2
) break;
5058 /* If necessary, advance to next segment in data. */
5061 /* How many characters left in this segment to match. */
5064 /* Want how many consecutive characters we can match in
5065 one shot, so, if necessary, adjust the count. */
5066 if (mcnt
> dend2
- d2
)
5069 /* Compare that many; failure if mismatch, else move
5071 if (RE_TRANSLATE_P (translate
)
5072 ? bcmp_translate (d
, d2
, mcnt
, translate
)
5073 : bcmp (d
, d2
, mcnt
))
5078 d
+= mcnt
, d2
+= mcnt
;
5084 /* begline matches the empty string at the beginning of the string
5085 (unless `not_bol' is set in `bufp'), and, if
5086 `newline_anchor' is set, after newlines. */
5088 DEBUG_PRINT1 ("EXECUTING begline.\n");
5090 if (AT_STRINGS_BEG (d
))
5092 if (!bufp
->not_bol
) break;
5094 else if (d
[-1] == '\n' && bufp
->newline_anchor
)
5098 /* In all other cases, we fail. */
5102 /* endline is the dual of begline. */
5104 DEBUG_PRINT1 ("EXECUTING endline.\n");
5106 if (AT_STRINGS_END (d
))
5108 if (!bufp
->not_eol
) break;
5111 /* We have to ``prefetch'' the next character. */
5112 else if ((d
== end1
? *string2
: *d
) == '\n'
5113 && bufp
->newline_anchor
)
5120 /* Match at the very beginning of the data. */
5122 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5123 if (AT_STRINGS_BEG (d
))
5128 /* Match at the very end of the data. */
5130 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5131 if (AT_STRINGS_END (d
))
5136 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5137 pushes NULL as the value for the string on the stack. Then
5138 `POP_FAILURE_POINT' will keep the current value for the
5139 string, instead of restoring it. To see why, consider
5140 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5141 then the . fails against the \n. But the next thing we want
5142 to do is match the \n against the \n; if we restored the
5143 string value, we would be back at the foo.
5145 Because this is used only in specific cases, we don't need to
5146 check all the things that `on_failure_jump' does, to make
5147 sure the right things get saved on the stack. Hence we don't
5148 share its code. The only reason to push anything on the
5149 stack at all is that otherwise we would have to change
5150 `anychar's code to do something besides goto fail in this
5151 case; that seems worse than this. */
5152 case on_failure_keep_string_jump
:
5153 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5154 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5157 PUSH_FAILURE_POINT (p
- 3, NULL
);
5160 case on_failure_jump_exclusive
:
5161 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5162 DEBUG_PRINT3 ("EXECUTING on_failure_jump_exclusive %d (to %p):\n",
5165 if (! FAIL_STACK_EMPTY ()
5166 && FAILURE_PAT (TOP_FAILURE_HANDLE ()) == (p
- 3)
5167 && fail_stack
.avail
== fail_stack
.frame
)
5169 /* We are trying to push failure F2 onto the stack but there
5170 is already a failure F1 pushed from the same instruction.
5171 Between F1 and now, something has matched (else this is an
5172 improper use of on_failure_jump_exclusive), so that we know
5173 that the fail-destination of F1 cannot match, hence we can
5174 pop F1 before pushing F2. Instead of doing this pop/push,
5175 we manually turn F1 into F2.
5176 `fail_stack.avail == fail_stack.frame' makes sure
5177 that popping F1 doesn't involve registers, else
5178 this optimization cannot be done so trivially. */
5179 assert (FAILURE_STR (TOP_FAILURE_HANDLE ()) != d
);
5180 FAILURE_STR (TOP_FAILURE_HANDLE ()) = d
;
5183 PUSH_FAILURE_POINT (p
- 3, d
);
5186 case on_failure_jump_loop
:
5188 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5189 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5192 CHECK_INFINITE_LOOP (p
- 3, d
);
5193 PUSH_FAILURE_POINT (p
- 3, d
);
5197 /* Uses of on_failure_jump:
5199 Each alternative starts with an on_failure_jump that points
5200 to the beginning of the next alternative. Each alternative
5201 except the last ends with a jump that in effect jumps past
5202 the rest of the alternatives. (They really jump to the
5203 ending jump of the following alternative, because tensioning
5204 these jumps is a hassle.)
5206 Repeats start with an on_failure_jump that points past both
5207 the repetition text and either the following jump or
5208 pop_failure_jump back to this on_failure_jump. */
5209 case on_failure_jump
:
5211 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5212 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5215 PUSH_FAILURE_POINT (p
-3, d
);
5218 /* This operation is used for greedy * and +.
5219 Compare the beginning of the repeat with what in the
5220 pattern follows its end. If we can establish that there
5221 is nothing that they would both match, i.e., that we
5222 would have to backtrack because of (as in, e.g., `a*a')
5223 then we can use a non-backtracking loop based on
5224 on_failure_jump_exclusive instead of on_failure_jump_loop. */
5225 case on_failure_jump_smart
:
5227 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5228 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5231 unsigned char *p1
= p
; /* Next operation. */
5232 unsigned char *p2
= p
+ mcnt
; /* Destination of the jump. */
5234 p
-= 3; /* Reset so that we will re-execute the
5235 instruction once it's been changed. */
5237 DEBUG_STATEMENT (debug
+= 2);
5238 if (mutually_exclusive_p (bufp
, p1
, p2
))
5240 /* Use a fast `on_failure_keep_string_jump' loop. */
5241 *p
= (unsigned char) on_failure_jump_exclusive
;
5242 /* STORE_NUMBER (p2 - 2, mcnt + 3); */
5246 /* Default to a safe `on_failure_jump' loop. */
5247 DEBUG_PRINT1 (" smart default => slow loop.\n");
5248 *p
= (unsigned char) on_failure_jump_loop
;
5250 DEBUG_STATEMENT (debug
-= 2);
5254 /* Unconditionally jump (without popping any failure points). */
5258 EXTRACT_NUMBER_AND_INCR (mcnt
, p
); /* Get the amount to jump. */
5259 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt
);
5260 p
+= mcnt
; /* Do the jump. */
5261 DEBUG_PRINT2 ("(to %p).\n", p
);
5265 /* Have to succeed matching what follows at least n times.
5266 After that, handle like `on_failure_jump'. */
5268 EXTRACT_NUMBER (mcnt
, p
+ 2);
5269 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt
);
5272 /* Originally, this is how many times we HAVE to succeed. */
5277 STORE_NUMBER_AND_INCR (p
, mcnt
);
5278 DEBUG_PRINT3 (" Setting %p to %d.\n", p
, mcnt
);
5282 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p
+2);
5283 p
[2] = (unsigned char) no_op
;
5284 p
[3] = (unsigned char) no_op
;
5290 EXTRACT_NUMBER (mcnt
, p
+ 2);
5291 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt
);
5293 /* Originally, this is how many times we CAN jump. */
5297 STORE_NUMBER (p
+ 2, mcnt
);
5298 goto unconditional_jump
;
5300 /* If don't have to jump any more, skip over the rest of command. */
5307 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5309 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5311 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
5312 DEBUG_PRINT3 (" Setting %p to %d.\n", p1
, mcnt
);
5313 STORE_NUMBER (p1
, mcnt
);
5319 not = (re_opcode_t
) *(p
- 1) == notwordbound
;
5320 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5322 /* We SUCCEED (or FAIL) in one of the following cases: */
5324 /* Case 1: D is at the beginning or the end of string. */
5325 if (AT_STRINGS_BEG (d
) || AT_STRINGS_END (d
))
5329 /* C1 is the character before D, S1 is the syntax of C1, C2
5330 is the character at D, and S2 is the syntax of C2. */
5333 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d
- 1));
5334 UPDATE_SYNTAX_TABLE (charpos
);
5336 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5337 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5340 UPDATE_SYNTAX_TABLE_FORWARD (charpos
+ 1);
5343 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5344 c2
= STRING_CHAR (d
, dend
- d
);
5347 if (/* Case 2: Only one of S1 and S2 is Sword. */
5348 ((s1
== Sword
) != (s2
== Sword
))
5349 /* Case 3: Both of S1 and S2 are Sword, and macro
5350 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5351 || ((s1
== Sword
) && WORD_BOUNDARY_P (c1
, c2
)))
5360 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5362 /* We FAIL in one of the following cases: */
5364 /* Case 1: D is at the end of string. */
5365 if (AT_STRINGS_END (d
))
5369 /* C1 is the character before D, S1 is the syntax of C1, C2
5370 is the character at D, and S2 is the syntax of C2. */
5373 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d
));
5374 UPDATE_SYNTAX_TABLE (charpos
);
5377 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5378 c2
= STRING_CHAR (d
, dend
- d
);
5381 /* Case 2: S2 is not Sword. */
5385 /* Case 3: D is not at the beginning of string ... */
5386 if (!AT_STRINGS_BEG (d
))
5388 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5390 UPDATE_SYNTAX_TABLE_BACKWARD (charpos
- 1);
5394 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5396 if ((s1
== Sword
) && !WORD_BOUNDARY_P (c1
, c2
))
5403 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5405 /* We FAIL in one of the following cases: */
5407 /* Case 1: D is at the beginning of string. */
5408 if (AT_STRINGS_BEG (d
))
5412 /* C1 is the character before D, S1 is the syntax of C1, C2
5413 is the character at D, and S2 is the syntax of C2. */
5416 int charpos
= SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d
) - 1);
5417 UPDATE_SYNTAX_TABLE (charpos
);
5419 GET_CHAR_BEFORE_2 (c1
, d
, string1
, end1
, string2
, end2
);
5422 /* Case 2: S1 is not Sword. */
5426 /* Case 3: D is not at the end of string ... */
5427 if (!AT_STRINGS_END (d
))
5430 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5431 c2
= STRING_CHAR (d
, dend
- d
);
5433 UPDATE_SYNTAX_TABLE_FORWARD (charpos
);
5437 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5439 if ((s2
== Sword
) && !WORD_BOUNDARY_P (c1
, c2
))
5447 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5448 if (PTR_BYTE_POS (d
) >= PT_BYTE
)
5453 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5454 if (PTR_BYTE_POS (d
) != PT_BYTE
)
5459 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5460 if (PTR_BYTE_POS (d
) <= PT_BYTE
)
5465 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt
);
5470 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5476 int pos1
= SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d
));
5477 UPDATE_SYNTAX_TABLE (pos1
);
5484 /* we must concern about multibyte form, ... */
5485 c
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5487 /* everything should be handled as ASCII, even though it
5488 looks like multibyte form. */
5491 if (SYNTAX (c
) != (enum syntaxcode
) mcnt
)
5498 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt
);
5500 goto matchnotsyntax
;
5503 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5509 int pos1
= SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d
));
5510 UPDATE_SYNTAX_TABLE (pos1
);
5517 c
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5521 if (SYNTAX (c
) == (enum syntaxcode
) mcnt
)
5528 DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p
);
5535 c
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5539 if (!CHAR_HAS_CATEGORY (c
, mcnt
))
5545 case notcategoryspec
:
5546 DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p
);
5553 c
= STRING_CHAR_AND_LENGTH (d
, dend
- d
, len
);
5557 if (CHAR_HAS_CATEGORY (c
, mcnt
))
5563 #else /* not emacs */
5565 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5567 if (!WORDCHAR_P (d
))
5573 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5579 #endif /* not emacs */
5584 continue; /* Successfully executed one pattern command; keep going. */
5587 /* We goto here if a matching operation fails. */
5590 if (!FAIL_STACK_EMPTY ())
5594 /* A restart point is known. Restore to that state. */
5595 DEBUG_PRINT1 ("\nFAIL:\n");
5596 POP_FAILURE_POINT (str
, pat
);
5597 switch (SWITCH_ENUM_CAST ((re_opcode_t
) *pat
++))
5599 case on_failure_keep_string_jump
:
5600 assert (str
== NULL
);
5601 goto continue_failure_jump
;
5603 case on_failure_jump_exclusive
:
5604 /* If something has matched, the alternative will not match,
5605 so we might as well keep popping right away. */
5606 if (0 /* d != str && d != string2 */) /* Don't bother. -sm */
5607 /* (d == string2 && str == end1) => (d =~ str) */
5611 case on_failure_jump_loop
:
5612 case on_failure_jump
:
5615 continue_failure_jump
:
5616 EXTRACT_NUMBER_AND_INCR (mcnt
, pat
);
5624 assert (p
>= bufp
->buffer
&& p
<= pend
);
5626 if (d
>= string1
&& d
<= end1
)
5630 break; /* Matching at this starting point really fails. */
5634 goto restore_best_regs
;
5638 return -1; /* Failure to match. */
5641 /* Subroutine definitions for re_match_2. */
5643 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5644 bytes; nonzero otherwise. */
5647 bcmp_translate (s1
, s2
, len
, translate
)
5648 unsigned char *s1
, *s2
;
5650 RE_TRANSLATE_TYPE translate
;
5652 register unsigned char *p1
= s1
, *p2
= s2
;
5653 unsigned char *p1_end
= s1
+ len
;
5654 unsigned char *p2_end
= s2
+ len
;
5656 while (p1
!= p1_end
&& p2
!= p2_end
)
5658 int p1_charlen
, p2_charlen
;
5661 /* FIXME: This assumes `multibyte = true'. */
5662 p1_ch
= STRING_CHAR_AND_LENGTH (p1
, p1_end
- p1
, p1_charlen
);
5663 p2_ch
= STRING_CHAR_AND_LENGTH (p2
, p2_end
- p2
, p2_charlen
);
5665 if (RE_TRANSLATE (translate
, p1_ch
)
5666 != RE_TRANSLATE (translate
, p2_ch
))
5669 p1
+= p1_charlen
, p2
+= p2_charlen
;
5672 if (p1
!= p1_end
|| p2
!= p2_end
)
5678 /* Entry points for GNU code. */
5680 /* re_compile_pattern is the GNU regular expression compiler: it
5681 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5682 Returns 0 if the pattern was valid, otherwise an error string.
5684 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5685 are set in BUFP on entry.
5687 We call regex_compile to do the actual compilation. */
5690 re_compile_pattern (pattern
, length
, bufp
)
5691 const char *pattern
;
5693 struct re_pattern_buffer
*bufp
;
5697 /* GNU code is written to assume at least RE_NREGS registers will be set
5698 (and at least one extra will be -1). */
5699 bufp
->regs_allocated
= REGS_UNALLOCATED
;
5701 /* And GNU code determines whether or not to get register information
5702 by passing null for the REGS argument to re_match, etc., not by
5706 /* Match anchors at newline. */
5707 bufp
->newline_anchor
= 1;
5709 ret
= regex_compile (pattern
, length
, re_syntax_options
, bufp
);
5713 return gettext (re_error_msgid
[(int) ret
]);
5716 /* Entry points compatible with 4.2 BSD regex library. We don't define
5717 them unless specifically requested. */
5719 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
5721 /* BSD has one and only one pattern buffer. */
5722 static struct re_pattern_buffer re_comp_buf
;
5726 /* Make these definitions weak in libc, so POSIX programs can redefine
5727 these names if they don't use our functions, and still use
5728 regcomp/regexec below without link errors. */
5738 if (!re_comp_buf
.buffer
)
5739 return gettext ("No previous regular expression");
5743 if (!re_comp_buf
.buffer
)
5745 re_comp_buf
.buffer
= (unsigned char *) malloc (200);
5746 if (re_comp_buf
.buffer
== NULL
)
5747 return gettext (re_error_msgid
[(int) REG_ESPACE
]);
5748 re_comp_buf
.allocated
= 200;
5750 re_comp_buf
.fastmap
= (char *) malloc (1 << BYTEWIDTH
);
5751 if (re_comp_buf
.fastmap
== NULL
)
5752 return gettext (re_error_msgid
[(int) REG_ESPACE
]);
5755 /* Since `re_exec' always passes NULL for the `regs' argument, we
5756 don't need to initialize the pattern buffer fields which affect it. */
5758 /* Match anchors at newlines. */
5759 re_comp_buf
.newline_anchor
= 1;
5761 ret
= regex_compile (s
, strlen (s
), re_syntax_options
, &re_comp_buf
);
5766 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5767 return (char *) gettext (re_error_msgid
[(int) ret
]);
5778 const int len
= strlen (s
);
5780 0 <= re_search (&re_comp_buf
, s
, len
, 0, len
, (struct re_registers
*) 0);
5782 #endif /* _REGEX_RE_COMP */
5784 /* POSIX.2 functions. Don't define these for Emacs. */
5788 /* regcomp takes a regular expression as a string and compiles it.
5790 PREG is a regex_t *. We do not expect any fields to be initialized,
5791 since POSIX says we shouldn't. Thus, we set
5793 `buffer' to the compiled pattern;
5794 `used' to the length of the compiled pattern;
5795 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5796 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5797 RE_SYNTAX_POSIX_BASIC;
5798 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5799 `fastmap' and `fastmap_accurate' to zero;
5800 `re_nsub' to the number of subexpressions in PATTERN.
5802 PATTERN is the address of the pattern string.
5804 CFLAGS is a series of bits which affect compilation.
5806 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5807 use POSIX basic syntax.
5809 If REG_NEWLINE is set, then . and [^...] don't match newline.
5810 Also, regexec will try a match beginning after every newline.
5812 If REG_ICASE is set, then we considers upper- and lowercase
5813 versions of letters to be equivalent when matching.
5815 If REG_NOSUB is set, then when PREG is passed to regexec, that
5816 routine will report only success or failure, and nothing about the
5819 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5820 the return codes and their meanings.) */
5823 regcomp (preg
, pattern
, cflags
)
5825 const char *pattern
;
5830 = (cflags
& REG_EXTENDED
) ?
5831 RE_SYNTAX_POSIX_EXTENDED
: RE_SYNTAX_POSIX_BASIC
;
5833 /* regex_compile will allocate the space for the compiled pattern. */
5835 preg
->allocated
= 0;
5838 /* Don't bother to use a fastmap when searching. This simplifies the
5839 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5840 characters after newlines into the fastmap. This way, we just try
5844 if (cflags
& REG_ICASE
)
5849 = (RE_TRANSLATE_TYPE
) malloc (CHAR_SET_SIZE
5850 * sizeof (*(RE_TRANSLATE_TYPE
)0));
5851 if (preg
->translate
== NULL
)
5852 return (int) REG_ESPACE
;
5854 /* Map uppercase characters to corresponding lowercase ones. */
5855 for (i
= 0; i
< CHAR_SET_SIZE
; i
++)
5856 preg
->translate
[i
] = ISUPPER (i
) ? tolower (i
) : i
;
5859 preg
->translate
= NULL
;
5861 /* If REG_NEWLINE is set, newlines are treated differently. */
5862 if (cflags
& REG_NEWLINE
)
5863 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5864 syntax
&= ~RE_DOT_NEWLINE
;
5865 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
5866 /* It also changes the matching behavior. */
5867 preg
->newline_anchor
= 1;
5870 preg
->newline_anchor
= 0;
5872 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
5874 /* POSIX says a null character in the pattern terminates it, so we
5875 can use strlen here in compiling the pattern. */
5876 ret
= regex_compile (pattern
, strlen (pattern
), syntax
, preg
);
5878 /* POSIX doesn't distinguish between an unmatched open-group and an
5879 unmatched close-group: both are REG_EPAREN. */
5880 if (ret
== REG_ERPAREN
) ret
= REG_EPAREN
;
5886 /* regexec searches for a given pattern, specified by PREG, in the
5889 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5890 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5891 least NMATCH elements, and we set them to the offsets of the
5892 corresponding matched substrings.
5894 EFLAGS specifies `execution flags' which affect matching: if
5895 REG_NOTBOL is set, then ^ does not match at the beginning of the
5896 string; if REG_NOTEOL is set, then $ does not match at the end.
5898 We return 0 if we find a match and REG_NOMATCH if not. */
5901 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
5902 const regex_t
*preg
;
5905 regmatch_t pmatch
[];
5909 struct re_registers regs
;
5910 regex_t private_preg
;
5911 int len
= strlen (string
);
5912 boolean want_reg_info
= !preg
->no_sub
&& nmatch
> 0;
5914 private_preg
= *preg
;
5916 private_preg
.not_bol
= !!(eflags
& REG_NOTBOL
);
5917 private_preg
.not_eol
= !!(eflags
& REG_NOTEOL
);
5919 /* The user has told us exactly how many registers to return
5920 information about, via `nmatch'. We have to pass that on to the
5921 matching routines. */
5922 private_preg
.regs_allocated
= REGS_FIXED
;
5926 regs
.num_regs
= nmatch
;
5927 regs
.start
= TALLOC (nmatch
, regoff_t
);
5928 regs
.end
= TALLOC (nmatch
, regoff_t
);
5929 if (regs
.start
== NULL
|| regs
.end
== NULL
)
5930 return (int) REG_NOMATCH
;
5933 /* Perform the searching operation. */
5934 ret
= re_search (&private_preg
, string
, len
,
5935 /* start: */ 0, /* range: */ len
,
5936 want_reg_info
? ®s
: (struct re_registers
*) 0);
5938 /* Copy the register information to the POSIX structure. */
5945 for (r
= 0; r
< nmatch
; r
++)
5947 pmatch
[r
].rm_so
= regs
.start
[r
];
5948 pmatch
[r
].rm_eo
= regs
.end
[r
];
5952 /* If we needed the temporary register info, free the space now. */
5957 /* We want zero return to mean success, unlike `re_search'. */
5958 return ret
>= 0 ? (int) REG_NOERROR
: (int) REG_NOMATCH
;
5962 /* Returns a message corresponding to an error code, ERRCODE, returned
5963 from either regcomp or regexec. We don't use PREG here. */
5966 regerror (errcode
, preg
, errbuf
, errbuf_size
)
5968 const regex_t
*preg
;
5976 || errcode
>= (sizeof (re_error_msgid
) / sizeof (re_error_msgid
[0])))
5977 /* Only error codes returned by the rest of the code should be passed
5978 to this routine. If we are given anything else, or if other regex
5979 code generates an invalid error code, then the program has a bug.
5980 Dump core so we can fix it. */
5983 msg
= gettext (re_error_msgid
[errcode
]);
5985 msg_size
= strlen (msg
) + 1; /* Includes the null. */
5987 if (errbuf_size
!= 0)
5989 if (msg_size
> errbuf_size
)
5991 strncpy (errbuf
, msg
, errbuf_size
- 1);
5992 errbuf
[errbuf_size
- 1] = 0;
5995 strcpy (errbuf
, msg
);
6002 /* Free dynamically allocated space used by PREG. */
6008 if (preg
->buffer
!= NULL
)
6009 free (preg
->buffer
);
6010 preg
->buffer
= NULL
;
6012 preg
->allocated
= 0;
6015 if (preg
->fastmap
!= NULL
)
6016 free (preg
->fastmap
);
6017 preg
->fastmap
= NULL
;
6018 preg
->fastmap_accurate
= 0;
6020 if (preg
->translate
!= NULL
)
6021 free (preg
->translate
);
6022 preg
->translate
= NULL
;
6025 #endif /* not emacs */