(re_match_2): Fix string shortening (to fit `stop') to make sure
[emacs.git] / src / regex.c
blob82c5d76f4dca5e03073654ca382c39f971a547c6
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */
22 /* TODO:
23 - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac".
24 - use `keep_string' more often than just .*\n.
25 - structure the opcode space into opcode+flag.
26 - merge with glic's regex.[ch]
28 That's it for now -sm */
30 /* AIX requires this to be the first thing in the file. */
31 #if defined (_AIX) && !defined (REGEX_MALLOC)
32 #pragma alloca
33 #endif
35 #undef _GNU_SOURCE
36 #define _GNU_SOURCE
38 #ifdef emacs
39 /* Converts the pointer to the char to BEG-based offset from the start. */
40 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
41 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
42 #else
43 #define PTR_TO_OFFSET(d) 0
44 #endif
46 #ifdef HAVE_CONFIG_H
47 #include <config.h>
48 #endif
50 /* We need this for `regex.h', and perhaps for the Emacs include files. */
51 #include <sys/types.h>
53 /* This is for other GNU distributions with internationalized messages. */
54 #if HAVE_LIBINTL_H || defined (_LIBC)
55 # include <libintl.h>
56 #else
57 # define gettext(msgid) (msgid)
58 #endif
60 #ifndef gettext_noop
61 /* This define is so xgettext can find the internationalizable
62 strings. */
63 #define gettext_noop(String) String
64 #endif
66 /* The `emacs' switch turns on certain matching commands
67 that make sense only in Emacs. */
68 #ifdef emacs
70 #include "lisp.h"
71 #include "buffer.h"
73 /* Make syntax table lookup grant data in gl_state. */
74 #define SYNTAX_ENTRY_VIA_PROPERTY
76 #include "syntax.h"
77 #include "charset.h"
78 #include "category.h"
80 #define malloc xmalloc
81 #define realloc xrealloc
82 #define free xfree
84 #else /* not emacs */
86 /* If we are not linking with Emacs proper,
87 we can't use the relocating allocator
88 even if config.h says that we can. */
89 #undef REL_ALLOC
91 #if defined (STDC_HEADERS) || defined (_LIBC)
92 #include <stdlib.h>
93 #else
94 char *malloc ();
95 char *realloc ();
96 #endif
98 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
99 If nothing else has been done, use the method below. */
100 #ifdef INHIBIT_STRING_HEADER
101 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
102 #if !defined (bzero) && !defined (bcopy)
103 #undef INHIBIT_STRING_HEADER
104 #endif
105 #endif
106 #endif
108 /* This is the normal way of making sure we have a bcopy and a bzero.
109 This is used in most programs--a few other programs avoid this
110 by defining INHIBIT_STRING_HEADER. */
111 #ifndef INHIBIT_STRING_HEADER
112 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
113 #include <string.h>
114 #ifndef bcmp
115 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
116 #endif
117 #ifndef bcopy
118 #define bcopy(s, d, n) memcpy ((d), (s), (n))
119 #endif
120 #ifndef bzero
121 #define bzero(s, n) memset ((s), 0, (n))
122 #endif
123 #else
124 #include <strings.h>
125 #endif
126 #endif
128 /* Define the syntax stuff for \<, \>, etc. */
130 /* This must be nonzero for the wordchar and notwordchar pattern
131 commands in re_match_2. */
132 #ifndef Sword
133 #define Sword 1
134 #endif
136 #ifdef SWITCH_ENUM_BUG
137 #define SWITCH_ENUM_CAST(x) ((int)(x))
138 #else
139 #define SWITCH_ENUM_CAST(x) (x)
140 #endif
142 #ifdef SYNTAX_TABLE
144 extern char *re_syntax_table;
146 #else /* not SYNTAX_TABLE */
148 /* How many characters in the character set. */
149 #define CHAR_SET_SIZE 256
151 static char re_syntax_table[CHAR_SET_SIZE];
153 static void
154 init_syntax_once ()
156 register int c;
157 static int done = 0;
159 if (done)
160 return;
162 bzero (re_syntax_table, sizeof re_syntax_table);
164 for (c = 'a'; c <= 'z'; c++)
165 re_syntax_table[c] = Sword;
167 for (c = 'A'; c <= 'Z'; c++)
168 re_syntax_table[c] = Sword;
170 for (c = '0'; c <= '9'; c++)
171 re_syntax_table[c] = Sword;
173 re_syntax_table['_'] = Sword;
175 done = 1;
178 #endif /* not SYNTAX_TABLE */
180 #define SYNTAX(c) re_syntax_table[c]
182 /* Dummy macros for non-Emacs environments. */
183 #define BASE_LEADING_CODE_P(c) (0)
184 #define WORD_BOUNDARY_P(c1, c2) (0)
185 #define CHAR_HEAD_P(p) (1)
186 #define SINGLE_BYTE_CHAR_P(c) (1)
187 #define SAME_CHARSET_P(c1, c2) (1)
188 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
189 #define STRING_CHAR(p, s) (*(p))
190 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
191 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
192 (c = ((p) == (end1) ? *(str2) : *(p)))
193 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
194 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
195 #endif /* not emacs */
197 /* Get the interface, including the syntax bits. */
198 #include "regex.h"
200 /* isalpha etc. are used for the character classes. */
201 #include <ctype.h>
203 #ifdef emacs
205 /* 1 if C is an ASCII character. */
206 #define IS_REAL_ASCII(c) ((c) < 0200)
208 /* 1 if C is a unibyte character. */
209 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
211 /* The Emacs definitions should not be directly affected by locales. */
213 /* In Emacs, these are only used for single-byte characters. */
214 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
215 #define ISCNTRL(c) ((c) < ' ')
216 #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
217 || ((c) >= 'a' && (c) <= 'f') \
218 || ((c) >= 'A' && (c) <= 'F'))
220 /* This is only used for single-byte characters. */
221 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
223 /* The rest must handle multibyte characters. */
225 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
226 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
227 : 1)
229 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
230 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
231 : 1)
233 #define ISALNUM(c) (IS_REAL_ASCII (c) \
234 ? (((c) >= 'a' && (c) <= 'z') \
235 || ((c) >= 'A' && (c) <= 'Z') \
236 || ((c) >= '0' && (c) <= '9')) \
237 : SYNTAX (c) == Sword)
239 #define ISALPHA(c) (IS_REAL_ASCII (c) \
240 ? (((c) >= 'a' && (c) <= 'z') \
241 || ((c) >= 'A' && (c) <= 'Z')) \
242 : SYNTAX (c) == Sword)
244 #define ISLOWER(c) (LOWERCASEP (c))
246 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
247 ? ((c) > ' ' && (c) < 0177 \
248 && !(((c) >= 'a' && (c) <= 'z') \
249 || ((c) >= 'A' && (c) <= 'Z') \
250 || ((c) >= '0' && (c) <= '9'))) \
251 : SYNTAX (c) != Sword)
253 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
255 #define ISUPPER(c) (UPPERCASEP (c))
257 #define ISWORD(c) (SYNTAX (c) == Sword)
259 #else /* not emacs */
261 /* Jim Meyering writes:
263 "... Some ctype macros are valid only for character codes that
264 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
265 using /bin/cc or gcc but without giving an ansi option). So, all
266 ctype uses should be through macros like ISPRINT... If
267 STDC_HEADERS is defined, then autoconf has verified that the ctype
268 macros don't need to be guarded with references to isascii. ...
269 Defining isascii to 1 should let any compiler worth its salt
270 eliminate the && through constant folding." */
272 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
273 #define ISASCII(c) 1
274 #else
275 #define ISASCII(c) isascii(c)
276 #endif
278 /* 1 if C is an ASCII character. */
279 #define IS_REAL_ASCII(c) ((c) < 0200)
281 /* This distinction is not meaningful, except in Emacs. */
282 #define ISUNIBYTE(c) 1
284 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
288 #ifdef isblank
289 #define ISBLANK(c) (ISASCII (c) && isblank (c))
290 #else
291 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
292 #endif
293 #ifdef isgraph
294 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
295 #else
296 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
297 #endif
299 #define ISPRINT(c) (ISASCII (c) && isprint (c))
300 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
301 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
302 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
303 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
304 #define ISLOWER(c) (ISASCII (c) && islower (c))
305 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
306 #define ISSPACE(c) (ISASCII (c) && isspace (c))
307 #define ISUPPER(c) (ISASCII (c) && isupper (c))
308 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
310 #define ISWORD(c) ISALPHA(c)
312 #endif /* not emacs */
314 #ifndef NULL
315 #define NULL (void *)0
316 #endif
318 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
319 since ours (we hope) works properly with all combinations of
320 machines, compilers, `char' and `unsigned char' argument types.
321 (Per Bothner suggested the basic approach.) */
322 #undef SIGN_EXTEND_CHAR
323 #if __STDC__
324 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
325 #else /* not __STDC__ */
326 /* As in Harbison and Steele. */
327 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
328 #endif
330 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
331 use `alloca' instead of `malloc'. This is because using malloc in
332 re_search* or re_match* could cause memory leaks when C-g is used in
333 Emacs; also, malloc is slower and causes storage fragmentation. On
334 the other hand, malloc is more portable, and easier to debug.
336 Because we sometimes use alloca, some routines have to be macros,
337 not functions -- `alloca'-allocated space disappears at the end of the
338 function it is called in. */
340 #ifdef REGEX_MALLOC
342 #define REGEX_ALLOCATE malloc
343 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
344 #define REGEX_FREE free
346 #else /* not REGEX_MALLOC */
348 /* Emacs already defines alloca, sometimes. */
349 #ifndef alloca
351 /* Make alloca work the best possible way. */
352 #ifdef __GNUC__
353 #define alloca __builtin_alloca
354 #else /* not __GNUC__ */
355 #if HAVE_ALLOCA_H
356 #include <alloca.h>
357 #else /* not __GNUC__ or HAVE_ALLOCA_H */
358 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
359 #ifndef _AIX /* Already did AIX, up at the top. */
360 char *alloca ();
361 #endif /* not _AIX */
362 #endif
363 #endif /* not HAVE_ALLOCA_H */
364 #endif /* not __GNUC__ */
366 #endif /* not alloca */
368 #define REGEX_ALLOCATE alloca
370 /* Assumes a `char *destination' variable. */
371 #define REGEX_REALLOCATE(source, osize, nsize) \
372 (destination = (char *) alloca (nsize), \
373 bcopy (source, destination, osize), \
374 destination)
376 /* No need to do anything to free, after alloca. */
377 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
379 #endif /* not REGEX_MALLOC */
381 /* Define how to allocate the failure stack. */
383 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
385 #define REGEX_ALLOCATE_STACK(size) \
386 r_alloc (&failure_stack_ptr, (size))
387 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
388 r_re_alloc (&failure_stack_ptr, (nsize))
389 #define REGEX_FREE_STACK(ptr) \
390 r_alloc_free (&failure_stack_ptr)
392 #else /* not using relocating allocator */
394 #ifdef REGEX_MALLOC
396 #define REGEX_ALLOCATE_STACK malloc
397 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
398 #define REGEX_FREE_STACK free
400 #else /* not REGEX_MALLOC */
402 #define REGEX_ALLOCATE_STACK alloca
404 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
405 REGEX_REALLOCATE (source, osize, nsize)
406 /* No need to explicitly free anything. */
407 #define REGEX_FREE_STACK(arg)
409 #endif /* not REGEX_MALLOC */
410 #endif /* not using relocating allocator */
413 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
414 `string1' or just past its end. This works if PTR is NULL, which is
415 a good thing. */
416 #define FIRST_STRING_P(ptr) \
417 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
419 /* (Re)Allocate N items of type T using malloc, or fail. */
420 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
421 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
422 #define RETALLOC_IF(addr, n, t) \
423 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
424 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
426 #define BYTEWIDTH 8 /* In bits. */
428 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
430 #undef MAX
431 #undef MIN
432 #define MAX(a, b) ((a) > (b) ? (a) : (b))
433 #define MIN(a, b) ((a) < (b) ? (a) : (b))
435 /* Type of source-pattern and string chars. */
436 typedef const unsigned char re_char;
438 typedef char boolean;
439 #define false 0
440 #define true 1
442 static int re_match_2_internal ();
444 /* These are the command codes that appear in compiled regular
445 expressions. Some opcodes are followed by argument bytes. A
446 command code can specify any interpretation whatsoever for its
447 arguments. Zero bytes may appear in the compiled regular expression. */
449 typedef enum
451 no_op = 0,
453 /* Succeed right away--no more backtracking. */
454 succeed,
456 /* Followed by one byte giving n, then by n literal bytes. */
457 exactn,
459 /* Matches any (more or less) character. */
460 anychar,
462 /* Matches any one char belonging to specified set. First
463 following byte is number of bitmap bytes. Then come bytes
464 for a bitmap saying which chars are in. Bits in each byte
465 are ordered low-bit-first. A character is in the set if its
466 bit is 1. A character too large to have a bit in the map is
467 automatically not in the set.
469 If the length byte has the 0x80 bit set, then that stuff
470 is followed by a range table:
471 2 bytes of flags for character sets (low 8 bits, high 8 bits)
472 See RANGE_TABLE_WORK_BITS below.
473 2 bytes, the number of pairs that follow
474 pairs, each 2 multibyte characters,
475 each multibyte character represented as 3 bytes. */
476 charset,
478 /* Same parameters as charset, but match any character that is
479 not one of those specified. */
480 charset_not,
482 /* Start remembering the text that is matched, for storing in a
483 register. Followed by one byte with the register number, in
484 the range 0 to one less than the pattern buffer's re_nsub
485 field. */
486 start_memory,
488 /* Stop remembering the text that is matched and store it in a
489 memory register. Followed by one byte with the register
490 number, in the range 0 to one less than `re_nsub' in the
491 pattern buffer. */
492 stop_memory,
494 /* Match a duplicate of something remembered. Followed by one
495 byte containing the register number. */
496 duplicate,
498 /* Fail unless at beginning of line. */
499 begline,
501 /* Fail unless at end of line. */
502 endline,
504 /* Succeeds if at beginning of buffer (if emacs) or at beginning
505 of string to be matched (if not). */
506 begbuf,
508 /* Analogously, for end of buffer/string. */
509 endbuf,
511 /* Followed by two byte relative address to which to jump. */
512 jump,
514 /* Followed by two-byte relative address of place to resume at
515 in case of failure. */
516 on_failure_jump,
518 /* Like on_failure_jump, but pushes a placeholder instead of the
519 current string position when executed. */
520 on_failure_keep_string_jump,
522 /* Like `on_failure_jump', except that it assumes that the
523 pattern following it is mutually exclusive with the pattern
524 at the destination of the jump: if one matches something,
525 the other won't match at all.
526 Always used via `on_failure_jump_smart'. */
527 on_failure_jump_exclusive,
529 /* Just like `on_failure_jump', except that it checks that we
530 don't get stuck in an infinite loop (matching an empty string
531 indefinitely). */
532 on_failure_jump_loop,
534 /* A smart `on_failure_jump' used for greedy * and + operators.
535 It analyses the loop before which it is put and if the
536 loop does not require backtracking, it changes itself to
537 `on_failure_jump_exclusive', else it just defaults to
538 changing itself into `on_failure_jump_loop'. */
539 on_failure_jump_smart,
541 /* Followed by two-byte relative address and two-byte number n.
542 After matching N times, jump to the address upon failure. */
543 succeed_n,
545 /* Followed by two-byte relative address, and two-byte number n.
546 Jump to the address N times, then fail. */
547 jump_n,
549 /* Set the following two-byte relative address to the
550 subsequent two-byte number. The address *includes* the two
551 bytes of number. */
552 set_number_at,
554 wordchar, /* Matches any word-constituent character. */
555 notwordchar, /* Matches any char that is not a word-constituent. */
557 wordbeg, /* Succeeds if at word beginning. */
558 wordend, /* Succeeds if at word end. */
560 wordbound, /* Succeeds if at a word boundary. */
561 notwordbound /* Succeeds if not at a word boundary. */
563 #ifdef emacs
564 ,before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
570 syntaxspec,
572 /* Matches any character whose syntax is not that specified. */
573 notsyntaxspec,
575 /* Matches any character whose category-set contains the specified
576 category. The operator is followed by a byte which contains a
577 category code (mnemonic ASCII character). */
578 categoryspec,
580 /* Matches any character whose category-set does not contain the
581 specified category. The operator is followed by a byte which
582 contains the category code (mnemonic ASCII character). */
583 notcategoryspec
584 #endif /* emacs */
585 } re_opcode_t;
587 /* Common operations on the compiled pattern. */
589 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
591 #define STORE_NUMBER(destination, number) \
592 do { \
593 (destination)[0] = (number) & 0377; \
594 (destination)[1] = (number) >> 8; \
595 } while (0)
597 /* Same as STORE_NUMBER, except increment DESTINATION to
598 the byte after where the number is stored. Therefore, DESTINATION
599 must be an lvalue. */
601 #define STORE_NUMBER_AND_INCR(destination, number) \
602 do { \
603 STORE_NUMBER (destination, number); \
604 (destination) += 2; \
605 } while (0)
607 /* Put into DESTINATION a number stored in two contiguous bytes starting
608 at SOURCE. */
610 #define EXTRACT_NUMBER(destination, source) \
611 do { \
612 (destination) = *(source) & 0377; \
613 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
614 } while (0)
616 #ifdef DEBUG
617 static void
618 extract_number (dest, source)
619 int *dest;
620 unsigned char *source;
622 int temp = SIGN_EXTEND_CHAR (*(source + 1));
623 *dest = *source & 0377;
624 *dest += temp << 8;
627 #ifndef EXTRACT_MACROS /* To debug the macros. */
628 #undef EXTRACT_NUMBER
629 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
630 #endif /* not EXTRACT_MACROS */
632 #endif /* DEBUG */
634 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
635 SOURCE must be an lvalue. */
637 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
638 do { \
639 EXTRACT_NUMBER (destination, source); \
640 (source) += 2; \
641 } while (0)
643 #ifdef DEBUG
644 static void
645 extract_number_and_incr (destination, source)
646 int *destination;
647 unsigned char **source;
649 extract_number (destination, *source);
650 *source += 2;
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* DEBUG */
661 /* Store a multibyte character in three contiguous bytes starting
662 DESTINATION, and increment DESTINATION to the byte after where the
663 character is stored. Therefore, DESTINATION must be an lvalue. */
665 #define STORE_CHARACTER_AND_INCR(destination, character) \
666 do { \
667 (destination)[0] = (character) & 0377; \
668 (destination)[1] = ((character) >> 8) & 0377; \
669 (destination)[2] = (character) >> 16; \
670 (destination) += 3; \
671 } while (0)
673 /* Put into DESTINATION a character stored in three contiguous bytes
674 starting at SOURCE. */
676 #define EXTRACT_CHARACTER(destination, source) \
677 do { \
678 (destination) = ((source)[0] \
679 | ((source)[1] << 8) \
680 | ((source)[2] << 16)); \
681 } while (0)
684 /* Macros for charset. */
686 /* Size of bitmap of charset P in bytes. P is a start of charset,
687 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
688 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
690 /* Nonzero if charset P has range table. */
691 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
693 /* Return the address of range table of charset P. But not the start
694 of table itself, but the before where the number of ranges is
695 stored. `2 +' means to skip re_opcode_t and size of bitmap,
696 and the 2 bytes of flags at the start of the range table. */
697 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
699 /* Extract the bit flags that start a range table. */
700 #define CHARSET_RANGE_TABLE_BITS(p) \
701 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
702 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
704 /* Test if C is listed in the bitmap of charset P. */
705 #define CHARSET_LOOKUP_BITMAP(p, c) \
706 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
707 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
709 /* Return the address of end of RANGE_TABLE. COUNT is number of
710 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
711 is start of range and end of range. `* 3' is size of each start
712 and end. */
713 #define CHARSET_RANGE_TABLE_END(range_table, count) \
714 ((range_table) + (count) * 2 * 3)
716 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
717 COUNT is number of ranges in RANGE_TABLE. */
718 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
719 do \
721 int range_start, range_end; \
722 unsigned char *p; \
723 unsigned char *range_table_end \
724 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
726 for (p = (range_table); p < range_table_end; p += 2 * 3) \
728 EXTRACT_CHARACTER (range_start, p); \
729 EXTRACT_CHARACTER (range_end, p + 3); \
731 if (range_start <= (c) && (c) <= range_end) \
733 (not) = !(not); \
734 break; \
738 while (0)
740 /* Test if C is in range table of CHARSET. The flag NOT is negated if
741 C is listed in it. */
742 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
743 do \
745 /* Number of ranges in range table. */ \
746 int count; \
747 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
749 EXTRACT_NUMBER_AND_INCR (count, range_table); \
750 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
752 while (0)
754 /* If DEBUG is defined, Regex prints many voluminous messages about what
755 it is doing (if the variable `debug' is nonzero). If linked with the
756 main program in `iregex.c', you can enter patterns and strings
757 interactively. And if linked with the main program in `main.c' and
758 the other test files, you can run the already-written tests. */
760 #ifdef DEBUG
762 /* We use standard I/O for debugging. */
763 #include <stdio.h>
765 /* It is useful to test things that ``must'' be true when debugging. */
766 #include <assert.h>
768 static int debug = -100000;
770 #define DEBUG_STATEMENT(e) e
771 #define DEBUG_PRINT1(x) if (debug > 0) printf (x)
772 #define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
773 #define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
774 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
775 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
776 if (debug > 0) print_partial_compiled_pattern (s, e)
777 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
778 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
781 /* Print the fastmap in human-readable form. */
783 void
784 print_fastmap (fastmap)
785 char *fastmap;
787 unsigned was_a_range = 0;
788 unsigned i = 0;
790 while (i < (1 << BYTEWIDTH))
792 if (fastmap[i++])
794 was_a_range = 0;
795 putchar (i - 1);
796 while (i < (1 << BYTEWIDTH) && fastmap[i])
798 was_a_range = 1;
799 i++;
801 if (was_a_range)
803 printf ("-");
804 putchar (i - 1);
808 putchar ('\n');
812 /* Print a compiled pattern string in human-readable form, starting at
813 the START pointer into it and ending just before the pointer END. */
815 void
816 print_partial_compiled_pattern (start, end)
817 unsigned char *start;
818 unsigned char *end;
820 int mcnt, mcnt2;
821 unsigned char *p = start;
822 unsigned char *pend = end;
824 if (start == NULL)
826 printf ("(null)\n");
827 return;
830 /* Loop over pattern commands. */
831 while (p < pend)
833 printf ("%d:\t", p - start);
835 switch ((re_opcode_t) *p++)
837 case no_op:
838 printf ("/no_op");
839 break;
841 case succeed:
842 printf ("/succeed");
843 break;
845 case exactn:
846 mcnt = *p++;
847 printf ("/exactn/%d", mcnt);
850 putchar ('/');
851 putchar (*p++);
853 while (--mcnt);
854 break;
856 case start_memory:
857 printf ("/start_memory/%d", *p++);
858 break;
860 case stop_memory:
861 printf ("/stop_memory/%d", *p++);
862 break;
864 case duplicate:
865 printf ("/duplicate/%d", *p++);
866 break;
868 case anychar:
869 printf ("/anychar");
870 break;
872 case charset:
873 case charset_not:
875 register int c, last = -100;
876 register int in_range = 0;
877 int length = CHARSET_BITMAP_SIZE (p - 1);
878 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
880 printf ("/charset [%s",
881 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
883 assert (p + *p < pend);
885 for (c = 0; c < 256; c++)
886 if (c / 8 < length
887 && (p[1 + (c/8)] & (1 << (c % 8))))
889 /* Are we starting a range? */
890 if (last + 1 == c && ! in_range)
892 putchar ('-');
893 in_range = 1;
895 /* Have we broken a range? */
896 else if (last + 1 != c && in_range)
898 putchar (last);
899 in_range = 0;
902 if (! in_range)
903 putchar (c);
905 last = c;
908 if (in_range)
909 putchar (last);
911 putchar (']');
913 p += 1 + length;
915 if (has_range_table)
917 int count;
918 printf ("has-range-table");
920 /* ??? Should print the range table; for now, just skip it. */
921 p += 2; /* skip range table bits */
922 EXTRACT_NUMBER_AND_INCR (count, p);
923 p = CHARSET_RANGE_TABLE_END (p, count);
926 break;
928 case begline:
929 printf ("/begline");
930 break;
932 case endline:
933 printf ("/endline");
934 break;
936 case on_failure_jump:
937 extract_number_and_incr (&mcnt, &p);
938 printf ("/on_failure_jump to %d", p + mcnt - start);
939 break;
941 case on_failure_keep_string_jump:
942 extract_number_and_incr (&mcnt, &p);
943 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
944 break;
946 case on_failure_jump_exclusive:
947 extract_number_and_incr (&mcnt, &p);
948 printf ("/on_failure_jump_exclusive to %d", p + mcnt - start);
949 break;
951 case on_failure_jump_loop:
952 extract_number_and_incr (&mcnt, &p);
953 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
954 break;
956 case on_failure_jump_smart:
957 extract_number_and_incr (&mcnt, &p);
958 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
959 break;
961 case jump:
962 extract_number_and_incr (&mcnt, &p);
963 printf ("/jump to %d", p + mcnt - start);
964 break;
966 case succeed_n:
967 extract_number_and_incr (&mcnt, &p);
968 extract_number_and_incr (&mcnt2, &p);
969 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
970 break;
972 case jump_n:
973 extract_number_and_incr (&mcnt, &p);
974 extract_number_and_incr (&mcnt2, &p);
975 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
976 break;
978 case set_number_at:
979 extract_number_and_incr (&mcnt, &p);
980 extract_number_and_incr (&mcnt2, &p);
981 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
982 break;
984 case wordbound:
985 printf ("/wordbound");
986 break;
988 case notwordbound:
989 printf ("/notwordbound");
990 break;
992 case wordbeg:
993 printf ("/wordbeg");
994 break;
996 case wordend:
997 printf ("/wordend");
999 #ifdef emacs
1000 case before_dot:
1001 printf ("/before_dot");
1002 break;
1004 case at_dot:
1005 printf ("/at_dot");
1006 break;
1008 case after_dot:
1009 printf ("/after_dot");
1010 break;
1012 case syntaxspec:
1013 printf ("/syntaxspec");
1014 mcnt = *p++;
1015 printf ("/%d", mcnt);
1016 break;
1018 case notsyntaxspec:
1019 printf ("/notsyntaxspec");
1020 mcnt = *p++;
1021 printf ("/%d", mcnt);
1022 break;
1023 #endif /* emacs */
1025 case wordchar:
1026 printf ("/wordchar");
1027 break;
1029 case notwordchar:
1030 printf ("/notwordchar");
1031 break;
1033 case begbuf:
1034 printf ("/begbuf");
1035 break;
1037 case endbuf:
1038 printf ("/endbuf");
1039 break;
1041 default:
1042 printf ("?%d", *(p-1));
1045 putchar ('\n');
1048 printf ("%d:\tend of pattern.\n", p - start);
1052 void
1053 print_compiled_pattern (bufp)
1054 struct re_pattern_buffer *bufp;
1056 unsigned char *buffer = bufp->buffer;
1058 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1059 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated);
1061 if (bufp->fastmap_accurate && bufp->fastmap)
1063 printf ("fastmap: ");
1064 print_fastmap (bufp->fastmap);
1067 printf ("re_nsub: %d\t", bufp->re_nsub);
1068 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1069 printf ("can_be_null: %d\t", bufp->can_be_null);
1070 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1071 printf ("no_sub: %d\t", bufp->no_sub);
1072 printf ("not_bol: %d\t", bufp->not_bol);
1073 printf ("not_eol: %d\t", bufp->not_eol);
1074 printf ("syntax: %d\n", bufp->syntax);
1075 fflush (stdout);
1076 /* Perhaps we should print the translate table? */
1080 void
1081 print_double_string (where, string1, size1, string2, size2)
1082 re_char *where;
1083 re_char *string1;
1084 re_char *string2;
1085 int size1;
1086 int size2;
1088 unsigned this_char;
1090 if (where == NULL)
1091 printf ("(null)");
1092 else
1094 if (FIRST_STRING_P (where))
1096 for (this_char = where - string1; this_char < size1; this_char++)
1097 putchar (string1[this_char]);
1099 where = string2;
1102 for (this_char = where - string2; this_char < size2; this_char++)
1103 putchar (string2[this_char]);
1107 #else /* not DEBUG */
1109 #undef assert
1110 #define assert(e)
1112 #define DEBUG_STATEMENT(e)
1113 #define DEBUG_PRINT1(x)
1114 #define DEBUG_PRINT2(x1, x2)
1115 #define DEBUG_PRINT3(x1, x2, x3)
1116 #define DEBUG_PRINT4(x1, x2, x3, x4)
1117 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1118 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1120 #endif /* not DEBUG */
1122 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1123 also be assigned to arbitrarily: each pattern buffer stores its own
1124 syntax, so it can be changed between regex compilations. */
1125 /* This has no initializer because initialized variables in Emacs
1126 become read-only after dumping. */
1127 reg_syntax_t re_syntax_options;
1130 /* Specify the precise syntax of regexps for compilation. This provides
1131 for compatibility for various utilities which historically have
1132 different, incompatible syntaxes.
1134 The argument SYNTAX is a bit mask comprised of the various bits
1135 defined in regex.h. We return the old syntax. */
1137 reg_syntax_t
1138 re_set_syntax (syntax)
1139 reg_syntax_t syntax;
1141 reg_syntax_t ret = re_syntax_options;
1143 re_syntax_options = syntax;
1144 return ret;
1147 /* This table gives an error message for each of the error codes listed
1148 in regex.h. Obviously the order here has to be same as there.
1149 POSIX doesn't require that we do anything for REG_NOERROR,
1150 but why not be nice? */
1152 static const char *re_error_msgid[] =
1154 gettext_noop ("Success"), /* REG_NOERROR */
1155 gettext_noop ("No match"), /* REG_NOMATCH */
1156 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1157 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1158 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1159 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1160 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1161 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1162 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1163 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1164 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1165 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1166 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1167 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1168 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1169 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1170 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1173 /* Avoiding alloca during matching, to placate r_alloc. */
1175 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1176 searching and matching functions should not call alloca. On some
1177 systems, alloca is implemented in terms of malloc, and if we're
1178 using the relocating allocator routines, then malloc could cause a
1179 relocation, which might (if the strings being searched are in the
1180 ralloc heap) shift the data out from underneath the regexp
1181 routines.
1183 Here's another reason to avoid allocation: Emacs
1184 processes input from X in a signal handler; processing X input may
1185 call malloc; if input arrives while a matching routine is calling
1186 malloc, then we're scrod. But Emacs can't just block input while
1187 calling matching routines; then we don't notice interrupts when
1188 they come in. So, Emacs blocks input around all regexp calls
1189 except the matching calls, which it leaves unprotected, in the
1190 faith that they will not malloc. */
1192 /* Normally, this is fine. */
1193 #define MATCH_MAY_ALLOCATE
1195 /* When using GNU C, we are not REALLY using the C alloca, no matter
1196 what config.h may say. So don't take precautions for it. */
1197 #ifdef __GNUC__
1198 #undef C_ALLOCA
1199 #endif
1201 /* The match routines may not allocate if (1) they would do it with malloc
1202 and (2) it's not safe for them to use malloc.
1203 Note that if REL_ALLOC is defined, matching would not use malloc for the
1204 failure stack, but we would still use it for the register vectors;
1205 so REL_ALLOC should not affect this. */
1206 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1207 #undef MATCH_MAY_ALLOCATE
1208 #endif
1211 /* Failure stack declarations and macros; both re_compile_fastmap and
1212 re_match_2 use a failure stack. These have to be macros because of
1213 REGEX_ALLOCATE_STACK. */
1216 /* Approximate number of failure points for which to initially allocate space
1217 when matching. If this number is exceeded, we allocate more
1218 space, so it is not a hard limit. */
1219 #ifndef INIT_FAILURE_ALLOC
1220 #define INIT_FAILURE_ALLOC 20
1221 #endif
1223 /* Roughly the maximum number of failure points on the stack. Would be
1224 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1225 This is a variable only so users of regex can assign to it; we never
1226 change it ourselves. */
1227 #if defined (MATCH_MAY_ALLOCATE)
1228 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1229 whose default stack limit is 2mb. In order for a larger
1230 value to work reliably, you have to try to make it accord
1231 with the process stack limit. */
1232 int re_max_failures = 40000;
1233 #else
1234 int re_max_failures = 4000;
1235 #endif
1237 union fail_stack_elt
1239 const unsigned char *pointer;
1240 unsigned int integer;
1243 typedef union fail_stack_elt fail_stack_elt_t;
1245 typedef struct
1247 fail_stack_elt_t *stack;
1248 unsigned size;
1249 unsigned avail; /* Offset of next open position. */
1250 unsigned frame; /* Offset of the cur constructed frame. */
1251 } fail_stack_type;
1253 #define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1254 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1255 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1258 /* Define macros to initialize and free the failure stack.
1259 Do `return -2' if the alloc fails. */
1261 #ifdef MATCH_MAY_ALLOCATE
1262 #define INIT_FAIL_STACK() \
1263 do { \
1264 fail_stack.stack = (fail_stack_elt_t *) \
1265 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1266 * sizeof (fail_stack_elt_t)); \
1268 if (fail_stack.stack == NULL) \
1269 return -2; \
1271 fail_stack.size = INIT_FAILURE_ALLOC; \
1272 fail_stack.avail = 0; \
1273 fail_stack.frame = 0; \
1274 } while (0)
1276 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1277 #else
1278 #define INIT_FAIL_STACK() \
1279 do { \
1280 fail_stack.avail = 0; \
1281 fail_stack.frame = 0; \
1282 } while (0)
1284 #define RESET_FAIL_STACK()
1285 #endif
1288 /* Double the size of FAIL_STACK, up to a limit
1289 which allows approximately `re_max_failures' items.
1291 Return 1 if succeeds, and 0 if either ran out of memory
1292 allocating space for it or it was already too large.
1294 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1296 /* Factor to increase the failure stack size by
1297 when we increase it.
1298 This used to be 2, but 2 was too wasteful
1299 because the old discarded stacks added up to as much space
1300 were as ultimate, maximum-size stack. */
1301 #define FAIL_STACK_GROWTH_FACTOR 4
1303 #define GROW_FAIL_STACK(fail_stack) \
1304 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1305 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1306 ? 0 \
1307 : ((fail_stack).stack \
1308 = (fail_stack_elt_t *) \
1309 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1310 (fail_stack).size * sizeof (fail_stack_elt_t), \
1311 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1312 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1313 * FAIL_STACK_GROWTH_FACTOR))), \
1315 (fail_stack).stack == NULL \
1316 ? 0 \
1317 : ((fail_stack).size \
1318 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1319 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1320 * FAIL_STACK_GROWTH_FACTOR)) \
1321 / sizeof (fail_stack_elt_t)), \
1322 1)))
1325 /* Push pointer POINTER on FAIL_STACK.
1326 Return 1 if was able to do so and 0 if ran out of memory allocating
1327 space to do so. */
1328 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1329 ((FAIL_STACK_FULL () \
1330 && !GROW_FAIL_STACK (FAIL_STACK)) \
1331 ? 0 \
1332 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1334 #define POP_PATTERN_OP() POP_FAILURE_POINTER ()
1336 /* Push a pointer value onto the failure stack.
1337 Assumes the variable `fail_stack'. Probably should only
1338 be called from within `PUSH_FAILURE_POINT'. */
1339 #define PUSH_FAILURE_POINTER(item) \
1340 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1342 /* This pushes an integer-valued item onto the failure stack.
1343 Assumes the variable `fail_stack'. Probably should only
1344 be called from within `PUSH_FAILURE_POINT'. */
1345 #define PUSH_FAILURE_INT(item) \
1346 fail_stack.stack[fail_stack.avail++].integer = (item)
1348 /* Push a fail_stack_elt_t value onto the failure stack.
1349 Assumes the variable `fail_stack'. Probably should only
1350 be called from within `PUSH_FAILURE_POINT'. */
1351 #define PUSH_FAILURE_ELT(item) \
1352 fail_stack.stack[fail_stack.avail++] = (item)
1354 /* These three POP... operations complement the three PUSH... operations.
1355 All assume that `fail_stack' is nonempty. */
1356 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1357 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1358 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1360 /* Individual items aside from the registers. */
1361 #define NUM_NONREG_ITEMS 3
1363 /* Used to examine the stack (to detect infinite loops). */
1364 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1365 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1366 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1367 #define TOP_FAILURE_HANDLE() fail_stack.frame
1370 #define ENSURE_FAIL_STACK(space) \
1371 while (REMAINING_AVAIL_SLOTS <= space) { \
1372 if (!GROW_FAIL_STACK (fail_stack)) \
1373 return -2; \
1374 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1375 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1378 /* Push register NUM onto the stack. */
1379 #define PUSH_FAILURE_REG(num) \
1380 do { \
1381 char *destination; \
1382 ENSURE_FAIL_STACK(3); \
1383 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1384 num, regstart[num], regend[num]); \
1385 PUSH_FAILURE_POINTER (regstart[num]); \
1386 PUSH_FAILURE_POINTER (regend[num]); \
1387 PUSH_FAILURE_INT (num); \
1388 } while (0)
1390 /* Pop a saved register off the stack. */
1391 #define POP_FAILURE_REG() \
1392 do { \
1393 int reg = POP_FAILURE_INT (); \
1394 regend[reg] = POP_FAILURE_POINTER (); \
1395 regstart[reg] = POP_FAILURE_POINTER (); \
1396 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1397 reg, regstart[reg], regend[reg]); \
1398 } while (0)
1400 /* Check that we are not stuck in an infinite loop. */
1401 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1402 do { \
1403 int failure = TOP_FAILURE_HANDLE(); \
1404 /* Check for infinite matching loops */ \
1405 while (failure > 0 && \
1406 (FAILURE_STR (failure) == string_place \
1407 || FAILURE_STR (failure) == NULL)) \
1409 assert (FAILURE_PAT (failure) >= bufp->buffer \
1410 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1411 if (FAILURE_PAT (failure) == pat_cur) \
1412 goto fail; \
1413 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1414 failure = NEXT_FAILURE_HANDLE(failure); \
1416 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1417 } while (0)
1419 /* Push the information about the state we will need
1420 if we ever fail back to it.
1422 Requires variables fail_stack, regstart, regend and
1423 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1424 declared.
1426 Does `return FAILURE_CODE' if runs out of memory. */
1428 #define PUSH_FAILURE_POINT(pattern, string_place) \
1429 do { \
1430 char *destination; \
1431 /* Must be int, so when we don't save any registers, the arithmetic \
1432 of 0 + -1 isn't done as unsigned. */ \
1434 DEBUG_STATEMENT (failure_id++); \
1435 DEBUG_STATEMENT (nfailure_points_pushed++); \
1436 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1437 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1438 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1440 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1442 DEBUG_PRINT1 ("\n"); \
1444 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1445 PUSH_FAILURE_INT (fail_stack.frame); \
1447 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1448 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1449 DEBUG_PRINT1 ("'\n"); \
1450 PUSH_FAILURE_POINTER (string_place); \
1452 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1453 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1454 PUSH_FAILURE_POINTER (pattern); \
1456 /* Close the frame by moving the frame pointer past it. */ \
1457 fail_stack.frame = fail_stack.avail; \
1458 } while (0)
1460 /* Estimate the size of data pushed by a typical failure stack entry.
1461 An estimate is all we need, because all we use this for
1462 is to choose a limit for how big to make the failure stack. */
1464 #define TYPICAL_FAILURE_SIZE 20
1466 /* How many items can still be added to the stack without overflowing it. */
1467 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1470 /* Pops what PUSH_FAIL_STACK pushes.
1472 We restore into the parameters, all of which should be lvalues:
1473 STR -- the saved data position.
1474 PAT -- the saved pattern position.
1475 REGSTART, REGEND -- arrays of string positions.
1477 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1478 `pend', `string1', `size1', `string2', and `size2'. */
1480 #define POP_FAILURE_POINT(str, pat) \
1481 do { \
1482 assert (!FAIL_STACK_EMPTY ()); \
1484 /* Remove failure points and point to how many regs pushed. */ \
1485 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1486 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1487 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1489 /* Pop the saved registers. */ \
1490 while (fail_stack.frame < fail_stack.avail) \
1491 POP_FAILURE_REG (); \
1493 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1494 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1495 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1497 /* If the saved string location is NULL, it came from an \
1498 on_failure_keep_string_jump opcode, and we want to throw away the \
1499 saved NULL, thus retaining our current position in the string. */ \
1500 str = (re_char *) POP_FAILURE_POINTER (); \
1501 DEBUG_PRINT2 (" Popping string %p: `", str); \
1502 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1503 DEBUG_PRINT1 ("'\n"); \
1505 fail_stack.frame = POP_FAILURE_INT (); \
1506 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1508 assert (fail_stack.avail >= 0); \
1509 assert (fail_stack.frame <= fail_stack.avail); \
1511 DEBUG_STATEMENT (nfailure_points_popped++); \
1512 } while (0) /* POP_FAILURE_POINT */
1516 /* Registers are set to a sentinel when they haven't yet matched. */
1517 #define REG_UNSET_VALUE NULL
1518 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1520 /* Subroutine declarations and macros for regex_compile. */
1522 static void store_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, int arg));
1523 static void store_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1524 int arg1, int arg2));
1525 static void insert_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1526 int arg, unsigned char *end));
1527 static void insert_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1528 int arg1, int arg2, unsigned char *end));
1529 static boolean at_begline_loc_p _RE_ARGS((const unsigned char *pattern,
1530 const unsigned char *p,
1531 reg_syntax_t syntax));
1532 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1533 const unsigned char *pend,
1534 reg_syntax_t syntax));
1536 /* Fetch the next character in the uncompiled pattern---translating it
1537 if necessary. Also cast from a signed character in the constant
1538 string passed to us by the user to an unsigned char that we can use
1539 as an array index (in, e.g., `translate'). */
1540 #ifndef PATFETCH
1541 #define PATFETCH(c) \
1542 do { \
1543 PATFETCH_RAW (c); \
1544 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
1545 } while (0)
1546 #endif
1548 /* Fetch the next character in the uncompiled pattern, with no
1549 translation. */
1550 #define PATFETCH_RAW(c) \
1551 do {if (p == pend) return REG_EEND; \
1552 c = *p++; \
1553 } while (0)
1555 /* Go backwards one character in the pattern. */
1556 #define PATUNFETCH p--
1559 /* If `translate' is non-null, return translate[D], else just D. We
1560 cast the subscript to translate because some data is declared as
1561 `char *', to avoid warnings when a string constant is passed. But
1562 when we use a character as a subscript we must make it unsigned. */
1563 #ifndef TRANSLATE
1564 #define TRANSLATE(d) \
1565 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1566 #endif
1569 /* Macros for outputting the compiled pattern into `buffer'. */
1571 /* If the buffer isn't allocated when it comes in, use this. */
1572 #define INIT_BUF_SIZE 32
1574 /* Make sure we have at least N more bytes of space in buffer. */
1575 #define GET_BUFFER_SPACE(n) \
1576 while (b - bufp->buffer + (n) > bufp->allocated) \
1577 EXTEND_BUFFER ()
1579 /* Make sure we have one more byte of buffer space and then add C to it. */
1580 #define BUF_PUSH(c) \
1581 do { \
1582 GET_BUFFER_SPACE (1); \
1583 *b++ = (unsigned char) (c); \
1584 } while (0)
1587 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1588 #define BUF_PUSH_2(c1, c2) \
1589 do { \
1590 GET_BUFFER_SPACE (2); \
1591 *b++ = (unsigned char) (c1); \
1592 *b++ = (unsigned char) (c2); \
1593 } while (0)
1596 /* As with BUF_PUSH_2, except for three bytes. */
1597 #define BUF_PUSH_3(c1, c2, c3) \
1598 do { \
1599 GET_BUFFER_SPACE (3); \
1600 *b++ = (unsigned char) (c1); \
1601 *b++ = (unsigned char) (c2); \
1602 *b++ = (unsigned char) (c3); \
1603 } while (0)
1606 /* Store a jump with opcode OP at LOC to location TO. We store a
1607 relative address offset by the three bytes the jump itself occupies. */
1608 #define STORE_JUMP(op, loc, to) \
1609 store_op1 (op, loc, (to) - (loc) - 3)
1611 /* Likewise, for a two-argument jump. */
1612 #define STORE_JUMP2(op, loc, to, arg) \
1613 store_op2 (op, loc, (to) - (loc) - 3, arg)
1615 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1616 #define INSERT_JUMP(op, loc, to) \
1617 insert_op1 (op, loc, (to) - (loc) - 3, b)
1619 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1620 #define INSERT_JUMP2(op, loc, to, arg) \
1621 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1624 /* This is not an arbitrary limit: the arguments which represent offsets
1625 into the pattern are two bytes long. So if 2^16 bytes turns out to
1626 be too small, many things would have to change. */
1627 #define MAX_BUF_SIZE (1L << 16)
1630 /* Extend the buffer by twice its current size via realloc and
1631 reset the pointers that pointed into the old block to point to the
1632 correct places in the new one. If extending the buffer results in it
1633 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1634 #define EXTEND_BUFFER() \
1635 do { \
1636 unsigned char *old_buffer = bufp->buffer; \
1637 if (bufp->allocated == MAX_BUF_SIZE) \
1638 return REG_ESIZE; \
1639 bufp->allocated <<= 1; \
1640 if (bufp->allocated > MAX_BUF_SIZE) \
1641 bufp->allocated = MAX_BUF_SIZE; \
1642 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1643 if (bufp->buffer == NULL) \
1644 return REG_ESPACE; \
1645 /* If the buffer moved, move all the pointers into it. */ \
1646 if (old_buffer != bufp->buffer) \
1648 b = (b - old_buffer) + bufp->buffer; \
1649 begalt = (begalt - old_buffer) + bufp->buffer; \
1650 if (fixup_alt_jump) \
1651 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1652 if (laststart) \
1653 laststart = (laststart - old_buffer) + bufp->buffer; \
1654 if (pending_exact) \
1655 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1657 } while (0)
1660 /* Since we have one byte reserved for the register number argument to
1661 {start,stop}_memory, the maximum number of groups we can report
1662 things about is what fits in that byte. */
1663 #define MAX_REGNUM 255
1665 /* But patterns can have more than `MAX_REGNUM' registers. We just
1666 ignore the excess. */
1667 typedef unsigned regnum_t;
1670 /* Macros for the compile stack. */
1672 /* Since offsets can go either forwards or backwards, this type needs to
1673 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1674 typedef int pattern_offset_t;
1676 typedef struct
1678 pattern_offset_t begalt_offset;
1679 pattern_offset_t fixup_alt_jump;
1680 pattern_offset_t laststart_offset;
1681 regnum_t regnum;
1682 } compile_stack_elt_t;
1685 typedef struct
1687 compile_stack_elt_t *stack;
1688 unsigned size;
1689 unsigned avail; /* Offset of next open position. */
1690 } compile_stack_type;
1693 #define INIT_COMPILE_STACK_SIZE 32
1695 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1696 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1698 /* The next available element. */
1699 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1702 /* Structure to manage work area for range table. */
1703 struct range_table_work_area
1705 int *table; /* actual work area. */
1706 int allocated; /* allocated size for work area in bytes. */
1707 int used; /* actually used size in words. */
1708 int bits; /* flag to record character classes */
1711 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1712 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1713 do { \
1714 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1716 (work_area).allocated += 16 * sizeof (int); \
1717 if ((work_area).table) \
1718 (work_area).table \
1719 = (int *) realloc ((work_area).table, (work_area).allocated); \
1720 else \
1721 (work_area).table \
1722 = (int *) malloc ((work_area).allocated); \
1723 if ((work_area).table == 0) \
1724 FREE_STACK_RETURN (REG_ESPACE); \
1726 } while (0)
1728 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1729 (work_area).bits |= (bit)
1731 /* These bits represent the various character classes such as [:alnum:]
1732 in a charset's range table. */
1733 #define BIT_ALNUM 0x1
1734 #define BIT_ALPHA 0x2
1735 #define BIT_WORD 0x4
1736 #define BIT_ASCII 0x8
1737 #define BIT_NONASCII 0x10
1738 #define BIT_GRAPH 0x20
1739 #define BIT_LOWER 0x40
1740 #define BIT_PRINT 0x80
1741 #define BIT_PUNCT 0x100
1742 #define BIT_SPACE 0x200
1743 #define BIT_UPPER 0x400
1744 #define BIT_UNIBYTE 0x800
1745 #define BIT_MULTIBYTE 0x1000
1747 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1748 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1749 do { \
1750 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1751 (work_area).table[(work_area).used++] = (range_start); \
1752 (work_area).table[(work_area).used++] = (range_end); \
1753 } while (0)
1755 /* Free allocated memory for WORK_AREA. */
1756 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1757 do { \
1758 if ((work_area).table) \
1759 free ((work_area).table); \
1760 } while (0)
1762 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1763 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1764 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1765 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1768 /* Set the bit for character C in a list. */
1769 #define SET_LIST_BIT(c) \
1770 (b[((unsigned char) (c)) / BYTEWIDTH] \
1771 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1774 /* Get the next unsigned number in the uncompiled pattern. */
1775 #define GET_UNSIGNED_NUMBER(num) \
1776 do { if (p != pend) \
1778 PATFETCH (c); \
1779 while (ISDIGIT (c)) \
1781 if (num < 0) \
1782 num = 0; \
1783 num = num * 10 + c - '0'; \
1784 if (p == pend) \
1785 break; \
1786 PATFETCH (c); \
1789 } while (0)
1791 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1793 #define IS_CHAR_CLASS(string) \
1794 (STREQ (string, "alpha") || STREQ (string, "upper") \
1795 || STREQ (string, "lower") || STREQ (string, "digit") \
1796 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1797 || STREQ (string, "space") || STREQ (string, "print") \
1798 || STREQ (string, "punct") || STREQ (string, "graph") \
1799 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1800 || STREQ (string, "word") \
1801 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1802 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1804 /* QUIT is only used on NTemacs. */
1805 #if !defined (WINDOWSNT) || !defined (emacs)
1806 #undef QUIT
1807 #define QUIT
1808 #endif
1810 #ifndef MATCH_MAY_ALLOCATE
1812 /* If we cannot allocate large objects within re_match_2_internal,
1813 we make the fail stack and register vectors global.
1814 The fail stack, we grow to the maximum size when a regexp
1815 is compiled.
1816 The register vectors, we adjust in size each time we
1817 compile a regexp, according to the number of registers it needs. */
1819 static fail_stack_type fail_stack;
1821 /* Size with which the following vectors are currently allocated.
1822 That is so we can make them bigger as needed,
1823 but never make them smaller. */
1824 static int regs_allocated_size;
1826 static re_char ** regstart, ** regend;
1827 static re_char **best_regstart, **best_regend;
1829 /* Make the register vectors big enough for NUM_REGS registers,
1830 but don't make them smaller. */
1832 static
1833 regex_grow_registers (num_regs)
1834 int num_regs;
1836 if (num_regs > regs_allocated_size)
1838 RETALLOC_IF (regstart, num_regs, re_char *);
1839 RETALLOC_IF (regend, num_regs, re_char *);
1840 RETALLOC_IF (best_regstart, num_regs, re_char *);
1841 RETALLOC_IF (best_regend, num_regs, re_char *);
1843 regs_allocated_size = num_regs;
1847 #endif /* not MATCH_MAY_ALLOCATE */
1849 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1850 compile_stack,
1851 regnum_t regnum));
1853 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1854 Returns one of error codes defined in `regex.h', or zero for success.
1856 Assumes the `allocated' (and perhaps `buffer') and `translate'
1857 fields are set in BUFP on entry.
1859 If it succeeds, results are put in BUFP (if it returns an error, the
1860 contents of BUFP are undefined):
1861 `buffer' is the compiled pattern;
1862 `syntax' is set to SYNTAX;
1863 `used' is set to the length of the compiled pattern;
1864 `fastmap_accurate' is zero;
1865 `re_nsub' is the number of subexpressions in PATTERN;
1866 `not_bol' and `not_eol' are zero;
1868 The `fastmap' and `newline_anchor' fields are neither
1869 examined nor set. */
1871 /* Insert the `jump' from the end of last alternative to "here".
1872 The space for the jump has already been allocated. */
1873 #define FIXUP_ALT_JUMP() \
1874 do { \
1875 if (fixup_alt_jump) \
1876 STORE_JUMP (jump, fixup_alt_jump, b); \
1877 } while (0)
1880 /* Return, freeing storage we allocated. */
1881 #define FREE_STACK_RETURN(value) \
1882 do { \
1883 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1884 free (compile_stack.stack); \
1885 return value; \
1886 } while (0)
1888 static reg_errcode_t
1889 regex_compile (pattern, size, syntax, bufp)
1890 re_char *pattern;
1891 int size;
1892 reg_syntax_t syntax;
1893 struct re_pattern_buffer *bufp;
1895 /* We fetch characters from PATTERN here. Even though PATTERN is
1896 `char *' (i.e., signed), we declare these variables as unsigned, so
1897 they can be reliably used as array indices. */
1898 register unsigned int c, c1;
1900 /* A random temporary spot in PATTERN. */
1901 re_char *p1;
1903 /* Points to the end of the buffer, where we should append. */
1904 register unsigned char *b;
1906 /* Keeps track of unclosed groups. */
1907 compile_stack_type compile_stack;
1909 /* Points to the current (ending) position in the pattern. */
1910 #ifdef AIX
1911 /* `const' makes AIX compiler fail. */
1912 unsigned char *p = pattern;
1913 #else
1914 re_char *p = pattern;
1915 #endif
1916 re_char *pend = pattern + size;
1918 /* How to translate the characters in the pattern. */
1919 RE_TRANSLATE_TYPE translate = bufp->translate;
1921 /* Address of the count-byte of the most recently inserted `exactn'
1922 command. This makes it possible to tell if a new exact-match
1923 character can be added to that command or if the character requires
1924 a new `exactn' command. */
1925 unsigned char *pending_exact = 0;
1927 /* Address of start of the most recently finished expression.
1928 This tells, e.g., postfix * where to find the start of its
1929 operand. Reset at the beginning of groups and alternatives. */
1930 unsigned char *laststart = 0;
1932 /* Address of beginning of regexp, or inside of last group. */
1933 unsigned char *begalt;
1935 /* Place in the uncompiled pattern (i.e., the {) to
1936 which to go back if the interval is invalid. */
1937 re_char *beg_interval;
1939 /* Address of the place where a forward jump should go to the end of
1940 the containing expression. Each alternative of an `or' -- except the
1941 last -- ends with a forward jump of this sort. */
1942 unsigned char *fixup_alt_jump = 0;
1944 /* Counts open-groups as they are encountered. Remembered for the
1945 matching close-group on the compile stack, so the same register
1946 number is put in the stop_memory as the start_memory. */
1947 regnum_t regnum = 0;
1949 /* Work area for range table of charset. */
1950 struct range_table_work_area range_table_work;
1952 #ifdef DEBUG
1953 debug++;
1954 DEBUG_PRINT1 ("\nCompiling pattern: ");
1955 if (debug > 0)
1957 unsigned debug_count;
1959 for (debug_count = 0; debug_count < size; debug_count++)
1960 putchar (pattern[debug_count]);
1961 putchar ('\n');
1963 #endif /* DEBUG */
1965 /* Initialize the compile stack. */
1966 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1967 if (compile_stack.stack == NULL)
1968 return REG_ESPACE;
1970 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1971 compile_stack.avail = 0;
1973 range_table_work.table = 0;
1974 range_table_work.allocated = 0;
1976 /* Initialize the pattern buffer. */
1977 bufp->syntax = syntax;
1978 bufp->fastmap_accurate = 0;
1979 bufp->not_bol = bufp->not_eol = 0;
1981 /* Set `used' to zero, so that if we return an error, the pattern
1982 printer (for debugging) will think there's no pattern. We reset it
1983 at the end. */
1984 bufp->used = 0;
1986 /* Always count groups, whether or not bufp->no_sub is set. */
1987 bufp->re_nsub = 0;
1989 #ifdef emacs
1990 /* bufp->multibyte is set before regex_compile is called, so don't alter
1991 it. */
1992 #else /* not emacs */
1993 /* Nothing is recognized as a multibyte character. */
1994 bufp->multibyte = 0;
1995 #endif
1997 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1998 /* Initialize the syntax table. */
1999 init_syntax_once ();
2000 #endif
2002 if (bufp->allocated == 0)
2004 if (bufp->buffer)
2005 { /* If zero allocated, but buffer is non-null, try to realloc
2006 enough space. This loses if buffer's address is bogus, but
2007 that is the user's responsibility. */
2008 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2010 else
2011 { /* Caller did not allocate a buffer. Do it for them. */
2012 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2014 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2016 bufp->allocated = INIT_BUF_SIZE;
2019 begalt = b = bufp->buffer;
2021 /* Loop through the uncompiled pattern until we're at the end. */
2022 while (p != pend)
2024 PATFETCH (c);
2026 switch (c)
2028 case '^':
2030 if ( /* If at start of pattern, it's an operator. */
2031 p == pattern + 1
2032 /* If context independent, it's an operator. */
2033 || syntax & RE_CONTEXT_INDEP_ANCHORS
2034 /* Otherwise, depends on what's come before. */
2035 || at_begline_loc_p (pattern, p, syntax))
2036 BUF_PUSH (begline);
2037 else
2038 goto normal_char;
2040 break;
2043 case '$':
2045 if ( /* If at end of pattern, it's an operator. */
2046 p == pend
2047 /* If context independent, it's an operator. */
2048 || syntax & RE_CONTEXT_INDEP_ANCHORS
2049 /* Otherwise, depends on what's next. */
2050 || at_endline_loc_p (p, pend, syntax))
2051 BUF_PUSH (endline);
2052 else
2053 goto normal_char;
2055 break;
2058 case '+':
2059 case '?':
2060 if ((syntax & RE_BK_PLUS_QM)
2061 || (syntax & RE_LIMITED_OPS))
2062 goto normal_char;
2063 handle_plus:
2064 case '*':
2065 /* If there is no previous pattern... */
2066 if (!laststart)
2068 if (syntax & RE_CONTEXT_INVALID_OPS)
2069 FREE_STACK_RETURN (REG_BADRPT);
2070 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2071 goto normal_char;
2075 /* Are we optimizing this jump? */
2076 boolean keep_string_p = false;
2078 /* 1 means zero (many) matches is allowed. */
2079 boolean zero_times_ok = 0, many_times_ok = 0;
2080 boolean greedy = 1;
2082 /* If there is a sequence of repetition chars, collapse it
2083 down to just one (the right one). We can't combine
2084 interval operators with these because of, e.g., `a{2}*',
2085 which should only match an even number of `a's. */
2087 for (;;)
2089 if (!(syntax & RE_ALL_GREEDY)
2090 && c == '?' && (zero_times_ok || many_times_ok))
2091 greedy = 0;
2092 else
2094 zero_times_ok |= c != '+';
2095 many_times_ok |= c != '?';
2098 if (p == pend)
2099 break;
2101 PATFETCH (c);
2103 if (c == '*'
2104 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2107 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2109 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2111 PATFETCH (c1);
2112 if (!(c1 == '+' || c1 == '?'))
2114 PATUNFETCH;
2115 PATUNFETCH;
2116 break;
2119 c = c1;
2121 else
2123 PATUNFETCH;
2124 break;
2127 /* If we get here, we found another repeat character. */
2130 /* Star, etc. applied to an empty pattern is equivalent
2131 to an empty pattern. */
2132 if (!laststart)
2133 break;
2135 /* Now we know whether or not zero matches is allowed
2136 and also whether or not two or more matches is allowed. */
2137 if (greedy)
2139 if (many_times_ok)
2140 { /* More than one repetition is allowed, so put in at the
2141 end a backward relative jump from `b' to before the next
2142 jump we're going to put in below (which jumps from
2143 laststart to after this jump).
2145 But if we are at the `*' in the exact sequence `.*\n',
2146 insert an unconditional jump backwards to the .,
2147 instead of the beginning of the loop. This way we only
2148 push a failure point once, instead of every time
2149 through the loop. */
2150 assert (p - 1 > pattern);
2152 /* Allocate the space for the jump. */
2153 GET_BUFFER_SPACE (3);
2155 /* We know we are not at the first character of the pattern,
2156 because laststart was nonzero. And we've already
2157 incremented `p', by the way, to be the character after
2158 the `*'. Do we have to do something analogous here
2159 for null bytes, because of RE_DOT_NOT_NULL? */
2160 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2161 && zero_times_ok
2162 && p < pend
2163 && TRANSLATE (*p) == TRANSLATE ('\n')
2164 && !(syntax & RE_DOT_NEWLINE))
2165 { /* We have .*\n. */
2166 STORE_JUMP (jump, b, laststart);
2167 keep_string_p = true;
2169 else
2170 STORE_JUMP (jump, b, laststart - 3);
2172 /* We've added more stuff to the buffer. */
2173 b += 3;
2176 /* On failure, jump from laststart to b + 3, which will be the
2177 end of the buffer after this jump is inserted. */
2178 GET_BUFFER_SPACE (3);
2179 if (!zero_times_ok)
2181 assert (many_times_ok);
2182 INSERT_JUMP (on_failure_jump_smart, b - 3, b + 3);
2183 pending_exact = 0;
2184 b += 3;
2186 else
2188 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2189 : !many_times_ok ?
2190 on_failure_jump : on_failure_jump_smart,
2191 laststart, b + 3);
2192 pending_exact = 0;
2193 b += 3;
2196 else /* not greedy */
2197 { /* I wish the greedy and non-greedy cases could be merged. */
2199 if (many_times_ok)
2201 /* The non-greedy multiple match looks like a repeat..until:
2202 we only need a conditional jump at the end of the loop */
2203 GET_BUFFER_SPACE (3);
2204 STORE_JUMP (on_failure_jump, b, laststart);
2205 b += 3;
2206 if (zero_times_ok)
2208 /* The repeat...until naturally matches one or more.
2209 To also match zero times, we need to first jump to
2210 the end of the loop (its conditional jump). */
2211 GET_BUFFER_SPACE (3);
2212 INSERT_JUMP (jump, laststart, b);
2213 b += 3;
2216 else
2218 /* non-greedy a?? */
2219 GET_BUFFER_SPACE (6);
2220 INSERT_JUMP (jump, laststart, b + 3);
2221 b += 3;
2222 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2223 b += 3;
2227 break;
2230 case '.':
2231 laststart = b;
2232 BUF_PUSH (anychar);
2233 break;
2236 case '[':
2238 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2240 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2242 /* Ensure that we have enough space to push a charset: the
2243 opcode, the length count, and the bitset; 34 bytes in all. */
2244 GET_BUFFER_SPACE (34);
2246 laststart = b;
2248 /* We test `*p == '^' twice, instead of using an if
2249 statement, so we only need one BUF_PUSH. */
2250 BUF_PUSH (*p == '^' ? charset_not : charset);
2251 if (*p == '^')
2252 p++;
2254 /* Remember the first position in the bracket expression. */
2255 p1 = p;
2257 /* Push the number of bytes in the bitmap. */
2258 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2260 /* Clear the whole map. */
2261 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2263 /* charset_not matches newline according to a syntax bit. */
2264 if ((re_opcode_t) b[-2] == charset_not
2265 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2266 SET_LIST_BIT ('\n');
2268 /* Read in characters and ranges, setting map bits. */
2269 for (;;)
2271 int len;
2272 boolean escaped_char = false;
2274 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2276 PATFETCH (c);
2278 /* \ might escape characters inside [...] and [^...]. */
2279 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2281 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2283 PATFETCH (c);
2284 escaped_char = true;
2286 else
2288 /* Could be the end of the bracket expression. If it's
2289 not (i.e., when the bracket expression is `[]' so
2290 far), the ']' character bit gets set way below. */
2291 if (c == ']' && p != p1 + 1)
2292 break;
2295 /* If C indicates start of multibyte char, get the
2296 actual character code in C, and set the pattern
2297 pointer P to the next character boundary. */
2298 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2300 PATUNFETCH;
2301 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2302 p += len;
2304 /* What should we do for the character which is
2305 greater than 0x7F, but not BASE_LEADING_CODE_P?
2306 XXX */
2308 /* See if we're at the beginning of a possible character
2309 class. */
2311 else if (!escaped_char &&
2312 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2314 /* Leave room for the null. */
2315 char str[CHAR_CLASS_MAX_LENGTH + 1];
2317 PATFETCH (c);
2318 c1 = 0;
2320 /* If pattern is `[[:'. */
2321 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2323 for (;;)
2325 PATFETCH (c);
2326 if (c == ':' || c == ']' || p == pend
2327 || c1 == CHAR_CLASS_MAX_LENGTH)
2328 break;
2329 str[c1++] = c;
2331 str[c1] = '\0';
2333 /* If isn't a word bracketed by `[:' and `:]':
2334 undo the ending character, the letters, and
2335 leave the leading `:' and `[' (but set bits for
2336 them). */
2337 if (c == ':' && *p == ']')
2339 int ch;
2340 boolean is_alnum = STREQ (str, "alnum");
2341 boolean is_alpha = STREQ (str, "alpha");
2342 boolean is_ascii = STREQ (str, "ascii");
2343 boolean is_blank = STREQ (str, "blank");
2344 boolean is_cntrl = STREQ (str, "cntrl");
2345 boolean is_digit = STREQ (str, "digit");
2346 boolean is_graph = STREQ (str, "graph");
2347 boolean is_lower = STREQ (str, "lower");
2348 boolean is_multibyte = STREQ (str, "multibyte");
2349 boolean is_nonascii = STREQ (str, "nonascii");
2350 boolean is_print = STREQ (str, "print");
2351 boolean is_punct = STREQ (str, "punct");
2352 boolean is_space = STREQ (str, "space");
2353 boolean is_unibyte = STREQ (str, "unibyte");
2354 boolean is_upper = STREQ (str, "upper");
2355 boolean is_word = STREQ (str, "word");
2356 boolean is_xdigit = STREQ (str, "xdigit");
2358 if (!IS_CHAR_CLASS (str))
2359 FREE_STACK_RETURN (REG_ECTYPE);
2361 /* Throw away the ] at the end of the character
2362 class. */
2363 PATFETCH (c);
2365 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2367 /* Most character classes in a multibyte match
2368 just set a flag. Exceptions are is_blank,
2369 is_digit, is_cntrl, and is_xdigit, since
2370 they can only match ASCII characters. We
2371 don't need to handle them for multibyte. */
2373 if (bufp->multibyte)
2375 int bit = 0;
2377 if (is_alnum) bit = BIT_ALNUM;
2378 if (is_alpha) bit = BIT_ALPHA;
2379 if (is_ascii) bit = BIT_ASCII;
2380 if (is_graph) bit = BIT_GRAPH;
2381 if (is_lower) bit = BIT_LOWER;
2382 if (is_multibyte) bit = BIT_MULTIBYTE;
2383 if (is_nonascii) bit = BIT_NONASCII;
2384 if (is_print) bit = BIT_PRINT;
2385 if (is_punct) bit = BIT_PUNCT;
2386 if (is_space) bit = BIT_SPACE;
2387 if (is_unibyte) bit = BIT_UNIBYTE;
2388 if (is_upper) bit = BIT_UPPER;
2389 if (is_word) bit = BIT_WORD;
2390 if (bit)
2391 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2392 bit);
2395 /* Handle character classes for ASCII characters. */
2396 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2398 int translated = TRANSLATE (ch);
2399 /* This was split into 3 if's to
2400 avoid an arbitrary limit in some compiler. */
2401 if ( (is_alnum && ISALNUM (ch))
2402 || (is_alpha && ISALPHA (ch))
2403 || (is_blank && ISBLANK (ch))
2404 || (is_cntrl && ISCNTRL (ch)))
2405 SET_LIST_BIT (translated);
2406 if ( (is_digit && ISDIGIT (ch))
2407 || (is_graph && ISGRAPH (ch))
2408 || (is_lower && ISLOWER (ch))
2409 || (is_print && ISPRINT (ch)))
2410 SET_LIST_BIT (translated);
2411 if ( (is_punct && ISPUNCT (ch))
2412 || (is_space && ISSPACE (ch))
2413 || (is_upper && ISUPPER (ch))
2414 || (is_xdigit && ISXDIGIT (ch)))
2415 SET_LIST_BIT (translated);
2416 if ( (is_ascii && IS_REAL_ASCII (ch))
2417 || (is_nonascii && !IS_REAL_ASCII (ch))
2418 || (is_unibyte && ISUNIBYTE (ch))
2419 || (is_multibyte && !ISUNIBYTE (ch)))
2420 SET_LIST_BIT (translated);
2422 if ( (is_word && ISWORD (ch)))
2423 SET_LIST_BIT (translated);
2426 /* Repeat the loop. */
2427 continue;
2429 else
2431 c1++;
2432 while (c1--)
2433 PATUNFETCH;
2434 SET_LIST_BIT ('[');
2436 /* Because the `:' may starts the range, we
2437 can't simply set bit and repeat the loop.
2438 Instead, just set it to C and handle below. */
2439 c = ':';
2443 if (p < pend && p[0] == '-' && p[1] != ']')
2446 /* Discard the `-'. */
2447 PATFETCH (c1);
2449 /* Fetch the character which ends the range. */
2450 PATFETCH (c1);
2451 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
2453 PATUNFETCH;
2454 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2455 p += len;
2458 if (SINGLE_BYTE_CHAR_P (c)
2459 && ! SINGLE_BYTE_CHAR_P (c1))
2461 /* Handle a range such as \177-\377 in multibyte mode.
2462 Split that into two ranges,,
2463 the low one ending at 0237, and the high one
2464 starting at ...040. */
2465 /* Unless I'm missing something,
2466 this line is useless. -sm
2467 int c1_base = (c1 & ~0177) | 040; */
2468 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2469 c1 = 0237;
2471 else if (!SAME_CHARSET_P (c, c1))
2472 FREE_STACK_RETURN (REG_ERANGE);
2474 else
2475 /* Range from C to C. */
2476 c1 = c;
2478 /* Set the range ... */
2479 if (SINGLE_BYTE_CHAR_P (c))
2480 /* ... into bitmap. */
2482 unsigned this_char;
2483 int range_start = c, range_end = c1;
2485 /* If the start is after the end, the range is empty. */
2486 if (range_start > range_end)
2488 if (syntax & RE_NO_EMPTY_RANGES)
2489 FREE_STACK_RETURN (REG_ERANGE);
2490 /* Else, repeat the loop. */
2492 else
2494 for (this_char = range_start; this_char <= range_end;
2495 this_char++)
2496 SET_LIST_BIT (TRANSLATE (this_char));
2499 else
2500 /* ... into range table. */
2501 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2504 /* Discard any (non)matching list bytes that are all 0 at the
2505 end of the map. Decrease the map-length byte too. */
2506 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2507 b[-1]--;
2508 b += b[-1];
2510 /* Build real range table from work area. */
2511 if (RANGE_TABLE_WORK_USED (range_table_work)
2512 || RANGE_TABLE_WORK_BITS (range_table_work))
2514 int i;
2515 int used = RANGE_TABLE_WORK_USED (range_table_work);
2517 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2518 bytes for flags, two for COUNT, and three bytes for
2519 each character. */
2520 GET_BUFFER_SPACE (4 + used * 3);
2522 /* Indicate the existence of range table. */
2523 laststart[1] |= 0x80;
2525 /* Store the character class flag bits into the range table.
2526 If not in emacs, these flag bits are always 0. */
2527 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2528 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2530 STORE_NUMBER_AND_INCR (b, used / 2);
2531 for (i = 0; i < used; i++)
2532 STORE_CHARACTER_AND_INCR
2533 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2536 break;
2539 case '(':
2540 if (syntax & RE_NO_BK_PARENS)
2541 goto handle_open;
2542 else
2543 goto normal_char;
2546 case ')':
2547 if (syntax & RE_NO_BK_PARENS)
2548 goto handle_close;
2549 else
2550 goto normal_char;
2553 case '\n':
2554 if (syntax & RE_NEWLINE_ALT)
2555 goto handle_alt;
2556 else
2557 goto normal_char;
2560 case '|':
2561 if (syntax & RE_NO_BK_VBAR)
2562 goto handle_alt;
2563 else
2564 goto normal_char;
2567 case '{':
2568 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2569 goto handle_interval;
2570 else
2571 goto normal_char;
2574 case '\\':
2575 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2577 /* Do not translate the character after the \, so that we can
2578 distinguish, e.g., \B from \b, even if we normally would
2579 translate, e.g., B to b. */
2580 PATFETCH_RAW (c);
2582 switch (c)
2584 case '(':
2585 if (syntax & RE_NO_BK_PARENS)
2586 goto normal_backslash;
2588 handle_open:
2590 int shy = 0;
2591 if (p+1 < pend)
2593 /* Look for a special (?...) construct */
2594 PATFETCH (c);
2595 if ((syntax & RE_SHY_GROUPS) && c == '?')
2597 PATFETCH (c);
2598 switch (c)
2600 case ':': shy = 1; break;
2601 default:
2602 /* Only (?:...) is supported right now. */
2603 FREE_STACK_RETURN (REG_BADPAT);
2606 else PATUNFETCH;
2609 if (!shy)
2611 bufp->re_nsub++;
2612 regnum++;
2615 if (COMPILE_STACK_FULL)
2617 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2618 compile_stack_elt_t);
2619 if (compile_stack.stack == NULL) return REG_ESPACE;
2621 compile_stack.size <<= 1;
2624 /* These are the values to restore when we hit end of this
2625 group. They are all relative offsets, so that if the
2626 whole pattern moves because of realloc, they will still
2627 be valid. */
2628 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2629 COMPILE_STACK_TOP.fixup_alt_jump
2630 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2631 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2632 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2634 /* Do not push a
2635 start_memory for groups beyond the last one we can
2636 represent in the compiled pattern. */
2637 if (regnum <= MAX_REGNUM && !shy)
2638 BUF_PUSH_2 (start_memory, regnum);
2640 compile_stack.avail++;
2642 fixup_alt_jump = 0;
2643 laststart = 0;
2644 begalt = b;
2645 /* If we've reached MAX_REGNUM groups, then this open
2646 won't actually generate any code, so we'll have to
2647 clear pending_exact explicitly. */
2648 pending_exact = 0;
2649 break;
2652 case ')':
2653 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2655 if (COMPILE_STACK_EMPTY)
2657 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2658 goto normal_backslash;
2659 else
2660 FREE_STACK_RETURN (REG_ERPAREN);
2663 handle_close:
2664 FIXUP_ALT_JUMP ();
2666 /* See similar code for backslashed left paren above. */
2667 if (COMPILE_STACK_EMPTY)
2669 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2670 goto normal_char;
2671 else
2672 FREE_STACK_RETURN (REG_ERPAREN);
2675 /* Since we just checked for an empty stack above, this
2676 ``can't happen''. */
2677 assert (compile_stack.avail != 0);
2679 /* We don't just want to restore into `regnum', because
2680 later groups should continue to be numbered higher,
2681 as in `(ab)c(de)' -- the second group is #2. */
2682 regnum_t this_group_regnum;
2684 compile_stack.avail--;
2685 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2686 fixup_alt_jump
2687 = COMPILE_STACK_TOP.fixup_alt_jump
2688 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2689 : 0;
2690 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2691 this_group_regnum = COMPILE_STACK_TOP.regnum;
2692 /* If we've reached MAX_REGNUM groups, then this open
2693 won't actually generate any code, so we'll have to
2694 clear pending_exact explicitly. */
2695 pending_exact = 0;
2697 /* We're at the end of the group, so now we know how many
2698 groups were inside this one. */
2699 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2700 BUF_PUSH_2 (stop_memory, this_group_regnum);
2702 break;
2705 case '|': /* `\|'. */
2706 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2707 goto normal_backslash;
2708 handle_alt:
2709 if (syntax & RE_LIMITED_OPS)
2710 goto normal_char;
2712 /* Insert before the previous alternative a jump which
2713 jumps to this alternative if the former fails. */
2714 GET_BUFFER_SPACE (3);
2715 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2716 pending_exact = 0;
2717 b += 3;
2719 /* The alternative before this one has a jump after it
2720 which gets executed if it gets matched. Adjust that
2721 jump so it will jump to this alternative's analogous
2722 jump (put in below, which in turn will jump to the next
2723 (if any) alternative's such jump, etc.). The last such
2724 jump jumps to the correct final destination. A picture:
2725 _____ _____
2726 | | | |
2727 | v | v
2728 a | b | c
2730 If we are at `b', then fixup_alt_jump right now points to a
2731 three-byte space after `a'. We'll put in the jump, set
2732 fixup_alt_jump to right after `b', and leave behind three
2733 bytes which we'll fill in when we get to after `c'. */
2735 FIXUP_ALT_JUMP ();
2737 /* Mark and leave space for a jump after this alternative,
2738 to be filled in later either by next alternative or
2739 when know we're at the end of a series of alternatives. */
2740 fixup_alt_jump = b;
2741 GET_BUFFER_SPACE (3);
2742 b += 3;
2744 laststart = 0;
2745 begalt = b;
2746 break;
2749 case '{':
2750 /* If \{ is a literal. */
2751 if (!(syntax & RE_INTERVALS)
2752 /* If we're at `\{' and it's not the open-interval
2753 operator. */
2754 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2755 || (p - 2 == pattern && p == pend))
2756 goto normal_backslash;
2758 handle_interval:
2760 /* If got here, then the syntax allows intervals. */
2762 /* At least (most) this many matches must be made. */
2763 int lower_bound = 0, upper_bound = -1;
2765 beg_interval = p - 1;
2767 if (p == pend)
2769 if (syntax & RE_NO_BK_BRACES)
2770 goto unfetch_interval;
2771 else
2772 FREE_STACK_RETURN (REG_EBRACE);
2775 GET_UNSIGNED_NUMBER (lower_bound);
2777 if (c == ',')
2779 GET_UNSIGNED_NUMBER (upper_bound);
2780 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2782 else
2783 /* Interval such as `{1}' => match exactly once. */
2784 upper_bound = lower_bound;
2786 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2787 || lower_bound > upper_bound)
2789 if (syntax & RE_NO_BK_BRACES)
2790 goto unfetch_interval;
2791 else
2792 FREE_STACK_RETURN (REG_BADBR);
2795 if (!(syntax & RE_NO_BK_BRACES))
2797 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2799 PATFETCH (c);
2802 if (c != '}')
2804 if (syntax & RE_NO_BK_BRACES)
2805 goto unfetch_interval;
2806 else
2807 FREE_STACK_RETURN (REG_BADBR);
2810 /* We just parsed a valid interval. */
2812 /* If it's invalid to have no preceding re. */
2813 if (!laststart)
2815 if (syntax & RE_CONTEXT_INVALID_OPS)
2816 FREE_STACK_RETURN (REG_BADRPT);
2817 else if (syntax & RE_CONTEXT_INDEP_OPS)
2818 laststart = b;
2819 else
2820 goto unfetch_interval;
2823 /* If the upper bound is zero, don't want to succeed at
2824 all; jump from `laststart' to `b + 3', which will be
2825 the end of the buffer after we insert the jump. */
2826 if (upper_bound == 0)
2828 GET_BUFFER_SPACE (3);
2829 INSERT_JUMP (jump, laststart, b + 3);
2830 b += 3;
2833 /* Otherwise, we have a nontrivial interval. When
2834 we're all done, the pattern will look like:
2835 set_number_at <jump count> <upper bound>
2836 set_number_at <succeed_n count> <lower bound>
2837 succeed_n <after jump addr> <succeed_n count>
2838 <body of loop>
2839 jump_n <succeed_n addr> <jump count>
2840 (The upper bound and `jump_n' are omitted if
2841 `upper_bound' is 1, though.) */
2842 else
2843 { /* If the upper bound is > 1, we need to insert
2844 more at the end of the loop. */
2845 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2847 GET_BUFFER_SPACE (nbytes);
2849 /* Initialize lower bound of the `succeed_n', even
2850 though it will be set during matching by its
2851 attendant `set_number_at' (inserted next),
2852 because `re_compile_fastmap' needs to know.
2853 Jump to the `jump_n' we might insert below. */
2854 INSERT_JUMP2 (succeed_n, laststart,
2855 b + 5 + (upper_bound > 1) * 5,
2856 lower_bound);
2857 b += 5;
2859 /* Code to initialize the lower bound. Insert
2860 before the `succeed_n'. The `5' is the last two
2861 bytes of this `set_number_at', plus 3 bytes of
2862 the following `succeed_n'. */
2863 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2864 b += 5;
2866 if (upper_bound > 1)
2867 { /* More than one repetition is allowed, so
2868 append a backward jump to the `succeed_n'
2869 that starts this interval.
2871 When we've reached this during matching,
2872 we'll have matched the interval once, so
2873 jump back only `upper_bound - 1' times. */
2874 STORE_JUMP2 (jump_n, b, laststart + 5,
2875 upper_bound - 1);
2876 b += 5;
2878 /* The location we want to set is the second
2879 parameter of the `jump_n'; that is `b-2' as
2880 an absolute address. `laststart' will be
2881 the `set_number_at' we're about to insert;
2882 `laststart+3' the number to set, the source
2883 for the relative address. But we are
2884 inserting into the middle of the pattern --
2885 so everything is getting moved up by 5.
2886 Conclusion: (b - 2) - (laststart + 3) + 5,
2887 i.e., b - laststart.
2889 We insert this at the beginning of the loop
2890 so that if we fail during matching, we'll
2891 reinitialize the bounds. */
2892 insert_op2 (set_number_at, laststart, b - laststart,
2893 upper_bound - 1, b);
2894 b += 5;
2897 pending_exact = 0;
2898 beg_interval = NULL;
2900 break;
2902 unfetch_interval:
2903 /* If an invalid interval, match the characters as literals. */
2904 assert (beg_interval);
2905 p = beg_interval;
2906 beg_interval = NULL;
2908 /* normal_char and normal_backslash need `c'. */
2909 PATFETCH (c);
2911 if (!(syntax & RE_NO_BK_BRACES))
2913 if (p > pattern && p[-1] == '\\')
2914 goto normal_backslash;
2916 goto normal_char;
2918 #ifdef emacs
2919 /* There is no way to specify the before_dot and after_dot
2920 operators. rms says this is ok. --karl */
2921 case '=':
2922 BUF_PUSH (at_dot);
2923 break;
2925 case 's':
2926 laststart = b;
2927 PATFETCH (c);
2928 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2929 break;
2931 case 'S':
2932 laststart = b;
2933 PATFETCH (c);
2934 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2935 break;
2937 case 'c':
2938 laststart = b;
2939 PATFETCH_RAW (c);
2940 BUF_PUSH_2 (categoryspec, c);
2941 break;
2943 case 'C':
2944 laststart = b;
2945 PATFETCH_RAW (c);
2946 BUF_PUSH_2 (notcategoryspec, c);
2947 break;
2948 #endif /* emacs */
2951 case 'w':
2952 laststart = b;
2953 BUF_PUSH (wordchar);
2954 break;
2957 case 'W':
2958 laststart = b;
2959 BUF_PUSH (notwordchar);
2960 break;
2963 case '<':
2964 BUF_PUSH (wordbeg);
2965 break;
2967 case '>':
2968 BUF_PUSH (wordend);
2969 break;
2971 case 'b':
2972 BUF_PUSH (wordbound);
2973 break;
2975 case 'B':
2976 BUF_PUSH (notwordbound);
2977 break;
2979 case '`':
2980 BUF_PUSH (begbuf);
2981 break;
2983 case '\'':
2984 BUF_PUSH (endbuf);
2985 break;
2987 case '1': case '2': case '3': case '4': case '5':
2988 case '6': case '7': case '8': case '9':
2989 if (syntax & RE_NO_BK_REFS)
2990 goto normal_char;
2992 c1 = c - '0';
2994 if (c1 > regnum)
2995 FREE_STACK_RETURN (REG_ESUBREG);
2997 /* Can't back reference to a subexpression if inside of it. */
2998 if (group_in_compile_stack (compile_stack, c1))
2999 goto normal_char;
3001 laststart = b;
3002 BUF_PUSH_2 (duplicate, c1);
3003 break;
3006 case '+':
3007 case '?':
3008 if (syntax & RE_BK_PLUS_QM)
3009 goto handle_plus;
3010 else
3011 goto normal_backslash;
3013 default:
3014 normal_backslash:
3015 /* You might think it would be useful for \ to mean
3016 not to translate; but if we don't translate it
3017 it will never match anything. */
3018 c = TRANSLATE (c);
3019 goto normal_char;
3021 break;
3024 default:
3025 /* Expects the character in `c'. */
3026 normal_char:
3027 p1 = p - 1; /* P1 points the head of C. */
3028 #ifdef emacs
3029 if (bufp->multibyte)
3031 c = STRING_CHAR (p1, pend - p1);
3032 c = TRANSLATE (c);
3033 /* Set P to the next character boundary. */
3034 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
3036 #endif
3037 /* If no exactn currently being built. */
3038 if (!pending_exact
3040 /* If last exactn not at current position. */
3041 || pending_exact + *pending_exact + 1 != b
3043 /* We have only one byte following the exactn for the count. */
3044 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
3046 /* If followed by a repetition operator. */
3047 || (p != pend && (*p == '*' || *p == '^'))
3048 || ((syntax & RE_BK_PLUS_QM)
3049 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3050 : p != pend && (*p == '+' || *p == '?'))
3051 || ((syntax & RE_INTERVALS)
3052 && ((syntax & RE_NO_BK_BRACES)
3053 ? p != pend && *p == '{'
3054 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3056 /* Start building a new exactn. */
3058 laststart = b;
3060 BUF_PUSH_2 (exactn, 0);
3061 pending_exact = b - 1;
3064 #ifdef emacs
3065 if (! SINGLE_BYTE_CHAR_P (c))
3067 unsigned char str[MAX_MULTIBYTE_LENGTH];
3068 int i = CHAR_STRING (c, str);
3069 int j;
3070 for (j = 0; j < i; j++)
3072 BUF_PUSH (str[j]);
3073 (*pending_exact)++;
3076 else
3077 #endif
3079 BUF_PUSH (c);
3080 (*pending_exact)++;
3082 break;
3083 } /* switch (c) */
3084 } /* while p != pend */
3087 /* Through the pattern now. */
3089 FIXUP_ALT_JUMP ();
3091 if (!COMPILE_STACK_EMPTY)
3092 FREE_STACK_RETURN (REG_EPAREN);
3094 /* If we don't want backtracking, force success
3095 the first time we reach the end of the compiled pattern. */
3096 if (syntax & RE_NO_POSIX_BACKTRACKING)
3097 BUF_PUSH (succeed);
3099 free (compile_stack.stack);
3101 /* We have succeeded; set the length of the buffer. */
3102 bufp->used = b - bufp->buffer;
3104 #ifdef DEBUG
3105 if (debug > 0)
3107 re_compile_fastmap (bufp);
3108 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3109 print_compiled_pattern (bufp);
3111 debug--;
3112 #endif /* DEBUG */
3114 #ifndef MATCH_MAY_ALLOCATE
3115 /* Initialize the failure stack to the largest possible stack. This
3116 isn't necessary unless we're trying to avoid calling alloca in
3117 the search and match routines. */
3119 int num_regs = bufp->re_nsub + 1;
3121 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3123 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3125 if (! fail_stack.stack)
3126 fail_stack.stack
3127 = (fail_stack_elt_t *) malloc (fail_stack.size
3128 * sizeof (fail_stack_elt_t));
3129 else
3130 fail_stack.stack
3131 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3132 (fail_stack.size
3133 * sizeof (fail_stack_elt_t)));
3136 regex_grow_registers (num_regs);
3138 #endif /* not MATCH_MAY_ALLOCATE */
3140 return REG_NOERROR;
3141 } /* regex_compile */
3143 /* Subroutines for `regex_compile'. */
3145 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3147 static void
3148 store_op1 (op, loc, arg)
3149 re_opcode_t op;
3150 unsigned char *loc;
3151 int arg;
3153 *loc = (unsigned char) op;
3154 STORE_NUMBER (loc + 1, arg);
3158 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3160 static void
3161 store_op2 (op, loc, arg1, arg2)
3162 re_opcode_t op;
3163 unsigned char *loc;
3164 int arg1, arg2;
3166 *loc = (unsigned char) op;
3167 STORE_NUMBER (loc + 1, arg1);
3168 STORE_NUMBER (loc + 3, arg2);
3172 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3173 for OP followed by two-byte integer parameter ARG. */
3175 static void
3176 insert_op1 (op, loc, arg, end)
3177 re_opcode_t op;
3178 unsigned char *loc;
3179 int arg;
3180 unsigned char *end;
3182 register unsigned char *pfrom = end;
3183 register unsigned char *pto = end + 3;
3185 while (pfrom != loc)
3186 *--pto = *--pfrom;
3188 store_op1 (op, loc, arg);
3192 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3194 static void
3195 insert_op2 (op, loc, arg1, arg2, end)
3196 re_opcode_t op;
3197 unsigned char *loc;
3198 int arg1, arg2;
3199 unsigned char *end;
3201 register unsigned char *pfrom = end;
3202 register unsigned char *pto = end + 5;
3204 while (pfrom != loc)
3205 *--pto = *--pfrom;
3207 store_op2 (op, loc, arg1, arg2);
3211 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3212 after an alternative or a begin-subexpression. We assume there is at
3213 least one character before the ^. */
3215 static boolean
3216 at_begline_loc_p (pattern, p, syntax)
3217 const unsigned char *pattern, *p;
3218 reg_syntax_t syntax;
3220 re_char *prev = p - 2;
3221 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3223 return
3224 /* After a subexpression? */
3225 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3226 /* After an alternative? */
3227 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3231 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3232 at least one character after the $, i.e., `P < PEND'. */
3234 static boolean
3235 at_endline_loc_p (p, pend, syntax)
3236 const unsigned char *p, *pend;
3237 reg_syntax_t syntax;
3239 re_char *next = p;
3240 boolean next_backslash = *next == '\\';
3241 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3243 return
3244 /* Before a subexpression? */
3245 (syntax & RE_NO_BK_PARENS ? *next == ')'
3246 : next_backslash && next_next && *next_next == ')')
3247 /* Before an alternative? */
3248 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3249 : next_backslash && next_next && *next_next == '|');
3253 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3254 false if it's not. */
3256 static boolean
3257 group_in_compile_stack (compile_stack, regnum)
3258 compile_stack_type compile_stack;
3259 regnum_t regnum;
3261 int this_element;
3263 for (this_element = compile_stack.avail - 1;
3264 this_element >= 0;
3265 this_element--)
3266 if (compile_stack.stack[this_element].regnum == regnum)
3267 return true;
3269 return false;
3272 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3273 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3274 characters can start a string that matches the pattern. This fastmap
3275 is used by re_search to skip quickly over impossible starting points.
3277 Character codes above (1 << BYTEWIDTH) are not represented in the
3278 fastmap, but the leading codes are represented. Thus, the fastmap
3279 indicates which character sets could start a match.
3281 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3282 area as BUFP->fastmap.
3284 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3285 the pattern buffer.
3287 Returns 0 if we succeed, -2 if an internal error. */
3290 re_compile_fastmap (bufp)
3291 struct re_pattern_buffer *bufp;
3293 int j, k;
3294 #ifdef MATCH_MAY_ALLOCATE
3295 fail_stack_type fail_stack;
3296 #endif
3297 #ifndef REGEX_MALLOC
3298 char *destination;
3299 #endif
3301 register char *fastmap = bufp->fastmap;
3302 unsigned char *pattern = bufp->buffer;
3303 unsigned long size = bufp->used;
3304 unsigned char *p = pattern;
3305 register unsigned char *pend = pattern + size;
3307 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3308 /* This holds the pointer to the failure stack, when
3309 it is allocated relocatably. */
3310 fail_stack_elt_t *failure_stack_ptr;
3311 #endif
3313 /* Assume that each path through the pattern can be null until
3314 proven otherwise. We set this false at the bottom of switch
3315 statement, to which we get only if a particular path doesn't
3316 match the empty string. */
3317 boolean path_can_be_null = true;
3319 /* We aren't doing a `succeed_n' to begin with. */
3320 boolean succeed_n_p = false;
3322 /* If all elements for base leading-codes in fastmap is set, this
3323 flag is set true. */
3324 boolean match_any_multibyte_characters = false;
3326 /* Maximum code of simple (single byte) character. */
3327 int simple_char_max;
3329 assert (fastmap != NULL && p != NULL);
3331 INIT_FAIL_STACK ();
3332 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3333 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3334 bufp->can_be_null = 0;
3336 /* The loop below works as follows:
3337 - It has a working-list kept in the PATTERN_STACK and which basically
3338 starts by only containing a pointer to the first operation.
3339 - If the opcode we're looking at is a match against some set of
3340 chars, then we add those chars to the fastmap and go on to the
3341 next work element from the worklist (done via `break').
3342 - If the opcode is a control operator on the other hand, we either
3343 ignore it (if it's meaningless at this point, such as `start_memory')
3344 or execute it (if it's a jump). If the jump has several destinations
3345 (i.e. `on_failure_jump'), then we push the other destination onto the
3346 worklist.
3347 We guarantee termination by ignoring backward jumps (more or less),
3348 so that `p' is monotonically increasing. More to the point, we
3349 never set `p' (or push) anything `<= p1'. */
3351 /* If can_be_null is set, then the fastmap will not be used anyway. */
3352 while (!bufp->can_be_null)
3354 /* `p1' is used as a marker of how far back a `on_failure_jump'
3355 can go without being ignored. It is normally equal to `p'
3356 (which prevents any backward `on_failure_jump') except right
3357 after a plain `jump', to allow patterns such as:
3358 0: jump 10
3359 3..9: <body>
3360 10: on_failure_jump 3
3361 as used for the *? operator. */
3362 unsigned char *p1 = p;
3364 if (p == pend || *p == succeed)
3366 /* We have reached the (effective) end of pattern. */
3367 if (!PATTERN_STACK_EMPTY ())
3369 bufp->can_be_null |= path_can_be_null;
3371 /* Reset for next path. */
3372 path_can_be_null = true;
3374 p = (unsigned char*) POP_PATTERN_OP ();
3376 continue;
3378 else
3379 break;
3382 /* We should never be about to go beyond the end of the pattern. */
3383 assert (p < pend);
3385 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3388 case duplicate:
3389 /* If the first character has to match a backreference, that means
3390 that the group was empty (since it already matched). Since this
3391 is the only case that interests us here, we can assume that the
3392 backreference must match the empty string. */
3393 p++;
3394 continue;
3397 /* Following are the cases which match a character. These end
3398 with `break'. */
3400 case exactn:
3401 fastmap[p[1]] = 1;
3402 break;
3405 #ifndef emacs
3406 case charset:
3408 int length = (*p & 0x7f);;
3409 p++;
3411 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3412 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3413 fastmap[j] = 1;
3415 break;
3417 case charset_not:
3418 /* Chars beyond end of map must be allowed. */
3420 int length = (*p & 0x7f);;
3421 p++;
3423 for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3424 fastmap[j] = 1;
3426 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3427 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3428 fastmap[j] = 1;
3430 break;
3432 case wordchar:
3433 for (j = 0; j < (1 << BYTEWIDTH); j++)
3434 if (SYNTAX (j) == Sword)
3435 fastmap[j] = 1;
3436 break;
3439 case notwordchar:
3440 for (j = 0; j < (1 << BYTEWIDTH); j++)
3441 if (SYNTAX (j) != Sword)
3442 fastmap[j] = 1;
3443 break;
3444 #else /* emacs */
3445 case charset:
3446 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3447 j >= 0; j--)
3448 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3449 fastmap[j] = 1;
3451 /* If we can match a character class, we can match
3452 any character set. */
3453 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3454 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
3455 goto set_fastmap_for_multibyte_characters;
3457 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3458 && match_any_multibyte_characters == false)
3460 /* Set fastmap[I] 1 where I is a base leading code of each
3461 multibyte character in the range table. */
3462 int c, count;
3464 /* Make P points the range table. */
3465 p += CHARSET_BITMAP_SIZE (&p[-2]);
3467 /* Extract the number of ranges in range table into COUNT. */
3468 EXTRACT_NUMBER_AND_INCR (count, p);
3469 for (; count > 0; count--, p += 2 * 3) /* XXX */
3471 /* Extract the start of each range. */
3472 EXTRACT_CHARACTER (c, p);
3473 j = CHAR_CHARSET (c);
3474 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3477 break;
3480 case charset_not:
3481 /* Chars beyond end of bitmap are possible matches.
3482 All the single-byte codes can occur in multibyte buffers.
3483 So any that are not listed in the charset
3484 are possible matches, even in multibyte buffers. */
3485 simple_char_max = (1 << BYTEWIDTH);
3486 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3487 j < simple_char_max; j++)
3488 fastmap[j] = 1;
3490 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3491 j >= 0; j--)
3492 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3493 fastmap[j] = 1;
3495 if (bufp->multibyte)
3496 /* Any character set can possibly contain a character
3497 which doesn't match the specified set of characters. */
3499 set_fastmap_for_multibyte_characters:
3500 if (match_any_multibyte_characters == false)
3502 for (j = 0x80; j < 0xA0; j++) /* XXX */
3503 if (BASE_LEADING_CODE_P (j))
3504 fastmap[j] = 1;
3505 match_any_multibyte_characters = true;
3508 break;
3511 case wordchar:
3512 /* All the single-byte codes can occur in multibyte buffers,
3513 and they may have word syntax. So do consider them. */
3514 simple_char_max = (1 << BYTEWIDTH);
3515 for (j = 0; j < simple_char_max; j++)
3516 if (SYNTAX (j) == Sword)
3517 fastmap[j] = 1;
3519 if (bufp->multibyte)
3520 /* Any character set can possibly contain a character
3521 whose syntax is `Sword'. */
3522 goto set_fastmap_for_multibyte_characters;
3523 break;
3526 case notwordchar:
3527 /* All the single-byte codes can occur in multibyte buffers,
3528 and they may not have word syntax. So do consider them. */
3529 simple_char_max = (1 << BYTEWIDTH);
3530 for (j = 0; j < simple_char_max; j++)
3531 if (SYNTAX (j) != Sword)
3532 fastmap[j] = 1;
3534 if (bufp->multibyte)
3535 /* Any character set can possibly contain a character
3536 whose syntax is not `Sword'. */
3537 goto set_fastmap_for_multibyte_characters;
3538 break;
3539 #endif
3541 case anychar:
3543 int fastmap_newline = fastmap['\n'];
3545 /* `.' matches anything, except perhaps newline.
3546 Even in a multibyte buffer, it should match any
3547 conceivable byte value for the fastmap. */
3548 if (bufp->multibyte)
3549 match_any_multibyte_characters = true;
3551 simple_char_max = (1 << BYTEWIDTH);
3552 for (j = 0; j < simple_char_max; j++)
3553 fastmap[j] = 1;
3555 /* ... except perhaps newline. */
3556 if (!(bufp->syntax & RE_DOT_NEWLINE))
3557 fastmap['\n'] = fastmap_newline;
3559 /* Otherwise, have to check alternative paths. */
3560 break;
3563 #ifdef emacs
3564 case wordbound:
3565 case notwordbound:
3566 case wordbeg:
3567 case wordend:
3568 case notsyntaxspec:
3569 case syntaxspec:
3570 /* This match depends on text properties. These end with
3571 aborting optimizations. */
3572 bufp->can_be_null = 1;
3573 continue;
3574 #if 0
3575 k = *p++;
3576 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3577 for (j = 0; j < simple_char_max; j++)
3578 if (SYNTAX (j) == (enum syntaxcode) k)
3579 fastmap[j] = 1;
3581 if (bufp->multibyte)
3582 /* Any character set can possibly contain a character
3583 whose syntax is K. */
3584 goto set_fastmap_for_multibyte_characters;
3585 break;
3587 case notsyntaxspec:
3588 k = *p++;
3589 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3590 for (j = 0; j < simple_char_max; j++)
3591 if (SYNTAX (j) != (enum syntaxcode) k)
3592 fastmap[j] = 1;
3594 if (bufp->multibyte)
3595 /* Any character set can possibly contain a character
3596 whose syntax is not K. */
3597 goto set_fastmap_for_multibyte_characters;
3598 break;
3599 #endif
3602 case categoryspec:
3603 k = *p++;
3604 simple_char_max = (1 << BYTEWIDTH);
3605 for (j = 0; j < simple_char_max; j++)
3606 if (CHAR_HAS_CATEGORY (j, k))
3607 fastmap[j] = 1;
3609 if (bufp->multibyte)
3610 /* Any character set can possibly contain a character
3611 whose category is K. */
3612 goto set_fastmap_for_multibyte_characters;
3613 break;
3616 case notcategoryspec:
3617 k = *p++;
3618 simple_char_max = (1 << BYTEWIDTH);
3619 for (j = 0; j < simple_char_max; j++)
3620 if (!CHAR_HAS_CATEGORY (j, k))
3621 fastmap[j] = 1;
3623 if (bufp->multibyte)
3624 /* Any character set can possibly contain a character
3625 whose category is not K. */
3626 goto set_fastmap_for_multibyte_characters;
3627 break;
3629 /* All cases after this match the empty string. These end with
3630 `continue'. */
3633 case before_dot:
3634 case at_dot:
3635 case after_dot:
3636 continue;
3637 #endif /* emacs */
3640 case no_op:
3641 case begline:
3642 case endline:
3643 case begbuf:
3644 case endbuf:
3645 #ifndef emacs
3646 case wordbound:
3647 case notwordbound:
3648 case wordbeg:
3649 case wordend:
3650 #endif
3651 continue;
3654 case jump_n:
3655 case jump:
3656 EXTRACT_NUMBER_AND_INCR (j, p);
3657 if (j < 0)
3658 /* Backward jumps can only go back to code that we've already
3659 visited. `re_compile' should make sure this is true. */
3660 break;
3661 p += j;
3662 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3664 case on_failure_jump:
3665 case on_failure_keep_string_jump:
3666 case on_failure_jump_exclusive:
3667 case on_failure_jump_loop:
3668 case on_failure_jump_smart:
3669 p++;
3670 break;
3671 default:
3672 continue;
3674 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3675 to jump back to "just after here". */
3676 /* Fallthrough */
3678 case on_failure_jump:
3679 case on_failure_keep_string_jump:
3680 case on_failure_jump_exclusive:
3681 case on_failure_jump_loop:
3682 case on_failure_jump_smart:
3683 handle_on_failure_jump:
3684 EXTRACT_NUMBER_AND_INCR (j, p);
3686 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3687 end of the pattern. We don't want to push such a point,
3688 since when we restore it above, entering the switch will
3689 increment `p' past the end of the pattern. We don't need
3690 to push such a point since we obviously won't find any more
3691 fastmap entries beyond `pend'. Such a pattern can match
3692 the null string, though. */
3693 if (p + j <= p1)
3694 /* Backward jump to be ignored. */
3696 else if (p + j < pend)
3698 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3700 RESET_FAIL_STACK ();
3701 return -2;
3704 else
3705 bufp->can_be_null = 1;
3707 if (succeed_n_p)
3709 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3710 succeed_n_p = false;
3713 continue;
3716 case succeed_n:
3717 /* Get to the number of times to succeed. */
3718 p += 2;
3720 /* Increment p past the n for when k != 0. */
3721 EXTRACT_NUMBER_AND_INCR (k, p);
3722 if (k == 0)
3724 p -= 4;
3725 succeed_n_p = true; /* Spaghetti code alert. */
3726 goto handle_on_failure_jump;
3728 continue;
3731 case set_number_at:
3732 p += 4;
3733 continue;
3736 case start_memory:
3737 case stop_memory:
3738 p += 1;
3739 continue;
3742 default:
3743 abort (); /* We have listed all the cases. */
3744 } /* switch *p++ */
3746 /* Getting here means we have found the possible starting
3747 characters for one path of the pattern -- and that the empty
3748 string does not match. We need not follow this path further.
3749 Instead, look at the next alternative (remembered on the
3750 stack), or quit if no more. The test at the top of the loop
3751 does these things. */
3752 path_can_be_null = false;
3753 p = pend;
3754 } /* while p */
3756 /* Set `can_be_null' for the last path (also the first path, if the
3757 pattern is empty). */
3758 bufp->can_be_null |= path_can_be_null;
3759 RESET_FAIL_STACK ();
3760 return 0;
3761 } /* re_compile_fastmap */
3763 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3764 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3765 this memory for recording register information. STARTS and ENDS
3766 must be allocated using the malloc library routine, and must each
3767 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3769 If NUM_REGS == 0, then subsequent matches should allocate their own
3770 register data.
3772 Unless this function is called, the first search or match using
3773 PATTERN_BUFFER will allocate its own register data, without
3774 freeing the old data. */
3776 void
3777 re_set_registers (bufp, regs, num_regs, starts, ends)
3778 struct re_pattern_buffer *bufp;
3779 struct re_registers *regs;
3780 unsigned num_regs;
3781 regoff_t *starts, *ends;
3783 if (num_regs)
3785 bufp->regs_allocated = REGS_REALLOCATE;
3786 regs->num_regs = num_regs;
3787 regs->start = starts;
3788 regs->end = ends;
3790 else
3792 bufp->regs_allocated = REGS_UNALLOCATED;
3793 regs->num_regs = 0;
3794 regs->start = regs->end = (regoff_t *) 0;
3798 /* Searching routines. */
3800 /* Like re_search_2, below, but only one string is specified, and
3801 doesn't let you say where to stop matching. */
3804 re_search (bufp, string, size, startpos, range, regs)
3805 struct re_pattern_buffer *bufp;
3806 const char *string;
3807 int size, startpos, range;
3808 struct re_registers *regs;
3810 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3811 regs, size);
3814 /* End address of virtual concatenation of string. */
3815 #define STOP_ADDR_VSTRING(P) \
3816 (((P) >= size1 ? string2 + size2 : string1 + size1))
3818 /* Address of POS in the concatenation of virtual string. */
3819 #define POS_ADDR_VSTRING(POS) \
3820 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3822 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3823 virtual concatenation of STRING1 and STRING2, starting first at index
3824 STARTPOS, then at STARTPOS + 1, and so on.
3826 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3828 RANGE is how far to scan while trying to match. RANGE = 0 means try
3829 only at STARTPOS; in general, the last start tried is STARTPOS +
3830 RANGE.
3832 In REGS, return the indices of the virtual concatenation of STRING1
3833 and STRING2 that matched the entire BUFP->buffer and its contained
3834 subexpressions.
3836 Do not consider matching one past the index STOP in the virtual
3837 concatenation of STRING1 and STRING2.
3839 We return either the position in the strings at which the match was
3840 found, -1 if no match, or -2 if error (such as failure
3841 stack overflow). */
3844 re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3845 struct re_pattern_buffer *bufp;
3846 const char *str1, *str2;
3847 int size1, size2;
3848 int startpos;
3849 int range;
3850 struct re_registers *regs;
3851 int stop;
3853 int val;
3854 re_char *string1 = (re_char*) str1;
3855 re_char *string2 = (re_char*) str2;
3856 register char *fastmap = bufp->fastmap;
3857 register RE_TRANSLATE_TYPE translate = bufp->translate;
3858 int total_size = size1 + size2;
3859 int endpos = startpos + range;
3860 int anchored_start = 0;
3862 /* Nonzero if we have to concern multibyte character. */
3863 int multibyte = bufp->multibyte;
3865 /* Check for out-of-range STARTPOS. */
3866 if (startpos < 0 || startpos > total_size)
3867 return -1;
3869 /* Fix up RANGE if it might eventually take us outside
3870 the virtual concatenation of STRING1 and STRING2.
3871 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3872 if (endpos < 0)
3873 range = 0 - startpos;
3874 else if (endpos > total_size)
3875 range = total_size - startpos;
3877 /* If the search isn't to be a backwards one, don't waste time in a
3878 search for a pattern anchored at beginning of buffer. */
3879 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3881 if (startpos > 0)
3882 return -1;
3883 else
3884 range = 0;
3887 #ifdef emacs
3888 /* In a forward search for something that starts with \=.
3889 don't keep searching past point. */
3890 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3892 range = PT_BYTE - BEGV_BYTE - startpos;
3893 if (range < 0)
3894 return -1;
3896 #endif /* emacs */
3898 /* Update the fastmap now if not correct already. */
3899 if (fastmap && !bufp->fastmap_accurate)
3900 if (re_compile_fastmap (bufp) == -2)
3901 return -2;
3903 /* See whether the pattern is anchored. */
3904 if (bufp->buffer[0] == begline)
3905 anchored_start = 1;
3907 #ifdef emacs
3908 gl_state.object = re_match_object;
3910 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3912 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3914 #endif
3916 /* Loop through the string, looking for a place to start matching. */
3917 for (;;)
3919 /* If the pattern is anchored,
3920 skip quickly past places we cannot match.
3921 We don't bother to treat startpos == 0 specially
3922 because that case doesn't repeat. */
3923 if (anchored_start && startpos > 0)
3925 if (! (bufp->newline_anchor
3926 && ((startpos <= size1 ? string1[startpos - 1]
3927 : string2[startpos - size1 - 1])
3928 == '\n')))
3929 goto advance;
3932 /* If a fastmap is supplied, skip quickly over characters that
3933 cannot be the start of a match. If the pattern can match the
3934 null string, however, we don't need to skip characters; we want
3935 the first null string. */
3936 if (fastmap && startpos < total_size && !bufp->can_be_null)
3938 register re_char *d;
3939 register unsigned int buf_ch;
3941 d = POS_ADDR_VSTRING (startpos);
3943 if (range > 0) /* Searching forwards. */
3945 register int lim = 0;
3946 int irange = range;
3948 if (startpos < size1 && startpos + range >= size1)
3949 lim = range - (size1 - startpos);
3951 /* Written out as an if-else to avoid testing `translate'
3952 inside the loop. */
3953 if (RE_TRANSLATE_P (translate))
3955 if (multibyte)
3956 while (range > lim)
3958 int buf_charlen;
3960 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3961 buf_charlen);
3963 buf_ch = RE_TRANSLATE (translate, buf_ch);
3964 if (buf_ch >= 0400
3965 || fastmap[buf_ch])
3966 break;
3968 range -= buf_charlen;
3969 d += buf_charlen;
3971 else
3972 while (range > lim
3973 && !fastmap[RE_TRANSLATE (translate, *d)])
3975 d++;
3976 range--;
3979 else
3980 while (range > lim && !fastmap[*d])
3982 d++;
3983 range--;
3986 startpos += irange - range;
3988 else /* Searching backwards. */
3990 int room = (startpos >= size1
3991 ? size2 + size1 - startpos
3992 : size1 - startpos);
3994 buf_ch = STRING_CHAR (d, room);
3995 if (RE_TRANSLATE_P (translate))
3996 buf_ch = RE_TRANSLATE (translate, buf_ch);
3998 if (! (buf_ch >= 0400
3999 || fastmap[buf_ch]))
4000 goto advance;
4004 /* If can't match the null string, and that's all we have left, fail. */
4005 if (range >= 0 && startpos == total_size && fastmap
4006 && !bufp->can_be_null)
4007 return -1;
4009 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4010 startpos, regs, stop);
4011 #ifndef REGEX_MALLOC
4012 #ifdef C_ALLOCA
4013 alloca (0);
4014 #endif
4015 #endif
4017 if (val >= 0)
4018 return startpos;
4020 if (val == -2)
4021 return -2;
4023 advance:
4024 if (!range)
4025 break;
4026 else if (range > 0)
4028 /* Update STARTPOS to the next character boundary. */
4029 if (multibyte)
4031 re_char *p = POS_ADDR_VSTRING (startpos);
4032 re_char *pend = STOP_ADDR_VSTRING (startpos);
4033 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4035 range -= len;
4036 if (range < 0)
4037 break;
4038 startpos += len;
4040 else
4042 range--;
4043 startpos++;
4046 else
4048 range++;
4049 startpos--;
4051 /* Update STARTPOS to the previous character boundary. */
4052 if (multibyte)
4054 re_char *p = POS_ADDR_VSTRING (startpos);
4055 int len = 0;
4057 /* Find the head of multibyte form. */
4058 while (!CHAR_HEAD_P (*p))
4059 p--, len++;
4061 /* Adjust it. */
4062 #if 0 /* XXX */
4063 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
4065 else
4066 #endif
4068 range += len;
4069 if (range > 0)
4070 break;
4072 startpos -= len;
4077 return -1;
4078 } /* re_search_2 */
4080 /* Declarations and macros for re_match_2. */
4082 static int bcmp_translate ();
4084 /* This converts PTR, a pointer into one of the search strings `string1'
4085 and `string2' into an offset from the beginning of that string. */
4086 #define POINTER_TO_OFFSET(ptr) \
4087 (FIRST_STRING_P (ptr) \
4088 ? ((regoff_t) ((ptr) - string1)) \
4089 : ((regoff_t) ((ptr) - string2 + size1)))
4091 /* Call before fetching a character with *d. This switches over to
4092 string2 if necessary. */
4093 #define PREFETCH() \
4094 while (d == dend) \
4096 /* End of string2 => fail. */ \
4097 if (dend == end_match_2) \
4098 goto fail; \
4099 /* End of string1 => advance to string2. */ \
4100 d = string2; \
4101 dend = end_match_2; \
4105 /* Test if at very beginning or at very end of the virtual concatenation
4106 of `string1' and `string2'. If only one string, it's `string2'. */
4107 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4108 #define AT_STRINGS_END(d) ((d) == end2)
4111 /* Test if D points to a character which is word-constituent. We have
4112 two special cases to check for: if past the end of string1, look at
4113 the first character in string2; and if before the beginning of
4114 string2, look at the last character in string1. */
4115 #define WORDCHAR_P(d) \
4116 (SYNTAX ((d) == end1 ? *string2 \
4117 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4118 == Sword)
4120 /* Disabled due to a compiler bug -- see comment at case wordbound */
4122 /* The comment at case wordbound is following one, but we don't use
4123 AT_WORD_BOUNDARY anymore to support multibyte form.
4125 The DEC Alpha C compiler 3.x generates incorrect code for the
4126 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4127 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4128 macro and introducing temporary variables works around the bug. */
4130 #if 0
4131 /* Test if the character before D and the one at D differ with respect
4132 to being word-constituent. */
4133 #define AT_WORD_BOUNDARY(d) \
4134 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4135 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4136 #endif
4138 /* Free everything we malloc. */
4139 #ifdef MATCH_MAY_ALLOCATE
4140 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4141 #define FREE_VARIABLES() \
4142 do { \
4143 REGEX_FREE_STACK (fail_stack.stack); \
4144 FREE_VAR (regstart); \
4145 FREE_VAR (regend); \
4146 FREE_VAR (best_regstart); \
4147 FREE_VAR (best_regend); \
4148 } while (0)
4149 #else
4150 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4151 #endif /* not MATCH_MAY_ALLOCATE */
4154 /* Optimization routines. */
4156 /* Jump over non-matching operations. */
4157 static unsigned char *
4158 skip_noops (p, pend, memory)
4159 unsigned char *p, *pend;
4160 int memory;
4162 int mcnt;
4163 while (p < pend)
4165 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4167 case start_memory:
4168 if (!memory)
4169 return p;
4170 case stop_memory:
4171 p += 2; break;
4172 case no_op:
4173 p += 1; break;
4174 case jump:
4175 p += 1;
4176 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4177 p += mcnt;
4178 break;
4179 default:
4180 return p;
4183 assert (p == pend);
4184 return p;
4187 /* Non-zero if "p1 matches something" implies "p2 fails". */
4188 static int
4189 mutually_exclusive_p (bufp, p1, p2)
4190 struct re_pattern_buffer *bufp;
4191 unsigned char *p1, *p2;
4193 int multibyte = bufp->multibyte;
4194 unsigned char *pend = bufp->buffer + bufp->used;
4196 assert (p1 >= bufp->buffer && p1 <= pend
4197 && p2 >= bufp->buffer && p2 <= pend);
4199 /* Skip over open/close-group commands.
4200 If what follows this loop is a ...+ construct,
4201 look at what begins its body, since we will have to
4202 match at least one of that. */
4203 p2 = skip_noops (p2, pend, 1);
4204 /* The same skip can be done for p1, except that skipping over
4205 start_memory is not a good idea (if there's a group inside
4206 the loop delimited by on_failure_jump_exclusive, then it
4207 can't optimize the push away (it still works properly, but
4208 slightly slower rather than faster)). */
4209 p1 = skip_noops (p1, pend, 0);
4211 /* If we're at the end of the pattern, we can change. */
4212 if (p2 == pend)
4214 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p1))
4216 case anychar:
4217 case charset_not:
4218 case charset:
4219 case exactn:
4220 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4221 return 1;
4222 default:
4223 return 0;
4227 else if ((re_opcode_t) *p2 == exactn
4228 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4230 register unsigned int c
4231 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4233 if ((re_opcode_t) *p1 == exactn)
4235 if (!(multibyte /* && (c != '\n') */
4236 && BASE_LEADING_CODE_P (c))
4237 ? c != p1[2]
4238 : (STRING_CHAR (&p2[2], pend - &p2[2])
4239 != STRING_CHAR (&p1[2], pend - &p1[2])))
4241 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4242 return 1;
4246 else if ((re_opcode_t) *p1 == charset
4247 || (re_opcode_t) *p1 == charset_not)
4249 int not = (re_opcode_t) *p1 == charset_not;
4251 if (multibyte /* && (c != '\n') */
4252 && BASE_LEADING_CODE_P (c))
4253 c = STRING_CHAR (&p2[2], pend - &p2[2]);
4255 /* Test if C is listed in charset (or charset_not)
4256 at `p1'. */
4257 if (SINGLE_BYTE_CHAR_P (c))
4259 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4260 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4261 not = !not;
4263 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4264 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4266 /* `not' is equal to 1 if c would match, which means
4267 that we can't change to pop_failure_jump. */
4268 if (!not)
4270 DEBUG_PRINT1 (" No match => fast loop.\n");
4271 return 1;
4274 else if ((re_opcode_t) *p1 == anychar
4275 && c == '\n')
4277 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4278 return 1;
4281 else if ((re_opcode_t) *p2 == charset
4282 || (re_opcode_t) *p2 == charset_not)
4284 if ((re_opcode_t) *p1 == exactn)
4285 /* Reuse the code above. */
4286 return mutually_exclusive_p (bufp, p2, p1);
4289 /* It is hard to list up all the character in charset
4290 P2 if it includes multibyte character. Give up in
4291 such case. */
4292 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4294 /* Now, we are sure that P2 has no range table.
4295 So, for the size of bitmap in P2, `p2[1]' is
4296 enough. But P1 may have range table, so the
4297 size of bitmap table of P1 is extracted by
4298 using macro `CHARSET_BITMAP_SIZE'.
4300 Since we know that all the character listed in
4301 P2 is ASCII, it is enough to test only bitmap
4302 table of P1. */
4304 if (*p1 == *p2)
4306 int idx;
4307 /* We win if the charset inside the loop
4308 has no overlap with the one after the loop. */
4309 for (idx = 0;
4310 (idx < (int) p2[1]
4311 && idx < CHARSET_BITMAP_SIZE (p1));
4312 idx++)
4313 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4314 break;
4316 if (idx == p2[1]
4317 || idx == CHARSET_BITMAP_SIZE (p1))
4319 DEBUG_PRINT1 (" No match => fast loop.\n");
4320 return 1;
4323 else if ((re_opcode_t) *p1 == charset
4324 || (re_opcode_t) *p1 == charset_not)
4326 int idx;
4327 /* We win if the charset_not inside the loop lists
4328 every character listed in the charset after. */
4329 for (idx = 0; idx < (int) p2[1]; idx++)
4330 if (! (p2[2 + idx] == 0
4331 || (idx < CHARSET_BITMAP_SIZE (p1)
4332 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4333 break;
4335 if (idx == p2[1])
4337 DEBUG_PRINT1 (" No match => fast loop.\n");
4338 return 1;
4344 /* Safe default. */
4345 return 0;
4349 /* Matching routines. */
4351 #ifndef emacs /* Emacs never uses this. */
4352 /* re_match is like re_match_2 except it takes only a single string. */
4355 re_match (bufp, string, size, pos, regs)
4356 struct re_pattern_buffer *bufp;
4357 const char *string;
4358 int size, pos;
4359 struct re_registers *regs;
4361 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4362 pos, regs, size);
4363 alloca (0);
4364 return result;
4366 #endif /* not emacs */
4368 #ifdef emacs
4369 /* In Emacs, this is the string or buffer in which we
4370 are matching. It is used for looking up syntax properties. */
4371 Lisp_Object re_match_object;
4372 #endif
4374 /* re_match_2 matches the compiled pattern in BUFP against the
4375 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4376 and SIZE2, respectively). We start matching at POS, and stop
4377 matching at STOP.
4379 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4380 store offsets for the substring each group matched in REGS. See the
4381 documentation for exactly how many groups we fill.
4383 We return -1 if no match, -2 if an internal error (such as the
4384 failure stack overflowing). Otherwise, we return the length of the
4385 matched substring. */
4388 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4389 struct re_pattern_buffer *bufp;
4390 const char *string1, *string2;
4391 int size1, size2;
4392 int pos;
4393 struct re_registers *regs;
4394 int stop;
4396 int result;
4398 #ifdef emacs
4399 int charpos;
4400 gl_state.object = re_match_object;
4401 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4402 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4403 #endif
4405 result = re_match_2_internal (bufp, string1, size1, string2, size2,
4406 pos, regs, stop);
4407 alloca (0);
4408 return result;
4411 /* This is a separate function so that we can force an alloca cleanup
4412 afterwards. */
4413 static int
4414 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4415 struct re_pattern_buffer *bufp;
4416 re_char *string1, *string2;
4417 int size1, size2;
4418 int pos;
4419 struct re_registers *regs;
4420 int stop;
4422 /* General temporaries. */
4423 int mcnt;
4424 boolean not;
4425 unsigned char *p1;
4427 /* Just past the end of the corresponding string. */
4428 re_char *end1, *end2;
4430 /* Pointers into string1 and string2, just past the last characters in
4431 each to consider matching. */
4432 re_char *end_match_1, *end_match_2;
4434 /* Where we are in the data, and the end of the current string. */
4435 re_char *d, *dend;
4437 /* Used sometimes to remember where we were before starting matching
4438 an operator so that we can go back in case of failure. This "atomic"
4439 behavior of matching opcodes is indispensable to the correctness
4440 of the on_failure_keep_string_jump optimization. */
4441 re_char *dfail;
4443 /* Where we are in the pattern, and the end of the pattern. */
4444 unsigned char *p = bufp->buffer;
4445 register unsigned char *pend = p + bufp->used;
4447 /* We use this to map every character in the string. */
4448 RE_TRANSLATE_TYPE translate = bufp->translate;
4450 /* Nonzero if we have to concern multibyte character. */
4451 int multibyte = bufp->multibyte;
4453 /* Failure point stack. Each place that can handle a failure further
4454 down the line pushes a failure point on this stack. It consists of
4455 regstart, and regend for all registers corresponding to
4456 the subexpressions we're currently inside, plus the number of such
4457 registers, and, finally, two char *'s. The first char * is where
4458 to resume scanning the pattern; the second one is where to resume
4459 scanning the strings. */
4460 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4461 fail_stack_type fail_stack;
4462 #endif
4463 #ifdef DEBUG
4464 static unsigned failure_id = 0;
4465 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4466 #endif
4468 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4469 /* This holds the pointer to the failure stack, when
4470 it is allocated relocatably. */
4471 fail_stack_elt_t *failure_stack_ptr;
4472 #endif
4474 /* We fill all the registers internally, independent of what we
4475 return, for use in backreferences. The number here includes
4476 an element for register zero. */
4477 unsigned num_regs = bufp->re_nsub + 1;
4479 /* Information on the contents of registers. These are pointers into
4480 the input strings; they record just what was matched (on this
4481 attempt) by a subexpression part of the pattern, that is, the
4482 regnum-th regstart pointer points to where in the pattern we began
4483 matching and the regnum-th regend points to right after where we
4484 stopped matching the regnum-th subexpression. (The zeroth register
4485 keeps track of what the whole pattern matches.) */
4486 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4487 re_char **regstart, **regend;
4488 #endif
4490 /* The following record the register info as found in the above
4491 variables when we find a match better than any we've seen before.
4492 This happens as we backtrack through the failure points, which in
4493 turn happens only if we have not yet matched the entire string. */
4494 unsigned best_regs_set = false;
4495 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4496 re_char **best_regstart, **best_regend;
4497 #endif
4499 /* Logically, this is `best_regend[0]'. But we don't want to have to
4500 allocate space for that if we're not allocating space for anything
4501 else (see below). Also, we never need info about register 0 for
4502 any of the other register vectors, and it seems rather a kludge to
4503 treat `best_regend' differently than the rest. So we keep track of
4504 the end of the best match so far in a separate variable. We
4505 initialize this to NULL so that when we backtrack the first time
4506 and need to test it, it's not garbage. */
4507 re_char *match_end = NULL;
4509 #ifdef DEBUG
4510 /* Counts the total number of registers pushed. */
4511 unsigned num_regs_pushed = 0;
4512 #endif
4514 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4516 INIT_FAIL_STACK ();
4518 #ifdef MATCH_MAY_ALLOCATE
4519 /* Do not bother to initialize all the register variables if there are
4520 no groups in the pattern, as it takes a fair amount of time. If
4521 there are groups, we include space for register 0 (the whole
4522 pattern), even though we never use it, since it simplifies the
4523 array indexing. We should fix this. */
4524 if (bufp->re_nsub)
4526 regstart = REGEX_TALLOC (num_regs, re_char *);
4527 regend = REGEX_TALLOC (num_regs, re_char *);
4528 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4529 best_regend = REGEX_TALLOC (num_regs, re_char *);
4531 if (!(regstart && regend && best_regstart && best_regend))
4533 FREE_VARIABLES ();
4534 return -2;
4537 else
4539 /* We must initialize all our variables to NULL, so that
4540 `FREE_VARIABLES' doesn't try to free them. */
4541 regstart = regend = best_regstart = best_regend = NULL;
4543 #endif /* MATCH_MAY_ALLOCATE */
4545 /* The starting position is bogus. */
4546 if (pos < 0 || pos > size1 + size2)
4548 FREE_VARIABLES ();
4549 return -1;
4552 /* Initialize subexpression text positions to -1 to mark ones that no
4553 start_memory/stop_memory has been seen for. Also initialize the
4554 register information struct. */
4555 for (mcnt = 1; mcnt < num_regs; mcnt++)
4556 regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE;
4558 /* Shorten strings to `stop'. */
4559 if (stop <= size1)
4561 size1 = stop;
4562 size2 = 0;
4564 else if (stop <= size1 + size2)
4565 size2 = stop - size1;
4567 /* We move `string1' into `string2' if the latter's empty -- but not if
4568 `string1' is null. */
4569 if (size2 == 0 && string1 != NULL)
4571 string2 = string1;
4572 size2 = size1;
4573 string1 = 0;
4574 size1 = 0;
4576 end1 = string1 + size1;
4577 end2 = string2 + size2;
4579 /* Compute where to stop matching, within the two strings. */
4580 end_match_1 = end1;
4581 end_match_2 = end2;
4583 /* `p' scans through the pattern as `d' scans through the data.
4584 `dend' is the end of the input string that `d' points within. `d'
4585 is advanced into the following input string whenever necessary, but
4586 this happens before fetching; therefore, at the beginning of the
4587 loop, `d' can be pointing at the end of a string, but it cannot
4588 equal `string2'. */
4589 if (size1 > 0 && pos <= size1)
4591 d = string1 + pos;
4592 dend = end_match_1;
4594 else
4596 d = string2 + pos - size1;
4597 dend = end_match_2;
4600 DEBUG_PRINT1 ("The compiled pattern is: ");
4601 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4602 DEBUG_PRINT1 ("The string to match is: `");
4603 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4604 DEBUG_PRINT1 ("'\n");
4606 /* This loops over pattern commands. It exits by returning from the
4607 function if the match is complete, or it drops through if the match
4608 fails at this starting point in the input data. */
4609 for (;;)
4611 DEBUG_PRINT2 ("\n%p: ", p);
4613 if (p == pend)
4614 { /* End of pattern means we might have succeeded. */
4615 DEBUG_PRINT1 ("end of pattern ... ");
4617 /* If we haven't matched the entire string, and we want the
4618 longest match, try backtracking. */
4619 if (d != end_match_2)
4621 /* 1 if this match ends in the same string (string1 or string2)
4622 as the best previous match. */
4623 boolean same_str_p = (FIRST_STRING_P (match_end)
4624 == FIRST_STRING_P (d));
4625 /* 1 if this match is the best seen so far. */
4626 boolean best_match_p;
4628 /* AIX compiler got confused when this was combined
4629 with the previous declaration. */
4630 if (same_str_p)
4631 best_match_p = d > match_end;
4632 else
4633 best_match_p = !FIRST_STRING_P (d);
4635 DEBUG_PRINT1 ("backtracking.\n");
4637 if (!FAIL_STACK_EMPTY ())
4638 { /* More failure points to try. */
4640 /* If exceeds best match so far, save it. */
4641 if (!best_regs_set || best_match_p)
4643 best_regs_set = true;
4644 match_end = d;
4646 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4648 for (mcnt = 1; mcnt < num_regs; mcnt++)
4650 best_regstart[mcnt] = regstart[mcnt];
4651 best_regend[mcnt] = regend[mcnt];
4654 goto fail;
4657 /* If no failure points, don't restore garbage. And if
4658 last match is real best match, don't restore second
4659 best one. */
4660 else if (best_regs_set && !best_match_p)
4662 restore_best_regs:
4663 /* Restore best match. It may happen that `dend ==
4664 end_match_1' while the restored d is in string2.
4665 For example, the pattern `x.*y.*z' against the
4666 strings `x-' and `y-z-', if the two strings are
4667 not consecutive in memory. */
4668 DEBUG_PRINT1 ("Restoring best registers.\n");
4670 d = match_end;
4671 dend = ((d >= string1 && d <= end1)
4672 ? end_match_1 : end_match_2);
4674 for (mcnt = 1; mcnt < num_regs; mcnt++)
4676 regstart[mcnt] = best_regstart[mcnt];
4677 regend[mcnt] = best_regend[mcnt];
4680 } /* d != end_match_2 */
4682 succeed_label:
4683 DEBUG_PRINT1 ("Accepting match.\n");
4685 /* If caller wants register contents data back, do it. */
4686 if (regs && !bufp->no_sub)
4688 /* Have the register data arrays been allocated? */
4689 if (bufp->regs_allocated == REGS_UNALLOCATED)
4690 { /* No. So allocate them with malloc. We need one
4691 extra element beyond `num_regs' for the `-1' marker
4692 GNU code uses. */
4693 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4694 regs->start = TALLOC (regs->num_regs, regoff_t);
4695 regs->end = TALLOC (regs->num_regs, regoff_t);
4696 if (regs->start == NULL || regs->end == NULL)
4698 FREE_VARIABLES ();
4699 return -2;
4701 bufp->regs_allocated = REGS_REALLOCATE;
4703 else if (bufp->regs_allocated == REGS_REALLOCATE)
4704 { /* Yes. If we need more elements than were already
4705 allocated, reallocate them. If we need fewer, just
4706 leave it alone. */
4707 if (regs->num_regs < num_regs + 1)
4709 regs->num_regs = num_regs + 1;
4710 RETALLOC (regs->start, regs->num_regs, regoff_t);
4711 RETALLOC (regs->end, regs->num_regs, regoff_t);
4712 if (regs->start == NULL || regs->end == NULL)
4714 FREE_VARIABLES ();
4715 return -2;
4719 else
4721 /* These braces fend off a "empty body in an else-statement"
4722 warning under GCC when assert expands to nothing. */
4723 assert (bufp->regs_allocated == REGS_FIXED);
4726 /* Convert the pointer data in `regstart' and `regend' to
4727 indices. Register zero has to be set differently,
4728 since we haven't kept track of any info for it. */
4729 if (regs->num_regs > 0)
4731 regs->start[0] = pos;
4732 regs->end[0] = POINTER_TO_OFFSET (d);
4735 /* Go through the first `min (num_regs, regs->num_regs)'
4736 registers, since that is all we initialized. */
4737 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4739 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4740 regs->start[mcnt] = regs->end[mcnt] = -1;
4741 else
4743 regs->start[mcnt]
4744 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4745 regs->end[mcnt]
4746 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4750 /* If the regs structure we return has more elements than
4751 were in the pattern, set the extra elements to -1. If
4752 we (re)allocated the registers, this is the case,
4753 because we always allocate enough to have at least one
4754 -1 at the end. */
4755 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4756 regs->start[mcnt] = regs->end[mcnt] = -1;
4757 } /* regs && !bufp->no_sub */
4759 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4760 nfailure_points_pushed, nfailure_points_popped,
4761 nfailure_points_pushed - nfailure_points_popped);
4762 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4764 mcnt = POINTER_TO_OFFSET (d) - pos;
4766 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4768 FREE_VARIABLES ();
4769 return mcnt;
4772 /* Otherwise match next pattern command. */
4773 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4775 /* Ignore these. Used to ignore the n of succeed_n's which
4776 currently have n == 0. */
4777 case no_op:
4778 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4779 break;
4781 case succeed:
4782 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4783 goto succeed_label;
4785 /* Match the next n pattern characters exactly. The following
4786 byte in the pattern defines n, and the n bytes after that
4787 are the characters to match. */
4788 case exactn:
4789 mcnt = *p++;
4790 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4792 /* Remember the start point to rollback upon failure. */
4793 dfail = d;
4795 /* This is written out as an if-else so we don't waste time
4796 testing `translate' inside the loop. */
4797 if (RE_TRANSLATE_P (translate))
4799 #ifdef emacs
4800 if (multibyte)
4803 int pat_charlen, buf_charlen;
4804 unsigned int pat_ch, buf_ch;
4806 PREFETCH ();
4807 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4808 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4810 if (RE_TRANSLATE (translate, buf_ch)
4811 != pat_ch)
4813 d = dfail;
4814 goto fail;
4817 p += pat_charlen;
4818 d += buf_charlen;
4819 mcnt -= pat_charlen;
4821 while (mcnt > 0);
4822 else
4823 #endif /* not emacs */
4826 PREFETCH ();
4827 if (RE_TRANSLATE (translate, *d) != *p++)
4829 d = dfail;
4830 goto fail;
4832 d++;
4834 while (--mcnt);
4836 else
4840 PREFETCH ();
4841 if (*d++ != *p++)
4843 d = dfail;
4844 goto fail;
4847 while (--mcnt);
4849 break;
4852 /* Match any character except possibly a newline or a null. */
4853 case anychar:
4855 int buf_charlen;
4856 unsigned int buf_ch;
4858 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4860 PREFETCH ();
4862 #ifdef emacs
4863 if (multibyte)
4864 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4865 else
4866 #endif /* not emacs */
4868 buf_ch = *d;
4869 buf_charlen = 1;
4872 buf_ch = TRANSLATE (buf_ch);
4874 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4875 && buf_ch == '\n')
4876 || ((bufp->syntax & RE_DOT_NOT_NULL)
4877 && buf_ch == '\000'))
4878 goto fail;
4880 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4881 d += buf_charlen;
4883 break;
4886 case charset:
4887 case charset_not:
4889 register unsigned int c;
4890 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4891 int len;
4893 /* Start of actual range_table, or end of bitmap if there is no
4894 range table. */
4895 unsigned char *range_table;
4897 /* Nonzero if there is a range table. */
4898 int range_table_exists;
4900 /* Number of ranges of range table. This is not included
4901 in the initial byte-length of the command. */
4902 int count = 0;
4904 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4906 PREFETCH ();
4907 c = *d;
4909 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4911 #ifdef emacs
4912 if (range_table_exists)
4914 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4915 EXTRACT_NUMBER_AND_INCR (count, range_table);
4918 if (multibyte && BASE_LEADING_CODE_P (c))
4919 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
4920 #endif /* emacs */
4922 if (SINGLE_BYTE_CHAR_P (c))
4923 { /* Lookup bitmap. */
4924 c = TRANSLATE (c); /* The character to match. */
4925 len = 1;
4927 /* Cast to `unsigned' instead of `unsigned char' in
4928 case the bit list is a full 32 bytes long. */
4929 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4930 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4931 not = !not;
4933 #ifdef emacs
4934 else if (range_table_exists)
4936 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4938 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4939 | (class_bits & BIT_ALPHA && ISALPHA (c))
4940 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
4941 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4942 | (class_bits & BIT_LOWER && ISLOWER (c))
4943 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4944 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
4945 | (class_bits & BIT_PRINT && ISPRINT (c))
4946 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4947 | (class_bits & BIT_SPACE && ISSPACE (c))
4948 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
4949 | (class_bits & BIT_UPPER && ISUPPER (c))
4950 | (class_bits & BIT_WORD && ISWORD (c)))
4951 not = !not;
4952 else
4953 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4955 #endif /* emacs */
4957 if (range_table_exists)
4958 p = CHARSET_RANGE_TABLE_END (range_table, count);
4959 else
4960 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
4962 if (!not) goto fail;
4964 d += len;
4965 break;
4969 /* The beginning of a group is represented by start_memory.
4970 The argument is the register number. The text
4971 matched within the group is recorded (in the internal
4972 registers data structure) under the register number. */
4973 case start_memory:
4974 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
4976 /* In case we need to undo this operation (via backtracking). */
4977 PUSH_FAILURE_REG ((unsigned int)*p);
4979 regstart[*p] = d;
4980 regend[*p] = REG_UNSET_VALUE; /* probably unnecessary. -sm */
4981 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4983 /* Move past the register number and inner group count. */
4984 p += 1;
4985 break;
4988 /* The stop_memory opcode represents the end of a group. Its
4989 argument is the same as start_memory's: the register number. */
4990 case stop_memory:
4991 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
4993 assert (!REG_UNSET (regstart[*p]));
4994 /* Strictly speaking, there should be code such as:
4996 assert (REG_UNSET (regend[*p]));
4997 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4999 But the only info to be pushed is regend[*p] and it is known to
5000 be UNSET, so there really isn't anything to push.
5001 Not pushing anything, on the other hand deprives us from the
5002 guarantee that regend[*p] is UNSET since undoing this operation
5003 will not reset its value properly. This is not important since
5004 the value will only be read on the next start_memory or at
5005 the very end and both events can only happen if this stop_memory
5006 is *not* undone. */
5008 regend[*p] = d;
5009 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5011 /* Move past the register number and the inner group count. */
5012 p += 1;
5013 break;
5016 /* \<digit> has been turned into a `duplicate' command which is
5017 followed by the numeric value of <digit> as the register number. */
5018 case duplicate:
5020 register re_char *d2, *dend2;
5021 int regno = *p++; /* Get which register to match against. */
5022 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5024 /* Can't back reference a group which we've never matched. */
5025 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5026 goto fail;
5028 /* Where in input to try to start matching. */
5029 d2 = regstart[regno];
5031 /* Remember the start point to rollback upon failure. */
5032 dfail = d;
5034 /* Where to stop matching; if both the place to start and
5035 the place to stop matching are in the same string, then
5036 set to the place to stop, otherwise, for now have to use
5037 the end of the first string. */
5039 dend2 = ((FIRST_STRING_P (regstart[regno])
5040 == FIRST_STRING_P (regend[regno]))
5041 ? regend[regno] : end_match_1);
5042 for (;;)
5044 /* If necessary, advance to next segment in register
5045 contents. */
5046 while (d2 == dend2)
5048 if (dend2 == end_match_2) break;
5049 if (dend2 == regend[regno]) break;
5051 /* End of string1 => advance to string2. */
5052 d2 = string2;
5053 dend2 = regend[regno];
5055 /* At end of register contents => success */
5056 if (d2 == dend2) break;
5058 /* If necessary, advance to next segment in data. */
5059 PREFETCH ();
5061 /* How many characters left in this segment to match. */
5062 mcnt = dend - d;
5064 /* Want how many consecutive characters we can match in
5065 one shot, so, if necessary, adjust the count. */
5066 if (mcnt > dend2 - d2)
5067 mcnt = dend2 - d2;
5069 /* Compare that many; failure if mismatch, else move
5070 past them. */
5071 if (RE_TRANSLATE_P (translate)
5072 ? bcmp_translate (d, d2, mcnt, translate)
5073 : bcmp (d, d2, mcnt))
5075 d = dfail;
5076 goto fail;
5078 d += mcnt, d2 += mcnt;
5081 break;
5084 /* begline matches the empty string at the beginning of the string
5085 (unless `not_bol' is set in `bufp'), and, if
5086 `newline_anchor' is set, after newlines. */
5087 case begline:
5088 DEBUG_PRINT1 ("EXECUTING begline.\n");
5090 if (AT_STRINGS_BEG (d))
5092 if (!bufp->not_bol) break;
5094 else if (d[-1] == '\n' && bufp->newline_anchor)
5096 break;
5098 /* In all other cases, we fail. */
5099 goto fail;
5102 /* endline is the dual of begline. */
5103 case endline:
5104 DEBUG_PRINT1 ("EXECUTING endline.\n");
5106 if (AT_STRINGS_END (d))
5108 if (!bufp->not_eol) break;
5111 /* We have to ``prefetch'' the next character. */
5112 else if ((d == end1 ? *string2 : *d) == '\n'
5113 && bufp->newline_anchor)
5115 break;
5117 goto fail;
5120 /* Match at the very beginning of the data. */
5121 case begbuf:
5122 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5123 if (AT_STRINGS_BEG (d))
5124 break;
5125 goto fail;
5128 /* Match at the very end of the data. */
5129 case endbuf:
5130 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5131 if (AT_STRINGS_END (d))
5132 break;
5133 goto fail;
5136 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5137 pushes NULL as the value for the string on the stack. Then
5138 `POP_FAILURE_POINT' will keep the current value for the
5139 string, instead of restoring it. To see why, consider
5140 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5141 then the . fails against the \n. But the next thing we want
5142 to do is match the \n against the \n; if we restored the
5143 string value, we would be back at the foo.
5145 Because this is used only in specific cases, we don't need to
5146 check all the things that `on_failure_jump' does, to make
5147 sure the right things get saved on the stack. Hence we don't
5148 share its code. The only reason to push anything on the
5149 stack at all is that otherwise we would have to change
5150 `anychar's code to do something besides goto fail in this
5151 case; that seems worse than this. */
5152 case on_failure_keep_string_jump:
5153 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5154 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5155 mcnt, p + mcnt);
5157 PUSH_FAILURE_POINT (p - 3, NULL);
5158 break;
5160 case on_failure_jump_exclusive:
5161 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5162 DEBUG_PRINT3 ("EXECUTING on_failure_jump_exclusive %d (to %p):\n",
5163 mcnt, p + mcnt);
5165 if (! FAIL_STACK_EMPTY ()
5166 && FAILURE_PAT (TOP_FAILURE_HANDLE ()) == (p - 3)
5167 && fail_stack.avail == fail_stack.frame)
5169 /* We are trying to push failure F2 onto the stack but there
5170 is already a failure F1 pushed from the same instruction.
5171 Between F1 and now, something has matched (else this is an
5172 improper use of on_failure_jump_exclusive), so that we know
5173 that the fail-destination of F1 cannot match, hence we can
5174 pop F1 before pushing F2. Instead of doing this pop/push,
5175 we manually turn F1 into F2.
5176 `fail_stack.avail == fail_stack.frame' makes sure
5177 that popping F1 doesn't involve registers, else
5178 this optimization cannot be done so trivially. */
5179 assert (FAILURE_STR (TOP_FAILURE_HANDLE ()) != d);
5180 FAILURE_STR (TOP_FAILURE_HANDLE ()) = d;
5182 else
5183 PUSH_FAILURE_POINT (p - 3, d);
5184 break;
5186 case on_failure_jump_loop:
5187 on_failure:
5188 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5189 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5190 mcnt, p + mcnt);
5192 CHECK_INFINITE_LOOP (p - 3, d);
5193 PUSH_FAILURE_POINT (p - 3, d);
5194 break;
5197 /* Uses of on_failure_jump:
5199 Each alternative starts with an on_failure_jump that points
5200 to the beginning of the next alternative. Each alternative
5201 except the last ends with a jump that in effect jumps past
5202 the rest of the alternatives. (They really jump to the
5203 ending jump of the following alternative, because tensioning
5204 these jumps is a hassle.)
5206 Repeats start with an on_failure_jump that points past both
5207 the repetition text and either the following jump or
5208 pop_failure_jump back to this on_failure_jump. */
5209 case on_failure_jump:
5210 QUIT;
5211 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5212 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5213 mcnt, p + mcnt);
5215 PUSH_FAILURE_POINT (p -3, d);
5216 break;
5218 /* This operation is used for greedy * and +.
5219 Compare the beginning of the repeat with what in the
5220 pattern follows its end. If we can establish that there
5221 is nothing that they would both match, i.e., that we
5222 would have to backtrack because of (as in, e.g., `a*a')
5223 then we can use a non-backtracking loop based on
5224 on_failure_jump_exclusive instead of on_failure_jump_loop. */
5225 case on_failure_jump_smart:
5226 QUIT;
5227 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5228 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5229 mcnt, p + mcnt);
5231 unsigned char *p1 = p; /* Next operation. */
5232 unsigned char *p2 = p + mcnt; /* Destination of the jump. */
5234 p -= 3; /* Reset so that we will re-execute the
5235 instruction once it's been changed. */
5237 DEBUG_STATEMENT (debug += 2);
5238 if (mutually_exclusive_p (bufp, p1, p2))
5240 /* Use a fast `on_failure_keep_string_jump' loop. */
5241 *p = (unsigned char) on_failure_jump_exclusive;
5242 /* STORE_NUMBER (p2 - 2, mcnt + 3); */
5244 else
5246 /* Default to a safe `on_failure_jump' loop. */
5247 DEBUG_PRINT1 (" smart default => slow loop.\n");
5248 *p = (unsigned char) on_failure_jump_loop;
5250 DEBUG_STATEMENT (debug -= 2);
5252 break;
5254 /* Unconditionally jump (without popping any failure points). */
5255 case jump:
5256 unconditional_jump:
5257 QUIT;
5258 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5259 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5260 p += mcnt; /* Do the jump. */
5261 DEBUG_PRINT2 ("(to %p).\n", p);
5262 break;
5265 /* Have to succeed matching what follows at least n times.
5266 After that, handle like `on_failure_jump'. */
5267 case succeed_n:
5268 EXTRACT_NUMBER (mcnt, p + 2);
5269 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5271 assert (mcnt >= 0);
5272 /* Originally, this is how many times we HAVE to succeed. */
5273 if (mcnt > 0)
5275 mcnt--;
5276 p += 2;
5277 STORE_NUMBER_AND_INCR (p, mcnt);
5278 DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
5280 else if (mcnt == 0)
5282 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
5283 p[2] = (unsigned char) no_op;
5284 p[3] = (unsigned char) no_op;
5285 goto on_failure;
5287 break;
5289 case jump_n:
5290 EXTRACT_NUMBER (mcnt, p + 2);
5291 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5293 /* Originally, this is how many times we CAN jump. */
5294 if (mcnt)
5296 mcnt--;
5297 STORE_NUMBER (p + 2, mcnt);
5298 goto unconditional_jump;
5300 /* If don't have to jump any more, skip over the rest of command. */
5301 else
5302 p += 4;
5303 break;
5305 case set_number_at:
5307 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5309 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5310 p1 = p + mcnt;
5311 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5312 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
5313 STORE_NUMBER (p1, mcnt);
5314 break;
5317 case wordbound:
5318 case notwordbound:
5319 not = (re_opcode_t) *(p - 1) == notwordbound;
5320 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5322 /* We SUCCEED (or FAIL) in one of the following cases: */
5324 /* Case 1: D is at the beginning or the end of string. */
5325 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5326 not = !not;
5327 else
5329 /* C1 is the character before D, S1 is the syntax of C1, C2
5330 is the character at D, and S2 is the syntax of C2. */
5331 int c1, c2, s1, s2;
5332 #ifdef emacs
5333 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1));
5334 UPDATE_SYNTAX_TABLE (charpos);
5335 #endif
5336 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5337 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5338 s1 = SYNTAX (c1);
5339 #ifdef emacs
5340 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5341 #endif
5342 PREFETCH ();
5343 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5344 c2 = STRING_CHAR (d, dend - d);
5345 s2 = SYNTAX (c2);
5347 if (/* Case 2: Only one of S1 and S2 is Sword. */
5348 ((s1 == Sword) != (s2 == Sword))
5349 /* Case 3: Both of S1 and S2 are Sword, and macro
5350 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5351 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5352 not = !not;
5354 if (not)
5355 break;
5356 else
5357 goto fail;
5359 case wordbeg:
5360 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5362 /* We FAIL in one of the following cases: */
5364 /* Case 1: D is at the end of string. */
5365 if (AT_STRINGS_END (d))
5366 goto fail;
5367 else
5369 /* C1 is the character before D, S1 is the syntax of C1, C2
5370 is the character at D, and S2 is the syntax of C2. */
5371 int c1, c2, s1, s2;
5372 #ifdef emacs
5373 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5374 UPDATE_SYNTAX_TABLE (charpos);
5375 #endif
5376 PREFETCH ();
5377 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5378 c2 = STRING_CHAR (d, dend - d);
5379 s2 = SYNTAX (c2);
5381 /* Case 2: S2 is not Sword. */
5382 if (s2 != Sword)
5383 goto fail;
5385 /* Case 3: D is not at the beginning of string ... */
5386 if (!AT_STRINGS_BEG (d))
5388 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5389 #ifdef emacs
5390 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5391 #endif
5392 s1 = SYNTAX (c1);
5394 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5395 returns 0. */
5396 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5397 goto fail;
5400 break;
5402 case wordend:
5403 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5405 /* We FAIL in one of the following cases: */
5407 /* Case 1: D is at the beginning of string. */
5408 if (AT_STRINGS_BEG (d))
5409 goto fail;
5410 else
5412 /* C1 is the character before D, S1 is the syntax of C1, C2
5413 is the character at D, and S2 is the syntax of C2. */
5414 int c1, c2, s1, s2;
5415 #ifdef emacs
5416 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1);
5417 UPDATE_SYNTAX_TABLE (charpos);
5418 #endif
5419 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5420 s1 = SYNTAX (c1);
5422 /* Case 2: S1 is not Sword. */
5423 if (s1 != Sword)
5424 goto fail;
5426 /* Case 3: D is not at the end of string ... */
5427 if (!AT_STRINGS_END (d))
5429 PREFETCH ();
5430 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5431 c2 = STRING_CHAR (d, dend - d);
5432 #ifdef emacs
5433 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5434 #endif
5435 s2 = SYNTAX (c2);
5437 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5438 returns 0. */
5439 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5440 goto fail;
5443 break;
5445 #ifdef emacs
5446 case before_dot:
5447 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5448 if (PTR_BYTE_POS (d) >= PT_BYTE)
5449 goto fail;
5450 break;
5452 case at_dot:
5453 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5454 if (PTR_BYTE_POS (d) != PT_BYTE)
5455 goto fail;
5456 break;
5458 case after_dot:
5459 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5460 if (PTR_BYTE_POS (d) <= PT_BYTE)
5461 goto fail;
5462 break;
5464 case syntaxspec:
5465 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5466 mcnt = *p++;
5467 goto matchsyntax;
5469 case wordchar:
5470 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5471 mcnt = (int) Sword;
5472 matchsyntax:
5473 PREFETCH ();
5474 #ifdef emacs
5476 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5477 UPDATE_SYNTAX_TABLE (pos1);
5479 #endif
5481 int c, len;
5483 if (multibyte)
5484 /* we must concern about multibyte form, ... */
5485 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5486 else
5487 /* everything should be handled as ASCII, even though it
5488 looks like multibyte form. */
5489 c = *d, len = 1;
5491 if (SYNTAX (c) != (enum syntaxcode) mcnt)
5492 goto fail;
5493 d += len;
5495 break;
5497 case notsyntaxspec:
5498 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5499 mcnt = *p++;
5500 goto matchnotsyntax;
5502 case notwordchar:
5503 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5504 mcnt = (int) Sword;
5505 matchnotsyntax:
5506 PREFETCH ();
5507 #ifdef emacs
5509 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5510 UPDATE_SYNTAX_TABLE (pos1);
5512 #endif
5514 int c, len;
5516 if (multibyte)
5517 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5518 else
5519 c = *d, len = 1;
5521 if (SYNTAX (c) == (enum syntaxcode) mcnt)
5522 goto fail;
5523 d += len;
5525 break;
5527 case categoryspec:
5528 DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
5529 mcnt = *p++;
5530 PREFETCH ();
5532 int c, len;
5534 if (multibyte)
5535 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5536 else
5537 c = *d, len = 1;
5539 if (!CHAR_HAS_CATEGORY (c, mcnt))
5540 goto fail;
5541 d += len;
5543 break;
5545 case notcategoryspec:
5546 DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
5547 mcnt = *p++;
5548 PREFETCH ();
5550 int c, len;
5552 if (multibyte)
5553 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5554 else
5555 c = *d, len = 1;
5557 if (CHAR_HAS_CATEGORY (c, mcnt))
5558 goto fail;
5559 d += len;
5561 break;
5563 #else /* not emacs */
5564 case wordchar:
5565 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5566 PREFETCH ();
5567 if (!WORDCHAR_P (d))
5568 goto fail;
5569 d++;
5570 break;
5572 case notwordchar:
5573 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5574 PREFETCH ();
5575 if (WORDCHAR_P (d))
5576 goto fail;
5577 d++;
5578 break;
5579 #endif /* not emacs */
5581 default:
5582 abort ();
5584 continue; /* Successfully executed one pattern command; keep going. */
5587 /* We goto here if a matching operation fails. */
5588 fail:
5589 QUIT;
5590 if (!FAIL_STACK_EMPTY ())
5592 re_char *str;
5593 unsigned char *pat;
5594 /* A restart point is known. Restore to that state. */
5595 DEBUG_PRINT1 ("\nFAIL:\n");
5596 POP_FAILURE_POINT (str, pat);
5597 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5599 case on_failure_keep_string_jump:
5600 assert (str == NULL);
5601 goto continue_failure_jump;
5603 case on_failure_jump_exclusive:
5604 /* If something has matched, the alternative will not match,
5605 so we might as well keep popping right away. */
5606 if (0 /* d != str && d != string2 */) /* Don't bother. -sm */
5607 /* (d == string2 && str == end1) => (d =~ str) */
5608 goto fail;
5609 /* Fallthrough */
5611 case on_failure_jump_loop:
5612 case on_failure_jump:
5613 case succeed_n:
5614 d = str;
5615 continue_failure_jump:
5616 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5617 p = pat + mcnt;
5618 break;
5620 default:
5621 abort();
5624 assert (p >= bufp->buffer && p <= pend);
5626 if (d >= string1 && d <= end1)
5627 dend = end_match_1;
5629 else
5630 break; /* Matching at this starting point really fails. */
5631 } /* for (;;) */
5633 if (best_regs_set)
5634 goto restore_best_regs;
5636 FREE_VARIABLES ();
5638 return -1; /* Failure to match. */
5639 } /* re_match_2 */
5641 /* Subroutine definitions for re_match_2. */
5643 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5644 bytes; nonzero otherwise. */
5646 static int
5647 bcmp_translate (s1, s2, len, translate)
5648 unsigned char *s1, *s2;
5649 register int len;
5650 RE_TRANSLATE_TYPE translate;
5652 register unsigned char *p1 = s1, *p2 = s2;
5653 unsigned char *p1_end = s1 + len;
5654 unsigned char *p2_end = s2 + len;
5656 while (p1 != p1_end && p2 != p2_end)
5658 int p1_charlen, p2_charlen;
5659 int p1_ch, p2_ch;
5661 /* FIXME: This assumes `multibyte = true'. */
5662 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5663 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5665 if (RE_TRANSLATE (translate, p1_ch)
5666 != RE_TRANSLATE (translate, p2_ch))
5667 return 1;
5669 p1 += p1_charlen, p2 += p2_charlen;
5672 if (p1 != p1_end || p2 != p2_end)
5673 return 1;
5675 return 0;
5678 /* Entry points for GNU code. */
5680 /* re_compile_pattern is the GNU regular expression compiler: it
5681 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5682 Returns 0 if the pattern was valid, otherwise an error string.
5684 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5685 are set in BUFP on entry.
5687 We call regex_compile to do the actual compilation. */
5689 const char *
5690 re_compile_pattern (pattern, length, bufp)
5691 const char *pattern;
5692 int length;
5693 struct re_pattern_buffer *bufp;
5695 reg_errcode_t ret;
5697 /* GNU code is written to assume at least RE_NREGS registers will be set
5698 (and at least one extra will be -1). */
5699 bufp->regs_allocated = REGS_UNALLOCATED;
5701 /* And GNU code determines whether or not to get register information
5702 by passing null for the REGS argument to re_match, etc., not by
5703 setting no_sub. */
5704 bufp->no_sub = 0;
5706 /* Match anchors at newline. */
5707 bufp->newline_anchor = 1;
5709 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5711 if (!ret)
5712 return NULL;
5713 return gettext (re_error_msgid[(int) ret]);
5716 /* Entry points compatible with 4.2 BSD regex library. We don't define
5717 them unless specifically requested. */
5719 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
5721 /* BSD has one and only one pattern buffer. */
5722 static struct re_pattern_buffer re_comp_buf;
5724 char *
5725 #ifdef _LIBC
5726 /* Make these definitions weak in libc, so POSIX programs can redefine
5727 these names if they don't use our functions, and still use
5728 regcomp/regexec below without link errors. */
5729 weak_function
5730 #endif
5731 re_comp (s)
5732 const char *s;
5734 reg_errcode_t ret;
5736 if (!s)
5738 if (!re_comp_buf.buffer)
5739 return gettext ("No previous regular expression");
5740 return 0;
5743 if (!re_comp_buf.buffer)
5745 re_comp_buf.buffer = (unsigned char *) malloc (200);
5746 if (re_comp_buf.buffer == NULL)
5747 return gettext (re_error_msgid[(int) REG_ESPACE]);
5748 re_comp_buf.allocated = 200;
5750 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5751 if (re_comp_buf.fastmap == NULL)
5752 return gettext (re_error_msgid[(int) REG_ESPACE]);
5755 /* Since `re_exec' always passes NULL for the `regs' argument, we
5756 don't need to initialize the pattern buffer fields which affect it. */
5758 /* Match anchors at newlines. */
5759 re_comp_buf.newline_anchor = 1;
5761 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5763 if (!ret)
5764 return NULL;
5766 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5767 return (char *) gettext (re_error_msgid[(int) ret]);
5772 #ifdef _LIBC
5773 weak_function
5774 #endif
5775 re_exec (s)
5776 const char *s;
5778 const int len = strlen (s);
5779 return
5780 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5782 #endif /* _REGEX_RE_COMP */
5784 /* POSIX.2 functions. Don't define these for Emacs. */
5786 #ifndef emacs
5788 /* regcomp takes a regular expression as a string and compiles it.
5790 PREG is a regex_t *. We do not expect any fields to be initialized,
5791 since POSIX says we shouldn't. Thus, we set
5793 `buffer' to the compiled pattern;
5794 `used' to the length of the compiled pattern;
5795 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5796 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5797 RE_SYNTAX_POSIX_BASIC;
5798 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5799 `fastmap' and `fastmap_accurate' to zero;
5800 `re_nsub' to the number of subexpressions in PATTERN.
5802 PATTERN is the address of the pattern string.
5804 CFLAGS is a series of bits which affect compilation.
5806 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5807 use POSIX basic syntax.
5809 If REG_NEWLINE is set, then . and [^...] don't match newline.
5810 Also, regexec will try a match beginning after every newline.
5812 If REG_ICASE is set, then we considers upper- and lowercase
5813 versions of letters to be equivalent when matching.
5815 If REG_NOSUB is set, then when PREG is passed to regexec, that
5816 routine will report only success or failure, and nothing about the
5817 registers.
5819 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5820 the return codes and their meanings.) */
5823 regcomp (preg, pattern, cflags)
5824 regex_t *preg;
5825 const char *pattern;
5826 int cflags;
5828 reg_errcode_t ret;
5829 unsigned syntax
5830 = (cflags & REG_EXTENDED) ?
5831 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5833 /* regex_compile will allocate the space for the compiled pattern. */
5834 preg->buffer = 0;
5835 preg->allocated = 0;
5836 preg->used = 0;
5838 /* Don't bother to use a fastmap when searching. This simplifies the
5839 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5840 characters after newlines into the fastmap. This way, we just try
5841 every character. */
5842 preg->fastmap = 0;
5844 if (cflags & REG_ICASE)
5846 unsigned i;
5848 preg->translate
5849 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5850 * sizeof (*(RE_TRANSLATE_TYPE)0));
5851 if (preg->translate == NULL)
5852 return (int) REG_ESPACE;
5854 /* Map uppercase characters to corresponding lowercase ones. */
5855 for (i = 0; i < CHAR_SET_SIZE; i++)
5856 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5858 else
5859 preg->translate = NULL;
5861 /* If REG_NEWLINE is set, newlines are treated differently. */
5862 if (cflags & REG_NEWLINE)
5863 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5864 syntax &= ~RE_DOT_NEWLINE;
5865 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5866 /* It also changes the matching behavior. */
5867 preg->newline_anchor = 1;
5869 else
5870 preg->newline_anchor = 0;
5872 preg->no_sub = !!(cflags & REG_NOSUB);
5874 /* POSIX says a null character in the pattern terminates it, so we
5875 can use strlen here in compiling the pattern. */
5876 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5878 /* POSIX doesn't distinguish between an unmatched open-group and an
5879 unmatched close-group: both are REG_EPAREN. */
5880 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5882 return (int) ret;
5886 /* regexec searches for a given pattern, specified by PREG, in the
5887 string STRING.
5889 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5890 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5891 least NMATCH elements, and we set them to the offsets of the
5892 corresponding matched substrings.
5894 EFLAGS specifies `execution flags' which affect matching: if
5895 REG_NOTBOL is set, then ^ does not match at the beginning of the
5896 string; if REG_NOTEOL is set, then $ does not match at the end.
5898 We return 0 if we find a match and REG_NOMATCH if not. */
5901 regexec (preg, string, nmatch, pmatch, eflags)
5902 const regex_t *preg;
5903 const char *string;
5904 size_t nmatch;
5905 regmatch_t pmatch[];
5906 int eflags;
5908 int ret;
5909 struct re_registers regs;
5910 regex_t private_preg;
5911 int len = strlen (string);
5912 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5914 private_preg = *preg;
5916 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5917 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5919 /* The user has told us exactly how many registers to return
5920 information about, via `nmatch'. We have to pass that on to the
5921 matching routines. */
5922 private_preg.regs_allocated = REGS_FIXED;
5924 if (want_reg_info)
5926 regs.num_regs = nmatch;
5927 regs.start = TALLOC (nmatch, regoff_t);
5928 regs.end = TALLOC (nmatch, regoff_t);
5929 if (regs.start == NULL || regs.end == NULL)
5930 return (int) REG_NOMATCH;
5933 /* Perform the searching operation. */
5934 ret = re_search (&private_preg, string, len,
5935 /* start: */ 0, /* range: */ len,
5936 want_reg_info ? &regs : (struct re_registers *) 0);
5938 /* Copy the register information to the POSIX structure. */
5939 if (want_reg_info)
5941 if (ret >= 0)
5943 unsigned r;
5945 for (r = 0; r < nmatch; r++)
5947 pmatch[r].rm_so = regs.start[r];
5948 pmatch[r].rm_eo = regs.end[r];
5952 /* If we needed the temporary register info, free the space now. */
5953 free (regs.start);
5954 free (regs.end);
5957 /* We want zero return to mean success, unlike `re_search'. */
5958 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5962 /* Returns a message corresponding to an error code, ERRCODE, returned
5963 from either regcomp or regexec. We don't use PREG here. */
5965 size_t
5966 regerror (errcode, preg, errbuf, errbuf_size)
5967 int errcode;
5968 const regex_t *preg;
5969 char *errbuf;
5970 size_t errbuf_size;
5972 const char *msg;
5973 size_t msg_size;
5975 if (errcode < 0
5976 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5977 /* Only error codes returned by the rest of the code should be passed
5978 to this routine. If we are given anything else, or if other regex
5979 code generates an invalid error code, then the program has a bug.
5980 Dump core so we can fix it. */
5981 abort ();
5983 msg = gettext (re_error_msgid[errcode]);
5985 msg_size = strlen (msg) + 1; /* Includes the null. */
5987 if (errbuf_size != 0)
5989 if (msg_size > errbuf_size)
5991 strncpy (errbuf, msg, errbuf_size - 1);
5992 errbuf[errbuf_size - 1] = 0;
5994 else
5995 strcpy (errbuf, msg);
5998 return msg_size;
6002 /* Free dynamically allocated space used by PREG. */
6004 void
6005 regfree (preg)
6006 regex_t *preg;
6008 if (preg->buffer != NULL)
6009 free (preg->buffer);
6010 preg->buffer = NULL;
6012 preg->allocated = 0;
6013 preg->used = 0;
6015 if (preg->fastmap != NULL)
6016 free (preg->fastmap);
6017 preg->fastmap = NULL;
6018 preg->fastmap_accurate = 0;
6020 if (preg->translate != NULL)
6021 free (preg->translate);
6022 preg->translate = NULL;
6025 #endif /* not emacs */