* keyboard.c (Qratio): New symbol.
[emacs.git] / src / regex.c
blob3cf8a139a466d55c037d2f5724bd616939581079
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
27 #undef _GNU_SOURCE
28 #define _GNU_SOURCE
30 #ifdef emacs
31 /* Converts the pointer to the char to BEG-based offset from the start. */
32 #define PTR_TO_OFFSET(d) \
33 POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \
34 ? (d) - string1 : (d) - (string2 - size1))
35 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
36 #else
37 #define PTR_TO_OFFSET(d) 0
38 #endif
40 #ifdef HAVE_CONFIG_H
41 #include <config.h>
42 #endif
44 /* We need this for `regex.h', and perhaps for the Emacs include files. */
45 #include <sys/types.h>
47 /* This is for other GNU distributions with internationalized messages. */
48 #if HAVE_LIBINTL_H || defined (_LIBC)
49 # include <libintl.h>
50 #else
51 # define gettext(msgid) (msgid)
52 #endif
54 #ifndef gettext_noop
55 /* This define is so xgettext can find the internationalizable
56 strings. */
57 #define gettext_noop(String) String
58 #endif
60 /* The `emacs' switch turns on certain matching commands
61 that make sense only in Emacs. */
62 #ifdef emacs
64 #include "lisp.h"
65 #include "buffer.h"
67 /* Make syntax table lookup grant data in gl_state. */
68 #define SYNTAX_ENTRY_VIA_PROPERTY
70 #include "syntax.h"
71 #include "charset.h"
72 #include "category.h"
74 #define malloc xmalloc
75 #define realloc xrealloc
76 #define free xfree
78 #else /* not emacs */
80 /* If we are not linking with Emacs proper,
81 we can't use the relocating allocator
82 even if config.h says that we can. */
83 #undef REL_ALLOC
85 #if defined (STDC_HEADERS) || defined (_LIBC)
86 #include <stdlib.h>
87 #else
88 char *malloc ();
89 char *realloc ();
90 #endif
92 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
93 If nothing else has been done, use the method below. */
94 #ifdef INHIBIT_STRING_HEADER
95 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
96 #if !defined (bzero) && !defined (bcopy)
97 #undef INHIBIT_STRING_HEADER
98 #endif
99 #endif
100 #endif
102 /* This is the normal way of making sure we have a bcopy and a bzero.
103 This is used in most programs--a few other programs avoid this
104 by defining INHIBIT_STRING_HEADER. */
105 #ifndef INHIBIT_STRING_HEADER
106 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
107 #include <string.h>
108 #ifndef bcmp
109 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
110 #endif
111 #ifndef bcopy
112 #define bcopy(s, d, n) memcpy ((d), (s), (n))
113 #endif
114 #ifndef bzero
115 #define bzero(s, n) memset ((s), 0, (n))
116 #endif
117 #else
118 #include <strings.h>
119 #endif
120 #endif
122 /* Define the syntax stuff for \<, \>, etc. */
124 /* This must be nonzero for the wordchar and notwordchar pattern
125 commands in re_match_2. */
126 #ifndef Sword
127 #define Sword 1
128 #endif
130 #ifdef SWITCH_ENUM_BUG
131 #define SWITCH_ENUM_CAST(x) ((int)(x))
132 #else
133 #define SWITCH_ENUM_CAST(x) (x)
134 #endif
136 #ifdef SYNTAX_TABLE
138 extern char *re_syntax_table;
140 #else /* not SYNTAX_TABLE */
142 /* How many characters in the character set. */
143 #define CHAR_SET_SIZE 256
145 static char re_syntax_table[CHAR_SET_SIZE];
147 static void
148 init_syntax_once ()
150 register int c;
151 static int done = 0;
153 if (done)
154 return;
156 bzero (re_syntax_table, sizeof re_syntax_table);
158 for (c = 'a'; c <= 'z'; c++)
159 re_syntax_table[c] = Sword;
161 for (c = 'A'; c <= 'Z'; c++)
162 re_syntax_table[c] = Sword;
164 for (c = '0'; c <= '9'; c++)
165 re_syntax_table[c] = Sword;
167 re_syntax_table['_'] = Sword;
169 done = 1;
172 #endif /* not SYNTAX_TABLE */
174 #define SYNTAX(c) re_syntax_table[c]
176 /* Dummy macros for non-Emacs environments. */
177 #define BASE_LEADING_CODE_P(c) (0)
178 #define WORD_BOUNDARY_P(c1, c2) (0)
179 #define CHAR_HEAD_P(p) (1)
180 #define SINGLE_BYTE_CHAR_P(c) (1)
181 #define SAME_CHARSET_P(c1, c2) (1)
182 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
183 #define STRING_CHAR(p, s) (*(p))
184 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
185 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
186 (c = ((p) == (end1) ? *(str2) : *(p)))
187 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
188 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
189 #endif /* not emacs */
191 /* Get the interface, including the syntax bits. */
192 #include "regex.h"
194 /* isalpha etc. are used for the character classes. */
195 #include <ctype.h>
197 #ifdef emacs
199 /* 1 if C is an ASCII character. */
200 #define IS_REAL_ASCII(c) ((c) < 0200)
202 /* 1 if C is a unibyte character. */
203 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
205 /* The Emacs definitions should not be directly affected by locales. */
207 /* In Emacs, these are only used for single-byte characters. */
208 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
209 #define ISCNTRL(c) ((c) < ' ')
210 #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
211 || ((c) >= 'a' && (c) <= 'f') \
212 || ((c) >= 'A' && (c) <= 'F'))
214 /* This is only used for single-byte characters. */
215 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
217 /* The rest must handle multibyte characters. */
219 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
220 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
221 : 1)
223 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
224 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
225 : 1)
227 #define ISALNUM(c) (IS_REAL_ASCII (c) \
228 ? (((c) >= 'a' && (c) <= 'z') \
229 || ((c) >= 'A' && (c) <= 'Z') \
230 || ((c) >= '0' && (c) <= '9')) \
231 : SYNTAX (c) == Sword)
233 #define ISALPHA(c) (IS_REAL_ASCII (c) \
234 ? (((c) >= 'a' && (c) <= 'z') \
235 || ((c) >= 'A' && (c) <= 'Z')) \
236 : SYNTAX (c) == Sword)
238 #define ISLOWER(c) (LOWERCASEP (c))
240 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
241 ? ((c) > ' ' && (c) < 0177 \
242 && !(((c) >= 'a' && (c) <= 'z') \
243 || ((c) >= 'A' && (c) <= 'Z') \
244 || ((c) >= '0' && (c) <= '9'))) \
245 : SYNTAX (c) != Sword)
247 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
249 #define ISUPPER(c) (UPPERCASEP (c))
251 #define ISWORD(c) (SYNTAX (c) == Sword)
253 #else /* not emacs */
255 /* Jim Meyering writes:
257 "... Some ctype macros are valid only for character codes that
258 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
259 using /bin/cc or gcc but without giving an ansi option). So, all
260 ctype uses should be through macros like ISPRINT... If
261 STDC_HEADERS is defined, then autoconf has verified that the ctype
262 macros don't need to be guarded with references to isascii. ...
263 Defining isascii to 1 should let any compiler worth its salt
264 eliminate the && through constant folding." */
266 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
267 #define ISASCII(c) 1
268 #else
269 #define ISASCII(c) isascii(c)
270 #endif
272 /* 1 if C is an ASCII character. */
273 #define IS_REAL_ASCII(c) ((c) < 0200)
275 /* This distinction is not meaningful, except in Emacs. */
276 #define ISUNIBYTE(c) 1
278 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
279 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
280 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
282 #ifdef isblank
283 #define ISBLANK(c) (ISASCII (c) && isblank (c))
284 #else
285 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
286 #endif
287 #ifdef isgraph
288 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
289 #else
290 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
291 #endif
293 #define ISPRINT(c) (ISASCII (c) && isprint (c))
294 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
295 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
296 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
297 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
298 #define ISLOWER(c) (ISASCII (c) && islower (c))
299 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
300 #define ISSPACE(c) (ISASCII (c) && isspace (c))
301 #define ISUPPER(c) (ISASCII (c) && isupper (c))
302 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
304 #define ISWORD(c) ISALPHA(c)
306 #endif /* not emacs */
308 #ifndef NULL
309 #define NULL (void *)0
310 #endif
312 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
313 since ours (we hope) works properly with all combinations of
314 machines, compilers, `char' and `unsigned char' argument types.
315 (Per Bothner suggested the basic approach.) */
316 #undef SIGN_EXTEND_CHAR
317 #if __STDC__
318 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
319 #else /* not __STDC__ */
320 /* As in Harbison and Steele. */
321 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
322 #endif
324 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
325 use `alloca' instead of `malloc'. This is because using malloc in
326 re_search* or re_match* could cause memory leaks when C-g is used in
327 Emacs; also, malloc is slower and causes storage fragmentation. On
328 the other hand, malloc is more portable, and easier to debug.
330 Because we sometimes use alloca, some routines have to be macros,
331 not functions -- `alloca'-allocated space disappears at the end of the
332 function it is called in. */
334 #ifdef REGEX_MALLOC
336 #define REGEX_ALLOCATE malloc
337 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
338 #define REGEX_FREE free
340 #else /* not REGEX_MALLOC */
342 /* Emacs already defines alloca, sometimes. */
343 #ifndef alloca
345 /* Make alloca work the best possible way. */
346 #ifdef __GNUC__
347 #define alloca __builtin_alloca
348 #else /* not __GNUC__ */
349 #if HAVE_ALLOCA_H
350 #include <alloca.h>
351 #else /* not __GNUC__ or HAVE_ALLOCA_H */
352 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
353 #ifndef _AIX /* Already did AIX, up at the top. */
354 char *alloca ();
355 #endif /* not _AIX */
356 #endif
357 #endif /* not HAVE_ALLOCA_H */
358 #endif /* not __GNUC__ */
360 #endif /* not alloca */
362 #define REGEX_ALLOCATE alloca
364 /* Assumes a `char *destination' variable. */
365 #define REGEX_REALLOCATE(source, osize, nsize) \
366 (destination = (char *) alloca (nsize), \
367 bcopy (source, destination, osize), \
368 destination)
370 /* No need to do anything to free, after alloca. */
371 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
373 #endif /* not REGEX_MALLOC */
375 /* Define how to allocate the failure stack. */
377 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
379 #define REGEX_ALLOCATE_STACK(size) \
380 r_alloc (&failure_stack_ptr, (size))
381 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
382 r_re_alloc (&failure_stack_ptr, (nsize))
383 #define REGEX_FREE_STACK(ptr) \
384 r_alloc_free (&failure_stack_ptr)
386 #else /* not using relocating allocator */
388 #ifdef REGEX_MALLOC
390 #define REGEX_ALLOCATE_STACK malloc
391 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
392 #define REGEX_FREE_STACK free
394 #else /* not REGEX_MALLOC */
396 #define REGEX_ALLOCATE_STACK alloca
398 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
399 REGEX_REALLOCATE (source, osize, nsize)
400 /* No need to explicitly free anything. */
401 #define REGEX_FREE_STACK(arg)
403 #endif /* not REGEX_MALLOC */
404 #endif /* not using relocating allocator */
407 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
408 `string1' or just past its end. This works if PTR is NULL, which is
409 a good thing. */
410 #define FIRST_STRING_P(ptr) \
411 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
413 /* (Re)Allocate N items of type T using malloc, or fail. */
414 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
415 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
417 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
418 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
420 #define BYTEWIDTH 8 /* In bits. */
422 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
424 #undef MAX
425 #undef MIN
426 #define MAX(a, b) ((a) > (b) ? (a) : (b))
427 #define MIN(a, b) ((a) < (b) ? (a) : (b))
429 typedef char boolean;
430 #define false 0
431 #define true 1
433 static int re_match_2_internal ();
435 /* These are the command codes that appear in compiled regular
436 expressions. Some opcodes are followed by argument bytes. A
437 command code can specify any interpretation whatsoever for its
438 arguments. Zero bytes may appear in the compiled regular expression. */
440 typedef enum
442 no_op = 0,
444 /* Succeed right away--no more backtracking. */
445 succeed,
447 /* Followed by one byte giving n, then by n literal bytes. */
448 exactn,
450 /* Matches any (more or less) character. */
451 anychar,
453 /* Matches any one char belonging to specified set. First
454 following byte is number of bitmap bytes. Then come bytes
455 for a bitmap saying which chars are in. Bits in each byte
456 are ordered low-bit-first. A character is in the set if its
457 bit is 1. A character too large to have a bit in the map is
458 automatically not in the set.
460 If the length byte has the 0x80 bit set, then that stuff
461 is followed by a range table:
462 2 bytes of flags for character sets (low 8 bits, high 8 bits)
463 See RANGE_TABLE_WORK_BITS below.
464 2 bytes, the number of pairs that follow
465 pairs, each 2 multibyte characters,
466 each multibyte character represented as 3 bytes. */
467 charset,
469 /* Same parameters as charset, but match any character that is
470 not one of those specified. */
471 charset_not,
473 /* Start remembering the text that is matched, for storing in a
474 register. Followed by one byte with the register number, in
475 the range 0 to one less than the pattern buffer's re_nsub
476 field. Then followed by one byte with the number of groups
477 inner to this one. (This last has to be part of the
478 start_memory only because we need it in the on_failure_jump
479 of re_match_2.) */
480 start_memory,
482 /* Stop remembering the text that is matched and store it in a
483 memory register. Followed by one byte with the register
484 number, in the range 0 to one less than `re_nsub' in the
485 pattern buffer, and one byte with the number of inner groups,
486 just like `start_memory'. (We need the number of inner
487 groups here because we don't have any easy way of finding the
488 corresponding start_memory when we're at a stop_memory.) */
489 stop_memory,
491 /* Match a duplicate of something remembered. Followed by one
492 byte containing the register number. */
493 duplicate,
495 /* Fail unless at beginning of line. */
496 begline,
498 /* Fail unless at end of line. */
499 endline,
501 /* Succeeds if at beginning of buffer (if emacs) or at beginning
502 of string to be matched (if not). */
503 begbuf,
505 /* Analogously, for end of buffer/string. */
506 endbuf,
508 /* Followed by two byte relative address to which to jump. */
509 jump,
511 /* Same as jump, but marks the end of an alternative. */
512 jump_past_alt,
514 /* Followed by two-byte relative address of place to resume at
515 in case of failure. */
516 on_failure_jump,
518 /* Like on_failure_jump, but pushes a placeholder instead of the
519 current string position when executed. */
520 on_failure_keep_string_jump,
522 /* Throw away latest failure point and then jump to following
523 two-byte relative address. */
524 pop_failure_jump,
526 /* Change to pop_failure_jump if know won't have to backtrack to
527 match; otherwise change to jump. This is used to jump
528 back to the beginning of a repeat. If what follows this jump
529 clearly won't match what the repeat does, such that we can be
530 sure that there is no use backtracking out of repetitions
531 already matched, then we change it to a pop_failure_jump.
532 Followed by two-byte address. */
533 maybe_pop_jump,
535 /* Jump to following two-byte address, and push a dummy failure
536 point. This failure point will be thrown away if an attempt
537 is made to use it for a failure. A `+' construct makes this
538 before the first repeat. Also used as an intermediary kind
539 of jump when compiling an alternative. */
540 dummy_failure_jump,
542 /* Push a dummy failure point and continue. Used at the end of
543 alternatives. */
544 push_dummy_failure,
546 /* Followed by two-byte relative address and two-byte number n.
547 After matching N times, jump to the address upon failure. */
548 succeed_n,
550 /* Followed by two-byte relative address, and two-byte number n.
551 Jump to the address N times, then fail. */
552 jump_n,
554 /* Set the following two-byte relative address to the
555 subsequent two-byte number. The address *includes* the two
556 bytes of number. */
557 set_number_at,
559 wordchar, /* Matches any word-constituent character. */
560 notwordchar, /* Matches any char that is not a word-constituent. */
562 wordbeg, /* Succeeds if at word beginning. */
563 wordend, /* Succeeds if at word end. */
565 wordbound, /* Succeeds if at a word boundary. */
566 notwordbound /* Succeeds if not at a word boundary. */
568 #ifdef emacs
569 ,before_dot, /* Succeeds if before point. */
570 at_dot, /* Succeeds if at point. */
571 after_dot, /* Succeeds if after point. */
573 /* Matches any character whose syntax is specified. Followed by
574 a byte which contains a syntax code, e.g., Sword. */
575 syntaxspec,
577 /* Matches any character whose syntax is not that specified. */
578 notsyntaxspec,
580 /* Matches any character whose category-set contains the specified
581 category. The operator is followed by a byte which contains a
582 category code (mnemonic ASCII character). */
583 categoryspec,
585 /* Matches any character whose category-set does not contain the
586 specified category. The operator is followed by a byte which
587 contains the category code (mnemonic ASCII character). */
588 notcategoryspec
589 #endif /* emacs */
590 } re_opcode_t;
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
597 do { \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
600 } while (0)
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
607 do { \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
610 } while (0)
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
613 at SOURCE. */
615 #define EXTRACT_NUMBER(destination, source) \
616 do { \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
619 } while (0)
621 #ifdef DEBUG
622 static void
623 extract_number (dest, source)
624 int *dest;
625 unsigned char *source;
627 int temp = SIGN_EXTEND_CHAR (*(source + 1));
628 *dest = *source & 0377;
629 *dest += temp << 8;
632 #ifndef EXTRACT_MACROS /* To debug the macros. */
633 #undef EXTRACT_NUMBER
634 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
635 #endif /* not EXTRACT_MACROS */
637 #endif /* DEBUG */
639 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
640 SOURCE must be an lvalue. */
642 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
643 do { \
644 EXTRACT_NUMBER (destination, source); \
645 (source) += 2; \
646 } while (0)
648 #ifdef DEBUG
649 static void
650 extract_number_and_incr (destination, source)
651 int *destination;
652 unsigned char **source;
654 extract_number (destination, *source);
655 *source += 2;
658 #ifndef EXTRACT_MACROS
659 #undef EXTRACT_NUMBER_AND_INCR
660 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
661 extract_number_and_incr (&dest, &src)
662 #endif /* not EXTRACT_MACROS */
664 #endif /* DEBUG */
666 /* Store a multibyte character in three contiguous bytes starting
667 DESTINATION, and increment DESTINATION to the byte after where the
668 character is stored. Therefore, DESTINATION must be an lvalue. */
670 #define STORE_CHARACTER_AND_INCR(destination, character) \
671 do { \
672 (destination)[0] = (character) & 0377; \
673 (destination)[1] = ((character) >> 8) & 0377; \
674 (destination)[2] = (character) >> 16; \
675 (destination) += 3; \
676 } while (0)
678 /* Put into DESTINATION a character stored in three contiguous bytes
679 starting at SOURCE. */
681 #define EXTRACT_CHARACTER(destination, source) \
682 do { \
683 (destination) = ((source)[0] \
684 | ((source)[1] << 8) \
685 | ((source)[2] << 16)); \
686 } while (0)
689 /* Macros for charset. */
691 /* Size of bitmap of charset P in bytes. P is a start of charset,
692 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
693 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
695 /* Nonzero if charset P has range table. */
696 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
698 /* Return the address of range table of charset P. But not the start
699 of table itself, but the before where the number of ranges is
700 stored. `2 +' means to skip re_opcode_t and size of bitmap,
701 and the 2 bytes of flags at the start of the range table. */
702 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
704 /* Extract the bit flags that start a range table. */
705 #define CHARSET_RANGE_TABLE_BITS(p) \
706 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
707 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
709 /* Test if C is listed in the bitmap of charset P. */
710 #define CHARSET_LOOKUP_BITMAP(p, c) \
711 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
712 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
714 /* Return the address of end of RANGE_TABLE. COUNT is number of
715 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
716 is start of range and end of range. `* 3' is size of each start
717 and end. */
718 #define CHARSET_RANGE_TABLE_END(range_table, count) \
719 ((range_table) + (count) * 2 * 3)
721 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
722 COUNT is number of ranges in RANGE_TABLE. */
723 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
724 do \
726 int range_start, range_end; \
727 unsigned char *p; \
728 unsigned char *range_table_end \
729 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
731 for (p = (range_table); p < range_table_end; p += 2 * 3) \
733 EXTRACT_CHARACTER (range_start, p); \
734 EXTRACT_CHARACTER (range_end, p + 3); \
736 if (range_start <= (c) && (c) <= range_end) \
738 (not) = !(not); \
739 break; \
743 while (0)
745 /* Test if C is in range table of CHARSET. The flag NOT is negated if
746 C is listed in it. */
747 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
748 do \
750 /* Number of ranges in range table. */ \
751 int count; \
752 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
754 EXTRACT_NUMBER_AND_INCR (count, range_table); \
755 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
757 while (0)
759 /* If DEBUG is defined, Regex prints many voluminous messages about what
760 it is doing (if the variable `debug' is nonzero). If linked with the
761 main program in `iregex.c', you can enter patterns and strings
762 interactively. And if linked with the main program in `main.c' and
763 the other test files, you can run the already-written tests. */
765 #ifdef DEBUG
767 /* We use standard I/O for debugging. */
768 #include <stdio.h>
770 /* It is useful to test things that ``must'' be true when debugging. */
771 #include <assert.h>
773 static int debug = 0;
775 #define DEBUG_STATEMENT(e) e
776 #define DEBUG_PRINT1(x) if (debug) printf (x)
777 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
778 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
779 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
780 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
781 if (debug) print_partial_compiled_pattern (s, e)
782 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
783 if (debug) print_double_string (w, s1, sz1, s2, sz2)
786 /* Print the fastmap in human-readable form. */
788 void
789 print_fastmap (fastmap)
790 char *fastmap;
792 unsigned was_a_range = 0;
793 unsigned i = 0;
795 while (i < (1 << BYTEWIDTH))
797 if (fastmap[i++])
799 was_a_range = 0;
800 putchar (i - 1);
801 while (i < (1 << BYTEWIDTH) && fastmap[i])
803 was_a_range = 1;
804 i++;
806 if (was_a_range)
808 printf ("-");
809 putchar (i - 1);
813 putchar ('\n');
817 /* Print a compiled pattern string in human-readable form, starting at
818 the START pointer into it and ending just before the pointer END. */
820 void
821 print_partial_compiled_pattern (start, end)
822 unsigned char *start;
823 unsigned char *end;
825 int mcnt, mcnt2;
826 unsigned char *p = start;
827 unsigned char *pend = end;
829 if (start == NULL)
831 printf ("(null)\n");
832 return;
835 /* Loop over pattern commands. */
836 while (p < pend)
838 printf ("%d:\t", p - start);
840 switch ((re_opcode_t) *p++)
842 case no_op:
843 printf ("/no_op");
844 break;
846 case exactn:
847 mcnt = *p++;
848 printf ("/exactn/%d", mcnt);
851 putchar ('/');
852 putchar (*p++);
854 while (--mcnt);
855 break;
857 case start_memory:
858 mcnt = *p++;
859 printf ("/start_memory/%d/%d", mcnt, *p++);
860 break;
862 case stop_memory:
863 mcnt = *p++;
864 printf ("/stop_memory/%d/%d", mcnt, *p++);
865 break;
867 case duplicate:
868 printf ("/duplicate/%d", *p++);
869 break;
871 case anychar:
872 printf ("/anychar");
873 break;
875 case charset:
876 case charset_not:
878 register int c, last = -100;
879 register int in_range = 0;
880 int length = *p & 0x7f;
881 int has_range_table = *p & 0x80;
882 int range_length = p[length + 2] + p[length + 3] * 0x100;
884 printf ("/charset [%s",
885 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
887 assert (p + *p < pend);
889 for (c = 0; c < 256; c++)
890 if (c / 8 < length
891 && (p[1 + (c/8)] & (1 << (c % 8))))
893 /* Are we starting a range? */
894 if (last + 1 == c && ! in_range)
896 putchar ('-');
897 in_range = 1;
899 /* Have we broken a range? */
900 else if (last + 1 != c && in_range)
902 putchar (last);
903 in_range = 0;
906 if (! in_range)
907 putchar (c);
909 last = c;
912 p += 1 + length;
914 if (in_range)
915 putchar (last);
917 putchar (']');
919 if (has_range_table)
920 printf ("has-range-table");
922 /* ??? Should print the range table; for now,
923 just skip it. */
924 if (has_range_table)
925 p += 4 + 6 * range_length;
927 break;
929 case begline:
930 printf ("/begline");
931 break;
933 case endline:
934 printf ("/endline");
935 break;
937 case on_failure_jump:
938 extract_number_and_incr (&mcnt, &p);
939 printf ("/on_failure_jump to %d", p + mcnt - start);
940 break;
942 case on_failure_keep_string_jump:
943 extract_number_and_incr (&mcnt, &p);
944 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
945 break;
947 case dummy_failure_jump:
948 extract_number_and_incr (&mcnt, &p);
949 printf ("/dummy_failure_jump to %d", p + mcnt - start);
950 break;
952 case push_dummy_failure:
953 printf ("/push_dummy_failure");
954 break;
956 case maybe_pop_jump:
957 extract_number_and_incr (&mcnt, &p);
958 printf ("/maybe_pop_jump to %d", p + mcnt - start);
959 break;
961 case pop_failure_jump:
962 extract_number_and_incr (&mcnt, &p);
963 printf ("/pop_failure_jump to %d", p + mcnt - start);
964 break;
966 case jump_past_alt:
967 extract_number_and_incr (&mcnt, &p);
968 printf ("/jump_past_alt to %d", p + mcnt - start);
969 break;
971 case jump:
972 extract_number_and_incr (&mcnt, &p);
973 printf ("/jump to %d", p + mcnt - start);
974 break;
976 case succeed_n:
977 extract_number_and_incr (&mcnt, &p);
978 extract_number_and_incr (&mcnt2, &p);
979 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
980 break;
982 case jump_n:
983 extract_number_and_incr (&mcnt, &p);
984 extract_number_and_incr (&mcnt2, &p);
985 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
986 break;
988 case set_number_at:
989 extract_number_and_incr (&mcnt, &p);
990 extract_number_and_incr (&mcnt2, &p);
991 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
992 break;
994 case wordbound:
995 printf ("/wordbound");
996 break;
998 case notwordbound:
999 printf ("/notwordbound");
1000 break;
1002 case wordbeg:
1003 printf ("/wordbeg");
1004 break;
1006 case wordend:
1007 printf ("/wordend");
1009 #ifdef emacs
1010 case before_dot:
1011 printf ("/before_dot");
1012 break;
1014 case at_dot:
1015 printf ("/at_dot");
1016 break;
1018 case after_dot:
1019 printf ("/after_dot");
1020 break;
1022 case syntaxspec:
1023 printf ("/syntaxspec");
1024 mcnt = *p++;
1025 printf ("/%d", mcnt);
1026 break;
1028 case notsyntaxspec:
1029 printf ("/notsyntaxspec");
1030 mcnt = *p++;
1031 printf ("/%d", mcnt);
1032 break;
1033 #endif /* emacs */
1035 case wordchar:
1036 printf ("/wordchar");
1037 break;
1039 case notwordchar:
1040 printf ("/notwordchar");
1041 break;
1043 case begbuf:
1044 printf ("/begbuf");
1045 break;
1047 case endbuf:
1048 printf ("/endbuf");
1049 break;
1051 default:
1052 printf ("?%d", *(p-1));
1055 putchar ('\n');
1058 printf ("%d:\tend of pattern.\n", p - start);
1062 void
1063 print_compiled_pattern (bufp)
1064 struct re_pattern_buffer *bufp;
1066 unsigned char *buffer = bufp->buffer;
1068 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1069 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
1071 if (bufp->fastmap_accurate && bufp->fastmap)
1073 printf ("fastmap: ");
1074 print_fastmap (bufp->fastmap);
1077 printf ("re_nsub: %d\t", bufp->re_nsub);
1078 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1079 printf ("can_be_null: %d\t", bufp->can_be_null);
1080 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1081 printf ("no_sub: %d\t", bufp->no_sub);
1082 printf ("not_bol: %d\t", bufp->not_bol);
1083 printf ("not_eol: %d\t", bufp->not_eol);
1084 printf ("syntax: %d\n", bufp->syntax);
1085 /* Perhaps we should print the translate table? */
1089 void
1090 print_double_string (where, string1, size1, string2, size2)
1091 const char *where;
1092 const char *string1;
1093 const char *string2;
1094 int size1;
1095 int size2;
1097 unsigned this_char;
1099 if (where == NULL)
1100 printf ("(null)");
1101 else
1103 if (FIRST_STRING_P (where))
1105 for (this_char = where - string1; this_char < size1; this_char++)
1106 putchar (string1[this_char]);
1108 where = string2;
1111 for (this_char = where - string2; this_char < size2; this_char++)
1112 putchar (string2[this_char]);
1116 #else /* not DEBUG */
1118 #undef assert
1119 #define assert(e)
1121 #define DEBUG_STATEMENT(e)
1122 #define DEBUG_PRINT1(x)
1123 #define DEBUG_PRINT2(x1, x2)
1124 #define DEBUG_PRINT3(x1, x2, x3)
1125 #define DEBUG_PRINT4(x1, x2, x3, x4)
1126 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1127 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1129 #endif /* not DEBUG */
1131 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1132 also be assigned to arbitrarily: each pattern buffer stores its own
1133 syntax, so it can be changed between regex compilations. */
1134 /* This has no initializer because initialized variables in Emacs
1135 become read-only after dumping. */
1136 reg_syntax_t re_syntax_options;
1139 /* Specify the precise syntax of regexps for compilation. This provides
1140 for compatibility for various utilities which historically have
1141 different, incompatible syntaxes.
1143 The argument SYNTAX is a bit mask comprised of the various bits
1144 defined in regex.h. We return the old syntax. */
1146 reg_syntax_t
1147 re_set_syntax (syntax)
1148 reg_syntax_t syntax;
1150 reg_syntax_t ret = re_syntax_options;
1152 re_syntax_options = syntax;
1153 return ret;
1156 /* This table gives an error message for each of the error codes listed
1157 in regex.h. Obviously the order here has to be same as there.
1158 POSIX doesn't require that we do anything for REG_NOERROR,
1159 but why not be nice? */
1161 static const char *re_error_msgid[] =
1163 gettext_noop ("Success"), /* REG_NOERROR */
1164 gettext_noop ("No match"), /* REG_NOMATCH */
1165 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1166 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1167 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1168 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1169 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1170 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1171 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1172 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1173 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1174 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1175 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1176 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1177 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1178 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1179 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1182 /* Avoiding alloca during matching, to placate r_alloc. */
1184 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1185 searching and matching functions should not call alloca. On some
1186 systems, alloca is implemented in terms of malloc, and if we're
1187 using the relocating allocator routines, then malloc could cause a
1188 relocation, which might (if the strings being searched are in the
1189 ralloc heap) shift the data out from underneath the regexp
1190 routines.
1192 Here's another reason to avoid allocation: Emacs
1193 processes input from X in a signal handler; processing X input may
1194 call malloc; if input arrives while a matching routine is calling
1195 malloc, then we're scrod. But Emacs can't just block input while
1196 calling matching routines; then we don't notice interrupts when
1197 they come in. So, Emacs blocks input around all regexp calls
1198 except the matching calls, which it leaves unprotected, in the
1199 faith that they will not malloc. */
1201 /* Normally, this is fine. */
1202 #define MATCH_MAY_ALLOCATE
1204 /* When using GNU C, we are not REALLY using the C alloca, no matter
1205 what config.h may say. So don't take precautions for it. */
1206 #ifdef __GNUC__
1207 #undef C_ALLOCA
1208 #endif
1210 /* The match routines may not allocate if (1) they would do it with malloc
1211 and (2) it's not safe for them to use malloc.
1212 Note that if REL_ALLOC is defined, matching would not use malloc for the
1213 failure stack, but we would still use it for the register vectors;
1214 so REL_ALLOC should not affect this. */
1215 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1216 #undef MATCH_MAY_ALLOCATE
1217 #endif
1220 /* Failure stack declarations and macros; both re_compile_fastmap and
1221 re_match_2 use a failure stack. These have to be macros because of
1222 REGEX_ALLOCATE_STACK. */
1225 /* Approximate number of failure points for which to initially allocate space
1226 when matching. If this number is exceeded, we allocate more
1227 space, so it is not a hard limit. */
1228 #ifndef INIT_FAILURE_ALLOC
1229 #define INIT_FAILURE_ALLOC 20
1230 #endif
1232 /* Roughly the maximum number of failure points on the stack. Would be
1233 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1234 This is a variable only so users of regex can assign to it; we never
1235 change it ourselves. */
1236 #if defined (MATCH_MAY_ALLOCATE)
1237 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1238 whose default stack limit is 2mb. In order for a larger
1239 value to work reliably, you have to try to make it accord
1240 with the process stack limit. */
1241 int re_max_failures = 40000;
1242 #else
1243 int re_max_failures = 4000;
1244 #endif
1246 union fail_stack_elt
1248 unsigned char *pointer;
1249 int integer;
1252 typedef union fail_stack_elt fail_stack_elt_t;
1254 typedef struct
1256 fail_stack_elt_t *stack;
1257 unsigned size;
1258 unsigned avail; /* Offset of next open position. */
1259 } fail_stack_type;
1261 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1262 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1263 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1266 /* Define macros to initialize and free the failure stack.
1267 Do `return -2' if the alloc fails. */
1269 #ifdef MATCH_MAY_ALLOCATE
1270 #define INIT_FAIL_STACK() \
1271 do { \
1272 fail_stack.stack = (fail_stack_elt_t *) \
1273 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1274 * sizeof (fail_stack_elt_t)); \
1276 if (fail_stack.stack == NULL) \
1277 return -2; \
1279 fail_stack.size = INIT_FAILURE_ALLOC; \
1280 fail_stack.avail = 0; \
1281 } while (0)
1283 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1284 #else
1285 #define INIT_FAIL_STACK() \
1286 do { \
1287 fail_stack.avail = 0; \
1288 } while (0)
1290 #define RESET_FAIL_STACK()
1291 #endif
1294 /* Double the size of FAIL_STACK, up to a limit
1295 which allows approximately `re_max_failures' items.
1297 Return 1 if succeeds, and 0 if either ran out of memory
1298 allocating space for it or it was already too large.
1300 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1302 /* Factor to increase the failure stack size by
1303 when we increase it.
1304 This used to be 2, but 2 was too wasteful
1305 because the old discarded stacks added up to as much space
1306 were as ultimate, maximum-size stack. */
1307 #define FAIL_STACK_GROWTH_FACTOR 4
1309 #define GROW_FAIL_STACK(fail_stack) \
1310 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1311 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1312 ? 0 \
1313 : ((fail_stack).stack \
1314 = (fail_stack_elt_t *) \
1315 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1316 (fail_stack).size * sizeof (fail_stack_elt_t), \
1317 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1318 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1319 * FAIL_STACK_GROWTH_FACTOR))), \
1321 (fail_stack).stack == NULL \
1322 ? 0 \
1323 : ((fail_stack).size \
1324 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1325 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1326 * FAIL_STACK_GROWTH_FACTOR)) \
1327 / sizeof (fail_stack_elt_t)), \
1328 1)))
1331 /* Push pointer POINTER on FAIL_STACK.
1332 Return 1 if was able to do so and 0 if ran out of memory allocating
1333 space to do so. */
1334 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1335 ((FAIL_STACK_FULL () \
1336 && !GROW_FAIL_STACK (FAIL_STACK)) \
1337 ? 0 \
1338 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1341 /* Push a pointer value onto the failure stack.
1342 Assumes the variable `fail_stack'. Probably should only
1343 be called from within `PUSH_FAILURE_POINT'. */
1344 #define PUSH_FAILURE_POINTER(item) \
1345 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1347 /* This pushes an integer-valued item onto the failure stack.
1348 Assumes the variable `fail_stack'. Probably should only
1349 be called from within `PUSH_FAILURE_POINT'. */
1350 #define PUSH_FAILURE_INT(item) \
1351 fail_stack.stack[fail_stack.avail++].integer = (item)
1353 /* Push a fail_stack_elt_t value onto the failure stack.
1354 Assumes the variable `fail_stack'. Probably should only
1355 be called from within `PUSH_FAILURE_POINT'. */
1356 #define PUSH_FAILURE_ELT(item) \
1357 fail_stack.stack[fail_stack.avail++] = (item)
1359 /* These three POP... operations complement the three PUSH... operations.
1360 All assume that `fail_stack' is nonempty. */
1361 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1362 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1363 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1365 /* Used to omit pushing failure point id's when we're not debugging. */
1366 #ifdef DEBUG
1367 #define DEBUG_PUSH PUSH_FAILURE_INT
1368 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1369 #else
1370 #define DEBUG_PUSH(item)
1371 #define DEBUG_POP(item_addr)
1372 #endif
1375 /* Push the information about the state we will need
1376 if we ever fail back to it.
1378 Requires variables fail_stack, regstart, regend, reg_info, and
1379 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1380 declared.
1382 Does `return FAILURE_CODE' if runs out of memory. */
1384 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1385 do { \
1386 char *destination; \
1387 /* Must be int, so when we don't save any registers, the arithmetic \
1388 of 0 + -1 isn't done as unsigned. */ \
1389 int this_reg; \
1391 DEBUG_STATEMENT (failure_id++); \
1392 DEBUG_STATEMENT (nfailure_points_pushed++); \
1393 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1394 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1395 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1397 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1398 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1400 /* Ensure we have enough space allocated for what we will push. */ \
1401 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1403 if (!GROW_FAIL_STACK (fail_stack)) \
1404 return failure_code; \
1406 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1407 (fail_stack).size); \
1408 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1411 /* Push the info, starting with the registers. */ \
1412 DEBUG_PRINT1 ("\n"); \
1414 if (1) \
1415 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1416 this_reg++) \
1418 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1419 DEBUG_STATEMENT (num_regs_pushed++); \
1421 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1422 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1424 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1425 PUSH_FAILURE_POINTER (regend[this_reg]); \
1427 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1428 DEBUG_PRINT2 (" match_null=%d", \
1429 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1430 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1431 DEBUG_PRINT2 (" matched_something=%d", \
1432 MATCHED_SOMETHING (reg_info[this_reg])); \
1433 DEBUG_PRINT2 (" ever_matched=%d", \
1434 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1435 DEBUG_PRINT1 ("\n"); \
1436 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1439 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1440 PUSH_FAILURE_INT (lowest_active_reg); \
1442 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1443 PUSH_FAILURE_INT (highest_active_reg); \
1445 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1446 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1447 PUSH_FAILURE_POINTER (pattern_place); \
1449 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1450 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1451 size2); \
1452 DEBUG_PRINT1 ("'\n"); \
1453 PUSH_FAILURE_POINTER (string_place); \
1455 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1456 DEBUG_PUSH (failure_id); \
1457 } while (0)
1459 /* This is the number of items that are pushed and popped on the stack
1460 for each register. */
1461 #define NUM_REG_ITEMS 3
1463 /* Individual items aside from the registers. */
1464 #ifdef DEBUG
1465 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1466 #else
1467 #define NUM_NONREG_ITEMS 4
1468 #endif
1470 /* Estimate the size of data pushed by a typical failure stack entry.
1471 An estimate is all we need, because all we use this for
1472 is to choose a limit for how big to make the failure stack. */
1474 #define TYPICAL_FAILURE_SIZE 20
1476 /* This is how many items we actually use for a failure point.
1477 It depends on the regexp. */
1478 #define NUM_FAILURE_ITEMS \
1479 (((0 \
1480 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1481 * NUM_REG_ITEMS) \
1482 + NUM_NONREG_ITEMS)
1484 /* How many items can still be added to the stack without overflowing it. */
1485 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1488 /* Pops what PUSH_FAIL_STACK pushes.
1490 We restore into the parameters, all of which should be lvalues:
1491 STR -- the saved data position.
1492 PAT -- the saved pattern position.
1493 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1494 REGSTART, REGEND -- arrays of string positions.
1495 REG_INFO -- array of information about each subexpression.
1497 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1498 `pend', `string1', `size1', `string2', and `size2'. */
1500 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1502 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1503 int this_reg; \
1504 const unsigned char *string_temp; \
1506 assert (!FAIL_STACK_EMPTY ()); \
1508 /* Remove failure points and point to how many regs pushed. */ \
1509 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1510 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1511 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1513 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1515 DEBUG_POP (&failure_id.integer); \
1516 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id.integer); \
1518 /* If the saved string location is NULL, it came from an \
1519 on_failure_keep_string_jump opcode, and we want to throw away the \
1520 saved NULL, thus retaining our current position in the string. */ \
1521 string_temp = POP_FAILURE_POINTER (); \
1522 if (string_temp != NULL) \
1523 str = (const char *) string_temp; \
1525 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1526 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1527 DEBUG_PRINT1 ("'\n"); \
1529 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1530 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1531 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1533 /* Restore register info. */ \
1534 high_reg = (unsigned) POP_FAILURE_INT (); \
1535 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1537 low_reg = (unsigned) POP_FAILURE_INT (); \
1538 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1540 if (1) \
1541 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1543 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1545 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1546 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1548 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1549 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1551 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1552 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1554 else \
1556 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1558 reg_info[this_reg].word.integer = 0; \
1559 regend[this_reg] = 0; \
1560 regstart[this_reg] = 0; \
1562 highest_active_reg = high_reg; \
1565 set_regs_matched_done = 0; \
1566 DEBUG_STATEMENT (nfailure_points_popped++); \
1567 } /* POP_FAILURE_POINT */
1571 /* Structure for per-register (a.k.a. per-group) information.
1572 Other register information, such as the
1573 starting and ending positions (which are addresses), and the list of
1574 inner groups (which is a bits list) are maintained in separate
1575 variables.
1577 We are making a (strictly speaking) nonportable assumption here: that
1578 the compiler will pack our bit fields into something that fits into
1579 the type of `word', i.e., is something that fits into one item on the
1580 failure stack. */
1582 typedef union
1584 fail_stack_elt_t word;
1585 struct
1587 /* This field is one if this group can match the empty string,
1588 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1589 #define MATCH_NULL_UNSET_VALUE 3
1590 unsigned match_null_string_p : 2;
1591 unsigned is_active : 1;
1592 unsigned matched_something : 1;
1593 unsigned ever_matched_something : 1;
1594 } bits;
1595 } register_info_type;
1597 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1598 #define IS_ACTIVE(R) ((R).bits.is_active)
1599 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1600 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1603 /* Call this when have matched a real character; it sets `matched' flags
1604 for the subexpressions which we are currently inside. Also records
1605 that those subexprs have matched. */
1606 #define SET_REGS_MATCHED() \
1607 do \
1609 if (!set_regs_matched_done) \
1611 unsigned r; \
1612 set_regs_matched_done = 1; \
1613 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1615 MATCHED_SOMETHING (reg_info[r]) \
1616 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1617 = 1; \
1621 while (0)
1623 /* Registers are set to a sentinel when they haven't yet matched. */
1624 static char reg_unset_dummy;
1625 #define REG_UNSET_VALUE (&reg_unset_dummy)
1626 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1628 /* Subroutine declarations and macros for regex_compile. */
1630 static void store_op1 (), store_op2 ();
1631 static void insert_op1 (), insert_op2 ();
1632 static boolean at_begline_loc_p (), at_endline_loc_p ();
1633 static boolean group_in_compile_stack ();
1634 static reg_errcode_t compile_range ();
1636 /* Fetch the next character in the uncompiled pattern---translating it
1637 if necessary. Also cast from a signed character in the constant
1638 string passed to us by the user to an unsigned char that we can use
1639 as an array index (in, e.g., `translate'). */
1640 #ifndef PATFETCH
1641 #define PATFETCH(c) \
1642 do {if (p == pend) return REG_EEND; \
1643 c = (unsigned char) *p++; \
1644 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
1645 } while (0)
1646 #endif
1648 /* Fetch the next character in the uncompiled pattern, with no
1649 translation. */
1650 #define PATFETCH_RAW(c) \
1651 do {if (p == pend) return REG_EEND; \
1652 c = (unsigned char) *p++; \
1653 } while (0)
1655 /* Go backwards one character in the pattern. */
1656 #define PATUNFETCH p--
1659 /* If `translate' is non-null, return translate[D], else just D. We
1660 cast the subscript to translate because some data is declared as
1661 `char *', to avoid warnings when a string constant is passed. But
1662 when we use a character as a subscript we must make it unsigned. */
1663 #ifndef TRANSLATE
1664 #define TRANSLATE(d) \
1665 (RE_TRANSLATE_P (translate) \
1666 ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
1667 #endif
1670 /* Macros for outputting the compiled pattern into `buffer'. */
1672 /* If the buffer isn't allocated when it comes in, use this. */
1673 #define INIT_BUF_SIZE 32
1675 /* Make sure we have at least N more bytes of space in buffer. */
1676 #define GET_BUFFER_SPACE(n) \
1677 while (b - bufp->buffer + (n) > bufp->allocated) \
1678 EXTEND_BUFFER ()
1680 /* Make sure we have one more byte of buffer space and then add C to it. */
1681 #define BUF_PUSH(c) \
1682 do { \
1683 GET_BUFFER_SPACE (1); \
1684 *b++ = (unsigned char) (c); \
1685 } while (0)
1688 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1689 #define BUF_PUSH_2(c1, c2) \
1690 do { \
1691 GET_BUFFER_SPACE (2); \
1692 *b++ = (unsigned char) (c1); \
1693 *b++ = (unsigned char) (c2); \
1694 } while (0)
1697 /* As with BUF_PUSH_2, except for three bytes. */
1698 #define BUF_PUSH_3(c1, c2, c3) \
1699 do { \
1700 GET_BUFFER_SPACE (3); \
1701 *b++ = (unsigned char) (c1); \
1702 *b++ = (unsigned char) (c2); \
1703 *b++ = (unsigned char) (c3); \
1704 } while (0)
1707 /* Store a jump with opcode OP at LOC to location TO. We store a
1708 relative address offset by the three bytes the jump itself occupies. */
1709 #define STORE_JUMP(op, loc, to) \
1710 store_op1 (op, loc, (to) - (loc) - 3)
1712 /* Likewise, for a two-argument jump. */
1713 #define STORE_JUMP2(op, loc, to, arg) \
1714 store_op2 (op, loc, (to) - (loc) - 3, arg)
1716 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1717 #define INSERT_JUMP(op, loc, to) \
1718 insert_op1 (op, loc, (to) - (loc) - 3, b)
1720 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1721 #define INSERT_JUMP2(op, loc, to, arg) \
1722 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1725 /* This is not an arbitrary limit: the arguments which represent offsets
1726 into the pattern are two bytes long. So if 2^16 bytes turns out to
1727 be too small, many things would have to change. */
1728 #define MAX_BUF_SIZE (1L << 16)
1731 /* Extend the buffer by twice its current size via realloc and
1732 reset the pointers that pointed into the old block to point to the
1733 correct places in the new one. If extending the buffer results in it
1734 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1735 #define EXTEND_BUFFER() \
1736 do { \
1737 unsigned char *old_buffer = bufp->buffer; \
1738 if (bufp->allocated == MAX_BUF_SIZE) \
1739 return REG_ESIZE; \
1740 bufp->allocated <<= 1; \
1741 if (bufp->allocated > MAX_BUF_SIZE) \
1742 bufp->allocated = MAX_BUF_SIZE; \
1743 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1744 if (bufp->buffer == NULL) \
1745 return REG_ESPACE; \
1746 /* If the buffer moved, move all the pointers into it. */ \
1747 if (old_buffer != bufp->buffer) \
1749 b = (b - old_buffer) + bufp->buffer; \
1750 begalt = (begalt - old_buffer) + bufp->buffer; \
1751 if (fixup_alt_jump) \
1752 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1753 if (laststart) \
1754 laststart = (laststart - old_buffer) + bufp->buffer; \
1755 if (pending_exact) \
1756 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1758 } while (0)
1761 /* Since we have one byte reserved for the register number argument to
1762 {start,stop}_memory, the maximum number of groups we can report
1763 things about is what fits in that byte. */
1764 #define MAX_REGNUM 255
1766 /* But patterns can have more than `MAX_REGNUM' registers. We just
1767 ignore the excess. */
1768 typedef unsigned regnum_t;
1771 /* Macros for the compile stack. */
1773 /* Since offsets can go either forwards or backwards, this type needs to
1774 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1775 typedef int pattern_offset_t;
1777 typedef struct
1779 pattern_offset_t begalt_offset;
1780 pattern_offset_t fixup_alt_jump;
1781 pattern_offset_t inner_group_offset;
1782 pattern_offset_t laststart_offset;
1783 regnum_t regnum;
1784 } compile_stack_elt_t;
1787 typedef struct
1789 compile_stack_elt_t *stack;
1790 unsigned size;
1791 unsigned avail; /* Offset of next open position. */
1792 } compile_stack_type;
1795 #define INIT_COMPILE_STACK_SIZE 32
1797 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1798 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1800 /* The next available element. */
1801 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1804 /* Structure to manage work area for range table. */
1805 struct range_table_work_area
1807 int *table; /* actual work area. */
1808 int allocated; /* allocated size for work area in bytes. */
1809 int used; /* actually used size in words. */
1810 int bits; /* flag to record character classes */
1813 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1814 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1815 do { \
1816 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1818 (work_area).allocated += 16 * sizeof (int); \
1819 if ((work_area).table) \
1820 (work_area).table \
1821 = (int *) realloc ((work_area).table, (work_area).allocated); \
1822 else \
1823 (work_area).table \
1824 = (int *) malloc ((work_area).allocated); \
1825 if ((work_area).table == 0) \
1826 FREE_STACK_RETURN (REG_ESPACE); \
1828 } while (0)
1830 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1831 (work_area).bits |= (bit)
1833 /* These bits represent the various character classes such as [:alnum:]
1834 in a charset's range table. */
1835 #define BIT_ALNUM 0x1
1836 #define BIT_ALPHA 0x2
1837 #define BIT_WORD 0x4
1838 #define BIT_ASCII 0x8
1839 #define BIT_NONASCII 0x10
1840 #define BIT_GRAPH 0x20
1841 #define BIT_LOWER 0x40
1842 #define BIT_PRINT 0x80
1843 #define BIT_PUNCT 0x100
1844 #define BIT_SPACE 0x200
1845 #define BIT_UPPER 0x400
1846 #define BIT_UNIBYTE 0x800
1847 #define BIT_MULTIBYTE 0x1000
1849 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1850 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1851 do { \
1852 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1853 (work_area).table[(work_area).used++] = (range_start); \
1854 (work_area).table[(work_area).used++] = (range_end); \
1855 } while (0)
1857 /* Free allocated memory for WORK_AREA. */
1858 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1859 do { \
1860 if ((work_area).table) \
1861 free ((work_area).table); \
1862 } while (0)
1864 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1865 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1866 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1867 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1870 /* Set the bit for character C in a list. */
1871 #define SET_LIST_BIT(c) \
1872 (b[((unsigned char) (c)) / BYTEWIDTH] \
1873 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1876 /* Get the next unsigned number in the uncompiled pattern. */
1877 #define GET_UNSIGNED_NUMBER(num) \
1878 { if (p != pend) \
1880 PATFETCH (c); \
1881 while (ISDIGIT (c)) \
1883 if (num < 0) \
1884 num = 0; \
1885 num = num * 10 + c - '0'; \
1886 if (p == pend) \
1887 break; \
1888 PATFETCH (c); \
1893 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1895 #define IS_CHAR_CLASS(string) \
1896 (STREQ (string, "alpha") || STREQ (string, "upper") \
1897 || STREQ (string, "lower") || STREQ (string, "digit") \
1898 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1899 || STREQ (string, "space") || STREQ (string, "print") \
1900 || STREQ (string, "punct") || STREQ (string, "graph") \
1901 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1902 || STREQ (string, "word") \
1903 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1904 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1906 #ifndef MATCH_MAY_ALLOCATE
1908 /* If we cannot allocate large objects within re_match_2_internal,
1909 we make the fail stack and register vectors global.
1910 The fail stack, we grow to the maximum size when a regexp
1911 is compiled.
1912 The register vectors, we adjust in size each time we
1913 compile a regexp, according to the number of registers it needs. */
1915 static fail_stack_type fail_stack;
1917 /* Size with which the following vectors are currently allocated.
1918 That is so we can make them bigger as needed,
1919 but never make them smaller. */
1920 static int regs_allocated_size;
1922 static const char ** regstart, ** regend;
1923 static const char ** old_regstart, ** old_regend;
1924 static const char **best_regstart, **best_regend;
1925 static register_info_type *reg_info;
1926 static const char **reg_dummy;
1927 static register_info_type *reg_info_dummy;
1929 /* Make the register vectors big enough for NUM_REGS registers,
1930 but don't make them smaller. */
1932 static
1933 regex_grow_registers (num_regs)
1934 int num_regs;
1936 if (num_regs > regs_allocated_size)
1938 RETALLOC_IF (regstart, num_regs, const char *);
1939 RETALLOC_IF (regend, num_regs, const char *);
1940 RETALLOC_IF (old_regstart, num_regs, const char *);
1941 RETALLOC_IF (old_regend, num_regs, const char *);
1942 RETALLOC_IF (best_regstart, num_regs, const char *);
1943 RETALLOC_IF (best_regend, num_regs, const char *);
1944 RETALLOC_IF (reg_info, num_regs, register_info_type);
1945 RETALLOC_IF (reg_dummy, num_regs, const char *);
1946 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1948 regs_allocated_size = num_regs;
1952 #endif /* not MATCH_MAY_ALLOCATE */
1954 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1955 Returns one of error codes defined in `regex.h', or zero for success.
1957 Assumes the `allocated' (and perhaps `buffer') and `translate'
1958 fields are set in BUFP on entry.
1960 If it succeeds, results are put in BUFP (if it returns an error, the
1961 contents of BUFP are undefined):
1962 `buffer' is the compiled pattern;
1963 `syntax' is set to SYNTAX;
1964 `used' is set to the length of the compiled pattern;
1965 `fastmap_accurate' is zero;
1966 `re_nsub' is the number of subexpressions in PATTERN;
1967 `not_bol' and `not_eol' are zero;
1969 The `fastmap' and `newline_anchor' fields are neither
1970 examined nor set. */
1972 /* Return, freeing storage we allocated. */
1973 #define FREE_STACK_RETURN(value) \
1974 do { \
1975 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1976 free (compile_stack.stack); \
1977 return value; \
1978 } while (0)
1980 static reg_errcode_t
1981 regex_compile (pattern, size, syntax, bufp)
1982 const char *pattern;
1983 int size;
1984 reg_syntax_t syntax;
1985 struct re_pattern_buffer *bufp;
1987 /* We fetch characters from PATTERN here. Even though PATTERN is
1988 `char *' (i.e., signed), we declare these variables as unsigned, so
1989 they can be reliably used as array indices. */
1990 register unsigned int c, c1;
1992 /* A random temporary spot in PATTERN. */
1993 const char *p1;
1995 /* Points to the end of the buffer, where we should append. */
1996 register unsigned char *b;
1998 /* Keeps track of unclosed groups. */
1999 compile_stack_type compile_stack;
2001 /* Points to the current (ending) position in the pattern. */
2002 #ifdef AIX
2003 /* `const' makes AIX compiler fail. */
2004 char *p = pattern;
2005 #else
2006 const char *p = pattern;
2007 #endif
2008 const char *pend = pattern + size;
2010 /* How to translate the characters in the pattern. */
2011 RE_TRANSLATE_TYPE translate = bufp->translate;
2013 /* Address of the count-byte of the most recently inserted `exactn'
2014 command. This makes it possible to tell if a new exact-match
2015 character can be added to that command or if the character requires
2016 a new `exactn' command. */
2017 unsigned char *pending_exact = 0;
2019 /* Address of start of the most recently finished expression.
2020 This tells, e.g., postfix * where to find the start of its
2021 operand. Reset at the beginning of groups and alternatives. */
2022 unsigned char *laststart = 0;
2024 /* Address of beginning of regexp, or inside of last group. */
2025 unsigned char *begalt;
2027 /* Place in the uncompiled pattern (i.e., the {) to
2028 which to go back if the interval is invalid. */
2029 const char *beg_interval;
2031 /* Address of the place where a forward jump should go to the end of
2032 the containing expression. Each alternative of an `or' -- except the
2033 last -- ends with a forward jump of this sort. */
2034 unsigned char *fixup_alt_jump = 0;
2036 /* Counts open-groups as they are encountered. Remembered for the
2037 matching close-group on the compile stack, so the same register
2038 number is put in the stop_memory as the start_memory. */
2039 regnum_t regnum = 0;
2041 /* Work area for range table of charset. */
2042 struct range_table_work_area range_table_work;
2044 #ifdef DEBUG
2045 DEBUG_PRINT1 ("\nCompiling pattern: ");
2046 if (debug)
2048 unsigned debug_count;
2050 for (debug_count = 0; debug_count < size; debug_count++)
2051 putchar (pattern[debug_count]);
2052 putchar ('\n');
2054 #endif /* DEBUG */
2056 /* Initialize the compile stack. */
2057 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2058 if (compile_stack.stack == NULL)
2059 return REG_ESPACE;
2061 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2062 compile_stack.avail = 0;
2064 range_table_work.table = 0;
2065 range_table_work.allocated = 0;
2067 /* Initialize the pattern buffer. */
2068 bufp->syntax = syntax;
2069 bufp->fastmap_accurate = 0;
2070 bufp->not_bol = bufp->not_eol = 0;
2072 /* Set `used' to zero, so that if we return an error, the pattern
2073 printer (for debugging) will think there's no pattern. We reset it
2074 at the end. */
2075 bufp->used = 0;
2077 /* Always count groups, whether or not bufp->no_sub is set. */
2078 bufp->re_nsub = 0;
2080 #ifdef emacs
2081 /* bufp->multibyte is set before regex_compile is called, so don't alter
2082 it. */
2083 #else /* not emacs */
2084 /* Nothing is recognized as a multibyte character. */
2085 bufp->multibyte = 0;
2086 #endif
2088 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2089 /* Initialize the syntax table. */
2090 init_syntax_once ();
2091 #endif
2093 if (bufp->allocated == 0)
2095 if (bufp->buffer)
2096 { /* If zero allocated, but buffer is non-null, try to realloc
2097 enough space. This loses if buffer's address is bogus, but
2098 that is the user's responsibility. */
2099 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2101 else
2102 { /* Caller did not allocate a buffer. Do it for them. */
2103 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2105 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2107 bufp->allocated = INIT_BUF_SIZE;
2110 begalt = b = bufp->buffer;
2112 /* Loop through the uncompiled pattern until we're at the end. */
2113 while (p != pend)
2115 PATFETCH (c);
2117 switch (c)
2119 case '^':
2121 if ( /* If at start of pattern, it's an operator. */
2122 p == pattern + 1
2123 /* If context independent, it's an operator. */
2124 || syntax & RE_CONTEXT_INDEP_ANCHORS
2125 /* Otherwise, depends on what's come before. */
2126 || at_begline_loc_p (pattern, p, syntax))
2127 BUF_PUSH (begline);
2128 else
2129 goto normal_char;
2131 break;
2134 case '$':
2136 if ( /* If at end of pattern, it's an operator. */
2137 p == pend
2138 /* If context independent, it's an operator. */
2139 || syntax & RE_CONTEXT_INDEP_ANCHORS
2140 /* Otherwise, depends on what's next. */
2141 || at_endline_loc_p (p, pend, syntax))
2142 BUF_PUSH (endline);
2143 else
2144 goto normal_char;
2146 break;
2149 case '+':
2150 case '?':
2151 if ((syntax & RE_BK_PLUS_QM)
2152 || (syntax & RE_LIMITED_OPS))
2153 goto normal_char;
2154 handle_plus:
2155 case '*':
2156 /* If there is no previous pattern... */
2157 if (!laststart)
2159 if (syntax & RE_CONTEXT_INVALID_OPS)
2160 FREE_STACK_RETURN (REG_BADRPT);
2161 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2162 goto normal_char;
2166 /* Are we optimizing this jump? */
2167 boolean keep_string_p = false;
2169 /* 1 means zero (many) matches is allowed. */
2170 char zero_times_ok = 0, many_times_ok = 0;
2172 /* If there is a sequence of repetition chars, collapse it
2173 down to just one (the right one). We can't combine
2174 interval operators with these because of, e.g., `a{2}*',
2175 which should only match an even number of `a's. */
2177 for (;;)
2179 zero_times_ok |= c != '+';
2180 many_times_ok |= c != '?';
2182 if (p == pend)
2183 break;
2185 PATFETCH (c);
2187 if (c == '*'
2188 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2191 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2193 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2195 PATFETCH (c1);
2196 if (!(c1 == '+' || c1 == '?'))
2198 PATUNFETCH;
2199 PATUNFETCH;
2200 break;
2203 c = c1;
2205 else
2207 PATUNFETCH;
2208 break;
2211 /* If we get here, we found another repeat character. */
2214 /* Star, etc. applied to an empty pattern is equivalent
2215 to an empty pattern. */
2216 if (!laststart)
2217 break;
2219 /* Now we know whether or not zero matches is allowed
2220 and also whether or not two or more matches is allowed. */
2221 if (many_times_ok)
2222 { /* More than one repetition is allowed, so put in at the
2223 end a backward relative jump from `b' to before the next
2224 jump we're going to put in below (which jumps from
2225 laststart to after this jump).
2227 But if we are at the `*' in the exact sequence `.*\n',
2228 insert an unconditional jump backwards to the .,
2229 instead of the beginning of the loop. This way we only
2230 push a failure point once, instead of every time
2231 through the loop. */
2232 assert (p - 1 > pattern);
2234 /* Allocate the space for the jump. */
2235 GET_BUFFER_SPACE (3);
2237 /* We know we are not at the first character of the pattern,
2238 because laststart was nonzero. And we've already
2239 incremented `p', by the way, to be the character after
2240 the `*'. Do we have to do something analogous here
2241 for null bytes, because of RE_DOT_NOT_NULL? */
2242 if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
2243 && zero_times_ok
2244 && p < pend
2245 && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
2246 && !(syntax & RE_DOT_NEWLINE))
2247 { /* We have .*\n. */
2248 STORE_JUMP (jump, b, laststart);
2249 keep_string_p = true;
2251 else
2252 /* Anything else. */
2253 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2255 /* We've added more stuff to the buffer. */
2256 b += 3;
2259 /* On failure, jump from laststart to b + 3, which will be the
2260 end of the buffer after this jump is inserted. */
2261 GET_BUFFER_SPACE (3);
2262 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2263 : on_failure_jump,
2264 laststart, b + 3);
2265 pending_exact = 0;
2266 b += 3;
2268 if (!zero_times_ok)
2270 /* At least one repetition is required, so insert a
2271 `dummy_failure_jump' before the initial
2272 `on_failure_jump' instruction of the loop. This
2273 effects a skip over that instruction the first time
2274 we hit that loop. */
2275 GET_BUFFER_SPACE (3);
2276 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2277 b += 3;
2280 break;
2283 case '.':
2284 laststart = b;
2285 BUF_PUSH (anychar);
2286 break;
2289 case '[':
2291 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2293 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2295 /* Ensure that we have enough space to push a charset: the
2296 opcode, the length count, and the bitset; 34 bytes in all. */
2297 GET_BUFFER_SPACE (34);
2299 laststart = b;
2301 /* We test `*p == '^' twice, instead of using an if
2302 statement, so we only need one BUF_PUSH. */
2303 BUF_PUSH (*p == '^' ? charset_not : charset);
2304 if (*p == '^')
2305 p++;
2307 /* Remember the first position in the bracket expression. */
2308 p1 = p;
2310 /* Push the number of bytes in the bitmap. */
2311 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2313 /* Clear the whole map. */
2314 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2316 /* charset_not matches newline according to a syntax bit. */
2317 if ((re_opcode_t) b[-2] == charset_not
2318 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2319 SET_LIST_BIT ('\n');
2321 /* Read in characters and ranges, setting map bits. */
2322 for (;;)
2324 int len;
2325 boolean escaped_char = false;
2327 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2329 PATFETCH (c);
2331 /* \ might escape characters inside [...] and [^...]. */
2332 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2334 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2336 PATFETCH (c);
2337 escaped_char = true;
2339 else
2341 /* Could be the end of the bracket expression. If it's
2342 not (i.e., when the bracket expression is `[]' so
2343 far), the ']' character bit gets set way below. */
2344 if (c == ']' && p != p1 + 1)
2345 break;
2348 /* If C indicates start of multibyte char, get the
2349 actual character code in C, and set the pattern
2350 pointer P to the next character boundary. */
2351 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2353 PATUNFETCH;
2354 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2355 p += len;
2357 /* What should we do for the character which is
2358 greater than 0x7F, but not BASE_LEADING_CODE_P?
2359 XXX */
2361 /* See if we're at the beginning of a possible character
2362 class. */
2364 else if (!escaped_char &&
2365 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2367 /* Leave room for the null. */
2368 char str[CHAR_CLASS_MAX_LENGTH + 1];
2370 PATFETCH (c);
2371 c1 = 0;
2373 /* If pattern is `[[:'. */
2374 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2376 for (;;)
2378 PATFETCH (c);
2379 if (c == ':' || c == ']' || p == pend
2380 || c1 == CHAR_CLASS_MAX_LENGTH)
2381 break;
2382 str[c1++] = c;
2384 str[c1] = '\0';
2386 /* If isn't a word bracketed by `[:' and `:]':
2387 undo the ending character, the letters, and
2388 leave the leading `:' and `[' (but set bits for
2389 them). */
2390 if (c == ':' && *p == ']')
2392 int ch;
2393 boolean is_alnum = STREQ (str, "alnum");
2394 boolean is_alpha = STREQ (str, "alpha");
2395 boolean is_ascii = STREQ (str, "ascii");
2396 boolean is_blank = STREQ (str, "blank");
2397 boolean is_cntrl = STREQ (str, "cntrl");
2398 boolean is_digit = STREQ (str, "digit");
2399 boolean is_graph = STREQ (str, "graph");
2400 boolean is_lower = STREQ (str, "lower");
2401 boolean is_multibyte = STREQ (str, "multibyte");
2402 boolean is_nonascii = STREQ (str, "nonascii");
2403 boolean is_print = STREQ (str, "print");
2404 boolean is_punct = STREQ (str, "punct");
2405 boolean is_space = STREQ (str, "space");
2406 boolean is_unibyte = STREQ (str, "unibyte");
2407 boolean is_upper = STREQ (str, "upper");
2408 boolean is_word = STREQ (str, "word");
2409 boolean is_xdigit = STREQ (str, "xdigit");
2411 if (!IS_CHAR_CLASS (str))
2412 FREE_STACK_RETURN (REG_ECTYPE);
2414 /* Throw away the ] at the end of the character
2415 class. */
2416 PATFETCH (c);
2418 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2420 /* Most character classes in a multibyte match
2421 just set a flag. Exceptions are is_blank,
2422 is_digit, is_cntrl, and is_xdigit, since
2423 they can only match ASCII characters. We
2424 don't need to handle them for multibyte. */
2426 if (bufp->multibyte)
2428 int bit = 0;
2430 if (is_alnum) bit = BIT_ALNUM;
2431 if (is_alpha) bit = BIT_ALPHA;
2432 if (is_ascii) bit = BIT_ASCII;
2433 if (is_graph) bit = BIT_GRAPH;
2434 if (is_lower) bit = BIT_LOWER;
2435 if (is_multibyte) bit = BIT_MULTIBYTE;
2436 if (is_nonascii) bit = BIT_NONASCII;
2437 if (is_print) bit = BIT_PRINT;
2438 if (is_punct) bit = BIT_PUNCT;
2439 if (is_space) bit = BIT_SPACE;
2440 if (is_unibyte) bit = BIT_UNIBYTE;
2441 if (is_upper) bit = BIT_UPPER;
2442 if (is_word) bit = BIT_WORD;
2443 if (bit)
2444 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2445 bit);
2448 /* Handle character classes for ASCII characters. */
2449 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2451 int translated = TRANSLATE (ch);
2452 /* This was split into 3 if's to
2453 avoid an arbitrary limit in some compiler. */
2454 if ( (is_alnum && ISALNUM (ch))
2455 || (is_alpha && ISALPHA (ch))
2456 || (is_blank && ISBLANK (ch))
2457 || (is_cntrl && ISCNTRL (ch)))
2458 SET_LIST_BIT (translated);
2459 if ( (is_digit && ISDIGIT (ch))
2460 || (is_graph && ISGRAPH (ch))
2461 || (is_lower && ISLOWER (ch))
2462 || (is_print && ISPRINT (ch)))
2463 SET_LIST_BIT (translated);
2464 if ( (is_punct && ISPUNCT (ch))
2465 || (is_space && ISSPACE (ch))
2466 || (is_upper && ISUPPER (ch))
2467 || (is_xdigit && ISXDIGIT (ch)))
2468 SET_LIST_BIT (translated);
2469 if ( (is_ascii && IS_REAL_ASCII (ch))
2470 || (is_nonascii && !IS_REAL_ASCII (ch))
2471 || (is_unibyte && ISUNIBYTE (ch))
2472 || (is_multibyte && !ISUNIBYTE (ch)))
2473 SET_LIST_BIT (translated);
2475 if ( (is_word && ISWORD (ch)))
2476 SET_LIST_BIT (translated);
2479 /* Repeat the loop. */
2480 continue;
2482 else
2484 c1++;
2485 while (c1--)
2486 PATUNFETCH;
2487 SET_LIST_BIT ('[');
2489 /* Because the `:' may starts the range, we
2490 can't simply set bit and repeat the loop.
2491 Instead, just set it to C and handle below. */
2492 c = ':';
2496 if (p < pend && p[0] == '-' && p[1] != ']')
2499 /* Discard the `-'. */
2500 PATFETCH (c1);
2502 /* Fetch the character which ends the range. */
2503 PATFETCH (c1);
2504 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
2506 PATUNFETCH;
2507 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2508 p += len;
2511 if (SINGLE_BYTE_CHAR_P (c)
2512 && ! SINGLE_BYTE_CHAR_P (c1))
2514 /* Handle a range such as \177-\377 in multibyte mode.
2515 Split that into two ranges,,
2516 the low one ending at 0237, and the high one
2517 starting at ...040. */
2518 int c1_base = (c1 & ~0177) | 040;
2519 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2520 c1 = 0237;
2522 else if (!SAME_CHARSET_P (c, c1))
2523 FREE_STACK_RETURN (REG_ERANGE);
2525 else
2526 /* Range from C to C. */
2527 c1 = c;
2529 /* Set the range ... */
2530 if (SINGLE_BYTE_CHAR_P (c))
2531 /* ... into bitmap. */
2533 unsigned this_char;
2534 int range_start = c, range_end = c1;
2536 /* If the start is after the end, the range is empty. */
2537 if (range_start > range_end)
2539 if (syntax & RE_NO_EMPTY_RANGES)
2540 FREE_STACK_RETURN (REG_ERANGE);
2541 /* Else, repeat the loop. */
2543 else
2545 for (this_char = range_start; this_char <= range_end;
2546 this_char++)
2547 SET_LIST_BIT (TRANSLATE (this_char));
2550 else
2551 /* ... into range table. */
2552 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2555 /* Discard any (non)matching list bytes that are all 0 at the
2556 end of the map. Decrease the map-length byte too. */
2557 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2558 b[-1]--;
2559 b += b[-1];
2561 /* Build real range table from work area. */
2562 if (RANGE_TABLE_WORK_USED (range_table_work)
2563 || RANGE_TABLE_WORK_BITS (range_table_work))
2565 int i;
2566 int used = RANGE_TABLE_WORK_USED (range_table_work);
2568 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2569 bytes for flags, two for COUNT, and three bytes for
2570 each character. */
2571 GET_BUFFER_SPACE (4 + used * 3);
2573 /* Indicate the existence of range table. */
2574 laststart[1] |= 0x80;
2576 /* Store the character class flag bits into the range table.
2577 If not in emacs, these flag bits are always 0. */
2578 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2579 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2581 STORE_NUMBER_AND_INCR (b, used / 2);
2582 for (i = 0; i < used; i++)
2583 STORE_CHARACTER_AND_INCR
2584 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2587 break;
2590 case '(':
2591 if (syntax & RE_NO_BK_PARENS)
2592 goto handle_open;
2593 else
2594 goto normal_char;
2597 case ')':
2598 if (syntax & RE_NO_BK_PARENS)
2599 goto handle_close;
2600 else
2601 goto normal_char;
2604 case '\n':
2605 if (syntax & RE_NEWLINE_ALT)
2606 goto handle_alt;
2607 else
2608 goto normal_char;
2611 case '|':
2612 if (syntax & RE_NO_BK_VBAR)
2613 goto handle_alt;
2614 else
2615 goto normal_char;
2618 case '{':
2619 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2620 goto handle_interval;
2621 else
2622 goto normal_char;
2625 case '\\':
2626 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2628 /* Do not translate the character after the \, so that we can
2629 distinguish, e.g., \B from \b, even if we normally would
2630 translate, e.g., B to b. */
2631 PATFETCH_RAW (c);
2633 switch (c)
2635 case '(':
2636 if (syntax & RE_NO_BK_PARENS)
2637 goto normal_backslash;
2639 handle_open:
2640 bufp->re_nsub++;
2641 regnum++;
2643 if (COMPILE_STACK_FULL)
2645 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2646 compile_stack_elt_t);
2647 if (compile_stack.stack == NULL) return REG_ESPACE;
2649 compile_stack.size <<= 1;
2652 /* These are the values to restore when we hit end of this
2653 group. They are all relative offsets, so that if the
2654 whole pattern moves because of realloc, they will still
2655 be valid. */
2656 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2657 COMPILE_STACK_TOP.fixup_alt_jump
2658 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2659 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2660 COMPILE_STACK_TOP.regnum = regnum;
2662 /* We will eventually replace the 0 with the number of
2663 groups inner to this one. But do not push a
2664 start_memory for groups beyond the last one we can
2665 represent in the compiled pattern. */
2666 if (regnum <= MAX_REGNUM)
2668 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2669 BUF_PUSH_3 (start_memory, regnum, 0);
2672 compile_stack.avail++;
2674 fixup_alt_jump = 0;
2675 laststart = 0;
2676 begalt = b;
2677 /* If we've reached MAX_REGNUM groups, then this open
2678 won't actually generate any code, so we'll have to
2679 clear pending_exact explicitly. */
2680 pending_exact = 0;
2681 break;
2684 case ')':
2685 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2687 if (COMPILE_STACK_EMPTY)
2688 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2689 goto normal_backslash;
2690 else
2691 FREE_STACK_RETURN (REG_ERPAREN);
2693 handle_close:
2694 if (fixup_alt_jump)
2695 { /* Push a dummy failure point at the end of the
2696 alternative for a possible future
2697 `pop_failure_jump' to pop. See comments at
2698 `push_dummy_failure' in `re_match_2'. */
2699 BUF_PUSH (push_dummy_failure);
2701 /* We allocated space for this jump when we assigned
2702 to `fixup_alt_jump', in the `handle_alt' case below. */
2703 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2706 /* See similar code for backslashed left paren above. */
2707 if (COMPILE_STACK_EMPTY)
2708 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2709 goto normal_char;
2710 else
2711 FREE_STACK_RETURN (REG_ERPAREN);
2713 /* Since we just checked for an empty stack above, this
2714 ``can't happen''. */
2715 assert (compile_stack.avail != 0);
2717 /* We don't just want to restore into `regnum', because
2718 later groups should continue to be numbered higher,
2719 as in `(ab)c(de)' -- the second group is #2. */
2720 regnum_t this_group_regnum;
2722 compile_stack.avail--;
2723 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2724 fixup_alt_jump
2725 = COMPILE_STACK_TOP.fixup_alt_jump
2726 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2727 : 0;
2728 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2729 this_group_regnum = COMPILE_STACK_TOP.regnum;
2730 /* If we've reached MAX_REGNUM groups, then this open
2731 won't actually generate any code, so we'll have to
2732 clear pending_exact explicitly. */
2733 pending_exact = 0;
2735 /* We're at the end of the group, so now we know how many
2736 groups were inside this one. */
2737 if (this_group_regnum <= MAX_REGNUM)
2739 unsigned char *inner_group_loc
2740 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2742 *inner_group_loc = regnum - this_group_regnum;
2743 BUF_PUSH_3 (stop_memory, this_group_regnum,
2744 regnum - this_group_regnum);
2747 break;
2750 case '|': /* `\|'. */
2751 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2752 goto normal_backslash;
2753 handle_alt:
2754 if (syntax & RE_LIMITED_OPS)
2755 goto normal_char;
2757 /* Insert before the previous alternative a jump which
2758 jumps to this alternative if the former fails. */
2759 GET_BUFFER_SPACE (3);
2760 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2761 pending_exact = 0;
2762 b += 3;
2764 /* The alternative before this one has a jump after it
2765 which gets executed if it gets matched. Adjust that
2766 jump so it will jump to this alternative's analogous
2767 jump (put in below, which in turn will jump to the next
2768 (if any) alternative's such jump, etc.). The last such
2769 jump jumps to the correct final destination. A picture:
2770 _____ _____
2771 | | | |
2772 | v | v
2773 a | b | c
2775 If we are at `b', then fixup_alt_jump right now points to a
2776 three-byte space after `a'. We'll put in the jump, set
2777 fixup_alt_jump to right after `b', and leave behind three
2778 bytes which we'll fill in when we get to after `c'. */
2780 if (fixup_alt_jump)
2781 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2783 /* Mark and leave space for a jump after this alternative,
2784 to be filled in later either by next alternative or
2785 when know we're at the end of a series of alternatives. */
2786 fixup_alt_jump = b;
2787 GET_BUFFER_SPACE (3);
2788 b += 3;
2790 laststart = 0;
2791 begalt = b;
2792 break;
2795 case '{':
2796 /* If \{ is a literal. */
2797 if (!(syntax & RE_INTERVALS)
2798 /* If we're at `\{' and it's not the open-interval
2799 operator. */
2800 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2801 || (p - 2 == pattern && p == pend))
2802 goto normal_backslash;
2804 handle_interval:
2806 /* If got here, then the syntax allows intervals. */
2808 /* At least (most) this many matches must be made. */
2809 int lower_bound = -1, upper_bound = -1;
2811 beg_interval = p - 1;
2813 if (p == pend)
2815 if (syntax & RE_NO_BK_BRACES)
2816 goto unfetch_interval;
2817 else
2818 FREE_STACK_RETURN (REG_EBRACE);
2821 GET_UNSIGNED_NUMBER (lower_bound);
2823 if (c == ',')
2825 GET_UNSIGNED_NUMBER (upper_bound);
2826 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2828 else
2829 /* Interval such as `{1}' => match exactly once. */
2830 upper_bound = lower_bound;
2832 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2833 || lower_bound > upper_bound)
2835 if (syntax & RE_NO_BK_BRACES)
2836 goto unfetch_interval;
2837 else
2838 FREE_STACK_RETURN (REG_BADBR);
2841 if (!(syntax & RE_NO_BK_BRACES))
2843 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2845 PATFETCH (c);
2848 if (c != '}')
2850 if (syntax & RE_NO_BK_BRACES)
2851 goto unfetch_interval;
2852 else
2853 FREE_STACK_RETURN (REG_BADBR);
2856 /* We just parsed a valid interval. */
2858 /* If it's invalid to have no preceding re. */
2859 if (!laststart)
2861 if (syntax & RE_CONTEXT_INVALID_OPS)
2862 FREE_STACK_RETURN (REG_BADRPT);
2863 else if (syntax & RE_CONTEXT_INDEP_OPS)
2864 laststart = b;
2865 else
2866 goto unfetch_interval;
2869 /* If the upper bound is zero, don't want to succeed at
2870 all; jump from `laststart' to `b + 3', which will be
2871 the end of the buffer after we insert the jump. */
2872 if (upper_bound == 0)
2874 GET_BUFFER_SPACE (3);
2875 INSERT_JUMP (jump, laststart, b + 3);
2876 b += 3;
2879 /* Otherwise, we have a nontrivial interval. When
2880 we're all done, the pattern will look like:
2881 set_number_at <jump count> <upper bound>
2882 set_number_at <succeed_n count> <lower bound>
2883 succeed_n <after jump addr> <succeed_n count>
2884 <body of loop>
2885 jump_n <succeed_n addr> <jump count>
2886 (The upper bound and `jump_n' are omitted if
2887 `upper_bound' is 1, though.) */
2888 else
2889 { /* If the upper bound is > 1, we need to insert
2890 more at the end of the loop. */
2891 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2893 GET_BUFFER_SPACE (nbytes);
2895 /* Initialize lower bound of the `succeed_n', even
2896 though it will be set during matching by its
2897 attendant `set_number_at' (inserted next),
2898 because `re_compile_fastmap' needs to know.
2899 Jump to the `jump_n' we might insert below. */
2900 INSERT_JUMP2 (succeed_n, laststart,
2901 b + 5 + (upper_bound > 1) * 5,
2902 lower_bound);
2903 b += 5;
2905 /* Code to initialize the lower bound. Insert
2906 before the `succeed_n'. The `5' is the last two
2907 bytes of this `set_number_at', plus 3 bytes of
2908 the following `succeed_n'. */
2909 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2910 b += 5;
2912 if (upper_bound > 1)
2913 { /* More than one repetition is allowed, so
2914 append a backward jump to the `succeed_n'
2915 that starts this interval.
2917 When we've reached this during matching,
2918 we'll have matched the interval once, so
2919 jump back only `upper_bound - 1' times. */
2920 STORE_JUMP2 (jump_n, b, laststart + 5,
2921 upper_bound - 1);
2922 b += 5;
2924 /* The location we want to set is the second
2925 parameter of the `jump_n'; that is `b-2' as
2926 an absolute address. `laststart' will be
2927 the `set_number_at' we're about to insert;
2928 `laststart+3' the number to set, the source
2929 for the relative address. But we are
2930 inserting into the middle of the pattern --
2931 so everything is getting moved up by 5.
2932 Conclusion: (b - 2) - (laststart + 3) + 5,
2933 i.e., b - laststart.
2935 We insert this at the beginning of the loop
2936 so that if we fail during matching, we'll
2937 reinitialize the bounds. */
2938 insert_op2 (set_number_at, laststart, b - laststart,
2939 upper_bound - 1, b);
2940 b += 5;
2943 pending_exact = 0;
2944 beg_interval = NULL;
2946 break;
2948 unfetch_interval:
2949 /* If an invalid interval, match the characters as literals. */
2950 assert (beg_interval);
2951 p = beg_interval;
2952 beg_interval = NULL;
2954 /* normal_char and normal_backslash need `c'. */
2955 PATFETCH (c);
2957 if (!(syntax & RE_NO_BK_BRACES))
2959 if (p > pattern && p[-1] == '\\')
2960 goto normal_backslash;
2962 goto normal_char;
2964 #ifdef emacs
2965 /* There is no way to specify the before_dot and after_dot
2966 operators. rms says this is ok. --karl */
2967 case '=':
2968 BUF_PUSH (at_dot);
2969 break;
2971 case 's':
2972 laststart = b;
2973 PATFETCH (c);
2974 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2975 break;
2977 case 'S':
2978 laststart = b;
2979 PATFETCH (c);
2980 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2981 break;
2983 case 'c':
2984 laststart = b;
2985 PATFETCH_RAW (c);
2986 BUF_PUSH_2 (categoryspec, c);
2987 break;
2989 case 'C':
2990 laststart = b;
2991 PATFETCH_RAW (c);
2992 BUF_PUSH_2 (notcategoryspec, c);
2993 break;
2994 #endif /* emacs */
2997 case 'w':
2998 laststart = b;
2999 BUF_PUSH (wordchar);
3000 break;
3003 case 'W':
3004 laststart = b;
3005 BUF_PUSH (notwordchar);
3006 break;
3009 case '<':
3010 BUF_PUSH (wordbeg);
3011 break;
3013 case '>':
3014 BUF_PUSH (wordend);
3015 break;
3017 case 'b':
3018 BUF_PUSH (wordbound);
3019 break;
3021 case 'B':
3022 BUF_PUSH (notwordbound);
3023 break;
3025 case '`':
3026 BUF_PUSH (begbuf);
3027 break;
3029 case '\'':
3030 BUF_PUSH (endbuf);
3031 break;
3033 case '1': case '2': case '3': case '4': case '5':
3034 case '6': case '7': case '8': case '9':
3035 if (syntax & RE_NO_BK_REFS)
3036 goto normal_char;
3038 c1 = c - '0';
3040 if (c1 > regnum)
3041 FREE_STACK_RETURN (REG_ESUBREG);
3043 /* Can't back reference to a subexpression if inside of it. */
3044 if (group_in_compile_stack (compile_stack, c1))
3045 goto normal_char;
3047 laststart = b;
3048 BUF_PUSH_2 (duplicate, c1);
3049 break;
3052 case '+':
3053 case '?':
3054 if (syntax & RE_BK_PLUS_QM)
3055 goto handle_plus;
3056 else
3057 goto normal_backslash;
3059 default:
3060 normal_backslash:
3061 /* You might think it would be useful for \ to mean
3062 not to translate; but if we don't translate it
3063 it will never match anything. */
3064 c = TRANSLATE (c);
3065 goto normal_char;
3067 break;
3070 default:
3071 /* Expects the character in `c'. */
3072 normal_char:
3073 p1 = p - 1; /* P1 points the head of C. */
3074 #ifdef emacs
3075 if (bufp->multibyte)
3077 c = STRING_CHAR (p1, pend - p1);
3078 c = TRANSLATE (c);
3079 /* Set P to the next character boundary. */
3080 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
3082 #endif
3083 /* If no exactn currently being built. */
3084 if (!pending_exact
3086 /* If last exactn not at current position. */
3087 || pending_exact + *pending_exact + 1 != b
3089 /* We have only one byte following the exactn for the count. */
3090 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
3092 /* If followed by a repetition operator. */
3093 || (p != pend && (*p == '*' || *p == '^'))
3094 || ((syntax & RE_BK_PLUS_QM)
3095 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3096 : p != pend && (*p == '+' || *p == '?'))
3097 || ((syntax & RE_INTERVALS)
3098 && ((syntax & RE_NO_BK_BRACES)
3099 ? p != pend && *p == '{'
3100 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3102 /* Start building a new exactn. */
3104 laststart = b;
3106 BUF_PUSH_2 (exactn, 0);
3107 pending_exact = b - 1;
3110 #ifdef emacs
3111 if (! SINGLE_BYTE_CHAR_P (c))
3113 unsigned char work[4], *str;
3114 int i = CHAR_STRING (c, work, str);
3115 int j;
3116 for (j = 0; j < i; j++)
3118 BUF_PUSH (str[j]);
3119 (*pending_exact)++;
3122 else
3123 #endif
3125 BUF_PUSH (c);
3126 (*pending_exact)++;
3128 break;
3129 } /* switch (c) */
3130 } /* while p != pend */
3133 /* Through the pattern now. */
3135 if (fixup_alt_jump)
3136 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3138 if (!COMPILE_STACK_EMPTY)
3139 FREE_STACK_RETURN (REG_EPAREN);
3141 /* If we don't want backtracking, force success
3142 the first time we reach the end of the compiled pattern. */
3143 if (syntax & RE_NO_POSIX_BACKTRACKING)
3144 BUF_PUSH (succeed);
3146 free (compile_stack.stack);
3148 /* We have succeeded; set the length of the buffer. */
3149 bufp->used = b - bufp->buffer;
3151 #ifdef DEBUG
3152 if (debug)
3154 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3155 print_compiled_pattern (bufp);
3157 #endif /* DEBUG */
3159 #ifndef MATCH_MAY_ALLOCATE
3160 /* Initialize the failure stack to the largest possible stack. This
3161 isn't necessary unless we're trying to avoid calling alloca in
3162 the search and match routines. */
3164 int num_regs = bufp->re_nsub + 1;
3166 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3168 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3170 #ifdef emacs
3171 if (! fail_stack.stack)
3172 fail_stack.stack
3173 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3174 * sizeof (fail_stack_elt_t));
3175 else
3176 fail_stack.stack
3177 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3178 (fail_stack.size
3179 * sizeof (fail_stack_elt_t)));
3180 #else /* not emacs */
3181 if (! fail_stack.stack)
3182 fail_stack.stack
3183 = (fail_stack_elt_t *) malloc (fail_stack.size
3184 * sizeof (fail_stack_elt_t));
3185 else
3186 fail_stack.stack
3187 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3188 (fail_stack.size
3189 * sizeof (fail_stack_elt_t)));
3190 #endif /* not emacs */
3193 regex_grow_registers (num_regs);
3195 #endif /* not MATCH_MAY_ALLOCATE */
3197 return REG_NOERROR;
3198 } /* regex_compile */
3200 /* Subroutines for `regex_compile'. */
3202 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3204 static void
3205 store_op1 (op, loc, arg)
3206 re_opcode_t op;
3207 unsigned char *loc;
3208 int arg;
3210 *loc = (unsigned char) op;
3211 STORE_NUMBER (loc + 1, arg);
3215 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3217 static void
3218 store_op2 (op, loc, arg1, arg2)
3219 re_opcode_t op;
3220 unsigned char *loc;
3221 int arg1, arg2;
3223 *loc = (unsigned char) op;
3224 STORE_NUMBER (loc + 1, arg1);
3225 STORE_NUMBER (loc + 3, arg2);
3229 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3230 for OP followed by two-byte integer parameter ARG. */
3232 static void
3233 insert_op1 (op, loc, arg, end)
3234 re_opcode_t op;
3235 unsigned char *loc;
3236 int arg;
3237 unsigned char *end;
3239 register unsigned char *pfrom = end;
3240 register unsigned char *pto = end + 3;
3242 while (pfrom != loc)
3243 *--pto = *--pfrom;
3245 store_op1 (op, loc, arg);
3249 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3251 static void
3252 insert_op2 (op, loc, arg1, arg2, end)
3253 re_opcode_t op;
3254 unsigned char *loc;
3255 int arg1, arg2;
3256 unsigned char *end;
3258 register unsigned char *pfrom = end;
3259 register unsigned char *pto = end + 5;
3261 while (pfrom != loc)
3262 *--pto = *--pfrom;
3264 store_op2 (op, loc, arg1, arg2);
3268 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3269 after an alternative or a begin-subexpression. We assume there is at
3270 least one character before the ^. */
3272 static boolean
3273 at_begline_loc_p (pattern, p, syntax)
3274 const char *pattern, *p;
3275 reg_syntax_t syntax;
3277 const char *prev = p - 2;
3278 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3280 return
3281 /* After a subexpression? */
3282 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3283 /* After an alternative? */
3284 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3288 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3289 at least one character after the $, i.e., `P < PEND'. */
3291 static boolean
3292 at_endline_loc_p (p, pend, syntax)
3293 const char *p, *pend;
3294 int syntax;
3296 const char *next = p;
3297 boolean next_backslash = *next == '\\';
3298 const char *next_next = p + 1 < pend ? p + 1 : 0;
3300 return
3301 /* Before a subexpression? */
3302 (syntax & RE_NO_BK_PARENS ? *next == ')'
3303 : next_backslash && next_next && *next_next == ')')
3304 /* Before an alternative? */
3305 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3306 : next_backslash && next_next && *next_next == '|');
3310 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3311 false if it's not. */
3313 static boolean
3314 group_in_compile_stack (compile_stack, regnum)
3315 compile_stack_type compile_stack;
3316 regnum_t regnum;
3318 int this_element;
3320 for (this_element = compile_stack.avail - 1;
3321 this_element >= 0;
3322 this_element--)
3323 if (compile_stack.stack[this_element].regnum == regnum)
3324 return true;
3326 return false;
3329 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3330 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3331 characters can start a string that matches the pattern. This fastmap
3332 is used by re_search to skip quickly over impossible starting points.
3334 Character codes above (1 << BYTEWIDTH) are not represented in the
3335 fastmap, but the leading codes are represented. Thus, the fastmap
3336 indicates which character sets could start a match.
3338 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3339 area as BUFP->fastmap.
3341 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3342 the pattern buffer.
3344 Returns 0 if we succeed, -2 if an internal error. */
3347 re_compile_fastmap (bufp)
3348 struct re_pattern_buffer *bufp;
3350 int i, j, k;
3351 #ifdef MATCH_MAY_ALLOCATE
3352 fail_stack_type fail_stack;
3353 #endif
3354 #ifndef REGEX_MALLOC
3355 char *destination;
3356 #endif
3357 /* We don't push any register information onto the failure stack. */
3358 unsigned num_regs = 0;
3360 register char *fastmap = bufp->fastmap;
3361 unsigned char *pattern = bufp->buffer;
3362 unsigned long size = bufp->used;
3363 unsigned char *p = pattern;
3364 register unsigned char *pend = pattern + size;
3366 /* This holds the pointer to the failure stack, when
3367 it is allocated relocatably. */
3368 fail_stack_elt_t *failure_stack_ptr;
3370 /* Assume that each path through the pattern can be null until
3371 proven otherwise. We set this false at the bottom of switch
3372 statement, to which we get only if a particular path doesn't
3373 match the empty string. */
3374 boolean path_can_be_null = true;
3376 /* We aren't doing a `succeed_n' to begin with. */
3377 boolean succeed_n_p = false;
3379 /* If all elements for base leading-codes in fastmap is set, this
3380 flag is set true. */
3381 boolean match_any_multibyte_characters = false;
3383 /* Maximum code of simple (single byte) character. */
3384 int simple_char_max;
3386 assert (fastmap != NULL && p != NULL);
3388 INIT_FAIL_STACK ();
3389 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3390 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3391 bufp->can_be_null = 0;
3393 while (1)
3395 if (p == pend || *p == succeed)
3397 /* We have reached the (effective) end of pattern. */
3398 if (!FAIL_STACK_EMPTY ())
3400 bufp->can_be_null |= path_can_be_null;
3402 /* Reset for next path. */
3403 path_can_be_null = true;
3405 p = fail_stack.stack[--fail_stack.avail].pointer;
3407 continue;
3409 else
3410 break;
3413 /* We should never be about to go beyond the end of the pattern. */
3414 assert (p < pend);
3416 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3419 /* I guess the idea here is to simply not bother with a fastmap
3420 if a backreference is used, since it's too hard to figure out
3421 the fastmap for the corresponding group. Setting
3422 `can_be_null' stops `re_search_2' from using the fastmap, so
3423 that is all we do. */
3424 case duplicate:
3425 bufp->can_be_null = 1;
3426 goto done;
3429 /* Following are the cases which match a character. These end
3430 with `break'. */
3432 case exactn:
3433 fastmap[p[1]] = 1;
3434 break;
3437 #ifndef emacs
3438 case charset:
3440 int length = (*p & 0x7f);;
3441 p++;
3443 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3444 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3445 fastmap[j] = 1;
3447 break;
3449 case charset_not:
3450 /* Chars beyond end of map must be allowed. */
3452 int length = (*p & 0x7f);;
3453 p++;
3455 for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3456 fastmap[j] = 1;
3458 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3459 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3460 fastmap[j] = 1;
3462 break;
3464 case wordchar:
3465 for (j = 0; j < (1 << BYTEWIDTH); j++)
3466 if (SYNTAX (j) == Sword)
3467 fastmap[j] = 1;
3468 break;
3471 case notwordchar:
3472 for (j = 0; j < (1 << BYTEWIDTH); j++)
3473 if (SYNTAX (j) != Sword)
3474 fastmap[j] = 1;
3475 break;
3476 #else /* emacs */
3477 case charset:
3478 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3479 j >= 0; j--)
3480 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3481 fastmap[j] = 1;
3483 /* If we can match a character class, we can match
3484 any character set. */
3485 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3486 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
3487 goto set_fastmap_for_multibyte_characters;
3489 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3490 && match_any_multibyte_characters == false)
3492 /* Set fastmap[I] 1 where I is a base leading code of each
3493 multibyte character in the range table. */
3494 int c, count;
3496 /* Make P points the range table. */
3497 p += CHARSET_BITMAP_SIZE (&p[-2]);
3499 /* Extract the number of ranges in range table into COUNT. */
3500 EXTRACT_NUMBER_AND_INCR (count, p);
3501 for (; count > 0; count--, p += 2 * 3) /* XXX */
3503 /* Extract the start of each range. */
3504 EXTRACT_CHARACTER (c, p);
3505 j = CHAR_CHARSET (c);
3506 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3509 break;
3512 case charset_not:
3513 /* Chars beyond end of bitmap are possible matches.
3514 All the single-byte codes can occur in multibyte buffers.
3515 So any that are not listed in the charset
3516 are possible matches, even in multibyte buffers. */
3517 simple_char_max = (1 << BYTEWIDTH);
3518 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3519 j < simple_char_max; j++)
3520 fastmap[j] = 1;
3522 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3523 j >= 0; j--)
3524 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3525 fastmap[j] = 1;
3527 if (bufp->multibyte)
3528 /* Any character set can possibly contain a character
3529 which doesn't match the specified set of characters. */
3531 set_fastmap_for_multibyte_characters:
3532 if (match_any_multibyte_characters == false)
3534 for (j = 0x80; j < 0xA0; j++) /* XXX */
3535 if (BASE_LEADING_CODE_P (j))
3536 fastmap[j] = 1;
3537 match_any_multibyte_characters = true;
3540 break;
3543 case wordchar:
3544 /* All the single-byte codes can occur in multibyte buffers,
3545 and they may have word syntax. So do consider them. */
3546 simple_char_max = (1 << BYTEWIDTH);
3547 for (j = 0; j < simple_char_max; j++)
3548 if (SYNTAX (j) == Sword)
3549 fastmap[j] = 1;
3551 if (bufp->multibyte)
3552 /* Any character set can possibly contain a character
3553 whose syntax is `Sword'. */
3554 goto set_fastmap_for_multibyte_characters;
3555 break;
3558 case notwordchar:
3559 /* All the single-byte codes can occur in multibyte buffers,
3560 and they may not have word syntax. So do consider them. */
3561 simple_char_max = (1 << BYTEWIDTH);
3562 for (j = 0; j < simple_char_max; j++)
3563 if (SYNTAX (j) != Sword)
3564 fastmap[j] = 1;
3566 if (bufp->multibyte)
3567 /* Any character set can possibly contain a character
3568 whose syntax is not `Sword'. */
3569 goto set_fastmap_for_multibyte_characters;
3570 break;
3571 #endif
3573 case anychar:
3575 int fastmap_newline = fastmap['\n'];
3577 /* `.' matches anything, except perhaps newline.
3578 Even in a multibyte buffer, it should match any
3579 conceivable byte value for the fastmap. */
3580 if (bufp->multibyte)
3581 match_any_multibyte_characters = true;
3583 simple_char_max = (1 << BYTEWIDTH);
3584 for (j = 0; j < simple_char_max; j++)
3585 fastmap[j] = 1;
3587 /* ... except perhaps newline. */
3588 if (!(bufp->syntax & RE_DOT_NEWLINE))
3589 fastmap['\n'] = fastmap_newline;
3591 /* Return if we have already set `can_be_null'; if we have,
3592 then the fastmap is irrelevant. Something's wrong here. */
3593 else if (bufp->can_be_null)
3594 goto done;
3596 /* Otherwise, have to check alternative paths. */
3597 break;
3600 #ifdef emacs
3601 case wordbound:
3602 case notwordbound:
3603 case wordbeg:
3604 case wordend:
3605 case notsyntaxspec:
3606 case syntaxspec:
3607 /* This match depends on text properties. These end with
3608 aborting optimizations. */
3609 bufp->can_be_null = 1;
3610 goto done;
3611 #if 0
3612 k = *p++;
3613 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3614 for (j = 0; j < simple_char_max; j++)
3615 if (SYNTAX (j) == (enum syntaxcode) k)
3616 fastmap[j] = 1;
3618 if (bufp->multibyte)
3619 /* Any character set can possibly contain a character
3620 whose syntax is K. */
3621 goto set_fastmap_for_multibyte_characters;
3622 break;
3624 case notsyntaxspec:
3625 k = *p++;
3626 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3627 for (j = 0; j < simple_char_max; j++)
3628 if (SYNTAX (j) != (enum syntaxcode) k)
3629 fastmap[j] = 1;
3631 if (bufp->multibyte)
3632 /* Any character set can possibly contain a character
3633 whose syntax is not K. */
3634 goto set_fastmap_for_multibyte_characters;
3635 break;
3636 #endif
3639 case categoryspec:
3640 k = *p++;
3641 simple_char_max = (1 << BYTEWIDTH);
3642 for (j = 0; j < simple_char_max; j++)
3643 if (CHAR_HAS_CATEGORY (j, k))
3644 fastmap[j] = 1;
3646 if (bufp->multibyte)
3647 /* Any character set can possibly contain a character
3648 whose category is K. */
3649 goto set_fastmap_for_multibyte_characters;
3650 break;
3653 case notcategoryspec:
3654 k = *p++;
3655 simple_char_max = (1 << BYTEWIDTH);
3656 for (j = 0; j < simple_char_max; j++)
3657 if (!CHAR_HAS_CATEGORY (j, k))
3658 fastmap[j] = 1;
3660 if (bufp->multibyte)
3661 /* Any character set can possibly contain a character
3662 whose category is not K. */
3663 goto set_fastmap_for_multibyte_characters;
3664 break;
3666 /* All cases after this match the empty string. These end with
3667 `continue'. */
3670 case before_dot:
3671 case at_dot:
3672 case after_dot:
3673 continue;
3674 #endif /* emacs */
3677 case no_op:
3678 case begline:
3679 case endline:
3680 case begbuf:
3681 case endbuf:
3682 #ifndef emacs
3683 case wordbound:
3684 case notwordbound:
3685 case wordbeg:
3686 case wordend:
3687 #endif
3688 case push_dummy_failure:
3689 continue;
3692 case jump_n:
3693 case pop_failure_jump:
3694 case maybe_pop_jump:
3695 case jump:
3696 case jump_past_alt:
3697 case dummy_failure_jump:
3698 EXTRACT_NUMBER_AND_INCR (j, p);
3699 p += j;
3700 if (j > 0)
3701 continue;
3703 /* Jump backward implies we just went through the body of a
3704 loop and matched nothing. Opcode jumped to should be
3705 `on_failure_jump' or `succeed_n'. Just treat it like an
3706 ordinary jump. For a * loop, it has pushed its failure
3707 point already; if so, discard that as redundant. */
3708 if ((re_opcode_t) *p != on_failure_jump
3709 && (re_opcode_t) *p != succeed_n)
3710 continue;
3712 p++;
3713 EXTRACT_NUMBER_AND_INCR (j, p);
3714 p += j;
3716 /* If what's on the stack is where we are now, pop it. */
3717 if (!FAIL_STACK_EMPTY ()
3718 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3719 fail_stack.avail--;
3721 continue;
3724 case on_failure_jump:
3725 case on_failure_keep_string_jump:
3726 handle_on_failure_jump:
3727 EXTRACT_NUMBER_AND_INCR (j, p);
3729 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3730 end of the pattern. We don't want to push such a point,
3731 since when we restore it above, entering the switch will
3732 increment `p' past the end of the pattern. We don't need
3733 to push such a point since we obviously won't find any more
3734 fastmap entries beyond `pend'. Such a pattern can match
3735 the null string, though. */
3736 if (p + j < pend)
3738 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3740 RESET_FAIL_STACK ();
3741 return -2;
3744 else
3745 bufp->can_be_null = 1;
3747 if (succeed_n_p)
3749 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3750 succeed_n_p = false;
3753 continue;
3756 case succeed_n:
3757 /* Get to the number of times to succeed. */
3758 p += 2;
3760 /* Increment p past the n for when k != 0. */
3761 EXTRACT_NUMBER_AND_INCR (k, p);
3762 if (k == 0)
3764 p -= 4;
3765 succeed_n_p = true; /* Spaghetti code alert. */
3766 goto handle_on_failure_jump;
3768 continue;
3771 case set_number_at:
3772 p += 4;
3773 continue;
3776 case start_memory:
3777 case stop_memory:
3778 p += 2;
3779 continue;
3782 default:
3783 abort (); /* We have listed all the cases. */
3784 } /* switch *p++ */
3786 /* Getting here means we have found the possible starting
3787 characters for one path of the pattern -- and that the empty
3788 string does not match. We need not follow this path further.
3789 Instead, look at the next alternative (remembered on the
3790 stack), or quit if no more. The test at the top of the loop
3791 does these things. */
3792 path_can_be_null = false;
3793 p = pend;
3794 } /* while p */
3796 /* Set `can_be_null' for the last path (also the first path, if the
3797 pattern is empty). */
3798 bufp->can_be_null |= path_can_be_null;
3800 done:
3801 RESET_FAIL_STACK ();
3802 return 0;
3803 } /* re_compile_fastmap */
3805 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3806 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3807 this memory for recording register information. STARTS and ENDS
3808 must be allocated using the malloc library routine, and must each
3809 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3811 If NUM_REGS == 0, then subsequent matches should allocate their own
3812 register data.
3814 Unless this function is called, the first search or match using
3815 PATTERN_BUFFER will allocate its own register data, without
3816 freeing the old data. */
3818 void
3819 re_set_registers (bufp, regs, num_regs, starts, ends)
3820 struct re_pattern_buffer *bufp;
3821 struct re_registers *regs;
3822 unsigned num_regs;
3823 regoff_t *starts, *ends;
3825 if (num_regs)
3827 bufp->regs_allocated = REGS_REALLOCATE;
3828 regs->num_regs = num_regs;
3829 regs->start = starts;
3830 regs->end = ends;
3832 else
3834 bufp->regs_allocated = REGS_UNALLOCATED;
3835 regs->num_regs = 0;
3836 regs->start = regs->end = (regoff_t *) 0;
3840 /* Searching routines. */
3842 /* Like re_search_2, below, but only one string is specified, and
3843 doesn't let you say where to stop matching. */
3846 re_search (bufp, string, size, startpos, range, regs)
3847 struct re_pattern_buffer *bufp;
3848 const char *string;
3849 int size, startpos, range;
3850 struct re_registers *regs;
3852 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3853 regs, size);
3856 /* End address of virtual concatenation of string. */
3857 #define STOP_ADDR_VSTRING(P) \
3858 (((P) >= size1 ? string2 + size2 : string1 + size1))
3860 /* Address of POS in the concatenation of virtual string. */
3861 #define POS_ADDR_VSTRING(POS) \
3862 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3864 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3865 virtual concatenation of STRING1 and STRING2, starting first at index
3866 STARTPOS, then at STARTPOS + 1, and so on.
3868 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3870 RANGE is how far to scan while trying to match. RANGE = 0 means try
3871 only at STARTPOS; in general, the last start tried is STARTPOS +
3872 RANGE.
3874 In REGS, return the indices of the virtual concatenation of STRING1
3875 and STRING2 that matched the entire BUFP->buffer and its contained
3876 subexpressions.
3878 Do not consider matching one past the index STOP in the virtual
3879 concatenation of STRING1 and STRING2.
3881 We return either the position in the strings at which the match was
3882 found, -1 if no match, or -2 if error (such as failure
3883 stack overflow). */
3886 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3887 struct re_pattern_buffer *bufp;
3888 const char *string1, *string2;
3889 int size1, size2;
3890 int startpos;
3891 int range;
3892 struct re_registers *regs;
3893 int stop;
3895 int val;
3896 register char *fastmap = bufp->fastmap;
3897 register RE_TRANSLATE_TYPE translate = bufp->translate;
3898 int total_size = size1 + size2;
3899 int endpos = startpos + range;
3900 int anchored_start = 0;
3902 /* Nonzero if we have to concern multibyte character. */
3903 int multibyte = bufp->multibyte;
3905 /* Check for out-of-range STARTPOS. */
3906 if (startpos < 0 || startpos > total_size)
3907 return -1;
3909 /* Fix up RANGE if it might eventually take us outside
3910 the virtual concatenation of STRING1 and STRING2.
3911 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3912 if (endpos < 0)
3913 range = 0 - startpos;
3914 else if (endpos > total_size)
3915 range = total_size - startpos;
3917 /* If the search isn't to be a backwards one, don't waste time in a
3918 search for a pattern anchored at beginning of buffer. */
3919 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3921 if (startpos > 0)
3922 return -1;
3923 else
3924 range = 0;
3927 #ifdef emacs
3928 /* In a forward search for something that starts with \=.
3929 don't keep searching past point. */
3930 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3932 range = PT_BYTE - BEGV_BYTE - startpos;
3933 if (range < 0)
3934 return -1;
3936 #endif /* emacs */
3938 /* Update the fastmap now if not correct already. */
3939 if (fastmap && !bufp->fastmap_accurate)
3940 if (re_compile_fastmap (bufp) == -2)
3941 return -2;
3943 /* See whether the pattern is anchored. */
3944 if (bufp->buffer[0] == begline)
3945 anchored_start = 1;
3947 #ifdef emacs
3948 gl_state.object = re_match_object;
3950 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
3951 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos);
3953 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3955 #endif
3957 /* Loop through the string, looking for a place to start matching. */
3958 for (;;)
3960 /* If the pattern is anchored,
3961 skip quickly past places we cannot match.
3962 We don't bother to treat startpos == 0 specially
3963 because that case doesn't repeat. */
3964 if (anchored_start && startpos > 0)
3966 if (! (bufp->newline_anchor
3967 && ((startpos <= size1 ? string1[startpos - 1]
3968 : string2[startpos - size1 - 1])
3969 == '\n')))
3970 goto advance;
3973 /* If a fastmap is supplied, skip quickly over characters that
3974 cannot be the start of a match. If the pattern can match the
3975 null string, however, we don't need to skip characters; we want
3976 the first null string. */
3977 if (fastmap && startpos < total_size && !bufp->can_be_null)
3979 register const char *d;
3980 register unsigned int buf_ch;
3982 d = POS_ADDR_VSTRING (startpos);
3984 if (range > 0) /* Searching forwards. */
3986 register int lim = 0;
3987 int irange = range;
3989 if (startpos < size1 && startpos + range >= size1)
3990 lim = range - (size1 - startpos);
3992 /* Written out as an if-else to avoid testing `translate'
3993 inside the loop. */
3994 if (RE_TRANSLATE_P (translate))
3996 if (multibyte)
3997 while (range > lim)
3999 int buf_charlen;
4001 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4002 buf_charlen);
4004 buf_ch = RE_TRANSLATE (translate, buf_ch);
4005 if (buf_ch >= 0400
4006 || fastmap[buf_ch])
4007 break;
4009 range -= buf_charlen;
4010 d += buf_charlen;
4012 else
4013 while (range > lim
4014 && !fastmap[(unsigned char)
4015 RE_TRANSLATE (translate, (unsigned char) *d)])
4017 d++;
4018 range--;
4021 else
4022 while (range > lim && !fastmap[(unsigned char) *d])
4024 d++;
4025 range--;
4028 startpos += irange - range;
4030 else /* Searching backwards. */
4032 int room = (size1 == 0 || startpos >= size1
4033 ? size2 + size1 - startpos
4034 : size1 - startpos);
4036 buf_ch = STRING_CHAR (d, room);
4037 if (RE_TRANSLATE_P (translate))
4038 buf_ch = RE_TRANSLATE (translate, buf_ch);
4040 if (! (buf_ch >= 0400
4041 || fastmap[buf_ch]))
4042 goto advance;
4046 /* If can't match the null string, and that's all we have left, fail. */
4047 if (range >= 0 && startpos == total_size && fastmap
4048 && !bufp->can_be_null)
4049 return -1;
4051 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4052 startpos, regs, stop);
4053 #ifndef REGEX_MALLOC
4054 #ifdef C_ALLOCA
4055 alloca (0);
4056 #endif
4057 #endif
4059 if (val >= 0)
4060 return startpos;
4062 if (val == -2)
4063 return -2;
4065 advance:
4066 if (!range)
4067 break;
4068 else if (range > 0)
4070 /* Update STARTPOS to the next character boundary. */
4071 if (multibyte)
4073 const unsigned char *p
4074 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
4075 const unsigned char *pend
4076 = (const unsigned char *) STOP_ADDR_VSTRING (startpos);
4077 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4079 range -= len;
4080 if (range < 0)
4081 break;
4082 startpos += len;
4084 else
4086 range--;
4087 startpos++;
4090 else
4092 range++;
4093 startpos--;
4095 /* Update STARTPOS to the previous character boundary. */
4096 if (multibyte)
4098 const unsigned char *p
4099 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
4100 int len = 0;
4102 /* Find the head of multibyte form. */
4103 while (!CHAR_HEAD_P (*p))
4104 p--, len++;
4106 /* Adjust it. */
4107 #if 0 /* XXX */
4108 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
4110 else
4111 #endif
4113 range += len;
4114 if (range > 0)
4115 break;
4117 startpos -= len;
4122 return -1;
4123 } /* re_search_2 */
4125 /* Declarations and macros for re_match_2. */
4127 static int bcmp_translate ();
4128 static boolean alt_match_null_string_p (),
4129 common_op_match_null_string_p (),
4130 group_match_null_string_p ();
4132 /* This converts PTR, a pointer into one of the search strings `string1'
4133 and `string2' into an offset from the beginning of that string. */
4134 #define POINTER_TO_OFFSET(ptr) \
4135 (FIRST_STRING_P (ptr) \
4136 ? ((regoff_t) ((ptr) - string1)) \
4137 : ((regoff_t) ((ptr) - string2 + size1)))
4139 /* Macros for dealing with the split strings in re_match_2. */
4141 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4143 /* Call before fetching a character with *d. This switches over to
4144 string2 if necessary. */
4145 #define PREFETCH() \
4146 while (d == dend) \
4148 /* End of string2 => fail. */ \
4149 if (dend == end_match_2) \
4150 goto fail; \
4151 /* End of string1 => advance to string2. */ \
4152 d = string2; \
4153 dend = end_match_2; \
4157 /* Test if at very beginning or at very end of the virtual concatenation
4158 of `string1' and `string2'. If only one string, it's `string2'. */
4159 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4160 #define AT_STRINGS_END(d) ((d) == end2)
4163 /* Test if D points to a character which is word-constituent. We have
4164 two special cases to check for: if past the end of string1, look at
4165 the first character in string2; and if before the beginning of
4166 string2, look at the last character in string1. */
4167 #define WORDCHAR_P(d) \
4168 (SYNTAX ((d) == end1 ? *string2 \
4169 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4170 == Sword)
4172 /* Disabled due to a compiler bug -- see comment at case wordbound */
4174 /* The comment at case wordbound is following one, but we don't use
4175 AT_WORD_BOUNDARY anymore to support multibyte form.
4177 The DEC Alpha C compiler 3.x generates incorrect code for the
4178 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4179 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4180 macro and introducing temporary variables works around the bug. */
4182 #if 0
4183 /* Test if the character before D and the one at D differ with respect
4184 to being word-constituent. */
4185 #define AT_WORD_BOUNDARY(d) \
4186 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4187 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4188 #endif
4190 /* Free everything we malloc. */
4191 #ifdef MATCH_MAY_ALLOCATE
4192 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4193 #define FREE_VARIABLES() \
4194 do { \
4195 REGEX_FREE_STACK (fail_stack.stack); \
4196 FREE_VAR (regstart); \
4197 FREE_VAR (regend); \
4198 FREE_VAR (old_regstart); \
4199 FREE_VAR (old_regend); \
4200 FREE_VAR (best_regstart); \
4201 FREE_VAR (best_regend); \
4202 FREE_VAR (reg_info); \
4203 FREE_VAR (reg_dummy); \
4204 FREE_VAR (reg_info_dummy); \
4205 } while (0)
4206 #else
4207 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4208 #endif /* not MATCH_MAY_ALLOCATE */
4210 /* These values must meet several constraints. They must not be valid
4211 register values; since we have a limit of 255 registers (because
4212 we use only one byte in the pattern for the register number), we can
4213 use numbers larger than 255. They must differ by 1, because of
4214 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4215 be larger than the value for the highest register, so we do not try
4216 to actually save any registers when none are active. */
4217 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4218 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4220 /* Matching routines. */
4222 #ifndef emacs /* Emacs never uses this. */
4223 /* re_match is like re_match_2 except it takes only a single string. */
4226 re_match (bufp, string, size, pos, regs)
4227 struct re_pattern_buffer *bufp;
4228 const char *string;
4229 int size, pos;
4230 struct re_registers *regs;
4232 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4233 pos, regs, size);
4234 alloca (0);
4235 return result;
4237 #endif /* not emacs */
4239 #ifdef emacs
4240 /* In Emacs, this is the string or buffer in which we
4241 are matching. It is used for looking up syntax properties. */
4242 Lisp_Object re_match_object;
4243 #endif
4245 /* re_match_2 matches the compiled pattern in BUFP against the
4246 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4247 and SIZE2, respectively). We start matching at POS, and stop
4248 matching at STOP.
4250 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4251 store offsets for the substring each group matched in REGS. See the
4252 documentation for exactly how many groups we fill.
4254 We return -1 if no match, -2 if an internal error (such as the
4255 failure stack overflowing). Otherwise, we return the length of the
4256 matched substring. */
4259 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4260 struct re_pattern_buffer *bufp;
4261 const char *string1, *string2;
4262 int size1, size2;
4263 int pos;
4264 struct re_registers *regs;
4265 int stop;
4267 int result;
4269 #ifdef emacs
4270 int charpos;
4271 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
4272 gl_state.object = re_match_object;
4273 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos);
4274 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4275 #endif
4277 result = re_match_2_internal (bufp, string1, size1, string2, size2,
4278 pos, regs, stop);
4279 alloca (0);
4280 return result;
4283 /* This is a separate function so that we can force an alloca cleanup
4284 afterwards. */
4285 static int
4286 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4287 struct re_pattern_buffer *bufp;
4288 const char *string1, *string2;
4289 int size1, size2;
4290 int pos;
4291 struct re_registers *regs;
4292 int stop;
4294 /* General temporaries. */
4295 int mcnt;
4296 unsigned char *p1;
4298 /* Just past the end of the corresponding string. */
4299 const char *end1, *end2;
4301 /* Pointers into string1 and string2, just past the last characters in
4302 each to consider matching. */
4303 const char *end_match_1, *end_match_2;
4305 /* Where we are in the data, and the end of the current string. */
4306 const char *d, *dend;
4308 /* Where we are in the pattern, and the end of the pattern. */
4309 unsigned char *p = bufp->buffer;
4310 register unsigned char *pend = p + bufp->used;
4312 /* Mark the opcode just after a start_memory, so we can test for an
4313 empty subpattern when we get to the stop_memory. */
4314 unsigned char *just_past_start_mem = 0;
4316 /* We use this to map every character in the string. */
4317 RE_TRANSLATE_TYPE translate = bufp->translate;
4319 /* Nonzero if we have to concern multibyte character. */
4320 int multibyte = bufp->multibyte;
4322 /* Failure point stack. Each place that can handle a failure further
4323 down the line pushes a failure point on this stack. It consists of
4324 restart, regend, and reg_info for all registers corresponding to
4325 the subexpressions we're currently inside, plus the number of such
4326 registers, and, finally, two char *'s. The first char * is where
4327 to resume scanning the pattern; the second one is where to resume
4328 scanning the strings. If the latter is zero, the failure point is
4329 a ``dummy''; if a failure happens and the failure point is a dummy,
4330 it gets discarded and the next next one is tried. */
4331 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4332 fail_stack_type fail_stack;
4333 #endif
4334 #ifdef DEBUG
4335 static unsigned failure_id = 0;
4336 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4337 #endif
4339 /* This holds the pointer to the failure stack, when
4340 it is allocated relocatably. */
4341 fail_stack_elt_t *failure_stack_ptr;
4343 /* We fill all the registers internally, independent of what we
4344 return, for use in backreferences. The number here includes
4345 an element for register zero. */
4346 unsigned num_regs = bufp->re_nsub + 1;
4348 /* The currently active registers. */
4349 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4350 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4352 /* Information on the contents of registers. These are pointers into
4353 the input strings; they record just what was matched (on this
4354 attempt) by a subexpression part of the pattern, that is, the
4355 regnum-th regstart pointer points to where in the pattern we began
4356 matching and the regnum-th regend points to right after where we
4357 stopped matching the regnum-th subexpression. (The zeroth register
4358 keeps track of what the whole pattern matches.) */
4359 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4360 const char **regstart, **regend;
4361 #endif
4363 /* If a group that's operated upon by a repetition operator fails to
4364 match anything, then the register for its start will need to be
4365 restored because it will have been set to wherever in the string we
4366 are when we last see its open-group operator. Similarly for a
4367 register's end. */
4368 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4369 const char **old_regstart, **old_regend;
4370 #endif
4372 /* The is_active field of reg_info helps us keep track of which (possibly
4373 nested) subexpressions we are currently in. The matched_something
4374 field of reg_info[reg_num] helps us tell whether or not we have
4375 matched any of the pattern so far this time through the reg_num-th
4376 subexpression. These two fields get reset each time through any
4377 loop their register is in. */
4378 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4379 register_info_type *reg_info;
4380 #endif
4382 /* The following record the register info as found in the above
4383 variables when we find a match better than any we've seen before.
4384 This happens as we backtrack through the failure points, which in
4385 turn happens only if we have not yet matched the entire string. */
4386 unsigned best_regs_set = false;
4387 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4388 const char **best_regstart, **best_regend;
4389 #endif
4391 /* Logically, this is `best_regend[0]'. But we don't want to have to
4392 allocate space for that if we're not allocating space for anything
4393 else (see below). Also, we never need info about register 0 for
4394 any of the other register vectors, and it seems rather a kludge to
4395 treat `best_regend' differently than the rest. So we keep track of
4396 the end of the best match so far in a separate variable. We
4397 initialize this to NULL so that when we backtrack the first time
4398 and need to test it, it's not garbage. */
4399 const char *match_end = NULL;
4401 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4402 int set_regs_matched_done = 0;
4404 /* Used when we pop values we don't care about. */
4405 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4406 const char **reg_dummy;
4407 register_info_type *reg_info_dummy;
4408 #endif
4410 #ifdef DEBUG
4411 /* Counts the total number of registers pushed. */
4412 unsigned num_regs_pushed = 0;
4413 #endif
4415 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4417 INIT_FAIL_STACK ();
4419 #ifdef MATCH_MAY_ALLOCATE
4420 /* Do not bother to initialize all the register variables if there are
4421 no groups in the pattern, as it takes a fair amount of time. If
4422 there are groups, we include space for register 0 (the whole
4423 pattern), even though we never use it, since it simplifies the
4424 array indexing. We should fix this. */
4425 if (bufp->re_nsub)
4427 regstart = REGEX_TALLOC (num_regs, const char *);
4428 regend = REGEX_TALLOC (num_regs, const char *);
4429 old_regstart = REGEX_TALLOC (num_regs, const char *);
4430 old_regend = REGEX_TALLOC (num_regs, const char *);
4431 best_regstart = REGEX_TALLOC (num_regs, const char *);
4432 best_regend = REGEX_TALLOC (num_regs, const char *);
4433 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4434 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4435 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4437 if (!(regstart && regend && old_regstart && old_regend && reg_info
4438 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4440 FREE_VARIABLES ();
4441 return -2;
4444 else
4446 /* We must initialize all our variables to NULL, so that
4447 `FREE_VARIABLES' doesn't try to free them. */
4448 regstart = regend = old_regstart = old_regend = best_regstart
4449 = best_regend = reg_dummy = NULL;
4450 reg_info = reg_info_dummy = (register_info_type *) NULL;
4452 #endif /* MATCH_MAY_ALLOCATE */
4454 /* The starting position is bogus. */
4455 if (pos < 0 || pos > size1 + size2)
4457 FREE_VARIABLES ();
4458 return -1;
4461 /* Initialize subexpression text positions to -1 to mark ones that no
4462 start_memory/stop_memory has been seen for. Also initialize the
4463 register information struct. */
4464 for (mcnt = 1; mcnt < num_regs; mcnt++)
4466 regstart[mcnt] = regend[mcnt]
4467 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4469 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4470 IS_ACTIVE (reg_info[mcnt]) = 0;
4471 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4472 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4475 /* We move `string1' into `string2' if the latter's empty -- but not if
4476 `string1' is null. */
4477 if (size2 == 0 && string1 != NULL)
4479 string2 = string1;
4480 size2 = size1;
4481 string1 = 0;
4482 size1 = 0;
4484 end1 = string1 + size1;
4485 end2 = string2 + size2;
4487 /* Compute where to stop matching, within the two strings. */
4488 if (stop <= size1)
4490 end_match_1 = string1 + stop;
4491 end_match_2 = string2;
4493 else
4495 end_match_1 = end1;
4496 end_match_2 = string2 + stop - size1;
4499 /* `p' scans through the pattern as `d' scans through the data.
4500 `dend' is the end of the input string that `d' points within. `d'
4501 is advanced into the following input string whenever necessary, but
4502 this happens before fetching; therefore, at the beginning of the
4503 loop, `d' can be pointing at the end of a string, but it cannot
4504 equal `string2'. */
4505 if (size1 > 0 && pos <= size1)
4507 d = string1 + pos;
4508 dend = end_match_1;
4510 else
4512 d = string2 + pos - size1;
4513 dend = end_match_2;
4516 DEBUG_PRINT1 ("The compiled pattern is: ");
4517 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4518 DEBUG_PRINT1 ("The string to match is: `");
4519 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4520 DEBUG_PRINT1 ("'\n");
4522 /* This loops over pattern commands. It exits by returning from the
4523 function if the match is complete, or it drops through if the match
4524 fails at this starting point in the input data. */
4525 for (;;)
4527 DEBUG_PRINT2 ("\n0x%x: ", p);
4529 if (p == pend)
4530 { /* End of pattern means we might have succeeded. */
4531 DEBUG_PRINT1 ("end of pattern ... ");
4533 /* If we haven't matched the entire string, and we want the
4534 longest match, try backtracking. */
4535 if (d != end_match_2)
4537 /* 1 if this match ends in the same string (string1 or string2)
4538 as the best previous match. */
4539 boolean same_str_p = (FIRST_STRING_P (match_end)
4540 == MATCHING_IN_FIRST_STRING);
4541 /* 1 if this match is the best seen so far. */
4542 boolean best_match_p;
4544 /* AIX compiler got confused when this was combined
4545 with the previous declaration. */
4546 if (same_str_p)
4547 best_match_p = d > match_end;
4548 else
4549 best_match_p = !MATCHING_IN_FIRST_STRING;
4551 DEBUG_PRINT1 ("backtracking.\n");
4553 if (!FAIL_STACK_EMPTY ())
4554 { /* More failure points to try. */
4556 /* If exceeds best match so far, save it. */
4557 if (!best_regs_set || best_match_p)
4559 best_regs_set = true;
4560 match_end = d;
4562 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4564 for (mcnt = 1; mcnt < num_regs; mcnt++)
4566 best_regstart[mcnt] = regstart[mcnt];
4567 best_regend[mcnt] = regend[mcnt];
4570 goto fail;
4573 /* If no failure points, don't restore garbage. And if
4574 last match is real best match, don't restore second
4575 best one. */
4576 else if (best_regs_set && !best_match_p)
4578 restore_best_regs:
4579 /* Restore best match. It may happen that `dend ==
4580 end_match_1' while the restored d is in string2.
4581 For example, the pattern `x.*y.*z' against the
4582 strings `x-' and `y-z-', if the two strings are
4583 not consecutive in memory. */
4584 DEBUG_PRINT1 ("Restoring best registers.\n");
4586 d = match_end;
4587 dend = ((d >= string1 && d <= end1)
4588 ? end_match_1 : end_match_2);
4590 for (mcnt = 1; mcnt < num_regs; mcnt++)
4592 regstart[mcnt] = best_regstart[mcnt];
4593 regend[mcnt] = best_regend[mcnt];
4596 } /* d != end_match_2 */
4598 succeed_label:
4599 DEBUG_PRINT1 ("Accepting match.\n");
4601 /* If caller wants register contents data back, do it. */
4602 if (regs && !bufp->no_sub)
4604 /* Have the register data arrays been allocated? */
4605 if (bufp->regs_allocated == REGS_UNALLOCATED)
4606 { /* No. So allocate them with malloc. We need one
4607 extra element beyond `num_regs' for the `-1' marker
4608 GNU code uses. */
4609 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4610 regs->start = TALLOC (regs->num_regs, regoff_t);
4611 regs->end = TALLOC (regs->num_regs, regoff_t);
4612 if (regs->start == NULL || regs->end == NULL)
4614 FREE_VARIABLES ();
4615 return -2;
4617 bufp->regs_allocated = REGS_REALLOCATE;
4619 else if (bufp->regs_allocated == REGS_REALLOCATE)
4620 { /* Yes. If we need more elements than were already
4621 allocated, reallocate them. If we need fewer, just
4622 leave it alone. */
4623 if (regs->num_regs < num_regs + 1)
4625 regs->num_regs = num_regs + 1;
4626 RETALLOC (regs->start, regs->num_regs, regoff_t);
4627 RETALLOC (regs->end, regs->num_regs, regoff_t);
4628 if (regs->start == NULL || regs->end == NULL)
4630 FREE_VARIABLES ();
4631 return -2;
4635 else
4637 /* These braces fend off a "empty body in an else-statement"
4638 warning under GCC when assert expands to nothing. */
4639 assert (bufp->regs_allocated == REGS_FIXED);
4642 /* Convert the pointer data in `regstart' and `regend' to
4643 indices. Register zero has to be set differently,
4644 since we haven't kept track of any info for it. */
4645 if (regs->num_regs > 0)
4647 regs->start[0] = pos;
4648 regs->end[0] = (MATCHING_IN_FIRST_STRING
4649 ? ((regoff_t) (d - string1))
4650 : ((regoff_t) (d - string2 + size1)));
4653 /* Go through the first `min (num_regs, regs->num_regs)'
4654 registers, since that is all we initialized. */
4655 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4657 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4658 regs->start[mcnt] = regs->end[mcnt] = -1;
4659 else
4661 regs->start[mcnt]
4662 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4663 regs->end[mcnt]
4664 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4668 /* If the regs structure we return has more elements than
4669 were in the pattern, set the extra elements to -1. If
4670 we (re)allocated the registers, this is the case,
4671 because we always allocate enough to have at least one
4672 -1 at the end. */
4673 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4674 regs->start[mcnt] = regs->end[mcnt] = -1;
4675 } /* regs && !bufp->no_sub */
4677 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4678 nfailure_points_pushed, nfailure_points_popped,
4679 nfailure_points_pushed - nfailure_points_popped);
4680 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4682 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4683 ? string1
4684 : string2 - size1);
4686 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4688 FREE_VARIABLES ();
4689 return mcnt;
4692 /* Otherwise match next pattern command. */
4693 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4695 /* Ignore these. Used to ignore the n of succeed_n's which
4696 currently have n == 0. */
4697 case no_op:
4698 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4699 break;
4701 case succeed:
4702 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4703 goto succeed_label;
4705 /* Match the next n pattern characters exactly. The following
4706 byte in the pattern defines n, and the n bytes after that
4707 are the characters to match. */
4708 case exactn:
4709 mcnt = *p++;
4710 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4712 /* This is written out as an if-else so we don't waste time
4713 testing `translate' inside the loop. */
4714 if (RE_TRANSLATE_P (translate))
4716 #ifdef emacs
4717 if (multibyte)
4720 int pat_charlen, buf_charlen;
4721 unsigned int pat_ch, buf_ch;
4723 PREFETCH ();
4724 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4725 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4727 if (RE_TRANSLATE (translate, buf_ch)
4728 != pat_ch)
4729 goto fail;
4731 p += pat_charlen;
4732 d += buf_charlen;
4733 mcnt -= pat_charlen;
4735 while (mcnt > 0);
4736 else
4737 #endif /* not emacs */
4740 PREFETCH ();
4741 if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d)
4742 != (unsigned char) *p++)
4743 goto fail;
4744 d++;
4746 while (--mcnt);
4748 else
4752 PREFETCH ();
4753 if (*d++ != (char) *p++) goto fail;
4755 while (--mcnt);
4757 SET_REGS_MATCHED ();
4758 break;
4761 /* Match any character except possibly a newline or a null. */
4762 case anychar:
4764 int buf_charlen;
4765 unsigned int buf_ch;
4767 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4769 PREFETCH ();
4771 #ifdef emacs
4772 if (multibyte)
4773 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4774 else
4775 #endif /* not emacs */
4777 buf_ch = (unsigned char) *d;
4778 buf_charlen = 1;
4781 buf_ch = TRANSLATE (buf_ch);
4783 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4784 && buf_ch == '\n')
4785 || ((bufp->syntax & RE_DOT_NOT_NULL)
4786 && buf_ch == '\000'))
4787 goto fail;
4789 SET_REGS_MATCHED ();
4790 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4791 d += buf_charlen;
4793 break;
4796 case charset:
4797 case charset_not:
4799 register unsigned int c;
4800 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4801 int len;
4803 /* Start of actual range_table, or end of bitmap if there is no
4804 range table. */
4805 unsigned char *range_table;
4807 /* Nonzero if there is a range table. */
4808 int range_table_exists;
4810 /* Number of ranges of range table. This is not included
4811 in the initial byte-length of the command. */
4812 int count = 0;
4814 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4816 PREFETCH ();
4817 c = (unsigned char) *d;
4819 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4821 #ifdef emacs
4822 if (range_table_exists)
4824 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4825 EXTRACT_NUMBER_AND_INCR (count, range_table);
4828 if (multibyte && BASE_LEADING_CODE_P (c))
4829 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
4830 #endif /* emacs */
4832 if (SINGLE_BYTE_CHAR_P (c))
4833 { /* Lookup bitmap. */
4834 c = TRANSLATE (c); /* The character to match. */
4835 len = 1;
4837 /* Cast to `unsigned' instead of `unsigned char' in
4838 case the bit list is a full 32 bytes long. */
4839 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4840 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4841 not = !not;
4843 #ifdef emacs
4844 else if (range_table_exists)
4846 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4848 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4849 | (class_bits & BIT_ALPHA && ISALPHA (c))
4850 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
4851 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4852 | (class_bits & BIT_LOWER && ISLOWER (c))
4853 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4854 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
4855 | (class_bits & BIT_PRINT && ISPRINT (c))
4856 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4857 | (class_bits & BIT_SPACE && ISSPACE (c))
4858 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
4859 | (class_bits & BIT_UPPER && ISUPPER (c))
4860 | (class_bits & BIT_WORD && ISWORD (c)))
4861 not = !not;
4862 else
4863 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4865 #endif /* emacs */
4867 if (range_table_exists)
4868 p = CHARSET_RANGE_TABLE_END (range_table, count);
4869 else
4870 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
4872 if (!not) goto fail;
4874 SET_REGS_MATCHED ();
4875 d += len;
4876 break;
4880 /* The beginning of a group is represented by start_memory.
4881 The arguments are the register number in the next byte, and the
4882 number of groups inner to this one in the next. The text
4883 matched within the group is recorded (in the internal
4884 registers data structure) under the register number. */
4885 case start_memory:
4886 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4888 /* Find out if this group can match the empty string. */
4889 p1 = p; /* To send to group_match_null_string_p. */
4891 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4892 REG_MATCH_NULL_STRING_P (reg_info[*p])
4893 = group_match_null_string_p (&p1, pend, reg_info);
4895 /* Save the position in the string where we were the last time
4896 we were at this open-group operator in case the group is
4897 operated upon by a repetition operator, e.g., with `(a*)*b'
4898 against `ab'; then we want to ignore where we are now in
4899 the string in case this attempt to match fails. */
4900 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4901 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4902 : regstart[*p];
4903 DEBUG_PRINT2 (" old_regstart: %d\n",
4904 POINTER_TO_OFFSET (old_regstart[*p]));
4906 regstart[*p] = d;
4907 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4909 IS_ACTIVE (reg_info[*p]) = 1;
4910 MATCHED_SOMETHING (reg_info[*p]) = 0;
4912 /* Clear this whenever we change the register activity status. */
4913 set_regs_matched_done = 0;
4915 /* This is the new highest active register. */
4916 highest_active_reg = *p;
4918 /* If nothing was active before, this is the new lowest active
4919 register. */
4920 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4921 lowest_active_reg = *p;
4923 /* Move past the register number and inner group count. */
4924 p += 2;
4925 just_past_start_mem = p;
4927 break;
4930 /* The stop_memory opcode represents the end of a group. Its
4931 arguments are the same as start_memory's: the register
4932 number, and the number of inner groups. */
4933 case stop_memory:
4934 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4936 /* We need to save the string position the last time we were at
4937 this close-group operator in case the group is operated
4938 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4939 against `aba'; then we want to ignore where we are now in
4940 the string in case this attempt to match fails. */
4941 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4942 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4943 : regend[*p];
4944 DEBUG_PRINT2 (" old_regend: %d\n",
4945 POINTER_TO_OFFSET (old_regend[*p]));
4947 regend[*p] = d;
4948 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4950 /* This register isn't active anymore. */
4951 IS_ACTIVE (reg_info[*p]) = 0;
4953 /* Clear this whenever we change the register activity status. */
4954 set_regs_matched_done = 0;
4956 /* If this was the only register active, nothing is active
4957 anymore. */
4958 if (lowest_active_reg == highest_active_reg)
4960 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4961 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4963 else
4964 { /* We must scan for the new highest active register, since
4965 it isn't necessarily one less than now: consider
4966 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4967 new highest active register is 1. */
4968 unsigned char r = *p - 1;
4969 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4970 r--;
4972 /* If we end up at register zero, that means that we saved
4973 the registers as the result of an `on_failure_jump', not
4974 a `start_memory', and we jumped to past the innermost
4975 `stop_memory'. For example, in ((.)*) we save
4976 registers 1 and 2 as a result of the *, but when we pop
4977 back to the second ), we are at the stop_memory 1.
4978 Thus, nothing is active. */
4979 if (r == 0)
4981 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4982 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4984 else
4985 highest_active_reg = r;
4988 /* If just failed to match something this time around with a
4989 group that's operated on by a repetition operator, try to
4990 force exit from the ``loop'', and restore the register
4991 information for this group that we had before trying this
4992 last match. */
4993 if ((!MATCHED_SOMETHING (reg_info[*p])
4994 || just_past_start_mem == p - 1)
4995 && (p + 2) < pend)
4997 boolean is_a_jump_n = false;
4999 p1 = p + 2;
5000 mcnt = 0;
5001 switch ((re_opcode_t) *p1++)
5003 case jump_n:
5004 is_a_jump_n = true;
5005 case pop_failure_jump:
5006 case maybe_pop_jump:
5007 case jump:
5008 case dummy_failure_jump:
5009 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5010 if (is_a_jump_n)
5011 p1 += 2;
5012 break;
5014 default:
5015 /* do nothing */ ;
5017 p1 += mcnt;
5019 /* If the next operation is a jump backwards in the pattern
5020 to an on_failure_jump right before the start_memory
5021 corresponding to this stop_memory, exit from the loop
5022 by forcing a failure after pushing on the stack the
5023 on_failure_jump's jump in the pattern, and d. */
5024 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5025 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
5027 /* If this group ever matched anything, then restore
5028 what its registers were before trying this last
5029 failed match, e.g., with `(a*)*b' against `ab' for
5030 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5031 against `aba' for regend[3].
5033 Also restore the registers for inner groups for,
5034 e.g., `((a*)(b*))*' against `aba' (register 3 would
5035 otherwise get trashed). */
5037 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
5039 unsigned r;
5041 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5043 /* Restore this and inner groups' (if any) registers. */
5044 for (r = *p; r < *p + *(p + 1); r++)
5046 regstart[r] = old_regstart[r];
5048 /* xx why this test? */
5049 if (old_regend[r] >= regstart[r])
5050 regend[r] = old_regend[r];
5053 p1++;
5054 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5055 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
5057 goto fail;
5061 /* Move past the register number and the inner group count. */
5062 p += 2;
5063 break;
5066 /* \<digit> has been turned into a `duplicate' command which is
5067 followed by the numeric value of <digit> as the register number. */
5068 case duplicate:
5070 register const char *d2, *dend2;
5071 int regno = *p++; /* Get which register to match against. */
5072 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5074 /* Can't back reference a group which we've never matched. */
5075 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5076 goto fail;
5078 /* Where in input to try to start matching. */
5079 d2 = regstart[regno];
5081 /* Where to stop matching; if both the place to start and
5082 the place to stop matching are in the same string, then
5083 set to the place to stop, otherwise, for now have to use
5084 the end of the first string. */
5086 dend2 = ((FIRST_STRING_P (regstart[regno])
5087 == FIRST_STRING_P (regend[regno]))
5088 ? regend[regno] : end_match_1);
5089 for (;;)
5091 /* If necessary, advance to next segment in register
5092 contents. */
5093 while (d2 == dend2)
5095 if (dend2 == end_match_2) break;
5096 if (dend2 == regend[regno]) break;
5098 /* End of string1 => advance to string2. */
5099 d2 = string2;
5100 dend2 = regend[regno];
5102 /* At end of register contents => success */
5103 if (d2 == dend2) break;
5105 /* If necessary, advance to next segment in data. */
5106 PREFETCH ();
5108 /* How many characters left in this segment to match. */
5109 mcnt = dend - d;
5111 /* Want how many consecutive characters we can match in
5112 one shot, so, if necessary, adjust the count. */
5113 if (mcnt > dend2 - d2)
5114 mcnt = dend2 - d2;
5116 /* Compare that many; failure if mismatch, else move
5117 past them. */
5118 if (RE_TRANSLATE_P (translate)
5119 ? bcmp_translate (d, d2, mcnt, translate)
5120 : bcmp (d, d2, mcnt))
5121 goto fail;
5122 d += mcnt, d2 += mcnt;
5124 /* Do this because we've match some characters. */
5125 SET_REGS_MATCHED ();
5128 break;
5131 /* begline matches the empty string at the beginning of the string
5132 (unless `not_bol' is set in `bufp'), and, if
5133 `newline_anchor' is set, after newlines. */
5134 case begline:
5135 DEBUG_PRINT1 ("EXECUTING begline.\n");
5137 if (AT_STRINGS_BEG (d))
5139 if (!bufp->not_bol) break;
5141 else if (d[-1] == '\n' && bufp->newline_anchor)
5143 break;
5145 /* In all other cases, we fail. */
5146 goto fail;
5149 /* endline is the dual of begline. */
5150 case endline:
5151 DEBUG_PRINT1 ("EXECUTING endline.\n");
5153 if (AT_STRINGS_END (d))
5155 if (!bufp->not_eol) break;
5158 /* We have to ``prefetch'' the next character. */
5159 else if ((d == end1 ? *string2 : *d) == '\n'
5160 && bufp->newline_anchor)
5162 break;
5164 goto fail;
5167 /* Match at the very beginning of the data. */
5168 case begbuf:
5169 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5170 if (AT_STRINGS_BEG (d))
5171 break;
5172 goto fail;
5175 /* Match at the very end of the data. */
5176 case endbuf:
5177 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5178 if (AT_STRINGS_END (d))
5179 break;
5180 goto fail;
5183 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5184 pushes NULL as the value for the string on the stack. Then
5185 `pop_failure_point' will keep the current value for the
5186 string, instead of restoring it. To see why, consider
5187 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5188 then the . fails against the \n. But the next thing we want
5189 to do is match the \n against the \n; if we restored the
5190 string value, we would be back at the foo.
5192 Because this is used only in specific cases, we don't need to
5193 check all the things that `on_failure_jump' does, to make
5194 sure the right things get saved on the stack. Hence we don't
5195 share its code. The only reason to push anything on the
5196 stack at all is that otherwise we would have to change
5197 `anychar's code to do something besides goto fail in this
5198 case; that seems worse than this. */
5199 case on_failure_keep_string_jump:
5200 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5202 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5203 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
5205 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
5206 break;
5209 /* Uses of on_failure_jump:
5211 Each alternative starts with an on_failure_jump that points
5212 to the beginning of the next alternative. Each alternative
5213 except the last ends with a jump that in effect jumps past
5214 the rest of the alternatives. (They really jump to the
5215 ending jump of the following alternative, because tensioning
5216 these jumps is a hassle.)
5218 Repeats start with an on_failure_jump that points past both
5219 the repetition text and either the following jump or
5220 pop_failure_jump back to this on_failure_jump. */
5221 case on_failure_jump:
5222 on_failure:
5223 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5225 #if defined (WINDOWSNT) && defined (emacs)
5226 QUIT;
5227 #endif
5229 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5230 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
5232 /* If this on_failure_jump comes right before a group (i.e.,
5233 the original * applied to a group), save the information
5234 for that group and all inner ones, so that if we fail back
5235 to this point, the group's information will be correct.
5236 For example, in \(a*\)*\1, we need the preceding group,
5237 and in \(zz\(a*\)b*\)\2, we need the inner group. */
5239 /* We can't use `p' to check ahead because we push
5240 a failure point to `p + mcnt' after we do this. */
5241 p1 = p;
5243 /* We need to skip no_op's before we look for the
5244 start_memory in case this on_failure_jump is happening as
5245 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5246 against aba. */
5247 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5248 p1++;
5250 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5252 /* We have a new highest active register now. This will
5253 get reset at the start_memory we are about to get to,
5254 but we will have saved all the registers relevant to
5255 this repetition op, as described above. */
5256 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5257 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5258 lowest_active_reg = *(p1 + 1);
5261 DEBUG_PRINT1 (":\n");
5262 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5263 break;
5266 /* A smart repeat ends with `maybe_pop_jump'.
5267 We change it to either `pop_failure_jump' or `jump'. */
5268 case maybe_pop_jump:
5269 #if defined (WINDOWSNT) && defined (emacs)
5270 QUIT;
5271 #endif
5272 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5273 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5275 register unsigned char *p2 = p;
5277 /* Compare the beginning of the repeat with what in the
5278 pattern follows its end. If we can establish that there
5279 is nothing that they would both match, i.e., that we
5280 would have to backtrack because of (as in, e.g., `a*a')
5281 then we can change to pop_failure_jump, because we'll
5282 never have to backtrack.
5284 This is not true in the case of alternatives: in
5285 `(a|ab)*' we do need to backtrack to the `ab' alternative
5286 (e.g., if the string was `ab'). But instead of trying to
5287 detect that here, the alternative has put on a dummy
5288 failure point which is what we will end up popping. */
5290 /* Skip over open/close-group commands.
5291 If what follows this loop is a ...+ construct,
5292 look at what begins its body, since we will have to
5293 match at least one of that. */
5294 while (1)
5296 if (p2 + 2 < pend
5297 && ((re_opcode_t) *p2 == stop_memory
5298 || (re_opcode_t) *p2 == start_memory))
5299 p2 += 3;
5300 else if (p2 + 6 < pend
5301 && (re_opcode_t) *p2 == dummy_failure_jump)
5302 p2 += 6;
5303 else
5304 break;
5307 p1 = p + mcnt;
5308 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5309 to the `maybe_finalize_jump' of this case. Examine what
5310 follows. */
5312 /* If we're at the end of the pattern, we can change. */
5313 if (p2 == pend)
5315 /* Consider what happens when matching ":\(.*\)"
5316 against ":/". I don't really understand this code
5317 yet. */
5318 p[-3] = (unsigned char) pop_failure_jump;
5319 DEBUG_PRINT1
5320 (" End of pattern: change to `pop_failure_jump'.\n");
5323 else if ((re_opcode_t) *p2 == exactn
5324 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5326 register unsigned int c
5327 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5329 if ((re_opcode_t) p1[3] == exactn)
5331 if (!(multibyte /* && (c != '\n') */
5332 && BASE_LEADING_CODE_P (c))
5333 ? c != p1[5]
5334 : (STRING_CHAR (&p2[2], pend - &p2[2])
5335 != STRING_CHAR (&p1[5], pend - &p1[5])))
5337 p[-3] = (unsigned char) pop_failure_jump;
5338 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5339 c, p1[5]);
5343 else if ((re_opcode_t) p1[3] == charset
5344 || (re_opcode_t) p1[3] == charset_not)
5346 int not = (re_opcode_t) p1[3] == charset_not;
5348 if (multibyte /* && (c != '\n') */
5349 && BASE_LEADING_CODE_P (c))
5350 c = STRING_CHAR (&p2[2], pend - &p2[2]);
5352 /* Test if C is listed in charset (or charset_not)
5353 at `&p1[3]'. */
5354 if (SINGLE_BYTE_CHAR_P (c))
5356 if (c < CHARSET_BITMAP_SIZE (&p1[3]) * BYTEWIDTH
5357 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5358 not = !not;
5360 else if (CHARSET_RANGE_TABLE_EXISTS_P (&p1[3]))
5361 CHARSET_LOOKUP_RANGE_TABLE (not, c, &p1[3]);
5363 /* `not' is equal to 1 if c would match, which means
5364 that we can't change to pop_failure_jump. */
5365 if (!not)
5367 p[-3] = (unsigned char) pop_failure_jump;
5368 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5372 else if ((re_opcode_t) *p2 == charset)
5374 if ((re_opcode_t) p1[3] == exactn)
5376 register unsigned int c = p1[5];
5377 int not = 0;
5379 if (multibyte && BASE_LEADING_CODE_P (c))
5380 c = STRING_CHAR (&p1[5], pend - &p1[5]);
5382 /* Test if C is listed in charset at `p2'. */
5383 if (SINGLE_BYTE_CHAR_P (c))
5385 if (c < CHARSET_BITMAP_SIZE (p2) * BYTEWIDTH
5386 && (p2[2 + c / BYTEWIDTH]
5387 & (1 << (c % BYTEWIDTH))))
5388 not = !not;
5390 else if (CHARSET_RANGE_TABLE_EXISTS_P (p2))
5391 CHARSET_LOOKUP_RANGE_TABLE (not, c, p2);
5393 if (!not)
5395 p[-3] = (unsigned char) pop_failure_jump;
5396 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5400 /* It is hard to list up all the character in charset
5401 P2 if it includes multibyte character. Give up in
5402 such case. */
5403 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
5405 /* Now, we are sure that P2 has no range table.
5406 So, for the size of bitmap in P2, `p2[1]' is
5407 enough. But P1 may have range table, so the
5408 size of bitmap table of P1 is extracted by
5409 using macro `CHARSET_BITMAP_SIZE'.
5411 Since we know that all the character listed in
5412 P2 is ASCII, it is enough to test only bitmap
5413 table of P1. */
5415 if ((re_opcode_t) p1[3] == charset_not)
5417 int idx;
5418 /* We win if the charset_not inside the loop lists
5419 every character listed in the charset after. */
5420 for (idx = 0; idx < (int) p2[1]; idx++)
5421 if (! (p2[2 + idx] == 0
5422 || (idx < CHARSET_BITMAP_SIZE (&p1[3])
5423 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5424 break;
5426 if (idx == p2[1])
5428 p[-3] = (unsigned char) pop_failure_jump;
5429 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5432 else if ((re_opcode_t) p1[3] == charset)
5434 int idx;
5435 /* We win if the charset inside the loop
5436 has no overlap with the one after the loop. */
5437 for (idx = 0;
5438 (idx < (int) p2[1]
5439 && idx < CHARSET_BITMAP_SIZE (&p1[3]));
5440 idx++)
5441 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5442 break;
5444 if (idx == p2[1]
5445 || idx == CHARSET_BITMAP_SIZE (&p1[3]))
5447 p[-3] = (unsigned char) pop_failure_jump;
5448 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5454 p -= 2; /* Point at relative address again. */
5455 if ((re_opcode_t) p[-1] != pop_failure_jump)
5457 p[-1] = (unsigned char) jump;
5458 DEBUG_PRINT1 (" Match => jump.\n");
5459 goto unconditional_jump;
5461 /* Note fall through. */
5464 /* The end of a simple repeat has a pop_failure_jump back to
5465 its matching on_failure_jump, where the latter will push a
5466 failure point. The pop_failure_jump takes off failure
5467 points put on by this pop_failure_jump's matching
5468 on_failure_jump; we got through the pattern to here from the
5469 matching on_failure_jump, so didn't fail. */
5470 case pop_failure_jump:
5472 /* We need to pass separate storage for the lowest and
5473 highest registers, even though we don't care about the
5474 actual values. Otherwise, we will restore only one
5475 register from the stack, since lowest will == highest in
5476 `pop_failure_point'. */
5477 unsigned dummy_low_reg, dummy_high_reg;
5478 unsigned char *pdummy;
5479 const char *sdummy;
5481 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5482 POP_FAILURE_POINT (sdummy, pdummy,
5483 dummy_low_reg, dummy_high_reg,
5484 reg_dummy, reg_dummy, reg_info_dummy);
5486 /* Note fall through. */
5489 /* Unconditionally jump (without popping any failure points). */
5490 case jump:
5491 unconditional_jump:
5492 #if defined (WINDOWSNT) && defined (emacs)
5493 QUIT;
5494 #endif
5495 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5496 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5497 p += mcnt; /* Do the jump. */
5498 DEBUG_PRINT2 ("(to 0x%x).\n", p);
5499 break;
5502 /* We need this opcode so we can detect where alternatives end
5503 in `group_match_null_string_p' et al. */
5504 case jump_past_alt:
5505 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5506 goto unconditional_jump;
5509 /* Normally, the on_failure_jump pushes a failure point, which
5510 then gets popped at pop_failure_jump. We will end up at
5511 pop_failure_jump, also, and with a pattern of, say, `a+', we
5512 are skipping over the on_failure_jump, so we have to push
5513 something meaningless for pop_failure_jump to pop. */
5514 case dummy_failure_jump:
5515 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5516 /* It doesn't matter what we push for the string here. What
5517 the code at `fail' tests is the value for the pattern. */
5518 PUSH_FAILURE_POINT (0, 0, -2);
5519 goto unconditional_jump;
5522 /* At the end of an alternative, we need to push a dummy failure
5523 point in case we are followed by a `pop_failure_jump', because
5524 we don't want the failure point for the alternative to be
5525 popped. For example, matching `(a|ab)*' against `aab'
5526 requires that we match the `ab' alternative. */
5527 case push_dummy_failure:
5528 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5529 /* See comments just above at `dummy_failure_jump' about the
5530 two zeroes. */
5531 PUSH_FAILURE_POINT (0, 0, -2);
5532 break;
5534 /* Have to succeed matching what follows at least n times.
5535 After that, handle like `on_failure_jump'. */
5536 case succeed_n:
5537 EXTRACT_NUMBER (mcnt, p + 2);
5538 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5540 assert (mcnt >= 0);
5541 /* Originally, this is how many times we HAVE to succeed. */
5542 if (mcnt > 0)
5544 mcnt--;
5545 p += 2;
5546 STORE_NUMBER_AND_INCR (p, mcnt);
5547 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
5549 else if (mcnt == 0)
5551 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
5552 p[2] = (unsigned char) no_op;
5553 p[3] = (unsigned char) no_op;
5554 goto on_failure;
5556 break;
5558 case jump_n:
5559 EXTRACT_NUMBER (mcnt, p + 2);
5560 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5562 /* Originally, this is how many times we CAN jump. */
5563 if (mcnt)
5565 mcnt--;
5566 STORE_NUMBER (p + 2, mcnt);
5567 goto unconditional_jump;
5569 /* If don't have to jump any more, skip over the rest of command. */
5570 else
5571 p += 4;
5572 break;
5574 case set_number_at:
5576 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5578 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5579 p1 = p + mcnt;
5580 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5581 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
5582 STORE_NUMBER (p1, mcnt);
5583 break;
5586 case wordbound:
5587 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5589 /* We SUCCEED in one of the following cases: */
5591 /* Case 1: D is at the beginning or the end of string. */
5592 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5593 break;
5594 else
5596 /* C1 is the character before D, S1 is the syntax of C1, C2
5597 is the character at D, and S2 is the syntax of C2. */
5598 int c1, c2, s1, s2;
5599 int pos1 = PTR_TO_OFFSET (d - 1);
5600 int charpos;
5602 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5603 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5604 #ifdef emacs
5605 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5606 UPDATE_SYNTAX_TABLE (charpos);
5607 #endif
5608 s1 = SYNTAX (c1);
5609 #ifdef emacs
5610 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5611 #endif
5612 s2 = SYNTAX (c2);
5614 if (/* Case 2: Only one of S1 and S2 is Sword. */
5615 ((s1 == Sword) != (s2 == Sword))
5616 /* Case 3: Both of S1 and S2 are Sword, and macro
5617 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5618 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5619 break;
5621 goto fail;
5623 case notwordbound:
5624 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5626 /* We FAIL in one of the following cases: */
5628 /* Case 1: D is at the beginning or the end of string. */
5629 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5630 goto fail;
5631 else
5633 /* C1 is the character before D, S1 is the syntax of C1, C2
5634 is the character at D, and S2 is the syntax of C2. */
5635 int c1, c2, s1, s2;
5636 int pos1 = PTR_TO_OFFSET (d - 1);
5637 int charpos;
5639 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5640 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5641 #ifdef emacs
5642 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5643 UPDATE_SYNTAX_TABLE (charpos);
5644 #endif
5645 s1 = SYNTAX (c1);
5646 #ifdef emacs
5647 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5648 #endif
5649 s2 = SYNTAX (c2);
5651 if (/* Case 2: Only one of S1 and S2 is Sword. */
5652 ((s1 == Sword) != (s2 == Sword))
5653 /* Case 3: Both of S1 and S2 are Sword, and macro
5654 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5655 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5656 goto fail;
5658 break;
5660 case wordbeg:
5661 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5663 /* We FAIL in one of the following cases: */
5665 /* Case 1: D is at the end of string. */
5666 if (AT_STRINGS_END (d))
5667 goto fail;
5668 else
5670 /* C1 is the character before D, S1 is the syntax of C1, C2
5671 is the character at D, and S2 is the syntax of C2. */
5672 int c1, c2, s1, s2;
5673 int pos1 = PTR_TO_OFFSET (d);
5674 int charpos;
5676 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5677 #ifdef emacs
5678 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5679 UPDATE_SYNTAX_TABLE (charpos);
5680 #endif
5681 s2 = SYNTAX (c2);
5683 /* Case 2: S2 is not Sword. */
5684 if (s2 != Sword)
5685 goto fail;
5687 /* Case 3: D is not at the beginning of string ... */
5688 if (!AT_STRINGS_BEG (d))
5690 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5691 #ifdef emacs
5692 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5693 #endif
5694 s1 = SYNTAX (c1);
5696 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5697 returns 0. */
5698 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5699 goto fail;
5702 break;
5704 case wordend:
5705 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5707 /* We FAIL in one of the following cases: */
5709 /* Case 1: D is at the beginning of string. */
5710 if (AT_STRINGS_BEG (d))
5711 goto fail;
5712 else
5714 /* C1 is the character before D, S1 is the syntax of C1, C2
5715 is the character at D, and S2 is the syntax of C2. */
5716 int c1, c2, s1, s2;
5717 int pos1 = PTR_TO_OFFSET (d);
5718 int charpos;
5720 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5721 #ifdef emacs
5722 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1);
5723 UPDATE_SYNTAX_TABLE (charpos);
5724 #endif
5725 s1 = SYNTAX (c1);
5727 /* Case 2: S1 is not Sword. */
5728 if (s1 != Sword)
5729 goto fail;
5731 /* Case 3: D is not at the end of string ... */
5732 if (!AT_STRINGS_END (d))
5734 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5735 #ifdef emacs
5736 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5737 #endif
5738 s2 = SYNTAX (c2);
5740 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5741 returns 0. */
5742 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5743 goto fail;
5746 break;
5748 #ifdef emacs
5749 case before_dot:
5750 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5751 if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE)
5752 goto fail;
5753 break;
5755 case at_dot:
5756 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5757 if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE)
5758 goto fail;
5759 break;
5761 case after_dot:
5762 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5763 if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE)
5764 goto fail;
5765 break;
5767 case syntaxspec:
5768 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5769 mcnt = *p++;
5770 goto matchsyntax;
5772 case wordchar:
5773 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5774 mcnt = (int) Sword;
5775 matchsyntax:
5776 PREFETCH ();
5777 #ifdef emacs
5779 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5780 UPDATE_SYNTAX_TABLE (pos1);
5782 #endif
5784 int c, len;
5786 if (multibyte)
5787 /* we must concern about multibyte form, ... */
5788 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5789 else
5790 /* everything should be handled as ASCII, even though it
5791 looks like multibyte form. */
5792 c = *d, len = 1;
5794 if (SYNTAX (c) != (enum syntaxcode) mcnt)
5795 goto fail;
5796 d += len;
5798 SET_REGS_MATCHED ();
5799 break;
5801 case notsyntaxspec:
5802 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5803 mcnt = *p++;
5804 goto matchnotsyntax;
5806 case notwordchar:
5807 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5808 mcnt = (int) Sword;
5809 matchnotsyntax:
5810 PREFETCH ();
5811 #ifdef emacs
5813 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5814 UPDATE_SYNTAX_TABLE (pos1);
5816 #endif
5818 int c, len;
5820 if (multibyte)
5821 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5822 else
5823 c = *d, len = 1;
5825 if (SYNTAX (c) == (enum syntaxcode) mcnt)
5826 goto fail;
5827 d += len;
5829 SET_REGS_MATCHED ();
5830 break;
5832 case categoryspec:
5833 DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
5834 mcnt = *p++;
5835 PREFETCH ();
5837 int c, len;
5839 if (multibyte)
5840 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5841 else
5842 c = *d, len = 1;
5844 if (!CHAR_HAS_CATEGORY (c, mcnt))
5845 goto fail;
5846 d += len;
5848 SET_REGS_MATCHED ();
5849 break;
5851 case notcategoryspec:
5852 DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
5853 mcnt = *p++;
5854 PREFETCH ();
5856 int c, len;
5858 if (multibyte)
5859 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5860 else
5861 c = *d, len = 1;
5863 if (CHAR_HAS_CATEGORY (c, mcnt))
5864 goto fail;
5865 d += len;
5867 SET_REGS_MATCHED ();
5868 break;
5870 #else /* not emacs */
5871 case wordchar:
5872 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5873 PREFETCH ();
5874 if (!WORDCHAR_P (d))
5875 goto fail;
5876 SET_REGS_MATCHED ();
5877 d++;
5878 break;
5880 case notwordchar:
5881 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5882 PREFETCH ();
5883 if (WORDCHAR_P (d))
5884 goto fail;
5885 SET_REGS_MATCHED ();
5886 d++;
5887 break;
5888 #endif /* not emacs */
5890 default:
5891 abort ();
5893 continue; /* Successfully executed one pattern command; keep going. */
5896 /* We goto here if a matching operation fails. */
5897 fail:
5898 #if defined (WINDOWSNT) && defined (emacs)
5899 QUIT;
5900 #endif
5901 if (!FAIL_STACK_EMPTY ())
5902 { /* A restart point is known. Restore to that state. */
5903 DEBUG_PRINT1 ("\nFAIL:\n");
5904 POP_FAILURE_POINT (d, p,
5905 lowest_active_reg, highest_active_reg,
5906 regstart, regend, reg_info);
5908 /* If this failure point is a dummy, try the next one. */
5909 if (!p)
5910 goto fail;
5912 /* If we failed to the end of the pattern, don't examine *p. */
5913 assert (p <= pend);
5914 if (p < pend)
5916 boolean is_a_jump_n = false;
5918 /* If failed to a backwards jump that's part of a repetition
5919 loop, need to pop this failure point and use the next one. */
5920 switch ((re_opcode_t) *p)
5922 case jump_n:
5923 is_a_jump_n = true;
5924 case maybe_pop_jump:
5925 case pop_failure_jump:
5926 case jump:
5927 p1 = p + 1;
5928 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5929 p1 += mcnt;
5931 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5932 || (!is_a_jump_n
5933 && (re_opcode_t) *p1 == on_failure_jump))
5934 goto fail;
5935 break;
5936 default:
5937 /* do nothing */ ;
5941 if (d >= string1 && d <= end1)
5942 dend = end_match_1;
5944 else
5945 break; /* Matching at this starting point really fails. */
5946 } /* for (;;) */
5948 if (best_regs_set)
5949 goto restore_best_regs;
5951 FREE_VARIABLES ();
5953 return -1; /* Failure to match. */
5954 } /* re_match_2 */
5956 /* Subroutine definitions for re_match_2. */
5959 /* We are passed P pointing to a register number after a start_memory.
5961 Return true if the pattern up to the corresponding stop_memory can
5962 match the empty string, and false otherwise.
5964 If we find the matching stop_memory, sets P to point to one past its number.
5965 Otherwise, sets P to an undefined byte less than or equal to END.
5967 We don't handle duplicates properly (yet). */
5969 static boolean
5970 group_match_null_string_p (p, end, reg_info)
5971 unsigned char **p, *end;
5972 register_info_type *reg_info;
5974 int mcnt;
5975 /* Point to after the args to the start_memory. */
5976 unsigned char *p1 = *p + 2;
5978 while (p1 < end)
5980 /* Skip over opcodes that can match nothing, and return true or
5981 false, as appropriate, when we get to one that can't, or to the
5982 matching stop_memory. */
5984 switch ((re_opcode_t) *p1)
5986 /* Could be either a loop or a series of alternatives. */
5987 case on_failure_jump:
5988 p1++;
5989 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5991 /* If the next operation is not a jump backwards in the
5992 pattern. */
5994 if (mcnt >= 0)
5996 /* Go through the on_failure_jumps of the alternatives,
5997 seeing if any of the alternatives cannot match nothing.
5998 The last alternative starts with only a jump,
5999 whereas the rest start with on_failure_jump and end
6000 with a jump, e.g., here is the pattern for `a|b|c':
6002 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6003 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6004 /exactn/1/c
6006 So, we have to first go through the first (n-1)
6007 alternatives and then deal with the last one separately. */
6010 /* Deal with the first (n-1) alternatives, which start
6011 with an on_failure_jump (see above) that jumps to right
6012 past a jump_past_alt. */
6014 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
6016 /* `mcnt' holds how many bytes long the alternative
6017 is, including the ending `jump_past_alt' and
6018 its number. */
6020 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
6021 reg_info))
6022 return false;
6024 /* Move to right after this alternative, including the
6025 jump_past_alt. */
6026 p1 += mcnt;
6028 /* Break if it's the beginning of an n-th alternative
6029 that doesn't begin with an on_failure_jump. */
6030 if ((re_opcode_t) *p1 != on_failure_jump)
6031 break;
6033 /* Still have to check that it's not an n-th
6034 alternative that starts with an on_failure_jump. */
6035 p1++;
6036 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6037 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
6039 /* Get to the beginning of the n-th alternative. */
6040 p1 -= 3;
6041 break;
6045 /* Deal with the last alternative: go back and get number
6046 of the `jump_past_alt' just before it. `mcnt' contains
6047 the length of the alternative. */
6048 EXTRACT_NUMBER (mcnt, p1 - 2);
6050 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
6051 return false;
6053 p1 += mcnt; /* Get past the n-th alternative. */
6054 } /* if mcnt > 0 */
6055 break;
6058 case stop_memory:
6059 assert (p1[1] == **p);
6060 *p = p1 + 2;
6061 return true;
6064 default:
6065 if (!common_op_match_null_string_p (&p1, end, reg_info))
6066 return false;
6068 } /* while p1 < end */
6070 return false;
6071 } /* group_match_null_string_p */
6074 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6075 It expects P to be the first byte of a single alternative and END one
6076 byte past the last. The alternative can contain groups. */
6078 static boolean
6079 alt_match_null_string_p (p, end, reg_info)
6080 unsigned char *p, *end;
6081 register_info_type *reg_info;
6083 int mcnt;
6084 unsigned char *p1 = p;
6086 while (p1 < end)
6088 /* Skip over opcodes that can match nothing, and break when we get
6089 to one that can't. */
6091 switch ((re_opcode_t) *p1)
6093 /* It's a loop. */
6094 case on_failure_jump:
6095 p1++;
6096 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6097 p1 += mcnt;
6098 break;
6100 default:
6101 if (!common_op_match_null_string_p (&p1, end, reg_info))
6102 return false;
6104 } /* while p1 < end */
6106 return true;
6107 } /* alt_match_null_string_p */
6110 /* Deals with the ops common to group_match_null_string_p and
6111 alt_match_null_string_p.
6113 Sets P to one after the op and its arguments, if any. */
6115 static boolean
6116 common_op_match_null_string_p (p, end, reg_info)
6117 unsigned char **p, *end;
6118 register_info_type *reg_info;
6120 int mcnt;
6121 boolean ret;
6122 int reg_no;
6123 unsigned char *p1 = *p;
6125 switch ((re_opcode_t) *p1++)
6127 case no_op:
6128 case begline:
6129 case endline:
6130 case begbuf:
6131 case endbuf:
6132 case wordbeg:
6133 case wordend:
6134 case wordbound:
6135 case notwordbound:
6136 #ifdef emacs
6137 case before_dot:
6138 case at_dot:
6139 case after_dot:
6140 #endif
6141 break;
6143 case start_memory:
6144 reg_no = *p1;
6145 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6146 ret = group_match_null_string_p (&p1, end, reg_info);
6148 /* Have to set this here in case we're checking a group which
6149 contains a group and a back reference to it. */
6151 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
6152 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
6154 if (!ret)
6155 return false;
6156 break;
6158 /* If this is an optimized succeed_n for zero times, make the jump. */
6159 case jump:
6160 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6161 if (mcnt >= 0)
6162 p1 += mcnt;
6163 else
6164 return false;
6165 break;
6167 case succeed_n:
6168 /* Get to the number of times to succeed. */
6169 p1 += 2;
6170 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6172 if (mcnt == 0)
6174 p1 -= 4;
6175 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6176 p1 += mcnt;
6178 else
6179 return false;
6180 break;
6182 case duplicate:
6183 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
6184 return false;
6185 break;
6187 case set_number_at:
6188 p1 += 4;
6190 default:
6191 /* All other opcodes mean we cannot match the empty string. */
6192 return false;
6195 *p = p1;
6196 return true;
6197 } /* common_op_match_null_string_p */
6200 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6201 bytes; nonzero otherwise. */
6203 static int
6204 bcmp_translate (s1, s2, len, translate)
6205 unsigned char *s1, *s2;
6206 register int len;
6207 RE_TRANSLATE_TYPE translate;
6209 register unsigned char *p1 = s1, *p2 = s2;
6210 unsigned char *p1_end = s1 + len;
6211 unsigned char *p2_end = s2 + len;
6213 while (p1 != p1_end && p2 != p2_end)
6215 int p1_charlen, p2_charlen;
6216 int p1_ch, p2_ch;
6218 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6219 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
6221 if (RE_TRANSLATE (translate, p1_ch)
6222 != RE_TRANSLATE (translate, p2_ch))
6223 return 1;
6225 p1 += p1_charlen, p2 += p2_charlen;
6228 if (p1 != p1_end || p2 != p2_end)
6229 return 1;
6231 return 0;
6234 /* Entry points for GNU code. */
6236 /* re_compile_pattern is the GNU regular expression compiler: it
6237 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6238 Returns 0 if the pattern was valid, otherwise an error string.
6240 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6241 are set in BUFP on entry.
6243 We call regex_compile to do the actual compilation. */
6245 const char *
6246 re_compile_pattern (pattern, length, bufp)
6247 const char *pattern;
6248 int length;
6249 struct re_pattern_buffer *bufp;
6251 reg_errcode_t ret;
6253 /* GNU code is written to assume at least RE_NREGS registers will be set
6254 (and at least one extra will be -1). */
6255 bufp->regs_allocated = REGS_UNALLOCATED;
6257 /* And GNU code determines whether or not to get register information
6258 by passing null for the REGS argument to re_match, etc., not by
6259 setting no_sub. */
6260 bufp->no_sub = 0;
6262 /* Match anchors at newline. */
6263 bufp->newline_anchor = 1;
6265 ret = regex_compile (pattern, length, re_syntax_options, bufp);
6267 if (!ret)
6268 return NULL;
6269 return gettext (re_error_msgid[(int) ret]);
6272 /* Entry points compatible with 4.2 BSD regex library. We don't define
6273 them unless specifically requested. */
6275 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
6277 /* BSD has one and only one pattern buffer. */
6278 static struct re_pattern_buffer re_comp_buf;
6280 char *
6281 #ifdef _LIBC
6282 /* Make these definitions weak in libc, so POSIX programs can redefine
6283 these names if they don't use our functions, and still use
6284 regcomp/regexec below without link errors. */
6285 weak_function
6286 #endif
6287 re_comp (s)
6288 const char *s;
6290 reg_errcode_t ret;
6292 if (!s)
6294 if (!re_comp_buf.buffer)
6295 return gettext ("No previous regular expression");
6296 return 0;
6299 if (!re_comp_buf.buffer)
6301 re_comp_buf.buffer = (unsigned char *) malloc (200);
6302 if (re_comp_buf.buffer == NULL)
6303 return gettext (re_error_msgid[(int) REG_ESPACE]);
6304 re_comp_buf.allocated = 200;
6306 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6307 if (re_comp_buf.fastmap == NULL)
6308 return gettext (re_error_msgid[(int) REG_ESPACE]);
6311 /* Since `re_exec' always passes NULL for the `regs' argument, we
6312 don't need to initialize the pattern buffer fields which affect it. */
6314 /* Match anchors at newlines. */
6315 re_comp_buf.newline_anchor = 1;
6317 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6319 if (!ret)
6320 return NULL;
6322 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6323 return (char *) gettext (re_error_msgid[(int) ret]);
6328 #ifdef _LIBC
6329 weak_function
6330 #endif
6331 re_exec (s)
6332 const char *s;
6334 const int len = strlen (s);
6335 return
6336 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6338 #endif /* _REGEX_RE_COMP */
6340 /* POSIX.2 functions. Don't define these for Emacs. */
6342 #ifndef emacs
6344 /* regcomp takes a regular expression as a string and compiles it.
6346 PREG is a regex_t *. We do not expect any fields to be initialized,
6347 since POSIX says we shouldn't. Thus, we set
6349 `buffer' to the compiled pattern;
6350 `used' to the length of the compiled pattern;
6351 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6352 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6353 RE_SYNTAX_POSIX_BASIC;
6354 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6355 `fastmap' and `fastmap_accurate' to zero;
6356 `re_nsub' to the number of subexpressions in PATTERN.
6358 PATTERN is the address of the pattern string.
6360 CFLAGS is a series of bits which affect compilation.
6362 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6363 use POSIX basic syntax.
6365 If REG_NEWLINE is set, then . and [^...] don't match newline.
6366 Also, regexec will try a match beginning after every newline.
6368 If REG_ICASE is set, then we considers upper- and lowercase
6369 versions of letters to be equivalent when matching.
6371 If REG_NOSUB is set, then when PREG is passed to regexec, that
6372 routine will report only success or failure, and nothing about the
6373 registers.
6375 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6376 the return codes and their meanings.) */
6379 regcomp (preg, pattern, cflags)
6380 regex_t *preg;
6381 const char *pattern;
6382 int cflags;
6384 reg_errcode_t ret;
6385 unsigned syntax
6386 = (cflags & REG_EXTENDED) ?
6387 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6389 /* regex_compile will allocate the space for the compiled pattern. */
6390 preg->buffer = 0;
6391 preg->allocated = 0;
6392 preg->used = 0;
6394 /* Don't bother to use a fastmap when searching. This simplifies the
6395 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6396 characters after newlines into the fastmap. This way, we just try
6397 every character. */
6398 preg->fastmap = 0;
6400 if (cflags & REG_ICASE)
6402 unsigned i;
6404 preg->translate
6405 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6406 * sizeof (*(RE_TRANSLATE_TYPE)0));
6407 if (preg->translate == NULL)
6408 return (int) REG_ESPACE;
6410 /* Map uppercase characters to corresponding lowercase ones. */
6411 for (i = 0; i < CHAR_SET_SIZE; i++)
6412 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6414 else
6415 preg->translate = NULL;
6417 /* If REG_NEWLINE is set, newlines are treated differently. */
6418 if (cflags & REG_NEWLINE)
6419 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6420 syntax &= ~RE_DOT_NEWLINE;
6421 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6422 /* It also changes the matching behavior. */
6423 preg->newline_anchor = 1;
6425 else
6426 preg->newline_anchor = 0;
6428 preg->no_sub = !!(cflags & REG_NOSUB);
6430 /* POSIX says a null character in the pattern terminates it, so we
6431 can use strlen here in compiling the pattern. */
6432 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
6434 /* POSIX doesn't distinguish between an unmatched open-group and an
6435 unmatched close-group: both are REG_EPAREN. */
6436 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6438 return (int) ret;
6442 /* regexec searches for a given pattern, specified by PREG, in the
6443 string STRING.
6445 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6446 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6447 least NMATCH elements, and we set them to the offsets of the
6448 corresponding matched substrings.
6450 EFLAGS specifies `execution flags' which affect matching: if
6451 REG_NOTBOL is set, then ^ does not match at the beginning of the
6452 string; if REG_NOTEOL is set, then $ does not match at the end.
6454 We return 0 if we find a match and REG_NOMATCH if not. */
6457 regexec (preg, string, nmatch, pmatch, eflags)
6458 const regex_t *preg;
6459 const char *string;
6460 size_t nmatch;
6461 regmatch_t pmatch[];
6462 int eflags;
6464 int ret;
6465 struct re_registers regs;
6466 regex_t private_preg;
6467 int len = strlen (string);
6468 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6470 private_preg = *preg;
6472 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6473 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6475 /* The user has told us exactly how many registers to return
6476 information about, via `nmatch'. We have to pass that on to the
6477 matching routines. */
6478 private_preg.regs_allocated = REGS_FIXED;
6480 if (want_reg_info)
6482 regs.num_regs = nmatch;
6483 regs.start = TALLOC (nmatch, regoff_t);
6484 regs.end = TALLOC (nmatch, regoff_t);
6485 if (regs.start == NULL || regs.end == NULL)
6486 return (int) REG_NOMATCH;
6489 /* Perform the searching operation. */
6490 ret = re_search (&private_preg, string, len,
6491 /* start: */ 0, /* range: */ len,
6492 want_reg_info ? &regs : (struct re_registers *) 0);
6494 /* Copy the register information to the POSIX structure. */
6495 if (want_reg_info)
6497 if (ret >= 0)
6499 unsigned r;
6501 for (r = 0; r < nmatch; r++)
6503 pmatch[r].rm_so = regs.start[r];
6504 pmatch[r].rm_eo = regs.end[r];
6508 /* If we needed the temporary register info, free the space now. */
6509 free (regs.start);
6510 free (regs.end);
6513 /* We want zero return to mean success, unlike `re_search'. */
6514 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6518 /* Returns a message corresponding to an error code, ERRCODE, returned
6519 from either regcomp or regexec. We don't use PREG here. */
6521 size_t
6522 regerror (errcode, preg, errbuf, errbuf_size)
6523 int errcode;
6524 const regex_t *preg;
6525 char *errbuf;
6526 size_t errbuf_size;
6528 const char *msg;
6529 size_t msg_size;
6531 if (errcode < 0
6532 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6533 /* Only error codes returned by the rest of the code should be passed
6534 to this routine. If we are given anything else, or if other regex
6535 code generates an invalid error code, then the program has a bug.
6536 Dump core so we can fix it. */
6537 abort ();
6539 msg = gettext (re_error_msgid[errcode]);
6541 msg_size = strlen (msg) + 1; /* Includes the null. */
6543 if (errbuf_size != 0)
6545 if (msg_size > errbuf_size)
6547 strncpy (errbuf, msg, errbuf_size - 1);
6548 errbuf[errbuf_size - 1] = 0;
6550 else
6551 strcpy (errbuf, msg);
6554 return msg_size;
6558 /* Free dynamically allocated space used by PREG. */
6560 void
6561 regfree (preg)
6562 regex_t *preg;
6564 if (preg->buffer != NULL)
6565 free (preg->buffer);
6566 preg->buffer = NULL;
6568 preg->allocated = 0;
6569 preg->used = 0;
6571 if (preg->fastmap != NULL)
6572 free (preg->fastmap);
6573 preg->fastmap = NULL;
6574 preg->fastmap_accurate = 0;
6576 if (preg->translate != NULL)
6577 free (preg->translate);
6578 preg->translate = NULL;
6581 #endif /* not emacs */