src/regex.c

   1 /* Extended regular expression matching and search library, version
   2    0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
   3    internationalization features.)
   4
   5    Copyright (C) 1993-2018 Free Software Foundation, Inc.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  19
  20 /* TODO:
  21    - structure the opcode space into opcode+flag.
  22    - merge with glibc's regex.[ch].
  23    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
  24      need to modify the compiled regexp so that re_match can be reentrant.
  25    - get rid of on_failure_jump_smart by doing the optimization in re_comp
  26      rather than at run-time, so that re_match can be reentrant.
  27 */
  28
  29 /* AIX requires this to be the first thing in the file.  */
  30 #if defined _AIX && !defined REGEX_MALLOC
  31   #pragma alloca
  32 #endif
  33
  34 /* Ignore some GCC warnings for now.  This section should go away
  35    once the Emacs and Gnulib regex code is merged.  */
  36 #if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
  37 # pragma GCC diagnostic ignored "-Wstrict-overflow"
  38 # ifndef emacs
  39 #  pragma GCC diagnostic ignored "-Wunused-function"
  40 #  pragma GCC diagnostic ignored "-Wunused-macros"
  41 #  pragma GCC diagnostic ignored "-Wunused-result"
  42 #  pragma GCC diagnostic ignored "-Wunused-variable"
  43 # endif
  44 #endif
  45
  46 #if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) && ! defined __clang__
  47 # pragma GCC diagnostic ignored "-Wunused-but-set-variable"
  48 #endif
  49
  50 #include <config.h>
  51
  52 #include <stddef.h>
  53 #include <stdlib.h>
  54
  55 #ifdef emacs
  56 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
  57 # include <sys/types.h>
  58 #endif
  59
  60 /* Whether to use ISO C Amendment 1 wide char functions.
  61    Those should not be used for Emacs since it uses its own.  */
  62 #if defined _LIBC
  63 #define WIDE_CHAR_SUPPORT 1
  64 #else
  65 #define WIDE_CHAR_SUPPORT \
  66         (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
  67 #endif
  68
  69 /* For platform which support the ISO C amendment 1 functionality we
  70    support user defined character classes.  */
  71 #if WIDE_CHAR_SUPPORT
  72 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
  73 # include <wchar.h>
  74 # include <wctype.h>
  75 #endif
  76
  77 #ifdef _LIBC
  78 /* We have to keep the namespace clean.  */
  79 # define regfree(preg) __regfree (preg)
  80 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
  81 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
  82 # define regerror(err_code, preg, errbuf, errbuf_size) \
  83         __regerror (err_code, preg, errbuf, errbuf_size)
  84 # define re_set_registers(bu, re, nu, st, en) \
  85         __re_set_registers (bu, re, nu, st, en)
  86 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
  87         __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
  88 # define re_match(bufp, string, size, pos, regs) \
  89         __re_match (bufp, string, size, pos, regs)
  90 # define re_search(bufp, string, size, startpos, range, regs) \
  91         __re_search (bufp, string, size, startpos, range, regs)
  92 # define re_compile_pattern(pattern, length, bufp) \
  93         __re_compile_pattern (pattern, length, bufp)
  94 # define re_set_syntax(syntax) __re_set_syntax (syntax)
  95 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
  96         __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
  97 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
  98
  99 /* Make sure we call libc's function even if the user overrides them.  */
 100 # define btowc __btowc
 101 # define iswctype __iswctype
 102 # define wctype __wctype
 103
 104 # define WEAK_ALIAS(a,b) weak_alias (a, b)
 105
 106 /* We are also using some library internals.  */
 107 # include <locale/localeinfo.h>
 108 # include <locale/elem-hash.h>
 109 # include <langinfo.h>
 110 #else
 111 # define WEAK_ALIAS(a,b)
 112 #endif
 113
 114 /* This is for other GNU distributions with internationalized messages.  */
 115 #if HAVE_LIBINTL_H || defined _LIBC
 116 # include <libintl.h>
 117 #else
 118 # define gettext(msgid) (msgid)
 119 #endif
 120
 121 #ifndef gettext_noop
 122 /* This define is so xgettext can find the internationalizable
 123    strings.  */
 124 # define gettext_noop(String) String
 125 #endif
 126
 127 /* The `emacs' switch turns on certain matching commands
 128    that make sense only in Emacs. */
 129 #ifdef emacs
 130
 131 # include "lisp.h"
 132 # include "character.h"
 133 # include "buffer.h"
 134
 135 # include "syntax.h"
 136 # include "category.h"
 137
 138 /* Make syntax table lookup grant data in gl_state.  */
 139 # define SYNTAX(c) syntax_property (c, 1)
 140
 141 # ifdef malloc
 142 #  undef malloc
 143 # endif
 144 # define malloc xmalloc
 145 # ifdef realloc
 146 #  undef realloc
 147 # endif
 148 # define realloc xrealloc
 149 # ifdef free
 150 #  undef free
 151 # endif
 152 # define free xfree
 153
 154 /* Converts the pointer to the char to BEG-based offset from the start.  */
 155 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
 156 /* Strings are 0-indexed, buffers are 1-indexed; we pun on the boolean
 157    result to get the right base index.  */
 158 # define POS_AS_IN_BUFFER(p)                                    \
 159   ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
 160
 161 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
 162 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
 163 # define RE_STRING_CHAR(p, multibyte) \
 164   (multibyte ? (STRING_CHAR (p)) : (*(p)))
 165 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
 166   (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
 167
 168 # define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
 169
 170 # define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
 171
 172 /* Set C a (possibly converted to multibyte) character before P.  P
 173    points into a string which is the virtual concatenation of STR1
 174    (which ends at END1) or STR2 (which ends at END2).  */
 175 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)                     \
 176   do {                                                                       \
 177     if (target_multibyte)                                                    \
 178       {                                                                      \
 179         re_char *dtemp = (p) == (str2) ? (end1) : (p);                       \
 180         re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
 181         while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));                   \
 182         c = STRING_CHAR (dtemp);                                             \
 183       }                                                                      \
 184     else                                                                     \
 185       {                                                                      \
 186         (c = ((p) == (str2) ? (end1) : (p))[-1]);                            \
 187         (c) = RE_CHAR_TO_MULTIBYTE (c);                                      \
 188       }                                                                      \
 189   } while (0)
 190
 191 /* Set C a (possibly converted to multibyte) character at P, and set
 192    LEN to the byte length of that character.  */
 193 # define GET_CHAR_AFTER(c, p, len)              \
 194   do {                                          \
 195     if (target_multibyte)                       \
 196       (c) = STRING_CHAR_AND_LENGTH (p, len);    \
 197     else                                        \
 198       {                                         \
 199         (c) = *p;                               \
 200         len = 1;                                \
 201         (c) = RE_CHAR_TO_MULTIBYTE (c);         \
 202       }                                         \
 203    } while (0)
 204
 205 #else  /* not emacs */
 206
 207 /* If we are not linking with Emacs proper,
 208    we can't use the relocating allocator
 209    even if config.h says that we can.  */
 210 # undef REL_ALLOC
 211
 212 # include <unistd.h>
 213
 214 /* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
 215
 216 static ATTRIBUTE_MALLOC void *
 217 xmalloc (size_t size)
 218 {
 219   void *val = malloc (size);
 220   if (!val && size)
 221     {
 222       write (STDERR_FILENO, "virtual memory exhausted\n", 25);
 223       exit (1);
 224     }
 225   return val;
 226 }
 227
 228 static void *
 229 xrealloc (void *block, size_t size)
 230 {
 231   void *val;
 232   /* We must call malloc explicitly when BLOCK is 0, since some
 233      reallocs don't do this.  */
 234   if (! block)
 235     val = malloc (size);
 236   else
 237     val = realloc (block, size);
 238   if (!val && size)
 239     {
 240       write (STDERR_FILENO, "virtual memory exhausted\n", 25);
 241       exit (1);
 242     }
 243   return val;
 244 }
 245
 246 # ifdef malloc
 247 #  undef malloc
 248 # endif
 249 # define malloc xmalloc
 250 # ifdef realloc
 251 #  undef realloc
 252 # endif
 253 # define realloc xrealloc
 254
 255 # include <stdbool.h>
 256 # include <string.h>
 257
 258 /* Define the syntax stuff for \<, \>, etc.  */
 259
 260 /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
 261 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
 262
 263 /* Dummy macros for non-Emacs environments.  */
 264 # define MAX_MULTIBYTE_LENGTH 1
 265 # define RE_MULTIBYTE_P(x) 0
 266 # define RE_TARGET_MULTIBYTE_P(x) 0
 267 # define WORD_BOUNDARY_P(c1, c2) (0)
 268 # define BYTES_BY_CHAR_HEAD(p) (1)
 269 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
 270 # define STRING_CHAR(p) (*(p))
 271 # define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
 272 # define CHAR_STRING(c, s) (*(s) = (c), 1)
 273 # define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
 274 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
 275 # define RE_CHAR_TO_MULTIBYTE(c) (c)
 276 # define RE_CHAR_TO_UNIBYTE(c) (c)
 277 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
 278   (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
 279 # define GET_CHAR_AFTER(c, p, len)      \
 280   (c = *p, len = 1)
 281 # define CHAR_BYTE8_P(c) (0)
 282 # define CHAR_LEADING_CODE(c) (c)
 283
 284 #endif /* not emacs */
 285
 286 #ifndef RE_TRANSLATE
 287 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
 288 # define RE_TRANSLATE_P(TBL) (TBL)
 289 #endif
 290 \f
 291 /* Get the interface, including the syntax bits.  */
 292 #include "regex.h"
 293
 294 /* isalpha etc. are used for the character classes.  */
 295 #include <ctype.h>
 296
 297 #ifdef emacs
 298
 299 /* 1 if C is an ASCII character.  */
 300 # define IS_REAL_ASCII(c) ((c) < 0200)
 301
 302 /* 1 if C is a unibyte character.  */
 303 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
 304
 305 /* The Emacs definitions should not be directly affected by locales.  */
 306
 307 /* In Emacs, these are only used for single-byte characters.  */
 308 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
 309 # define ISCNTRL(c) ((c) < ' ')
 310 # define ISXDIGIT(c) (0 <= char_hexdigit (c))
 311
 312 /* The rest must handle multibyte characters.  */
 313
 314 # define ISBLANK(c) (IS_REAL_ASCII (c)                  \
 315                      ? ((c) == ' ' || (c) == '\t')      \
 316                      : blankp (c))
 317
 318 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)                             \
 319                      ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)       \
 320                      : graphicp (c))
 321
 322 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)                             \
 323                     ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)       \
 324                      : printablep (c))
 325
 326 # define ISALNUM(c) (IS_REAL_ASCII (c)                  \
 327                     ? (((c) >= 'a' && (c) <= 'z')       \
 328                        || ((c) >= 'A' && (c) <= 'Z')    \
 329                        || ((c) >= '0' && (c) <= '9'))   \
 330                     : alphanumericp (c))
 331
 332 # define ISALPHA(c) (IS_REAL_ASCII (c)                  \
 333                     ? (((c) >= 'a' && (c) <= 'z')       \
 334                        || ((c) >= 'A' && (c) <= 'Z'))   \
 335                     : alphabeticp (c))
 336
 337 # define ISLOWER(c) lowercasep (c)
 338
 339 # define ISPUNCT(c) (IS_REAL_ASCII (c)                          \
 340                     ? ((c) > ' ' && (c) < 0177                  \
 341                        && !(((c) >= 'a' && (c) <= 'z')          \
 342                             || ((c) >= 'A' && (c) <= 'Z')       \
 343                             || ((c) >= '0' && (c) <= '9')))     \
 344                     : SYNTAX (c) != Sword)
 345
 346 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
 347
 348 # define ISUPPER(c) uppercasep (c)
 349
 350 # define ISWORD(c) (SYNTAX (c) == Sword)
 351
 352 #else /* not emacs */
 353
 354 /* 1 if C is an ASCII character.  */
 355 # define IS_REAL_ASCII(c) ((c) < 0200)
 356
 357 /* This distinction is not meaningful, except in Emacs.  */
 358 # define ISUNIBYTE(c) 1
 359
 360 # ifdef isblank
 361 #  define ISBLANK(c) isblank (c)
 362 # else
 363 #  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 364 # endif
 365 # ifdef isgraph
 366 #  define ISGRAPH(c) isgraph (c)
 367 # else
 368 #  define ISGRAPH(c) (isprint (c) && !isspace (c))
 369 # endif
 370
 371 /* Solaris defines ISPRINT so we must undefine it first.  */
 372 # undef ISPRINT
 373 # define ISPRINT(c) isprint (c)
 374 # define ISDIGIT(c) isdigit (c)
 375 # define ISALNUM(c) isalnum (c)
 376 # define ISALPHA(c) isalpha (c)
 377 # define ISCNTRL(c) iscntrl (c)
 378 # define ISLOWER(c) islower (c)
 379 # define ISPUNCT(c) ispunct (c)
 380 # define ISSPACE(c) isspace (c)
 381 # define ISUPPER(c) isupper (c)
 382 # define ISXDIGIT(c) isxdigit (c)
 383
 384 # define ISWORD(c) ISALPHA (c)
 385
 386 # ifdef _tolower
 387 #  define TOLOWER(c) _tolower (c)
 388 # else
 389 #  define TOLOWER(c) tolower (c)
 390 # endif
 391
 392 /* How many characters in the character set.  */
 393 # define CHAR_SET_SIZE 256
 394
 395 # ifdef SYNTAX_TABLE
 396
 397 extern char *re_syntax_table;
 398
 399 # else /* not SYNTAX_TABLE */
 400
 401 static char re_syntax_table[CHAR_SET_SIZE];
 402
 403 static void
 404 init_syntax_once (void)
 405 {
 406    register int c;
 407    static int done = 0;
 408
 409    if (done)
 410      return;
 411
 412    memset (re_syntax_table, 0, sizeof re_syntax_table);
 413
 414    for (c = 0; c < CHAR_SET_SIZE; ++c)
 415      if (ISALNUM (c))
 416         re_syntax_table[c] = Sword;
 417
 418    re_syntax_table['_'] = Ssymbol;
 419
 420    done = 1;
 421 }
 422
 423 # endif /* not SYNTAX_TABLE */
 424
 425 # define SYNTAX(c) re_syntax_table[(c)]
 426
 427 #endif /* not emacs */
 428 \f
 429 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
 430 \f
 431 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
 432    use `alloca' instead of `malloc'.  This is because using malloc in
 433    re_search* or re_match* could cause memory leaks when C-g is used
 434    in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
 435    via `record_xmalloc' which uses `unwind_protect' to ensure the
 436    memory is freed even in case of non-local exits); also, malloc is
 437    slower and causes storage fragmentation.  On the other hand, malloc
 438    is more portable, and easier to debug.
 439
 440    Because we sometimes use alloca, some routines have to be macros,
 441    not functions -- `alloca'-allocated space disappears at the end of the
 442    function it is called in.  */
 443
 444 #ifdef REGEX_MALLOC
 445
 446 # define REGEX_ALLOCATE malloc
 447 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
 448 # define REGEX_FREE free
 449
 450 #else /* not REGEX_MALLOC  */
 451
 452 # ifdef emacs
 453 /* This may be adjusted in main(), if the stack is successfully grown.  */
 454 ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
 455 /* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
 456 #  define REGEX_USE_SAFE_ALLOCA                                        \
 457   ptrdiff_t sa_avail = emacs_re_safe_alloca;                           \
 458   ptrdiff_t sa_count = SPECPDL_INDEX (); bool sa_must_free = false
 459
 460 #  define REGEX_SAFE_FREE() SAFE_FREE ()
 461 #  define REGEX_ALLOCATE SAFE_ALLOCA
 462 # else
 463 #  include <alloca.h>
 464 #  define REGEX_ALLOCATE alloca
 465 # endif
 466
 467 /* Assumes a `char *destination' variable.  */
 468 # define REGEX_REALLOCATE(source, osize, nsize)                         \
 469   (destination = REGEX_ALLOCATE (nsize),                                \
 470    memcpy (destination, source, osize))
 471
 472 /* No need to do anything to free, after alloca.  */
 473 # define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
 474
 475 #endif /* not REGEX_MALLOC */
 476
 477 #ifndef REGEX_USE_SAFE_ALLOCA
 478 # define REGEX_USE_SAFE_ALLOCA ((void) 0)
 479 # define REGEX_SAFE_FREE() ((void) 0)
 480 #endif
 481
 482 /* Define how to allocate the failure stack.  */
 483
 484 #if defined REL_ALLOC && defined REGEX_MALLOC
 485
 486 # define REGEX_ALLOCATE_STACK(size)                             \
 487   r_alloc (&failure_stack_ptr, (size))
 488 # define REGEX_REALLOCATE_STACK(source, osize, nsize)           \
 489   r_re_alloc (&failure_stack_ptr, (nsize))
 490 # define REGEX_FREE_STACK(ptr)                                  \
 491   r_alloc_free (&failure_stack_ptr)
 492
 493 #else /* not using relocating allocator */
 494
 495 # define REGEX_ALLOCATE_STACK(size) REGEX_ALLOCATE (size)
 496 # define REGEX_REALLOCATE_STACK(source, o, n) REGEX_REALLOCATE (source, o, n)
 497 # define REGEX_FREE_STACK(ptr) REGEX_FREE (ptr)
 498
 499 #endif /* not using relocating allocator */
 500
 501
 502 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
 503    `string1' or just past its end.  This works if PTR is NULL, which is
 504    a good thing.  */
 505 #define FIRST_STRING_P(ptr)                                     \
 506   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
 507
 508 /* (Re)Allocate N items of type T using malloc, or fail.  */
 509 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
 510 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
 511 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
 512
 513 #define BYTEWIDTH 8 /* In bits.  */
 514
 515 #ifndef emacs
 516 # undef max
 517 # undef min
 518 # define max(a, b) ((a) > (b) ? (a) : (b))
 519 # define min(a, b) ((a) < (b) ? (a) : (b))
 520 #endif
 521
 522 /* Type of source-pattern and string chars.  */
 523 typedef const unsigned char re_char;
 524
 525 typedef char boolean;
 526
 527 static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
 528                                      re_char *string1, size_t size1,
 529                                      re_char *string2, size_t size2,
 530                                      ssize_t pos,
 531                                      struct re_registers *regs,
 532                                      ssize_t stop);
 533 \f
 534 /* These are the command codes that appear in compiled regular
 535    expressions.  Some opcodes are followed by argument bytes.  A
 536    command code can specify any interpretation whatsoever for its
 537    arguments.  Zero bytes may appear in the compiled regular expression.  */
 538
 539 typedef enum
 540 {
 541   no_op = 0,
 542
 543   /* Succeed right away--no more backtracking.  */
 544   succeed,
 545
 546         /* Followed by one byte giving n, then by n literal bytes.  */
 547   exactn,
 548
 549         /* Matches any (more or less) character.  */
 550   anychar,
 551
 552         /* Matches any one char belonging to specified set.  First
 553            following byte is number of bitmap bytes.  Then come bytes
 554            for a bitmap saying which chars are in.  Bits in each byte
 555            are ordered low-bit-first.  A character is in the set if its
 556            bit is 1.  A character too large to have a bit in the map is
 557            automatically not in the set.
 558
 559            If the length byte has the 0x80 bit set, then that stuff
 560            is followed by a range table:
 561                2 bytes of flags for character sets (low 8 bits, high 8 bits)
 562                    See RANGE_TABLE_WORK_BITS below.
 563                2 bytes, the number of pairs that follow (upto 32767)
 564                pairs, each 2 multibyte characters,
 565                    each multibyte character represented as 3 bytes.  */
 566   charset,
 567
 568         /* Same parameters as charset, but match any character that is
 569            not one of those specified.  */
 570   charset_not,
 571
 572         /* Start remembering the text that is matched, for storing in a
 573            register.  Followed by one byte with the register number, in
 574            the range 0 to one less than the pattern buffer's re_nsub
 575            field.  */
 576   start_memory,
 577
 578         /* Stop remembering the text that is matched and store it in a
 579            memory register.  Followed by one byte with the register
 580            number, in the range 0 to one less than `re_nsub' in the
 581            pattern buffer.  */
 582   stop_memory,
 583
 584         /* Match a duplicate of something remembered. Followed by one
 585            byte containing the register number.  */
 586   duplicate,
 587
 588         /* Fail unless at beginning of line.  */
 589   begline,
 590
 591         /* Fail unless at end of line.  */
 592   endline,
 593
 594         /* Succeeds if at beginning of buffer (if emacs) or at beginning
 595            of string to be matched (if not).  */
 596   begbuf,
 597
 598         /* Analogously, for end of buffer/string.  */
 599   endbuf,
 600
 601         /* Followed by two byte relative address to which to jump.  */
 602   jump,
 603
 604         /* Followed by two-byte relative address of place to resume at
 605            in case of failure.  */
 606   on_failure_jump,
 607
 608         /* Like on_failure_jump, but pushes a placeholder instead of the
 609            current string position when executed.  */
 610   on_failure_keep_string_jump,
 611
 612         /* Just like `on_failure_jump', except that it checks that we
 613            don't get stuck in an infinite loop (matching an empty string
 614            indefinitely).  */
 615   on_failure_jump_loop,
 616
 617         /* Just like `on_failure_jump_loop', except that it checks for
 618            a different kind of loop (the kind that shows up with non-greedy
 619            operators).  This operation has to be immediately preceded
 620            by a `no_op'.  */
 621   on_failure_jump_nastyloop,
 622
 623         /* A smart `on_failure_jump' used for greedy * and + operators.
 624            It analyzes the loop before which it is put and if the
 625            loop does not require backtracking, it changes itself to
 626            `on_failure_keep_string_jump' and short-circuits the loop,
 627            else it just defaults to changing itself into `on_failure_jump'.
 628            It assumes that it is pointing to just past a `jump'.  */
 629   on_failure_jump_smart,
 630
 631         /* Followed by two-byte relative address and two-byte number n.
 632            After matching N times, jump to the address upon failure.
 633            Does not work if N starts at 0: use on_failure_jump_loop
 634            instead.  */
 635   succeed_n,
 636
 637         /* Followed by two-byte relative address, and two-byte number n.
 638            Jump to the address N times, then fail.  */
 639   jump_n,
 640
 641         /* Set the following two-byte relative address to the
 642            subsequent two-byte number.  The address *includes* the two
 643            bytes of number.  */
 644   set_number_at,
 645
 646   wordbeg,      /* Succeeds if at word beginning.  */
 647   wordend,      /* Succeeds if at word end.  */
 648
 649   wordbound,    /* Succeeds if at a word boundary.  */
 650   notwordbound, /* Succeeds if not at a word boundary.  */
 651
 652   symbeg,       /* Succeeds if at symbol beginning.  */
 653   symend,       /* Succeeds if at symbol end.  */
 654
 655         /* Matches any character whose syntax is specified.  Followed by
 656            a byte which contains a syntax code, e.g., Sword.  */
 657   syntaxspec,
 658
 659         /* Matches any character whose syntax is not that specified.  */
 660   notsyntaxspec
 661
 662 #ifdef emacs
 663   , at_dot,     /* Succeeds if at point.  */
 664
 665   /* Matches any character whose category-set contains the specified
 666      category.  The operator is followed by a byte which contains a
 667      category code (mnemonic ASCII character).  */
 668   categoryspec,
 669
 670   /* Matches any character whose category-set does not contain the
 671      specified category.  The operator is followed by a byte which
 672      contains the category code (mnemonic ASCII character).  */
 673   notcategoryspec
 674 #endif /* emacs */
 675 } re_opcode_t;
 676 \f
 677 /* Common operations on the compiled pattern.  */
 678
 679 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
 680
 681 #define STORE_NUMBER(destination, number)                               \
 682   do {                                                                  \
 683     (destination)[0] = (number) & 0377;                                 \
 684     (destination)[1] = (number) >> 8;                                   \
 685   } while (0)
 686
 687 /* Same as STORE_NUMBER, except increment DESTINATION to
 688    the byte after where the number is stored.  Therefore, DESTINATION
 689    must be an lvalue.  */
 690
 691 #define STORE_NUMBER_AND_INCR(destination, number)                      \
 692   do {                                                                  \
 693     STORE_NUMBER (destination, number);                                 \
 694     (destination) += 2;                                                 \
 695   } while (0)
 696
 697 /* Put into DESTINATION a number stored in two contiguous bytes starting
 698    at SOURCE.  */
 699
 700 #define EXTRACT_NUMBER(destination, source)                             \
 701   ((destination) = extract_number (source))
 702
 703 static int
 704 extract_number (re_char *source)
 705 {
 706   unsigned leading_byte = SIGN_EXTEND_CHAR (source[1]);
 707   return (leading_byte << 8) + source[0];
 708 }
 709
 710 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
 711    SOURCE must be an lvalue.  */
 712
 713 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
 714   ((destination) = extract_number_and_incr (&source))
 715
 716 static int
 717 extract_number_and_incr (re_char **source)
 718 {
 719   int num = extract_number (*source);
 720   *source += 2;
 721   return num;
 722 }
 723 \f
 724 /* Store a multibyte character in three contiguous bytes starting
 725    DESTINATION, and increment DESTINATION to the byte after where the
 726    character is stored.  Therefore, DESTINATION must be an lvalue.  */
 727
 728 #define STORE_CHARACTER_AND_INCR(destination, character)        \
 729   do {                                                          \
 730     (destination)[0] = (character) & 0377;                      \
 731     (destination)[1] = ((character) >> 8) & 0377;               \
 732     (destination)[2] = (character) >> 16;                       \
 733     (destination) += 3;                                         \
 734   } while (0)
 735
 736 /* Put into DESTINATION a character stored in three contiguous bytes
 737    starting at SOURCE.  */
 738
 739 #define EXTRACT_CHARACTER(destination, source)  \
 740   do {                                          \
 741     (destination) = ((source)[0]                \
 742                      | ((source)[1] << 8)       \
 743                      | ((source)[2] << 16));    \
 744   } while (0)
 745
 746
 747 /* Macros for charset. */
 748
 749 /* Size of bitmap of charset P in bytes.  P is a start of charset,
 750    i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
 751 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
 752
 753 /* Nonzero if charset P has range table.  */
 754 #define CHARSET_RANGE_TABLE_EXISTS_P(p)  ((p)[1] & 0x80)
 755
 756 /* Return the address of range table of charset P.  But not the start
 757    of table itself, but the before where the number of ranges is
 758    stored.  `2 +' means to skip re_opcode_t and size of bitmap,
 759    and the 2 bytes of flags at the start of the range table.  */
 760 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
 761
 762 #ifdef emacs
 763 /* Extract the bit flags that start a range table.  */
 764 #define CHARSET_RANGE_TABLE_BITS(p)             \
 765   ((p)[2 + CHARSET_BITMAP_SIZE (p)]             \
 766    + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
 767 #endif
 768
 769 /* Return the address of end of RANGE_TABLE.  COUNT is number of
 770    ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
 771    is start of range and end of range.  `* 3' is size of each start
 772    and end.  */
 773 #define CHARSET_RANGE_TABLE_END(range_table, count)     \
 774   ((range_table) + (count) * 2 * 3)
 775 \f
 776 /* If DEBUG is defined, Regex prints many voluminous messages about what
 777    it is doing (if the variable `debug' is nonzero).  If linked with the
 778    main program in `iregex.c', you can enter patterns and strings
 779    interactively.  And if linked with the main program in `main.c' and
 780    the other test files, you can run the already-written tests.  */
 781
 782 #ifdef DEBUG
 783
 784 /* We use standard I/O for debugging.  */
 785 # include <stdio.h>
 786
 787 /* It is useful to test things that ``must'' be true when debugging.  */
 788 # include <assert.h>
 789
 790 static int debug = -100000;
 791
 792 # define DEBUG_STATEMENT(e) e
 793 # define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
 794 # define DEBUG_COMPILES_ARGUMENTS
 795 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)                          \
 796   if (debug > 0) print_partial_compiled_pattern (s, e)
 797 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)                 \
 798   if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
 799
 800
 801 /* Print the fastmap in human-readable form.  */
 802
 803 static void
 804 print_fastmap (char *fastmap)
 805 {
 806   unsigned was_a_range = 0;
 807   unsigned i = 0;
 808
 809   while (i < (1 << BYTEWIDTH))
 810     {
 811       if (fastmap[i++])
 812         {
 813           was_a_range = 0;
 814           putchar (i - 1);
 815           while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
 816             {
 817               was_a_range = 1;
 818               i++;
 819             }
 820           if (was_a_range)
 821             {
 822               printf ("-");
 823               putchar (i - 1);
 824             }
 825         }
 826     }
 827   putchar ('\n');
 828 }
 829
 830
 831 /* Print a compiled pattern string in human-readable form, starting at
 832    the START pointer into it and ending just before the pointer END.  */
 833
 834 static void
 835 print_partial_compiled_pattern (re_char *start, re_char *end)
 836 {
 837   int mcnt, mcnt2;
 838   re_char *p = start;
 839   re_char *pend = end;
 840
 841   if (start == NULL)
 842     {
 843       fprintf (stderr, "(null)\n");
 844       return;
 845     }
 846
 847   /* Loop over pattern commands.  */
 848   while (p < pend)
 849     {
 850       fprintf (stderr, "%td:\t", p - start);
 851
 852       switch ((re_opcode_t) *p++)
 853         {
 854         case no_op:
 855           fprintf (stderr, "/no_op");
 856           break;
 857
 858         case succeed:
 859           fprintf (stderr, "/succeed");
 860           break;
 861
 862         case exactn:
 863           mcnt = *p++;
 864           fprintf (stderr, "/exactn/%d", mcnt);
 865           do
 866             {
 867               fprintf (stderr, "/%c", *p++);
 868             }
 869           while (--mcnt);
 870           break;
 871
 872         case start_memory:
 873           fprintf (stderr, "/start_memory/%d", *p++);
 874           break;
 875
 876         case stop_memory:
 877           fprintf (stderr, "/stop_memory/%d", *p++);
 878           break;
 879
 880         case duplicate:
 881           fprintf (stderr, "/duplicate/%d", *p++);
 882           break;
 883
 884         case anychar:
 885           fprintf (stderr, "/anychar");
 886           break;
 887
 888         case charset:
 889         case charset_not:
 890           {
 891             register int c, last = -100;
 892             register int in_range = 0;
 893             int length = CHARSET_BITMAP_SIZE (p - 1);
 894             int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
 895
 896             fprintf (stderr, "/charset [%s",
 897                      (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
 898
 899             if (p + *p >= pend)
 900               fprintf (stderr, " !extends past end of pattern! ");
 901
 902             for (c = 0; c < 256; c++)
 903               if (c / 8 < length
 904                   && (p[1 + (c/8)] & (1 << (c % 8))))
 905                 {
 906                   /* Are we starting a range?  */
 907                   if (last + 1 == c && ! in_range)
 908                     {
 909                       fprintf (stderr, "-");
 910                       in_range = 1;
 911                     }
 912                   /* Have we broken a range?  */
 913                   else if (last + 1 != c && in_range)
 914                     {
 915                       fprintf (stderr, "%c", last);
 916                       in_range = 0;
 917                     }
 918
 919                   if (! in_range)
 920                     fprintf (stderr, "%c", c);
 921
 922                   last = c;
 923               }
 924
 925             if (in_range)
 926               fprintf (stderr, "%c", last);
 927
 928             fprintf (stderr, "]");
 929
 930             p += 1 + length;
 931
 932             if (has_range_table)
 933               {
 934                 int count;
 935                 fprintf (stderr, "has-range-table");
 936
 937                 /* ??? Should print the range table; for now, just skip it.  */
 938                 p += 2;         /* skip range table bits */
 939                 EXTRACT_NUMBER_AND_INCR (count, p);
 940                 p = CHARSET_RANGE_TABLE_END (p, count);
 941               }
 942           }
 943           break;
 944
 945         case begline:
 946           fprintf (stderr, "/begline");
 947           break;
 948
 949         case endline:
 950           fprintf (stderr, "/endline");
 951           break;
 952
 953         case on_failure_jump:
 954           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 955           fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
 956           break;
 957
 958         case on_failure_keep_string_jump:
 959           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 960           fprintf (stderr, "/on_failure_keep_string_jump to %td",
 961                    p + mcnt - start);
 962           break;
 963
 964         case on_failure_jump_nastyloop:
 965           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 966           fprintf (stderr, "/on_failure_jump_nastyloop to %td",
 967                    p + mcnt - start);
 968           break;
 969
 970         case on_failure_jump_loop:
 971           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 972           fprintf (stderr, "/on_failure_jump_loop to %td",
 973                    p + mcnt - start);
 974           break;
 975
 976         case on_failure_jump_smart:
 977           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 978           fprintf (stderr, "/on_failure_jump_smart to %td",
 979                    p + mcnt - start);
 980           break;
 981
 982         case jump:
 983           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 984           fprintf (stderr, "/jump to %td", p + mcnt - start);
 985           break;
 986
 987         case succeed_n:
 988           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 989           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
 990           fprintf (stderr, "/succeed_n to %td, %d times",
 991                    p - 2 + mcnt - start, mcnt2);
 992           break;
 993
 994         case jump_n:
 995           EXTRACT_NUMBER_AND_INCR (mcnt, p);
 996           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
 997           fprintf (stderr, "/jump_n to %td, %d times",
 998                    p - 2 + mcnt - start, mcnt2);
 999           break;
1000
1001         case set_number_at:
1002           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1003           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1004           fprintf (stderr, "/set_number_at location %td to %d",
1005                    p - 2 + mcnt - start, mcnt2);
1006           break;
1007
1008         case wordbound:
1009           fprintf (stderr, "/wordbound");
1010           break;
1011
1012         case notwordbound:
1013           fprintf (stderr, "/notwordbound");
1014           break;
1015
1016         case wordbeg:
1017           fprintf (stderr, "/wordbeg");
1018           break;
1019
1020         case wordend:
1021           fprintf (stderr, "/wordend");
1022           break;
1023
1024         case symbeg:
1025           fprintf (stderr, "/symbeg");
1026           break;
1027
1028         case symend:
1029           fprintf (stderr, "/symend");
1030           break;
1031
1032         case syntaxspec:
1033           fprintf (stderr, "/syntaxspec");
1034           mcnt = *p++;
1035           fprintf (stderr, "/%d", mcnt);
1036           break;
1037
1038         case notsyntaxspec:
1039           fprintf (stderr, "/notsyntaxspec");
1040           mcnt = *p++;
1041           fprintf (stderr, "/%d", mcnt);
1042           break;
1043
1044 # ifdef emacs
1045         case at_dot:
1046           fprintf (stderr, "/at_dot");
1047           break;
1048
1049         case categoryspec:
1050           fprintf (stderr, "/categoryspec");
1051           mcnt = *p++;
1052           fprintf (stderr, "/%d", mcnt);
1053           break;
1054
1055         case notcategoryspec:
1056           fprintf (stderr, "/notcategoryspec");
1057           mcnt = *p++;
1058           fprintf (stderr, "/%d", mcnt);
1059           break;
1060 # endif /* emacs */
1061
1062         case begbuf:
1063           fprintf (stderr, "/begbuf");
1064           break;
1065
1066         case endbuf:
1067           fprintf (stderr, "/endbuf");
1068           break;
1069
1070         default:
1071           fprintf (stderr, "?%d", *(p-1));
1072         }
1073
1074       fprintf (stderr, "\n");
1075     }
1076
1077   fprintf (stderr, "%td:\tend of pattern.\n", p - start);
1078 }
1079
1080
1081 static void
1082 print_compiled_pattern (struct re_pattern_buffer *bufp)
1083 {
1084   re_char *buffer = bufp->buffer;
1085
1086   print_partial_compiled_pattern (buffer, buffer + bufp->used);
1087   printf ("%ld bytes used/%ld bytes allocated.\n",
1088           bufp->used, bufp->allocated);
1089
1090   if (bufp->fastmap_accurate && bufp->fastmap)
1091     {
1092       printf ("fastmap: ");
1093       print_fastmap (bufp->fastmap);
1094     }
1095
1096   printf ("re_nsub: %zu\t", bufp->re_nsub);
1097   printf ("regs_alloc: %d\t", bufp->regs_allocated);
1098   printf ("can_be_null: %d\t", bufp->can_be_null);
1099   printf ("no_sub: %d\t", bufp->no_sub);
1100   printf ("not_bol: %d\t", bufp->not_bol);
1101   printf ("not_eol: %d\t", bufp->not_eol);
1102 #ifndef emacs
1103   printf ("syntax: %lx\n", bufp->syntax);
1104 #endif
1105   fflush (stdout);
1106   /* Perhaps we should print the translate table?  */
1107 }
1108
1109
1110 static void
1111 print_double_string (re_char *where, re_char *string1, ssize_t size1,
1112                      re_char *string2, ssize_t size2)
1113 {
1114   ssize_t this_char;
1115
1116   if (where == NULL)
1117     printf ("(null)");
1118   else
1119     {
1120       if (FIRST_STRING_P (where))
1121         {
1122           for (this_char = where - string1; this_char < size1; this_char++)
1123             putchar (string1[this_char]);
1124
1125           where = string2;
1126         }
1127
1128       for (this_char = where - string2; this_char < size2; this_char++)
1129         putchar (string2[this_char]);
1130     }
1131 }
1132
1133 #else /* not DEBUG */
1134
1135 # undef assert
1136 # define assert(e)
1137
1138 # define DEBUG_STATEMENT(e)
1139 # define DEBUG_PRINT(...)
1140 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1141 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1142
1143 #endif /* not DEBUG */
1144 \f
1145 #ifndef emacs
1146
1147 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
1148    also be assigned to arbitrarily: each pattern buffer stores its own
1149    syntax, so it can be changed between regex compilations.  */
1150 /* This has no initializer because initialized variables in Emacs
1151    become read-only after dumping.  */
1152 reg_syntax_t re_syntax_options;
1153
1154
1155 /* Specify the precise syntax of regexps for compilation.  This provides
1156    for compatibility for various utilities which historically have
1157    different, incompatible syntaxes.
1158
1159    The argument SYNTAX is a bit mask comprised of the various bits
1160    defined in regex.h.  We return the old syntax.  */
1161
1162 reg_syntax_t
1163 re_set_syntax (reg_syntax_t syntax)
1164 {
1165   reg_syntax_t ret = re_syntax_options;
1166
1167   re_syntax_options = syntax;
1168   return ret;
1169 }
1170 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1171
1172 #endif
1173 \f
1174 /* This table gives an error message for each of the error codes listed
1175    in regex.h.  Obviously the order here has to be same as there.
1176    POSIX doesn't require that we do anything for REG_NOERROR,
1177    but why not be nice?  */
1178
1179 static const char *re_error_msgid[] =
1180   {
1181     gettext_noop ("Success"),   /* REG_NOERROR */
1182     gettext_noop ("No match"),  /* REG_NOMATCH */
1183     gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1184     gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1185     gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1186     gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1187     gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1188     gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1189     gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1190     gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1191     gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1192     gettext_noop ("Invalid range end"), /* REG_ERANGE */
1193     gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1194     gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1195     gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1196     gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1197     gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1198     gettext_noop ("Range striding over charsets"), /* REG_ERANGEX  */
1199     gettext_noop ("Invalid content of \\{\\}, repetitions too big") /* REG_ESIZEBR  */
1200   };
1201 \f
1202 /* Whether to allocate memory during matching.  */
1203
1204 /* Define MATCH_MAY_ALLOCATE to allow the searching and matching
1205    functions allocate memory for the failure stack and registers.
1206    Normally should be defined, because otherwise searching and
1207    matching routines will have much smaller memory resources at their
1208    disposal, and therefore might fail to handle complex regexps.
1209    Therefore undefine MATCH_MAY_ALLOCATE only in the following
1210    exceptional situations:
1211
1212    . When running on a system where memory is at premium.
1213    . When alloca cannot be used at all, perhaps due to bugs in
1214      its implementation, or its being unavailable, or due to a
1215      very small stack size.  This requires to define REGEX_MALLOC
1216      to use malloc instead, which in turn could lead to memory
1217      leaks if search is interrupted by a signal.  (For these
1218      reasons, defining REGEX_MALLOC when building Emacs
1219      automatically undefines MATCH_MAY_ALLOCATE, but outside
1220      Emacs you may not care about memory leaks.)  If you want to
1221      prevent the memory leaks, undefine MATCH_MAY_ALLOCATE.
1222    . When code that calls the searching and matching functions
1223      cannot allow memory allocation, for whatever reasons.  */
1224
1225 /* Normally, this is fine.  */
1226 #define MATCH_MAY_ALLOCATE
1227
1228 /* The match routines may not allocate if (1) they would do it with malloc
1229    and (2) it's not safe for them to use malloc.
1230    Note that if REL_ALLOC is defined, matching would not use malloc for the
1231    failure stack, but we would still use it for the register vectors;
1232    so REL_ALLOC should not affect this.  */
1233 #if defined REGEX_MALLOC && defined emacs
1234 # undef MATCH_MAY_ALLOCATE
1235 #endif
1236
1237 /* While regex matching of a single compiled pattern isn't reentrant
1238    (because we compile regexes to bytecode programs, and the bytecode
1239    programs are self-modifying), the regex machinery must nevertheless
1240    be reentrant with respect to _different_ patterns, and we do that
1241    by avoiding global variables and using MATCH_MAY_ALLOCATE.  */
1242 #if !defined MATCH_MAY_ALLOCATE && defined emacs
1243 # error "Emacs requires MATCH_MAY_ALLOCATE"
1244 #endif
1245
1246 \f
1247 /* Failure stack declarations and macros; both re_compile_fastmap and
1248    re_match_2 use a failure stack.  These have to be macros because of
1249    REGEX_ALLOCATE_STACK.  */
1250
1251
1252 /* Approximate number of failure points for which to initially allocate space
1253    when matching.  If this number is exceeded, we allocate more
1254    space, so it is not a hard limit.  */
1255 #ifndef INIT_FAILURE_ALLOC
1256 # define INIT_FAILURE_ALLOC 20
1257 #endif
1258
1259 /* Roughly the maximum number of failure points on the stack.  Would be
1260    exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1261    This is a variable only so users of regex can assign to it; we never
1262    change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
1263    before using it, so it should probably be a byte-count instead.  */
1264 # if defined MATCH_MAY_ALLOCATE
1265 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1266    whose default stack limit is 2mb.  In order for a larger
1267    value to work reliably, you have to try to make it accord
1268    with the process stack limit.  */
1269 size_t emacs_re_max_failures = 40000;
1270 # else
1271 size_t emacs_re_max_failures = 4000;
1272 # endif
1273
1274 union fail_stack_elt
1275 {
1276   re_char *pointer;
1277   /* This should be the biggest `int' that's no bigger than a pointer.  */
1278   long integer;
1279 };
1280
1281 typedef union fail_stack_elt fail_stack_elt_t;
1282
1283 typedef struct
1284 {
1285   fail_stack_elt_t *stack;
1286   size_t size;
1287   size_t avail; /* Offset of next open position.  */
1288   size_t frame; /* Offset of the cur constructed frame.  */
1289 } fail_stack_type;
1290
1291 #define FAIL_STACK_EMPTY()     (fail_stack.frame == 0)
1292
1293
1294 /* Define macros to initialize and free the failure stack.
1295    Do `return -2' if the alloc fails.  */
1296
1297 #ifdef MATCH_MAY_ALLOCATE
1298 # define INIT_FAIL_STACK()                                              \
1299   do {                                                                  \
1300     fail_stack.stack =                                                  \
1301       REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE   \
1302                             * sizeof (fail_stack_elt_t));               \
1303                                                                         \
1304     if (fail_stack.stack == NULL)                                       \
1305       return -2;                                                        \
1306                                                                         \
1307     fail_stack.size = INIT_FAILURE_ALLOC;                               \
1308     fail_stack.avail = 0;                                               \
1309     fail_stack.frame = 0;                                               \
1310   } while (0)
1311 #else
1312 # define INIT_FAIL_STACK()                                              \
1313   do {                                                                  \
1314     fail_stack.avail = 0;                                               \
1315     fail_stack.frame = 0;                                               \
1316   } while (0)
1317
1318 # define RETALLOC_IF(addr, n, t) \
1319   if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
1320 #endif
1321
1322
1323 /* Double the size of FAIL_STACK, up to a limit
1324    which allows approximately `emacs_re_max_failures' items.
1325
1326    Return 1 if succeeds, and 0 if either ran out of memory
1327    allocating space for it or it was already too large.
1328
1329    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
1330
1331 /* Factor to increase the failure stack size by
1332    when we increase it.
1333    This used to be 2, but 2 was too wasteful
1334    because the old discarded stacks added up to as much space
1335    were as ultimate, maximum-size stack.  */
1336 #define FAIL_STACK_GROWTH_FACTOR 4
1337
1338 #define GROW_FAIL_STACK(fail_stack)                                     \
1339   (((fail_stack).size >= emacs_re_max_failures * TYPICAL_FAILURE_SIZE)        \
1340    ? 0                                                                  \
1341    : ((fail_stack).stack                                                \
1342       = REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
1343           (fail_stack).size * sizeof (fail_stack_elt_t),                \
1344           min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE,                  \
1345                ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR))          \
1346           * sizeof (fail_stack_elt_t)),                                 \
1347                                                                         \
1348       (fail_stack).stack == NULL                                        \
1349       ? 0                                                               \
1350       : ((fail_stack).size                                              \
1351          = (min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE,                \
1352                  ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR))),      \
1353          1)))
1354
1355
1356 /* Push a pointer value onto the failure stack.
1357    Assumes the variable `fail_stack'.  Probably should only
1358    be called from within `PUSH_FAILURE_POINT'.  */
1359 #define PUSH_FAILURE_POINTER(item)                                      \
1360   fail_stack.stack[fail_stack.avail++].pointer = (item)
1361
1362 /* This pushes an integer-valued item onto the failure stack.
1363    Assumes the variable `fail_stack'.  Probably should only
1364    be called from within `PUSH_FAILURE_POINT'.  */
1365 #define PUSH_FAILURE_INT(item)                                  \
1366   fail_stack.stack[fail_stack.avail++].integer = (item)
1367
1368 /* These POP... operations complement the PUSH... operations.
1369    All assume that `fail_stack' is nonempty.  */
1370 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1371 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1372
1373 /* Individual items aside from the registers.  */
1374 #define NUM_NONREG_ITEMS 3
1375
1376 /* Used to examine the stack (to detect infinite loops).  */
1377 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1378 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1379 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1380 #define TOP_FAILURE_HANDLE() fail_stack.frame
1381
1382
1383 #define ENSURE_FAIL_STACK(space)                                        \
1384 while (REMAINING_AVAIL_SLOTS <= space) {                                \
1385   if (!GROW_FAIL_STACK (fail_stack))                                    \
1386     return -2;                                                          \
1387   DEBUG_PRINT ("\n  Doubled stack; size now: %zd\n", (fail_stack).size);\
1388   DEBUG_PRINT ("         slots available: %zd\n", REMAINING_AVAIL_SLOTS);\
1389 }
1390
1391 /* Push register NUM onto the stack.  */
1392 #define PUSH_FAILURE_REG(num)                                           \
1393 do {                                                                    \
1394   char *destination;                                                    \
1395   long n = num;                                                         \
1396   ENSURE_FAIL_STACK(3);                                                 \
1397   DEBUG_PRINT ("    Push reg %ld (spanning %p -> %p)\n",                \
1398                n, regstart[n], regend[n]);                              \
1399   PUSH_FAILURE_POINTER (regstart[n]);                                   \
1400   PUSH_FAILURE_POINTER (regend[n]);                                     \
1401   PUSH_FAILURE_INT (n);                                                 \
1402 } while (0)
1403
1404 /* Change the counter's value to VAL, but make sure that it will
1405    be reset when backtracking.  */
1406 #define PUSH_NUMBER(ptr,val)                                            \
1407 do {                                                                    \
1408   char *destination;                                                    \
1409   int c;                                                                \
1410   ENSURE_FAIL_STACK(3);                                                 \
1411   EXTRACT_NUMBER (c, ptr);                                              \
1412   DEBUG_PRINT ("    Push number %p = %d -> %d\n", ptr, c, val);         \
1413   PUSH_FAILURE_INT (c);                                                 \
1414   PUSH_FAILURE_POINTER (ptr);                                           \
1415   PUSH_FAILURE_INT (-1);                                                \
1416   STORE_NUMBER (ptr, val);                                              \
1417 } while (0)
1418
1419 /* Pop a saved register off the stack.  */
1420 #define POP_FAILURE_REG_OR_COUNT()                                      \
1421 do {                                                                    \
1422   long pfreg = POP_FAILURE_INT ();                                      \
1423   if (pfreg == -1)                                                      \
1424     {                                                                   \
1425       /* It's a counter.  */                                            \
1426       /* Here, we discard `const', making re_match non-reentrant.  */   \
1427       unsigned char *ptr = (unsigned char *) POP_FAILURE_POINTER ();    \
1428       pfreg = POP_FAILURE_INT ();                                       \
1429       STORE_NUMBER (ptr, pfreg);                                        \
1430       DEBUG_PRINT ("     Pop counter %p = %ld\n", ptr, pfreg);          \
1431     }                                                                   \
1432   else                                                                  \
1433     {                                                                   \
1434       regend[pfreg] = POP_FAILURE_POINTER ();                           \
1435       regstart[pfreg] = POP_FAILURE_POINTER ();                         \
1436       DEBUG_PRINT ("     Pop reg %ld (spanning %p -> %p)\n",            \
1437                    pfreg, regstart[pfreg], regend[pfreg]);              \
1438     }                                                                   \
1439 } while (0)
1440
1441 /* Check that we are not stuck in an infinite loop.  */
1442 #define CHECK_INFINITE_LOOP(pat_cur, string_place)                      \
1443 do {                                                                    \
1444   ssize_t failure = TOP_FAILURE_HANDLE ();                              \
1445   /* Check for infinite matching loops */                               \
1446   while (failure > 0                                                    \
1447          && (FAILURE_STR (failure) == string_place                      \
1448              || FAILURE_STR (failure) == NULL))                         \
1449     {                                                                   \
1450       assert (FAILURE_PAT (failure) >= bufp->buffer                     \
1451               && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
1452       if (FAILURE_PAT (failure) == pat_cur)                             \
1453         {                                                               \
1454           cycle = 1;                                                    \
1455           break;                                                        \
1456         }                                                               \
1457       DEBUG_PRINT ("  Other pattern: %p\n", FAILURE_PAT (failure));     \
1458       failure = NEXT_FAILURE_HANDLE(failure);                           \
1459     }                                                                   \
1460   DEBUG_PRINT ("  Other string: %p\n", FAILURE_STR (failure));          \
1461 } while (0)
1462
1463 /* Push the information about the state we will need
1464    if we ever fail back to it.
1465
1466    Requires variables fail_stack, regstart, regend and
1467    num_regs be declared.  GROW_FAIL_STACK requires `destination' be
1468    declared.
1469
1470    Does `return FAILURE_CODE' if runs out of memory.  */
1471
1472 #define PUSH_FAILURE_POINT(pattern, string_place)                       \
1473 do {                                                                    \
1474   char *destination;                                                    \
1475   /* Must be int, so when we don't save any registers, the arithmetic   \
1476      of 0 + -1 isn't done as unsigned.  */                              \
1477                                                                         \
1478   DEBUG_STATEMENT (nfailure_points_pushed++);                           \
1479   DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n");                              \
1480   DEBUG_PRINT ("  Before push, next avail: %zd\n", (fail_stack).avail); \
1481   DEBUG_PRINT ("                        size: %zd\n", (fail_stack).size);\
1482                                                                         \
1483   ENSURE_FAIL_STACK (NUM_NONREG_ITEMS);                                 \
1484                                                                         \
1485   DEBUG_PRINT ("\n");                                                   \
1486                                                                         \
1487   DEBUG_PRINT ("  Push frame index: %zd\n", fail_stack.frame);          \
1488   PUSH_FAILURE_INT (fail_stack.frame);                                  \
1489                                                                         \
1490   DEBUG_PRINT ("  Push string %p: \"", string_place);                   \
1491   DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1492   DEBUG_PRINT ("\"\n");                                                 \
1493   PUSH_FAILURE_POINTER (string_place);                                  \
1494                                                                         \
1495   DEBUG_PRINT ("  Push pattern %p: ", pattern);                         \
1496   DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend);                   \
1497   PUSH_FAILURE_POINTER (pattern);                                       \
1498                                                                         \
1499   /* Close the frame by moving the frame pointer past it.  */           \
1500   fail_stack.frame = fail_stack.avail;                                  \
1501 } while (0)
1502
1503 /* Estimate the size of data pushed by a typical failure stack entry.
1504    An estimate is all we need, because all we use this for
1505    is to choose a limit for how big to make the failure stack.  */
1506 /* BEWARE, the value `20' is hard-coded in emacs.c:main().  */
1507 #define TYPICAL_FAILURE_SIZE 20
1508
1509 /* How many items can still be added to the stack without overflowing it.  */
1510 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1511
1512
1513 /* Pops what PUSH_FAIL_STACK pushes.
1514
1515    We restore into the parameters, all of which should be lvalues:
1516      STR -- the saved data position.
1517      PAT -- the saved pattern position.
1518      REGSTART, REGEND -- arrays of string positions.
1519
1520    Also assumes the variables `fail_stack' and (if debugging), `bufp',
1521    `pend', `string1', `size1', `string2', and `size2'.  */
1522
1523 #define POP_FAILURE_POINT(str, pat)                                     \
1524 do {                                                                    \
1525   assert (!FAIL_STACK_EMPTY ());                                        \
1526                                                                         \
1527   /* Remove failure points and point to how many regs pushed.  */       \
1528   DEBUG_PRINT ("POP_FAILURE_POINT:\n");                                 \
1529   DEBUG_PRINT ("  Before pop, next avail: %zd\n", fail_stack.avail);    \
1530   DEBUG_PRINT ("                     size: %zd\n", fail_stack.size);    \
1531                                                                         \
1532   /* Pop the saved registers.  */                                       \
1533   while (fail_stack.frame < fail_stack.avail)                           \
1534     POP_FAILURE_REG_OR_COUNT ();                                        \
1535                                                                         \
1536   pat = POP_FAILURE_POINTER ();                                         \
1537   DEBUG_PRINT ("  Popping pattern %p: ", pat);                          \
1538   DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend);                       \
1539                                                                         \
1540   /* If the saved string location is NULL, it came from an              \
1541      on_failure_keep_string_jump opcode, and we want to throw away the  \
1542      saved NULL, thus retaining our current position in the string.  */ \
1543   str = POP_FAILURE_POINTER ();                                         \
1544   DEBUG_PRINT ("  Popping string %p: \"", str);                         \
1545   DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2);      \
1546   DEBUG_PRINT ("\"\n");                                                 \
1547                                                                         \
1548   fail_stack.frame = POP_FAILURE_INT ();                                \
1549   DEBUG_PRINT ("  Popping  frame index: %zd\n", fail_stack.frame);      \
1550                                                                         \
1551   assert (fail_stack.avail >= 0);                                       \
1552   assert (fail_stack.frame <= fail_stack.avail);                        \
1553                                                                         \
1554   DEBUG_STATEMENT (nfailure_points_popped++);                           \
1555 } while (0) /* POP_FAILURE_POINT */
1556
1557
1558 \f
1559 /* Registers are set to a sentinel when they haven't yet matched.  */
1560 #define REG_UNSET(e) ((e) == NULL)
1561 \f
1562 /* Subroutine declarations and macros for regex_compile.  */
1563
1564 static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1565 #ifdef emacs
1566                                     bool posix_backtracking,
1567                                     const char *whitespace_regexp,
1568 #else
1569                                     reg_syntax_t syntax,
1570 #endif
1571                                     struct re_pattern_buffer *bufp);
1572 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1573 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1574 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1575                         int arg, unsigned char *end);
1576 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1577                         int arg1, int arg2, unsigned char *end);
1578 static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1579                                  reg_syntax_t syntax);
1580 static boolean at_endline_loc_p (re_char *p, re_char *pend,
1581                                  reg_syntax_t syntax);
1582 static re_char *skip_one_char (re_char *p);
1583 static int analyze_first (re_char *p, re_char *pend,
1584                           char *fastmap, const int multibyte);
1585
1586 /* Fetch the next character in the uncompiled pattern, with no
1587    translation.  */
1588 #define PATFETCH(c)                                                     \
1589   do {                                                                  \
1590     int len;                                                            \
1591     if (p == pend) return REG_EEND;                                     \
1592     c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte);                  \
1593     p += len;                                                           \
1594   } while (0)
1595
1596
1597 /* If `translate' is non-null, return translate[D], else just D.  We
1598    cast the subscript to translate because some data is declared as
1599    `char *', to avoid warnings when a string constant is passed.  But
1600    when we use a character as a subscript we must make it unsigned.  */
1601 #ifndef TRANSLATE
1602 # define TRANSLATE(d) \
1603   (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1604 #endif
1605
1606
1607 /* Macros for outputting the compiled pattern into `buffer'.  */
1608
1609 /* If the buffer isn't allocated when it comes in, use this.  */
1610 #define INIT_BUF_SIZE  32
1611
1612 /* Make sure we have at least N more bytes of space in buffer.  */
1613 #define GET_BUFFER_SPACE(n)                                             \
1614     while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated)         \
1615       EXTEND_BUFFER ()
1616
1617 /* Make sure we have one more byte of buffer space and then add C to it.  */
1618 #define BUF_PUSH(c)                                                     \
1619   do {                                                                  \
1620     GET_BUFFER_SPACE (1);                                               \
1621     *b++ = (unsigned char) (c);                                         \
1622   } while (0)
1623
1624
1625 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1626 #define BUF_PUSH_2(c1, c2)                                              \
1627   do {                                                                  \
1628     GET_BUFFER_SPACE (2);                                               \
1629     *b++ = (unsigned char) (c1);                                        \
1630     *b++ = (unsigned char) (c2);                                        \
1631   } while (0)
1632
1633
1634 /* Store a jump with opcode OP at LOC to location TO.  We store a
1635    relative address offset by the three bytes the jump itself occupies.  */
1636 #define STORE_JUMP(op, loc, to) \
1637   store_op1 (op, loc, (to) - (loc) - 3)
1638
1639 /* Likewise, for a two-argument jump.  */
1640 #define STORE_JUMP2(op, loc, to, arg) \
1641   store_op2 (op, loc, (to) - (loc) - 3, arg)
1642
1643 /* Like `STORE_JUMP', but for inserting.  Assume `b' is the buffer end.  */
1644 #define INSERT_JUMP(op, loc, to) \
1645   insert_op1 (op, loc, (to) - (loc) - 3, b)
1646
1647 /* Like `STORE_JUMP2', but for inserting.  Assume `b' is the buffer end.  */
1648 #define INSERT_JUMP2(op, loc, to, arg) \
1649   insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1650
1651
1652 /* This is not an arbitrary limit: the arguments which represent offsets
1653    into the pattern are two bytes long.  So if 2^15 bytes turns out to
1654    be too small, many things would have to change.  */
1655 # define MAX_BUF_SIZE (1L << 15)
1656
1657 /* Extend the buffer by twice its current size via realloc and
1658    reset the pointers that pointed into the old block to point to the
1659    correct places in the new one.  If extending the buffer results in it
1660    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1661 #define EXTEND_BUFFER()                                                 \
1662   do {                                                                  \
1663     unsigned char *old_buffer = bufp->buffer;                           \
1664     if (bufp->allocated == MAX_BUF_SIZE)                                \
1665       return REG_ESIZE;                                                 \
1666     bufp->allocated <<= 1;                                              \
1667     if (bufp->allocated > MAX_BUF_SIZE)                                 \
1668       bufp->allocated = MAX_BUF_SIZE;                                   \
1669     ptrdiff_t b_off = b - old_buffer;                                   \
1670     ptrdiff_t begalt_off = begalt - old_buffer;                         \
1671     bool fixup_alt_jump_set = !!fixup_alt_jump;                         \
1672     bool laststart_set = !!laststart;                                   \
1673     bool pending_exact_set = !!pending_exact;                           \
1674     ptrdiff_t fixup_alt_jump_off, laststart_off, pending_exact_off;     \
1675     if (fixup_alt_jump_set) fixup_alt_jump_off = fixup_alt_jump - old_buffer; \
1676     if (laststart_set) laststart_off = laststart - old_buffer;          \
1677     if (pending_exact_set) pending_exact_off = pending_exact - old_buffer; \
1678     RETALLOC (bufp->buffer, bufp->allocated, unsigned char);            \
1679     if (bufp->buffer == NULL)                                           \
1680       return REG_ESPACE;                                                \
1681     unsigned char *new_buffer = bufp->buffer;                           \
1682     b = new_buffer + b_off;                                             \
1683     begalt = new_buffer + begalt_off;                                   \
1684     if (fixup_alt_jump_set) fixup_alt_jump = new_buffer + fixup_alt_jump_off; \
1685     if (laststart_set) laststart = new_buffer + laststart_off;          \
1686     if (pending_exact_set) pending_exact = new_buffer + pending_exact_off; \
1687   } while (0)
1688
1689
1690 /* Since we have one byte reserved for the register number argument to
1691    {start,stop}_memory, the maximum number of groups we can report
1692    things about is what fits in that byte.  */
1693 #define MAX_REGNUM 255
1694
1695 /* But patterns can have more than `MAX_REGNUM' registers.  We just
1696    ignore the excess.  */
1697 typedef int regnum_t;
1698
1699
1700 /* Macros for the compile stack.  */
1701
1702 /* Since offsets can go either forwards or backwards, this type needs to
1703    be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.  */
1704 /* int may be not enough when sizeof(int) == 2.  */
1705 typedef long pattern_offset_t;
1706
1707 typedef struct
1708 {
1709   pattern_offset_t begalt_offset;
1710   pattern_offset_t fixup_alt_jump;
1711   pattern_offset_t laststart_offset;
1712   regnum_t regnum;
1713 } compile_stack_elt_t;
1714
1715
1716 typedef struct
1717 {
1718   compile_stack_elt_t *stack;
1719   size_t size;
1720   size_t avail;                 /* Offset of next open position.  */
1721 } compile_stack_type;
1722
1723
1724 #define INIT_COMPILE_STACK_SIZE 32
1725
1726 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1727 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1728
1729 /* The next available element.  */
1730 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1731
1732 /* Explicit quit checking is needed for Emacs, which uses polling to
1733    process input events.  */
1734 #ifndef emacs
1735 static void maybe_quit (void) {}
1736 #endif
1737 \f
1738 /* Structure to manage work area for range table.  */
1739 struct range_table_work_area
1740 {
1741   int *table;                   /* actual work area.  */
1742   int allocated;                /* allocated size for work area in bytes.  */
1743   int used;                     /* actually used size in words.  */
1744   int bits;                     /* flag to record character classes */
1745 };
1746
1747 #ifdef emacs
1748
1749 /* Make sure that WORK_AREA can hold more N multibyte characters.
1750    This is used only in set_image_of_range and set_image_of_range_1.
1751    It expects WORK_AREA to be a pointer.
1752    If it can't get the space, it returns from the surrounding function.  */
1753
1754 #define EXTEND_RANGE_TABLE(work_area, n)                                \
1755   do {                                                                  \
1756     if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1757       {                                                                 \
1758         extend_range_table_work_area (&work_area);                      \
1759         if ((work_area).table == 0)                                     \
1760           return (REG_ESPACE);                                          \
1761       }                                                                 \
1762   } while (0)
1763
1764 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)           \
1765   (work_area).bits |= (bit)
1766
1767 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
1768 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)    \
1769   do {                                                                  \
1770     EXTEND_RANGE_TABLE ((work_area), 2);                                \
1771     (work_area).table[(work_area).used++] = (range_start);              \
1772     (work_area).table[(work_area).used++] = (range_end);                \
1773   } while (0)
1774
1775 #endif /* emacs */
1776
1777 /* Free allocated memory for WORK_AREA.  */
1778 #define FREE_RANGE_TABLE_WORK_AREA(work_area)   \
1779   do {                                          \
1780     if ((work_area).table)                      \
1781       free ((work_area).table);                 \
1782   } while (0)
1783
1784 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1785 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1786 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1787 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1788
1789 /* Bits used to implement the multibyte-part of the various character classes
1790    such as [:alnum:] in a charset's range table.  The code currently assumes
1791    that only the low 16 bits are used.  */
1792 #define BIT_WORD        0x1
1793 #define BIT_LOWER       0x2
1794 #define BIT_PUNCT       0x4
1795 #define BIT_SPACE       0x8
1796 #define BIT_UPPER       0x10
1797 #define BIT_MULTIBYTE   0x20
1798 #define BIT_ALPHA       0x40
1799 #define BIT_ALNUM       0x80
1800 #define BIT_GRAPH       0x100
1801 #define BIT_PRINT       0x200
1802 #define BIT_BLANK       0x400
1803 \f
1804
1805 /* Set the bit for character C in a list.  */
1806 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1807
1808
1809 #ifdef emacs
1810
1811 /* Store characters in the range FROM to TO in the bitmap at B (for
1812    ASCII and unibyte characters) and WORK_AREA (for multibyte
1813    characters) while translating them and paying attention to the
1814    continuity of translated characters.
1815
1816    Implementation note: It is better to implement these fairly big
1817    macros by a function, but it's not that easy because macros called
1818    in this macro assume various local variables already declared.  */
1819
1820 /* Both FROM and TO are ASCII characters.  */
1821
1822 #define SETUP_ASCII_RANGE(work_area, FROM, TO)                  \
1823   do {                                                          \
1824     int C0, C1;                                                 \
1825                                                                 \
1826     for (C0 = (FROM); C0 <= (TO); C0++)                         \
1827       {                                                         \
1828         C1 = TRANSLATE (C0);                                    \
1829         if (! ASCII_CHAR_P (C1))                                \
1830           {                                                     \
1831             SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);    \
1832             if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)             \
1833               C1 = C0;                                          \
1834           }                                                     \
1835         SET_LIST_BIT (C1);                                      \
1836       }                                                         \
1837   } while (0)
1838
1839
1840 /* Both FROM and TO are unibyte characters (0x80..0xFF).  */
1841
1842 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)                               \
1843   do {                                                                         \
1844     int C0, C1, C2, I;                                                         \
1845     int USED = RANGE_TABLE_WORK_USED (work_area);                              \
1846                                                                                \
1847     for (C0 = (FROM); C0 <= (TO); C0++)                                        \
1848       {                                                                        \
1849         C1 = RE_CHAR_TO_MULTIBYTE (C0);                                        \
1850         if (CHAR_BYTE8_P (C1))                                                 \
1851           SET_LIST_BIT (C0);                                                   \
1852         else                                                                   \
1853           {                                                                    \
1854             C2 = TRANSLATE (C1);                                               \
1855             if (C2 == C1                                                       \
1856                 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)                         \
1857               C1 = C0;                                                         \
1858             SET_LIST_BIT (C1);                                                 \
1859             for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1860               {                                                                \
1861                 int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1862                 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1863                                                                                \
1864                 if (C2 >= from - 1 && C2 <= to + 1)                            \
1865                   {                                                            \
1866                     if (C2 == from - 1)                                        \
1867                       RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1868                     else if (C2 == to + 1)                                     \
1869                       RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1870                     break;                                                     \
1871                   }                                                            \
1872               }                                                                \
1873             if (I < USED)                                                      \
1874               SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);                 \
1875           }                                                                    \
1876       }                                                                        \
1877   } while (0)
1878
1879
1880 /* Both FROM and TO are multibyte characters.  */
1881
1882 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)                         \
1883   do {                                                                     \
1884     int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);           \
1885                                                                            \
1886     SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));                 \
1887     for (C0 = (FROM); C0 <= (TO); C0++)                                    \
1888       {                                                                    \
1889         C1 = TRANSLATE (C0);                                               \
1890         if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0                            \
1891             || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))          \
1892           SET_LIST_BIT (C2);                                               \
1893         if (C1 >= (FROM) && C1 <= (TO))                                    \
1894           continue;                                                        \
1895         for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1896           {                                                                \
1897             int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1898             int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1899                                                                            \
1900             if (C1 >= from - 1 && C1 <= to + 1)                            \
1901               {                                                            \
1902                 if (C1 == from - 1)                                        \
1903                   RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1904                 else if (C1 == to + 1)                                     \
1905                   RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1906                 break;                                                     \
1907               }                                                            \
1908           }                                                                \
1909         if (I < USED)                                                      \
1910           SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);                 \
1911       }                                                                    \
1912   } while (0)
1913
1914 #endif /* emacs */
1915
1916 /* Get the next unsigned number in the uncompiled pattern.  */
1917 #define GET_INTERVAL_COUNT(num)                                 \
1918   do {                                                                  \
1919     if (p == pend)                                                      \
1920       FREE_STACK_RETURN (REG_EBRACE);                                   \
1921     else                                                                \
1922       {                                                                 \
1923         PATFETCH (c);                                                   \
1924         while ('0' <= c && c <= '9')                                    \
1925           {                                                             \
1926             if (num < 0)                                                \
1927               num = 0;                                                  \
1928             if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num)    \
1929               FREE_STACK_RETURN (REG_ESIZEBR);                          \
1930             num = num * 10 + c - '0';                                   \
1931             if (p == pend)                                              \
1932               FREE_STACK_RETURN (REG_EBRACE);                           \
1933             PATFETCH (c);                                               \
1934           }                                                             \
1935       }                                                                 \
1936   } while (0)
1937 \f
1938 #if ! WIDE_CHAR_SUPPORT
1939
1940 /* Parse a character class, i.e. string such as "[:name:]".  *strp
1941    points to the string to be parsed and limit is length, in bytes, of
1942    that string.
1943
1944    If *strp point to a string that begins with "[:name:]", where name is
1945    a non-empty sequence of lower case letters, *strp will be advanced past the
1946    closing square bracket and RECC_* constant which maps to the name will be
1947    returned.  If name is not a valid character class name zero, or RECC_ERROR,
1948    is returned.
1949
1950    Otherwise, if *strp doesn't begin with "[:name:]", -1 is returned.
1951
1952    The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
1953  */
1954 re_wctype_t
1955 re_wctype_parse (const unsigned char **strp, unsigned limit)
1956 {
1957   const char *beg = (const char *)*strp, *it;
1958
1959   if (limit < 4 || beg[0] != '[' || beg[1] != ':')
1960     return -1;
1961
1962   beg += 2;  /* skip opening "[:" */
1963   limit -= 3;  /* opening "[:" and half of closing ":]"; --limit handles rest */
1964   for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
1965     if (!--limit)
1966       return -1;
1967
1968   *strp = (const unsigned char *)(it + 2);
1969
1970   /* Sort tests in the length=five case by frequency the classes to minimize
1971      number of times we fail the comparison.  The frequencies of character class
1972      names used in Emacs sources as of 2016-07-27:
1973
1974      $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
1975            sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
1976          213 [:alnum:]
1977          104 [:alpha:]
1978           62 [:space:]
1979           39 [:digit:]
1980           36 [:blank:]
1981           26 [:word:]
1982           26 [:upper:]
1983           21 [:lower:]
1984           10 [:xdigit:]
1985           10 [:punct:]
1986           10 [:ascii:]
1987            4 [:nonascii:]
1988            4 [:graph:]
1989            2 [:print:]
1990            2 [:cntrl:]
1991            1 [:ff:]
1992
1993      If you update this list, consider also updating chain of or'ed conditions
1994      in execute_charset function.
1995    */
1996
1997   switch (it - beg) {
1998   case 4:
1999     if (!memcmp (beg, "word", 4))      return RECC_WORD;
2000     break;
2001   case 5:
2002     if (!memcmp (beg, "alnum", 5))     return RECC_ALNUM;
2003     if (!memcmp (beg, "alpha", 5))     return RECC_ALPHA;
2004     if (!memcmp (beg, "space", 5))     return RECC_SPACE;
2005     if (!memcmp (beg, "digit", 5))     return RECC_DIGIT;
2006     if (!memcmp (beg, "blank", 5))     return RECC_BLANK;
2007     if (!memcmp (beg, "upper", 5))     return RECC_UPPER;
2008     if (!memcmp (beg, "lower", 5))     return RECC_LOWER;
2009     if (!memcmp (beg, "punct", 5))     return RECC_PUNCT;
2010     if (!memcmp (beg, "ascii", 5))     return RECC_ASCII;
2011     if (!memcmp (beg, "graph", 5))     return RECC_GRAPH;
2012     if (!memcmp (beg, "print", 5))     return RECC_PRINT;
2013     if (!memcmp (beg, "cntrl", 5))     return RECC_CNTRL;
2014     break;
2015   case 6:
2016     if (!memcmp (beg, "xdigit", 6))    return RECC_XDIGIT;
2017     break;
2018   case 7:
2019     if (!memcmp (beg, "unibyte", 7))   return RECC_UNIBYTE;
2020     break;
2021   case 8:
2022     if (!memcmp (beg, "nonascii", 8))  return RECC_NONASCII;
2023     break;
2024   case 9:
2025     if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
2026     break;
2027   }
2028
2029   return RECC_ERROR;
2030 }
2031
2032 /* True if CH is in the char class CC.  */
2033 boolean
2034 re_iswctype (int ch, re_wctype_t cc)
2035 {
2036   switch (cc)
2037     {
2038     case RECC_ALNUM: return ISALNUM (ch) != 0;
2039     case RECC_ALPHA: return ISALPHA (ch) != 0;
2040     case RECC_BLANK: return ISBLANK (ch) != 0;
2041     case RECC_CNTRL: return ISCNTRL (ch) != 0;
2042     case RECC_DIGIT: return ISDIGIT (ch) != 0;
2043     case RECC_GRAPH: return ISGRAPH (ch) != 0;
2044     case RECC_LOWER: return ISLOWER (ch) != 0;
2045     case RECC_PRINT: return ISPRINT (ch) != 0;
2046     case RECC_PUNCT: return ISPUNCT (ch) != 0;
2047     case RECC_SPACE: return ISSPACE (ch) != 0;
2048     case RECC_UPPER: return ISUPPER (ch) != 0;
2049     case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2050     case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
2051     case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2052     case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
2053     case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2054     case RECC_WORD: return ISWORD (ch) != 0;
2055     case RECC_ERROR: return false;
2056     default:
2057       abort ();
2058     }
2059 }
2060
2061 /* Return a bit-pattern to use in the range-table bits to match multibyte
2062    chars of class CC.  */
2063 static int
2064 re_wctype_to_bit (re_wctype_t cc)
2065 {
2066   switch (cc)
2067     {
2068     case RECC_NONASCII:
2069     case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2070     case RECC_ALPHA: return BIT_ALPHA;
2071     case RECC_ALNUM: return BIT_ALNUM;
2072     case RECC_WORD: return BIT_WORD;
2073     case RECC_LOWER: return BIT_LOWER;
2074     case RECC_UPPER: return BIT_UPPER;
2075     case RECC_PUNCT: return BIT_PUNCT;
2076     case RECC_SPACE: return BIT_SPACE;
2077     case RECC_GRAPH: return BIT_GRAPH;
2078     case RECC_PRINT: return BIT_PRINT;
2079     case RECC_BLANK: return BIT_BLANK;
2080     case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2081     case RECC_UNIBYTE: case RECC_ERROR: return 0;
2082     default:
2083       abort ();
2084     }
2085 }
2086 #endif
2087 \f
2088 /* Filling in the work area of a range.  */
2089
2090 /* Actually extend the space in WORK_AREA.  */
2091
2092 static void
2093 extend_range_table_work_area (struct range_table_work_area *work_area)
2094 {
2095   work_area->allocated += 16 * sizeof (int);
2096   work_area->table = realloc (work_area->table, work_area->allocated);
2097 }
2098
2099 #if 0
2100 #ifdef emacs
2101
2102 /* Carefully find the ranges of codes that are equivalent
2103    under case conversion to the range start..end when passed through
2104    TRANSLATE.  Handle the case where non-letters can come in between
2105    two upper-case letters (which happens in Latin-1).
2106    Also handle the case of groups of more than 2 case-equivalent chars.
2107
2108    The basic method is to look at consecutive characters and see
2109    if they can form a run that can be handled as one.
2110
2111    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2112
2113 static int
2114 set_image_of_range_1 (struct range_table_work_area *work_area,
2115                       re_wchar_t start, re_wchar_t end,
2116                       RE_TRANSLATE_TYPE translate)
2117 {
2118   /* `one_case' indicates a character, or a run of characters,
2119      each of which is an isolate (no case-equivalents).
2120      This includes all ASCII non-letters.
2121
2122      `two_case' indicates a character, or a run of characters,
2123      each of which has two case-equivalent forms.
2124      This includes all ASCII letters.
2125
2126      `strange' indicates a character that has more than one
2127      case-equivalent.  */
2128
2129   enum case_type {one_case, two_case, strange};
2130
2131   /* Describe the run that is in progress,
2132      which the next character can try to extend.
2133      If run_type is strange, that means there really is no run.
2134      If run_type is one_case, then run_start...run_end is the run.
2135      If run_type is two_case, then the run is run_start...run_end,
2136      and the case-equivalents end at run_eqv_end.  */
2137
2138   enum case_type run_type = strange;
2139   int run_start, run_end, run_eqv_end;
2140
2141   Lisp_Object eqv_table;
2142
2143   if (!RE_TRANSLATE_P (translate))
2144     {
2145       EXTEND_RANGE_TABLE (work_area, 2);
2146       work_area->table[work_area->used++] = (start);
2147       work_area->table[work_area->used++] = (end);
2148       return -1;
2149     }
2150
2151   eqv_table = XCHAR_TABLE (translate)->extras[2];
2152
2153   for (; start <= end; start++)
2154     {
2155       enum case_type this_type;
2156       int eqv = RE_TRANSLATE (eqv_table, start);
2157       int minchar, maxchar;
2158
2159       /* Classify this character */
2160       if (eqv == start)
2161         this_type = one_case;
2162       else if (RE_TRANSLATE (eqv_table, eqv) == start)
2163         this_type = two_case;
2164       else
2165         this_type = strange;
2166
2167       if (start < eqv)
2168         minchar = start, maxchar = eqv;
2169       else
2170         minchar = eqv, maxchar = start;
2171
2172       /* Can this character extend the run in progress?  */
2173       if (this_type == strange || this_type != run_type
2174           || !(minchar == run_end + 1
2175                && (run_type == two_case
2176                    ? maxchar == run_eqv_end + 1 : 1)))
2177         {
2178           /* No, end the run.
2179              Record each of its equivalent ranges.  */
2180           if (run_type == one_case)
2181             {
2182               EXTEND_RANGE_TABLE (work_area, 2);
2183               work_area->table[work_area->used++] = run_start;
2184               work_area->table[work_area->used++] = run_end;
2185             }
2186           else if (run_type == two_case)
2187             {
2188               EXTEND_RANGE_TABLE (work_area, 4);
2189               work_area->table[work_area->used++] = run_start;
2190               work_area->table[work_area->used++] = run_end;
2191               work_area->table[work_area->used++]
2192                 = RE_TRANSLATE (eqv_table, run_start);
2193               work_area->table[work_area->used++]
2194                 = RE_TRANSLATE (eqv_table, run_end);
2195             }
2196           run_type = strange;
2197         }
2198
2199       if (this_type == strange)
2200         {
2201           /* For a strange character, add each of its equivalents, one
2202              by one.  Don't start a range.  */
2203           do
2204             {
2205               EXTEND_RANGE_TABLE (work_area, 2);
2206               work_area->table[work_area->used++] = eqv;
2207               work_area->table[work_area->used++] = eqv;
2208               eqv = RE_TRANSLATE (eqv_table, eqv);
2209             }
2210           while (eqv != start);
2211         }
2212
2213       /* Add this char to the run, or start a new run.  */
2214       else if (run_type == strange)
2215         {
2216           /* Initialize a new range.  */
2217           run_type = this_type;
2218           run_start = start;
2219           run_end = start;
2220           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2221         }
2222       else
2223         {
2224           /* Extend a running range.  */
2225           run_end = minchar;
2226           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2227         }
2228     }
2229
2230   /* If a run is still in progress at the end, finish it now
2231      by recording its equivalent ranges.  */
2232   if (run_type == one_case)
2233     {
2234       EXTEND_RANGE_TABLE (work_area, 2);
2235       work_area->table[work_area->used++] = run_start;
2236       work_area->table[work_area->used++] = run_end;
2237     }
2238   else if (run_type == two_case)
2239     {
2240       EXTEND_RANGE_TABLE (work_area, 4);
2241       work_area->table[work_area->used++] = run_start;
2242       work_area->table[work_area->used++] = run_end;
2243       work_area->table[work_area->used++]
2244         = RE_TRANSLATE (eqv_table, run_start);
2245       work_area->table[work_area->used++]
2246         = RE_TRANSLATE (eqv_table, run_end);
2247     }
2248
2249   return -1;
2250 }
2251
2252 #endif /* emacs */
2253
2254 /* Record the image of the range start..end when passed through
2255    TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2256    and is not even necessarily contiguous.
2257    Normally we approximate it with the smallest contiguous range that contains
2258    all the chars we need.  However, for Latin-1 we go to extra effort
2259    to do a better job.
2260
2261    This function is not called for ASCII ranges.
2262
2263    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2264
2265 static int
2266 set_image_of_range (struct range_table_work_area *work_area,
2267                     re_wchar_t start, re_wchar_t end,
2268                     RE_TRANSLATE_TYPE translate)
2269 {
2270   re_wchar_t cmin, cmax;
2271
2272 #ifdef emacs
2273   /* For Latin-1 ranges, use set_image_of_range_1
2274      to get proper handling of ranges that include letters and nonletters.
2275      For a range that includes the whole of Latin-1, this is not necessary.
2276      For other character sets, we don't bother to get this right.  */
2277   if (RE_TRANSLATE_P (translate) && start < 04400
2278       && !(start < 04200 && end >= 04377))
2279     {
2280       int newend;
2281       int tem;
2282       newend = end;
2283       if (newend > 04377)
2284         newend = 04377;
2285       tem = set_image_of_range_1 (work_area, start, newend, translate);
2286       if (tem > 0)
2287         return tem;
2288
2289       start = 04400;
2290       if (end < 04400)
2291         return -1;
2292     }
2293 #endif
2294
2295   EXTEND_RANGE_TABLE (work_area, 2);
2296   work_area->table[work_area->used++] = (start);
2297   work_area->table[work_area->used++] = (end);
2298
2299   cmin = -1, cmax = -1;
2300
2301   if (RE_TRANSLATE_P (translate))
2302     {
2303       int ch;
2304
2305       for (ch = start; ch <= end; ch++)
2306         {
2307           re_wchar_t c = TRANSLATE (ch);
2308           if (! (start <= c && c <= end))
2309             {
2310               if (cmin == -1)
2311                 cmin = c, cmax = c;
2312               else
2313                 {
2314                   cmin = min (cmin, c);
2315                   cmax = max (cmax, c);
2316                 }
2317             }
2318         }
2319
2320       if (cmin != -1)
2321         {
2322           EXTEND_RANGE_TABLE (work_area, 2);
2323           work_area->table[work_area->used++] = (cmin);
2324           work_area->table[work_area->used++] = (cmax);
2325         }
2326     }
2327
2328   return -1;
2329 }
2330 #endif  /* 0 */
2331 \f
2332 #ifndef MATCH_MAY_ALLOCATE
2333
2334 /* If we cannot allocate large objects within re_match_2_internal,
2335    we make the fail stack and register vectors global.
2336    The fail stack, we grow to the maximum size when a regexp
2337    is compiled.
2338    The register vectors, we adjust in size each time we
2339    compile a regexp, according to the number of registers it needs.  */
2340
2341 static fail_stack_type fail_stack;
2342
2343 /* Size with which the following vectors are currently allocated.
2344    That is so we can make them bigger as needed,
2345    but never make them smaller.  */
2346 static int regs_allocated_size;
2347
2348 static re_char **     regstart, **     regend;
2349 static re_char **best_regstart, **best_regend;
2350
2351 /* Make the register vectors big enough for NUM_REGS registers,
2352    but don't make them smaller.  */
2353
2354 static
2355 regex_grow_registers (int num_regs)
2356 {
2357   if (num_regs > regs_allocated_size)
2358     {
2359       RETALLOC_IF (regstart,     num_regs, re_char *);
2360       RETALLOC_IF (regend,       num_regs, re_char *);
2361       RETALLOC_IF (best_regstart, num_regs, re_char *);
2362       RETALLOC_IF (best_regend,  num_regs, re_char *);
2363
2364       regs_allocated_size = num_regs;
2365     }
2366 }
2367
2368 #endif /* not MATCH_MAY_ALLOCATE */
2369 \f
2370 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2371                                        regnum_t regnum);
2372
2373 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2374    Returns one of error codes defined in `regex.h', or zero for success.
2375
2376    If WHITESPACE_REGEXP is given (only #ifdef emacs), it is used instead of
2377    a space character in PATTERN.
2378
2379    Assumes the `allocated' (and perhaps `buffer') and `translate'
2380    fields are set in BUFP on entry.
2381
2382    If it succeeds, results are put in BUFP (if it returns an error, the
2383    contents of BUFP are undefined):
2384      `buffer' is the compiled pattern;
2385      `syntax' is set to SYNTAX;
2386      `used' is set to the length of the compiled pattern;
2387      `fastmap_accurate' is zero;
2388      `re_nsub' is the number of subexpressions in PATTERN;
2389      `not_bol' and `not_eol' are zero;
2390
2391    The `fastmap' field is neither examined nor set.  */
2392
2393 /* Insert the `jump' from the end of last alternative to "here".
2394    The space for the jump has already been allocated. */
2395 #define FIXUP_ALT_JUMP()                                                \
2396 do {                                                                    \
2397   if (fixup_alt_jump)                                                   \
2398     STORE_JUMP (jump, fixup_alt_jump, b);                               \
2399 } while (0)
2400
2401
2402 /* Return, freeing storage we allocated.  */
2403 #define FREE_STACK_RETURN(value)                \
2404   do {                                                  \
2405     FREE_RANGE_TABLE_WORK_AREA (range_table_work);      \
2406     free (compile_stack.stack);                         \
2407     return value;                                       \
2408   } while (0)
2409
2410 static reg_errcode_t
2411 regex_compile (re_char *pattern, size_t size,
2412 #ifdef emacs
2413 # define syntax RE_SYNTAX_EMACS
2414                bool posix_backtracking,
2415                const char *whitespace_regexp,
2416 #else
2417                reg_syntax_t syntax,
2418 # define posix_backtracking (!(syntax & RE_NO_POSIX_BACKTRACKING))
2419 #endif
2420                struct re_pattern_buffer *bufp)
2421 {
2422   /* We fetch characters from PATTERN here.  */
2423   register re_wchar_t c, c1;
2424
2425   /* Points to the end of the buffer, where we should append.  */
2426   register unsigned char *b;
2427
2428   /* Keeps track of unclosed groups.  */
2429   compile_stack_type compile_stack;
2430
2431   /* Points to the current (ending) position in the pattern.  */
2432 #ifdef AIX
2433   /* `const' makes AIX compiler fail.  */
2434   unsigned char *p = pattern;
2435 #else
2436   re_char *p = pattern;
2437 #endif
2438   re_char *pend = pattern + size;
2439
2440   /* How to translate the characters in the pattern.  */
2441   RE_TRANSLATE_TYPE translate = bufp->translate;
2442
2443   /* Address of the count-byte of the most recently inserted `exactn'
2444      command.  This makes it possible to tell if a new exact-match
2445      character can be added to that command or if the character requires
2446      a new `exactn' command.  */
2447   unsigned char *pending_exact = 0;
2448
2449   /* Address of start of the most recently finished expression.
2450      This tells, e.g., postfix * where to find the start of its
2451      operand.  Reset at the beginning of groups and alternatives.  */
2452   unsigned char *laststart = 0;
2453
2454   /* Address of beginning of regexp, or inside of last group.  */
2455   unsigned char *begalt;
2456
2457   /* Place in the uncompiled pattern (i.e., the {) to
2458      which to go back if the interval is invalid.  */
2459   re_char *beg_interval;
2460
2461   /* Address of the place where a forward jump should go to the end of
2462      the containing expression.  Each alternative of an `or' -- except the
2463      last -- ends with a forward jump of this sort.  */
2464   unsigned char *fixup_alt_jump = 0;
2465
2466   /* Work area for range table of charset.  */
2467   struct range_table_work_area range_table_work;
2468
2469   /* If the object matched can contain multibyte characters.  */
2470   const boolean multibyte = RE_MULTIBYTE_P (bufp);
2471
2472 #ifdef emacs
2473   /* Nonzero if we have pushed down into a subpattern.  */
2474   int in_subpattern = 0;
2475
2476   /* These hold the values of p, pattern, and pend from the main
2477      pattern when we have pushed into a subpattern.  */
2478   re_char *main_p;
2479   re_char *main_pattern;
2480   re_char *main_pend;
2481 #endif
2482
2483 #ifdef DEBUG
2484   debug++;
2485   DEBUG_PRINT ("\nCompiling pattern: ");
2486   if (debug > 0)
2487     {
2488       unsigned debug_count;
2489
2490       for (debug_count = 0; debug_count < size; debug_count++)
2491         putchar (pattern[debug_count]);
2492       putchar ('\n');
2493     }
2494 #endif /* DEBUG */
2495
2496   /* Initialize the compile stack.  */
2497   compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2498   if (compile_stack.stack == NULL)
2499     return REG_ESPACE;
2500
2501   compile_stack.size = INIT_COMPILE_STACK_SIZE;
2502   compile_stack.avail = 0;
2503
2504   range_table_work.table = 0;
2505   range_table_work.allocated = 0;
2506
2507   /* Initialize the pattern buffer.  */
2508 #ifndef emacs
2509   bufp->syntax = syntax;
2510 #endif
2511   bufp->fastmap_accurate = 0;
2512   bufp->not_bol = bufp->not_eol = 0;
2513   bufp->used_syntax = 0;
2514
2515   /* Set `used' to zero, so that if we return an error, the pattern
2516      printer (for debugging) will think there's no pattern.  We reset it
2517      at the end.  */
2518   bufp->used = 0;
2519
2520   /* Always count groups, whether or not bufp->no_sub is set.  */
2521   bufp->re_nsub = 0;
2522
2523 #if !defined emacs && !defined SYNTAX_TABLE
2524   /* Initialize the syntax table.  */
2525    init_syntax_once ();
2526 #endif
2527
2528   if (bufp->allocated == 0)
2529     {
2530       if (bufp->buffer)
2531         { /* If zero allocated, but buffer is non-null, try to realloc
2532              enough space.  This loses if buffer's address is bogus, but
2533              that is the user's responsibility.  */
2534           RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2535         }
2536       else
2537         { /* Caller did not allocate a buffer.  Do it for them.  */
2538           bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2539         }
2540       if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2541
2542       bufp->allocated = INIT_BUF_SIZE;
2543     }
2544
2545   begalt = b = bufp->buffer;
2546
2547   /* Loop through the uncompiled pattern until we're at the end.  */
2548   while (1)
2549     {
2550       if (p == pend)
2551         {
2552 #ifdef emacs
2553           /* If this is the end of an included regexp,
2554              pop back to the main regexp and try again.  */
2555           if (in_subpattern)
2556             {
2557               in_subpattern = 0;
2558               pattern = main_pattern;
2559               p = main_p;
2560               pend = main_pend;
2561               continue;
2562             }
2563 #endif
2564           /* If this is the end of the main regexp, we are done.  */
2565           break;
2566         }
2567
2568       PATFETCH (c);
2569
2570       switch (c)
2571         {
2572 #ifdef emacs
2573         case ' ':
2574           {
2575             re_char *p1 = p;
2576
2577             /* If there's no special whitespace regexp, treat
2578                spaces normally.  And don't try to do this recursively.  */
2579             if (!whitespace_regexp || in_subpattern)
2580               goto normal_char;
2581
2582             /* Peek past following spaces.  */
2583             while (p1 != pend)
2584               {
2585                 if (*p1 != ' ')
2586                   break;
2587                 p1++;
2588               }
2589             /* If the spaces are followed by a repetition op,
2590                treat them normally.  */
2591             if (p1 != pend
2592                 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
2593                     || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2594               goto normal_char;
2595
2596             /* Replace the spaces with the whitespace regexp.  */
2597             in_subpattern = 1;
2598             main_p = p1;
2599             main_pend = pend;
2600             main_pattern = pattern;
2601             p = pattern = (re_char *) whitespace_regexp;
2602             pend = p + strlen (whitespace_regexp);
2603             break;
2604           }
2605 #endif
2606
2607         case '^':
2608           {
2609             if (   /* If at start of pattern, it's an operator.  */
2610                    p == pattern + 1
2611                    /* If context independent, it's an operator.  */
2612                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2613                    /* Otherwise, depends on what's come before.  */
2614                 || at_begline_loc_p (pattern, p, syntax))
2615               BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2616             else
2617               goto normal_char;
2618           }
2619           break;
2620
2621
2622         case '$':
2623           {
2624             if (   /* If at end of pattern, it's an operator.  */
2625                    p == pend
2626                    /* If context independent, it's an operator.  */
2627                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2628                    /* Otherwise, depends on what's next.  */
2629                 || at_endline_loc_p (p, pend, syntax))
2630                BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2631              else
2632                goto normal_char;
2633            }
2634            break;
2635
2636
2637         case '+':
2638         case '?':
2639           if ((syntax & RE_BK_PLUS_QM)
2640               || (syntax & RE_LIMITED_OPS))
2641             goto normal_char;
2642           FALLTHROUGH;
2643         case '*':
2644         handle_plus:
2645           /* If there is no previous pattern...  */
2646           if (!laststart)
2647             {
2648               if (syntax & RE_CONTEXT_INVALID_OPS)
2649                 FREE_STACK_RETURN (REG_BADRPT);
2650               else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2651                 goto normal_char;
2652             }
2653
2654           {
2655             /* 1 means zero (many) matches is allowed.  */
2656             boolean zero_times_ok = 0, many_times_ok = 0;
2657             boolean greedy = 1;
2658
2659             /* If there is a sequence of repetition chars, collapse it
2660                down to just one (the right one).  We can't combine
2661                interval operators with these because of, e.g., `a{2}*',
2662                which should only match an even number of `a's.  */
2663
2664             for (;;)
2665               {
2666                 if ((syntax & RE_FRUGAL)
2667                     && c == '?' && (zero_times_ok || many_times_ok))
2668                   greedy = 0;
2669                 else
2670                   {
2671                     zero_times_ok |= c != '+';
2672                     many_times_ok |= c != '?';
2673                   }
2674
2675                 if (p == pend)
2676                   break;
2677                 else if (*p == '*'
2678                          || (!(syntax & RE_BK_PLUS_QM)
2679                              && (*p == '+' || *p == '?')))
2680                   ;
2681                 else if (syntax & RE_BK_PLUS_QM  && *p == '\\')
2682                   {
2683                     if (p+1 == pend)
2684                       FREE_STACK_RETURN (REG_EESCAPE);
2685                     if (p[1] == '+' || p[1] == '?')
2686                       PATFETCH (c); /* Gobble up the backslash.  */
2687                     else
2688                       break;
2689                   }
2690                 else
2691                   break;
2692                 /* If we get here, we found another repeat character.  */
2693                 PATFETCH (c);
2694                }
2695
2696             /* Star, etc. applied to an empty pattern is equivalent
2697                to an empty pattern.  */
2698             if (!laststart || laststart == b)
2699               break;
2700
2701             /* Now we know whether or not zero matches is allowed
2702                and also whether or not two or more matches is allowed.  */
2703             if (greedy)
2704               {
2705                 if (many_times_ok)
2706                   {
2707                     boolean simple = skip_one_char (laststart) == b;
2708                     size_t startoffset = 0;
2709                     re_opcode_t ofj =
2710                       /* Check if the loop can match the empty string.  */
2711                       (simple || !analyze_first (laststart, b, NULL, 0))
2712                       ? on_failure_jump : on_failure_jump_loop;
2713                     assert (skip_one_char (laststart) <= b);
2714
2715                     if (!zero_times_ok && simple)
2716                       { /* Since simple * loops can be made faster by using
2717                            on_failure_keep_string_jump, we turn simple P+
2718                            into PP* if P is simple.  */
2719                         unsigned char *p1, *p2;
2720                         startoffset = b - laststart;
2721                         GET_BUFFER_SPACE (startoffset);
2722                         p1 = b; p2 = laststart;
2723                         while (p2 < p1)
2724                           *b++ = *p2++;
2725                         zero_times_ok = 1;
2726                       }
2727
2728                     GET_BUFFER_SPACE (6);
2729                     if (!zero_times_ok)
2730                       /* A + loop.  */
2731                       STORE_JUMP (ofj, b, b + 6);
2732                     else
2733                       /* Simple * loops can use on_failure_keep_string_jump
2734                          depending on what follows.  But since we don't know
2735                          that yet, we leave the decision up to
2736                          on_failure_jump_smart.  */
2737                       INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2738                                    laststart + startoffset, b + 6);
2739                     b += 3;
2740                     STORE_JUMP (jump, b, laststart + startoffset);
2741                     b += 3;
2742                   }
2743                 else
2744                   {
2745                     /* A simple ? pattern.  */
2746                     assert (zero_times_ok);
2747                     GET_BUFFER_SPACE (3);
2748                     INSERT_JUMP (on_failure_jump, laststart, b + 3);
2749                     b += 3;
2750                   }
2751               }
2752             else                /* not greedy */
2753               { /* I wish the greedy and non-greedy cases could be merged.  */
2754
2755                 GET_BUFFER_SPACE (7); /* We might use less.  */
2756                 if (many_times_ok)
2757                   {
2758                     boolean emptyp = analyze_first (laststart, b, NULL, 0);
2759
2760                     /* The non-greedy multiple match looks like
2761                        a repeat..until: we only need a conditional jump
2762                        at the end of the loop.  */
2763                     if (emptyp) BUF_PUSH (no_op);
2764                     STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2765                                 : on_failure_jump, b, laststart);
2766                     b += 3;
2767                     if (zero_times_ok)
2768                       {
2769                         /* The repeat...until naturally matches one or more.
2770                            To also match zero times, we need to first jump to
2771                            the end of the loop (its conditional jump).  */
2772                         INSERT_JUMP (jump, laststart, b);
2773                         b += 3;
2774                       }
2775                   }
2776                 else
2777                   {
2778                     /* non-greedy a?? */
2779                     INSERT_JUMP (jump, laststart, b + 3);
2780                     b += 3;
2781                     INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2782                     b += 3;
2783                   }
2784               }
2785           }
2786           pending_exact = 0;
2787           break;
2788
2789
2790         case '.':
2791           laststart = b;
2792           BUF_PUSH (anychar);
2793           break;
2794
2795
2796         case '[':
2797           {
2798             re_char *p1;
2799
2800             CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2801
2802             if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2803
2804             /* Ensure that we have enough space to push a charset: the
2805                opcode, the length count, and the bitset; 34 bytes in all.  */
2806             GET_BUFFER_SPACE (34);
2807
2808             laststart = b;
2809
2810             /* We test `*p == '^' twice, instead of using an if
2811                statement, so we only need one BUF_PUSH.  */
2812             BUF_PUSH (*p == '^' ? charset_not : charset);
2813             if (*p == '^')
2814               p++;
2815
2816             /* Remember the first position in the bracket expression.  */
2817             p1 = p;
2818
2819             /* Push the number of bytes in the bitmap.  */
2820             BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2821
2822             /* Clear the whole map.  */
2823             memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2824
2825             /* charset_not matches newline according to a syntax bit.  */
2826             if ((re_opcode_t) b[-2] == charset_not
2827                 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2828               SET_LIST_BIT ('\n');
2829
2830             /* Read in characters and ranges, setting map bits.  */
2831             for (;;)
2832               {
2833                 boolean escaped_char = false;
2834                 const unsigned char *p2 = p;
2835                 re_wctype_t cc;
2836                 re_wchar_t ch;
2837
2838                 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2839
2840                 /* See if we're at the beginning of a possible character
2841                    class.  */
2842                 if (syntax & RE_CHAR_CLASSES &&
2843                     (cc = re_wctype_parse(&p, pend - p)) != -1)
2844                   {
2845                     if (cc == 0)
2846                       FREE_STACK_RETURN (REG_ECTYPE);
2847
2848                     if (p == pend)
2849                       FREE_STACK_RETURN (REG_EBRACK);
2850
2851 #ifndef emacs
2852                     for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2853                       if (re_iswctype (btowc (ch), cc))
2854                         {
2855                           c = TRANSLATE (ch);
2856                           if (c < (1 << BYTEWIDTH))
2857                             SET_LIST_BIT (c);
2858                         }
2859 #else  /* emacs */
2860                     /* Most character classes in a multibyte match just set
2861                        a flag.  Exceptions are is_blank, is_digit, is_cntrl, and
2862                        is_xdigit, since they can only match ASCII characters.
2863                        We don't need to handle them for multibyte.  */
2864
2865                     /* Setup the gl_state object to its buffer-defined value.
2866                        This hardcodes the buffer-global syntax-table for ASCII
2867                        chars, while the other chars will obey syntax-table
2868                        properties.  It's not ideal, but it's the way it's been
2869                        done until now.  */
2870                     SETUP_BUFFER_SYNTAX_TABLE ();
2871
2872                     for (c = 0; c < 0x80; ++c)
2873                       if (re_iswctype (c, cc))
2874                         {
2875                           SET_LIST_BIT (c);
2876                           c1 = TRANSLATE (c);
2877                           if (c1 == c)
2878                             continue;
2879                           if (ASCII_CHAR_P (c1))
2880                             SET_LIST_BIT (c1);
2881                           else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2882                             SET_LIST_BIT (c1);
2883                         }
2884                     SET_RANGE_TABLE_WORK_AREA_BIT
2885                       (range_table_work, re_wctype_to_bit (cc));
2886 #endif  /* emacs */
2887                     /* In most cases the matching rule for char classes only
2888                        uses the syntax table for multibyte chars, so that the
2889                        content of the syntax-table is not hardcoded in the
2890                        range_table.  SPACE and WORD are the two exceptions.  */
2891                     if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2892                       bufp->used_syntax = 1;
2893
2894                     /* Repeat the loop. */
2895                     continue;
2896                   }
2897
2898                 /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
2899                    always be determined from TRANSLATE(X) and TRANSLATE(Y)
2900                    So the translation is done later in a loop.  Example:
2901                    (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
2902                 PATFETCH (c);
2903
2904                 /* \ might escape characters inside [...] and [^...].  */
2905                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2906                   {
2907                     if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2908
2909                     PATFETCH (c);
2910                     escaped_char = true;
2911                   }
2912                 else
2913                   {
2914                     /* Could be the end of the bracket expression.  If it's
2915                        not (i.e., when the bracket expression is `[]' so
2916                        far), the ']' character bit gets set way below.  */
2917                     if (c == ']' && p2 != p1)
2918                       break;
2919                   }
2920
2921                 if (p < pend && p[0] == '-' && p[1] != ']')
2922                   {
2923
2924                     /* Discard the `-'. */
2925                     PATFETCH (c1);
2926
2927                     /* Fetch the character which ends the range. */
2928                     PATFETCH (c1);
2929 #ifdef emacs
2930                     if (CHAR_BYTE8_P (c1)
2931                         && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2932                       /* Treat the range from a multibyte character to
2933                          raw-byte character as empty.  */
2934                       c = c1 + 1;
2935 #endif  /* emacs */
2936                   }
2937                 else
2938                   /* Range from C to C. */
2939                   c1 = c;
2940
2941                 if (c > c1)
2942                   {
2943                     if (syntax & RE_NO_EMPTY_RANGES)
2944                       FREE_STACK_RETURN (REG_ERANGEX);
2945                     /* Else, repeat the loop.  */
2946                   }
2947                 else
2948                   {
2949 #ifndef emacs
2950                     /* Set the range into bitmap */
2951                     for (; c <= c1; c++)
2952                       {
2953                         ch = TRANSLATE (c);
2954                         if (ch < (1 << BYTEWIDTH))
2955                           SET_LIST_BIT (ch);
2956                       }
2957 #else  /* emacs */
2958                     if (c < 128)
2959                       {
2960                         ch = min (127, c1);
2961                         SETUP_ASCII_RANGE (range_table_work, c, ch);
2962                         c = ch + 1;
2963                         if (CHAR_BYTE8_P (c1))
2964                           c = BYTE8_TO_CHAR (128);
2965                       }
2966                     if (c <= c1)
2967                       {
2968                         if (CHAR_BYTE8_P (c))
2969                           {
2970                             c = CHAR_TO_BYTE8 (c);
2971                             c1 = CHAR_TO_BYTE8 (c1);
2972                             for (; c <= c1; c++)
2973                               SET_LIST_BIT (c);
2974                           }
2975                         else if (multibyte)
2976                           {
2977                             SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
2978                           }
2979                         else
2980                           {
2981                             SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
2982                           }
2983                       }
2984 #endif /* emacs */
2985                   }
2986               }
2987
2988             /* Discard any (non)matching list bytes that are all 0 at the
2989                end of the map.  Decrease the map-length byte too.  */
2990             while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2991               b[-1]--;
2992             b += b[-1];
2993
2994             /* Build real range table from work area.  */
2995             if (RANGE_TABLE_WORK_USED (range_table_work)
2996                 || RANGE_TABLE_WORK_BITS (range_table_work))
2997               {
2998                 int i;
2999                 int used = RANGE_TABLE_WORK_USED (range_table_work);
3000
3001                 /* Allocate space for COUNT + RANGE_TABLE.  Needs two
3002                    bytes for flags, two for COUNT, and three bytes for
3003                    each character.  */
3004                 GET_BUFFER_SPACE (4 + used * 3);
3005
3006                 /* Indicate the existence of range table.  */
3007                 laststart[1] |= 0x80;
3008
3009                 /* Store the character class flag bits into the range table.
3010                    If not in emacs, these flag bits are always 0.  */
3011                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3012                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3013
3014                 STORE_NUMBER_AND_INCR (b, used / 2);
3015                 for (i = 0; i < used; i++)
3016                   STORE_CHARACTER_AND_INCR
3017                     (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3018               }
3019           }
3020           break;
3021
3022
3023         case '(':
3024           if (syntax & RE_NO_BK_PARENS)
3025             goto handle_open;
3026           else
3027             goto normal_char;
3028
3029
3030         case ')':
3031           if (syntax & RE_NO_BK_PARENS)
3032             goto handle_close;
3033           else
3034             goto normal_char;
3035
3036
3037         case '\n':
3038           if (syntax & RE_NEWLINE_ALT)
3039             goto handle_alt;
3040           else
3041             goto normal_char;
3042
3043
3044         case '|':
3045           if (syntax & RE_NO_BK_VBAR)
3046             goto handle_alt;
3047           else
3048             goto normal_char;
3049
3050
3051         case '{':
3052            if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3053              goto handle_interval;
3054            else
3055              goto normal_char;
3056
3057
3058         case '\\':
3059           if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3060
3061           /* Do not translate the character after the \, so that we can
3062              distinguish, e.g., \B from \b, even if we normally would
3063              translate, e.g., B to b.  */
3064           PATFETCH (c);
3065
3066           switch (c)
3067             {
3068             case '(':
3069               if (syntax & RE_NO_BK_PARENS)
3070                 goto normal_backslash;
3071
3072             handle_open:
3073               {
3074                 int shy = 0;
3075                 regnum_t regnum = 0;
3076                 if (p+1 < pend)
3077                   {
3078                     /* Look for a special (?...) construct */
3079                     if ((syntax & RE_SHY_GROUPS) && *p == '?')
3080                       {
3081                         PATFETCH (c); /* Gobble up the '?'.  */
3082                         while (!shy)
3083                           {
3084                             PATFETCH (c);
3085                             switch (c)
3086                               {
3087                               case ':': shy = 1; break;
3088                               case '0':
3089                                 /* An explicitly specified regnum must start
3090                                    with non-0. */
3091                                 if (regnum == 0)
3092                                   FREE_STACK_RETURN (REG_BADPAT);
3093                                 FALLTHROUGH;
3094                               case '1': case '2': case '3': case '4':
3095                               case '5': case '6': case '7': case '8': case '9':
3096                                 regnum = 10*regnum + (c - '0'); break;
3097                               default:
3098                                 /* Only (?:...) is supported right now. */
3099                                 FREE_STACK_RETURN (REG_BADPAT);
3100                               }
3101                           }
3102                       }
3103                   }
3104
3105                 if (!shy)
3106                   regnum = ++bufp->re_nsub;
3107                 else if (regnum)
3108                   { /* It's actually not shy, but explicitly numbered.  */
3109                     shy = 0;
3110                     if (regnum > bufp->re_nsub)
3111                       bufp->re_nsub = regnum;
3112                     else if (regnum > bufp->re_nsub
3113                              /* Ideally, we'd want to check that the specified
3114                                 group can't have matched (i.e. all subgroups
3115                                 using the same regnum are in other branches of
3116                                 OR patterns), but we don't currently keep track
3117                                 of enough info to do that easily.  */
3118                              || group_in_compile_stack (compile_stack, regnum))
3119                       FREE_STACK_RETURN (REG_BADPAT);
3120                   }
3121                 else
3122                   /* It's really shy.  */
3123                   regnum = - bufp->re_nsub;
3124
3125                 if (COMPILE_STACK_FULL)
3126                   {
3127                     RETALLOC (compile_stack.stack, compile_stack.size << 1,
3128                               compile_stack_elt_t);
3129                     if (compile_stack.stack == NULL) return REG_ESPACE;
3130
3131                     compile_stack.size <<= 1;
3132                   }
3133
3134                 /* These are the values to restore when we hit end of this
3135                    group.  They are all relative offsets, so that if the
3136                    whole pattern moves because of realloc, they will still
3137                    be valid.  */
3138                 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3139                 COMPILE_STACK_TOP.fixup_alt_jump
3140                   = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3141                 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3142                 COMPILE_STACK_TOP.regnum = regnum;
3143
3144                 /* Do not push a start_memory for groups beyond the last one
3145                    we can represent in the compiled pattern.  */
3146                 if (regnum <= MAX_REGNUM && regnum > 0)
3147                   BUF_PUSH_2 (start_memory, regnum);
3148
3149                 compile_stack.avail++;
3150
3151                 fixup_alt_jump = 0;
3152                 laststart = 0;
3153                 begalt = b;
3154                 /* If we've reached MAX_REGNUM groups, then this open
3155                    won't actually generate any code, so we'll have to
3156                    clear pending_exact explicitly.  */
3157                 pending_exact = 0;
3158                 break;
3159               }
3160
3161             case ')':
3162               if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3163
3164               if (COMPILE_STACK_EMPTY)
3165                 {
3166                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3167                     goto normal_backslash;
3168                   else
3169                     FREE_STACK_RETURN (REG_ERPAREN);
3170                 }
3171
3172             handle_close:
3173               FIXUP_ALT_JUMP ();
3174
3175               /* See similar code for backslashed left paren above.  */
3176               if (COMPILE_STACK_EMPTY)
3177                 {
3178                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3179                     goto normal_char;
3180                   else
3181                     FREE_STACK_RETURN (REG_ERPAREN);
3182                 }
3183
3184               /* Since we just checked for an empty stack above, this
3185                  ``can't happen''.  */
3186               assert (compile_stack.avail != 0);
3187               {
3188                 /* We don't just want to restore into `regnum', because
3189                    later groups should continue to be numbered higher,
3190                    as in `(ab)c(de)' -- the second group is #2.  */
3191                 regnum_t regnum;
3192
3193                 compile_stack.avail--;
3194                 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3195                 fixup_alt_jump
3196                   = COMPILE_STACK_TOP.fixup_alt_jump
3197                     ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3198                     : 0;
3199                 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3200                 regnum = COMPILE_STACK_TOP.regnum;
3201                 /* If we've reached MAX_REGNUM groups, then this open
3202                    won't actually generate any code, so we'll have to
3203                    clear pending_exact explicitly.  */
3204                 pending_exact = 0;
3205
3206                 /* We're at the end of the group, so now we know how many
3207                    groups were inside this one.  */
3208                 if (regnum <= MAX_REGNUM && regnum > 0)
3209                   BUF_PUSH_2 (stop_memory, regnum);
3210               }
3211               break;
3212
3213
3214             case '|':                                   /* `\|'.  */
3215               if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3216                 goto normal_backslash;
3217             handle_alt:
3218               if (syntax & RE_LIMITED_OPS)
3219                 goto normal_char;
3220
3221               /* Insert before the previous alternative a jump which
3222                  jumps to this alternative if the former fails.  */
3223               GET_BUFFER_SPACE (3);
3224               INSERT_JUMP (on_failure_jump, begalt, b + 6);
3225               pending_exact = 0;
3226               b += 3;
3227
3228               /* The alternative before this one has a jump after it
3229                  which gets executed if it gets matched.  Adjust that
3230                  jump so it will jump to this alternative's analogous
3231                  jump (put in below, which in turn will jump to the next
3232                  (if any) alternative's such jump, etc.).  The last such
3233                  jump jumps to the correct final destination.  A picture:
3234                           _____ _____
3235                           |   | |   |
3236                           |   v |   v
3237                         a | b   | c
3238
3239                  If we are at `b', then fixup_alt_jump right now points to a
3240                  three-byte space after `a'.  We'll put in the jump, set
3241                  fixup_alt_jump to right after `b', and leave behind three
3242                  bytes which we'll fill in when we get to after `c'.  */
3243
3244               FIXUP_ALT_JUMP ();
3245
3246               /* Mark and leave space for a jump after this alternative,
3247                  to be filled in later either by next alternative or
3248                  when know we're at the end of a series of alternatives.  */
3249               fixup_alt_jump = b;
3250               GET_BUFFER_SPACE (3);
3251               b += 3;
3252
3253               laststart = 0;
3254               begalt = b;
3255               break;
3256
3257
3258             case '{':
3259               /* If \{ is a literal.  */
3260               if (!(syntax & RE_INTERVALS)
3261                      /* If we're at `\{' and it's not the open-interval
3262                         operator.  */
3263                   || (syntax & RE_NO_BK_BRACES))
3264                 goto normal_backslash;
3265
3266             handle_interval:
3267               {
3268                 /* If got here, then the syntax allows intervals.  */
3269
3270                 /* At least (most) this many matches must be made.  */
3271                 int lower_bound = 0, upper_bound = -1;
3272
3273                 beg_interval = p;
3274
3275                 GET_INTERVAL_COUNT (lower_bound);
3276
3277                 if (c == ',')
3278                   GET_INTERVAL_COUNT (upper_bound);
3279                 else
3280                   /* Interval such as `{1}' => match exactly once. */
3281                   upper_bound = lower_bound;
3282
3283                 if (lower_bound < 0
3284                     || (0 <= upper_bound && upper_bound < lower_bound))
3285                   FREE_STACK_RETURN (REG_BADBR);
3286
3287                 if (!(syntax & RE_NO_BK_BRACES))
3288                   {
3289                     if (c != '\\')
3290                       FREE_STACK_RETURN (REG_BADBR);
3291                     if (p == pend)
3292                       FREE_STACK_RETURN (REG_EESCAPE);
3293                     PATFETCH (c);
3294                   }
3295
3296                 if (c != '}')
3297                   FREE_STACK_RETURN (REG_BADBR);
3298
3299                 /* We just parsed a valid interval.  */
3300
3301                 /* If it's invalid to have no preceding re.  */
3302                 if (!laststart)
3303                   {
3304                     if (syntax & RE_CONTEXT_INVALID_OPS)
3305                       FREE_STACK_RETURN (REG_BADRPT);
3306                     else if (syntax & RE_CONTEXT_INDEP_OPS)
3307                       laststart = b;
3308                     else
3309                       goto unfetch_interval;
3310                   }
3311
3312                 if (upper_bound == 0)
3313                   /* If the upper bound is zero, just drop the sub pattern
3314                      altogether.  */
3315                   b = laststart;
3316                 else if (lower_bound == 1 && upper_bound == 1)
3317                   /* Just match it once: nothing to do here.  */
3318                   ;
3319
3320                 /* Otherwise, we have a nontrivial interval.  When
3321                    we're all done, the pattern will look like:
3322                    set_number_at <jump count> <upper bound>
3323                    set_number_at <succeed_n count> <lower bound>
3324                    succeed_n <after jump addr> <succeed_n count>
3325                    <body of loop>
3326                    jump_n <succeed_n addr> <jump count>
3327                    (The upper bound and `jump_n' are omitted if
3328                    `upper_bound' is 1, though.)  */
3329                 else
3330                   { /* If the upper bound is > 1, we need to insert
3331                        more at the end of the loop.  */
3332                     unsigned int nbytes = (upper_bound < 0 ? 3
3333                                            : upper_bound > 1 ? 5 : 0);
3334                     unsigned int startoffset = 0;
3335
3336                     GET_BUFFER_SPACE (20); /* We might use less.  */
3337
3338                     if (lower_bound == 0)
3339                       {
3340                         /* A succeed_n that starts with 0 is really a
3341                            a simple on_failure_jump_loop.  */
3342                         INSERT_JUMP (on_failure_jump_loop, laststart,
3343                                      b + 3 + nbytes);
3344                         b += 3;
3345                       }
3346                     else
3347                       {
3348                         /* Initialize lower bound of the `succeed_n', even
3349                            though it will be set during matching by its
3350                            attendant `set_number_at' (inserted next),
3351                            because `re_compile_fastmap' needs to know.
3352                            Jump to the `jump_n' we might insert below.  */
3353                         INSERT_JUMP2 (succeed_n, laststart,
3354                                       b + 5 + nbytes,
3355                                       lower_bound);
3356                         b += 5;
3357
3358                         /* Code to initialize the lower bound.  Insert
3359                            before the `succeed_n'.  The `5' is the last two
3360                            bytes of this `set_number_at', plus 3 bytes of
3361                            the following `succeed_n'.  */
3362                         insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3363                         b += 5;
3364                         startoffset += 5;
3365                       }
3366
3367                     if (upper_bound < 0)
3368                       {
3369                         /* A negative upper bound stands for infinity,
3370                            in which case it degenerates to a plain jump.  */
3371                         STORE_JUMP (jump, b, laststart + startoffset);
3372                         b += 3;
3373                       }
3374                     else if (upper_bound > 1)
3375                       { /* More than one repetition is allowed, so
3376                            append a backward jump to the `succeed_n'
3377                            that starts this interval.
3378
3379                            When we've reached this during matching,
3380                            we'll have matched the interval once, so
3381                            jump back only `upper_bound - 1' times.  */
3382                         STORE_JUMP2 (jump_n, b, laststart + startoffset,
3383                                      upper_bound - 1);
3384                         b += 5;
3385
3386                         /* The location we want to set is the second
3387                            parameter of the `jump_n'; that is `b-2' as
3388                            an absolute address.  `laststart' will be
3389                            the `set_number_at' we're about to insert;
3390                            `laststart+3' the number to set, the source
3391                            for the relative address.  But we are
3392                            inserting into the middle of the pattern --
3393                            so everything is getting moved up by 5.
3394                            Conclusion: (b - 2) - (laststart + 3) + 5,
3395                            i.e., b - laststart.
3396
3397                            We insert this at the beginning of the loop
3398                            so that if we fail during matching, we'll
3399                            reinitialize the bounds.  */
3400                         insert_op2 (set_number_at, laststart, b - laststart,
3401                                     upper_bound - 1, b);
3402                         b += 5;
3403                       }
3404                   }
3405                 pending_exact = 0;
3406                 beg_interval = NULL;
3407               }
3408               break;
3409
3410             unfetch_interval:
3411               /* If an invalid interval, match the characters as literals.  */
3412                assert (beg_interval);
3413                p = beg_interval;
3414                beg_interval = NULL;
3415
3416                /* normal_char and normal_backslash need `c'.  */
3417                c = '{';
3418
3419                if (!(syntax & RE_NO_BK_BRACES))
3420                  {
3421                    assert (p > pattern && p[-1] == '\\');
3422                    goto normal_backslash;
3423                  }
3424                else
3425                  goto normal_char;
3426
3427 #ifdef emacs
3428             case '=':
3429               laststart = b;
3430               BUF_PUSH (at_dot);
3431               break;
3432
3433             case 's':
3434               laststart = b;
3435               PATFETCH (c);
3436               BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3437               break;
3438
3439             case 'S':
3440               laststart = b;
3441               PATFETCH (c);
3442               BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3443               break;
3444
3445             case 'c':
3446               laststart = b;
3447               PATFETCH (c);
3448               BUF_PUSH_2 (categoryspec, c);
3449               break;
3450
3451             case 'C':
3452               laststart = b;
3453               PATFETCH (c);
3454               BUF_PUSH_2 (notcategoryspec, c);
3455               break;
3456 #endif /* emacs */
3457
3458
3459             case 'w':
3460               if (syntax & RE_NO_GNU_OPS)
3461                 goto normal_char;
3462               laststart = b;
3463               BUF_PUSH_2 (syntaxspec, Sword);
3464               break;
3465
3466
3467             case 'W':
3468               if (syntax & RE_NO_GNU_OPS)
3469                 goto normal_char;
3470               laststart = b;
3471               BUF_PUSH_2 (notsyntaxspec, Sword);
3472               break;
3473
3474
3475             case '<':
3476               if (syntax & RE_NO_GNU_OPS)
3477                 goto normal_char;
3478               laststart = b;
3479               BUF_PUSH (wordbeg);
3480               break;
3481
3482             case '>':
3483               if (syntax & RE_NO_GNU_OPS)
3484                 goto normal_char;
3485               laststart = b;
3486               BUF_PUSH (wordend);
3487               break;
3488
3489             case '_':
3490               if (syntax & RE_NO_GNU_OPS)
3491                 goto normal_char;
3492               laststart = b;
3493               PATFETCH (c);
3494               if (c == '<')
3495                 BUF_PUSH (symbeg);
3496               else if (c == '>')
3497                 BUF_PUSH (symend);
3498               else
3499                 FREE_STACK_RETURN (REG_BADPAT);
3500               break;
3501
3502             case 'b':
3503               if (syntax & RE_NO_GNU_OPS)
3504                 goto normal_char;
3505               BUF_PUSH (wordbound);
3506               break;
3507
3508             case 'B':
3509               if (syntax & RE_NO_GNU_OPS)
3510                 goto normal_char;
3511               BUF_PUSH (notwordbound);
3512               break;
3513
3514             case '`':
3515               if (syntax & RE_NO_GNU_OPS)
3516                 goto normal_char;
3517               BUF_PUSH (begbuf);
3518               break;
3519
3520             case '\'':
3521               if (syntax & RE_NO_GNU_OPS)
3522                 goto normal_char;
3523               BUF_PUSH (endbuf);
3524               break;
3525
3526             case '1': case '2': case '3': case '4': case '5':
3527             case '6': case '7': case '8': case '9':
3528               {
3529                 regnum_t reg;
3530
3531                 if (syntax & RE_NO_BK_REFS)
3532                   goto normal_backslash;
3533
3534                 reg = c - '0';
3535
3536                 if (reg > bufp->re_nsub || reg < 1
3537                     /* Can't back reference to a subexp before its end.  */
3538                     || group_in_compile_stack (compile_stack, reg))
3539                   FREE_STACK_RETURN (REG_ESUBREG);
3540
3541                 laststart = b;
3542                 BUF_PUSH_2 (duplicate, reg);
3543               }
3544               break;
3545
3546
3547             case '+':
3548             case '?':
3549               if (syntax & RE_BK_PLUS_QM)
3550                 goto handle_plus;
3551               else
3552                 goto normal_backslash;
3553
3554             default:
3555             normal_backslash:
3556               /* You might think it would be useful for \ to mean
3557                  not to translate; but if we don't translate it
3558                  it will never match anything.  */
3559               goto normal_char;
3560             }
3561           break;
3562
3563
3564         default:
3565         /* Expects the character in `c'.  */
3566         normal_char:
3567           /* If no exactn currently being built.  */
3568           if (!pending_exact
3569
3570               /* If last exactn not at current position.  */
3571               || pending_exact + *pending_exact + 1 != b
3572
3573               /* We have only one byte following the exactn for the count.  */
3574               || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3575
3576               /* If followed by a repetition operator.  */
3577               || (p != pend && (*p == '*' || *p == '^'))
3578               || ((syntax & RE_BK_PLUS_QM)
3579                   ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3580                   : p != pend && (*p == '+' || *p == '?'))
3581               || ((syntax & RE_INTERVALS)
3582                   && ((syntax & RE_NO_BK_BRACES)
3583                       ? p != pend && *p == '{'
3584                       : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3585             {
3586               /* Start building a new exactn.  */
3587
3588               laststart = b;
3589
3590               BUF_PUSH_2 (exactn, 0);
3591               pending_exact = b - 1;
3592             }
3593
3594           GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3595           {
3596             int len;
3597
3598             if (multibyte)
3599               {
3600                 c = TRANSLATE (c);
3601                 len = CHAR_STRING (c, b);
3602                 b += len;
3603               }
3604             else
3605               {
3606                 c1 = RE_CHAR_TO_MULTIBYTE (c);
3607                 if (! CHAR_BYTE8_P (c1))
3608                   {
3609                     re_wchar_t c2 = TRANSLATE (c1);
3610
3611                     if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3612                       c = c1;
3613                   }
3614                 *b++ = c;
3615                 len = 1;
3616               }
3617             (*pending_exact) += len;
3618           }
3619
3620           break;
3621         } /* switch (c) */
3622     } /* while p != pend */
3623
3624
3625   /* Through the pattern now.  */
3626
3627   FIXUP_ALT_JUMP ();
3628
3629   if (!COMPILE_STACK_EMPTY)
3630     FREE_STACK_RETURN (REG_EPAREN);
3631
3632   /* If we don't want backtracking, force success
3633      the first time we reach the end of the compiled pattern.  */
3634   if (!posix_backtracking)
3635     BUF_PUSH (succeed);
3636
3637   /* We have succeeded; set the length of the buffer.  */
3638   bufp->used = b - bufp->buffer;
3639
3640 #ifdef DEBUG
3641   if (debug > 0)
3642     {
3643       re_compile_fastmap (bufp);
3644       DEBUG_PRINT ("\nCompiled pattern: \n");
3645       print_compiled_pattern (bufp);
3646     }
3647   debug--;
3648 #endif /* DEBUG */
3649
3650 #ifndef MATCH_MAY_ALLOCATE
3651   /* Initialize the failure stack to the largest possible stack.  This
3652      isn't necessary unless we're trying to avoid calling alloca in
3653      the search and match routines.  */
3654   {
3655     int num_regs = bufp->re_nsub + 1;
3656
3657     if (fail_stack.size < emacs_re_max_failures * TYPICAL_FAILURE_SIZE)
3658       {
3659         fail_stack.size = emacs_re_max_failures * TYPICAL_FAILURE_SIZE;
3660         falk_stack.stack = realloc (fail_stack.stack,
3661                                     fail_stack.size * sizeof *falk_stack.stack);
3662       }
3663
3664     regex_grow_registers (num_regs);
3665   }
3666 #endif /* not MATCH_MAY_ALLOCATE */
3667
3668   FREE_STACK_RETURN (REG_NOERROR);
3669
3670 #ifdef emacs
3671 # undef syntax
3672 #else
3673 # undef posix_backtracking
3674 #endif
3675 } /* regex_compile */
3676 \f
3677 /* Subroutines for `regex_compile'.  */
3678
3679 /* Store OP at LOC followed by two-byte integer parameter ARG.  */
3680
3681 static void
3682 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3683 {
3684   *loc = (unsigned char) op;
3685   STORE_NUMBER (loc + 1, arg);
3686 }
3687
3688
3689 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2.  */
3690
3691 static void
3692 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3693 {
3694   *loc = (unsigned char) op;
3695   STORE_NUMBER (loc + 1, arg1);
3696   STORE_NUMBER (loc + 3, arg2);
3697 }
3698
3699
3700 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3701    for OP followed by two-byte integer parameter ARG.  */
3702
3703 static void
3704 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3705 {
3706   register unsigned char *pfrom = end;
3707   register unsigned char *pto = end + 3;
3708
3709   while (pfrom != loc)
3710     *--pto = *--pfrom;
3711
3712   store_op1 (op, loc, arg);
3713 }
3714
3715
3716 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
3717
3718 static void
3719 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
3720 {
3721   register unsigned char *pfrom = end;
3722   register unsigned char *pto = end + 5;
3723
3724   while (pfrom != loc)
3725     *--pto = *--pfrom;
3726
3727   store_op2 (op, loc, arg1, arg2);
3728 }
3729
3730
3731 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
3732    after an alternative or a begin-subexpression.  We assume there is at
3733    least one character before the ^.  */
3734
3735 static boolean
3736 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax)
3737 {
3738   re_char *prev = p - 2;
3739   boolean odd_backslashes;
3740
3741   /* After a subexpression?  */
3742   if (*prev == '(')
3743     odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3744
3745   /* After an alternative?  */
3746   else if (*prev == '|')
3747     odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3748
3749   /* After a shy subexpression?  */
3750   else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3751     {
3752       /* Skip over optional regnum.  */
3753       while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3754         --prev;
3755
3756       if (!(prev - 2 >= pattern
3757             && prev[-1] == '?' && prev[-2] == '('))
3758         return false;
3759       prev -= 2;
3760       odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3761     }
3762   else
3763     return false;
3764
3765   /* Count the number of preceding backslashes.  */
3766   p = prev;
3767   while (prev - 1 >= pattern && prev[-1] == '\\')
3768     --prev;
3769   return (p - prev) & odd_backslashes;
3770 }
3771
3772
3773 /* The dual of at_begline_loc_p.  This one is for $.  We assume there is
3774    at least one character after the $, i.e., `P < PEND'.  */
3775
3776 static boolean
3777 at_endline_loc_p (re_char *p, re_char *pend, reg_syntax_t syntax)
3778 {
3779   re_char *next = p;
3780   boolean next_backslash = *next == '\\';
3781   re_char *next_next = p + 1 < pend ? p + 1 : 0;
3782
3783   return
3784        /* Before a subexpression?  */
3785        (syntax & RE_NO_BK_PARENS ? *next == ')'
3786         : next_backslash && next_next && *next_next == ')')
3787        /* Before an alternative?  */
3788     || (syntax & RE_NO_BK_VBAR ? *next == '|'
3789         : next_backslash && next_next && *next_next == '|');
3790 }
3791
3792
3793 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3794    false if it's not.  */
3795
3796 static boolean
3797 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3798 {
3799   ssize_t this_element;
3800
3801   for (this_element = compile_stack.avail - 1;
3802        this_element >= 0;
3803        this_element--)
3804     if (compile_stack.stack[this_element].regnum == regnum)
3805       return true;
3806
3807   return false;
3808 }
3809 \f
3810 /* analyze_first.
3811    If fastmap is non-NULL, go through the pattern and fill fastmap
3812    with all the possible leading chars.  If fastmap is NULL, don't
3813    bother filling it up (obviously) and only return whether the
3814    pattern could potentially match the empty string.
3815
3816    Return 1  if p..pend might match the empty string.
3817    Return 0  if p..pend matches at least one char.
3818    Return -1 if fastmap was not updated accurately.  */
3819
3820 static int
3821 analyze_first (re_char *p, re_char *pend, char *fastmap,
3822                const int multibyte)
3823 {
3824   int j, k;
3825   boolean not;
3826
3827   /* If all elements for base leading-codes in fastmap is set, this
3828      flag is set true.  */
3829   boolean match_any_multibyte_characters = false;
3830
3831   assert (p);
3832
3833   /* The loop below works as follows:
3834      - It has a working-list kept in the PATTERN_STACK and which basically
3835        starts by only containing a pointer to the first operation.
3836      - If the opcode we're looking at is a match against some set of
3837        chars, then we add those chars to the fastmap and go on to the
3838        next work element from the worklist (done via `break').
3839      - If the opcode is a control operator on the other hand, we either
3840        ignore it (if it's meaningless at this point, such as `start_memory')
3841        or execute it (if it's a jump).  If the jump has several destinations
3842        (i.e. `on_failure_jump'), then we push the other destination onto the
3843        worklist.
3844      We guarantee termination by ignoring backward jumps (more or less),
3845      so that `p' is monotonically increasing.  More to the point, we
3846      never set `p' (or push) anything `<= p1'.  */
3847
3848   while (p < pend)
3849     {
3850       /* `p1' is used as a marker of how far back a `on_failure_jump'
3851          can go without being ignored.  It is normally equal to `p'
3852          (which prevents any backward `on_failure_jump') except right
3853          after a plain `jump', to allow patterns such as:
3854             0: jump 10
3855             3..9: <body>
3856             10: on_failure_jump 3
3857          as used for the *? operator.  */
3858       re_char *p1 = p;
3859
3860       switch (*p++)
3861         {
3862         case succeed:
3863           return 1;
3864
3865         case duplicate:
3866           /* If the first character has to match a backreference, that means
3867              that the group was empty (since it already matched).  Since this
3868              is the only case that interests us here, we can assume that the
3869              backreference must match the empty string.  */
3870           p++;
3871           continue;
3872
3873
3874       /* Following are the cases which match a character.  These end
3875          with `break'.  */
3876
3877         case exactn:
3878           if (fastmap)
3879             {
3880               /* If multibyte is nonzero, the first byte of each
3881                  character is an ASCII or a leading code.  Otherwise,
3882                  each byte is a character.  Thus, this works in both
3883                  cases. */
3884               fastmap[p[1]] = 1;
3885               if (! multibyte)
3886                 {
3887                   /* For the case of matching this unibyte regex
3888                      against multibyte, we must set a leading code of
3889                      the corresponding multibyte character.  */
3890                   int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3891
3892                   fastmap[CHAR_LEADING_CODE (c)] = 1;
3893                 }
3894             }
3895           break;
3896
3897
3898         case anychar:
3899           /* We could put all the chars except for \n (and maybe \0)
3900              but we don't bother since it is generally not worth it.  */
3901           if (!fastmap) break;
3902           return -1;
3903
3904
3905         case charset_not:
3906           if (!fastmap) break;
3907           {
3908             /* Chars beyond end of bitmap are possible matches.  */
3909             for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3910                  j < (1 << BYTEWIDTH); j++)
3911               fastmap[j] = 1;
3912           }
3913           FALLTHROUGH;
3914         case charset:
3915           if (!fastmap) break;
3916           not = (re_opcode_t) *(p - 1) == charset_not;
3917           for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3918                j >= 0; j--)
3919             if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3920               fastmap[j] = 1;
3921
3922 #ifdef emacs
3923           if (/* Any leading code can possibly start a character
3924                  which doesn't match the specified set of characters.  */
3925               not
3926               ||
3927               /* If we can match a character class, we can match any
3928                  multibyte characters.  */
3929               (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3930                && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3931
3932             {
3933               if (match_any_multibyte_characters == false)
3934                 {
3935                   for (j = MIN_MULTIBYTE_LEADING_CODE;
3936                        j <= MAX_MULTIBYTE_LEADING_CODE; j++)
3937                     fastmap[j] = 1;
3938                   match_any_multibyte_characters = true;
3939                 }
3940             }
3941
3942           else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3943                    && match_any_multibyte_characters == false)
3944             {
3945               /* Set fastmap[I] to 1 where I is a leading code of each
3946                  multibyte character in the range table. */
3947               int c, count;
3948               unsigned char lc1, lc2;
3949
3950               /* Make P points the range table.  `+ 2' is to skip flag
3951                  bits for a character class.  */
3952               p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
3953
3954               /* Extract the number of ranges in range table into COUNT.  */
3955               EXTRACT_NUMBER_AND_INCR (count, p);
3956               for (; count > 0; count--, p += 3)
3957                 {
3958                   /* Extract the start and end of each range.  */
3959                   EXTRACT_CHARACTER (c, p);
3960                   lc1 = CHAR_LEADING_CODE (c);
3961                   p += 3;
3962                   EXTRACT_CHARACTER (c, p);
3963                   lc2 = CHAR_LEADING_CODE (c);
3964                   for (j = lc1; j <= lc2; j++)
3965                     fastmap[j] = 1;
3966                 }
3967             }
3968 #endif
3969           break;
3970
3971         case syntaxspec:
3972         case notsyntaxspec:
3973           if (!fastmap) break;
3974 #ifndef emacs
3975           not = (re_opcode_t)p[-1] == notsyntaxspec;
3976           k = *p++;
3977           for (j = 0; j < (1 << BYTEWIDTH); j++)
3978             if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
3979               fastmap[j] = 1;
3980           break;
3981 #else  /* emacs */
3982           /* This match depends on text properties.  These end with
3983              aborting optimizations.  */
3984           return -1;
3985
3986         case categoryspec:
3987         case notcategoryspec:
3988           if (!fastmap) break;
3989           not = (re_opcode_t)p[-1] == notcategoryspec;
3990           k = *p++;
3991           for (j = (1 << BYTEWIDTH); j >= 0; j--)
3992             if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
3993               fastmap[j] = 1;
3994
3995           /* Any leading code can possibly start a character which
3996              has or doesn't has the specified category.  */
3997           if (match_any_multibyte_characters == false)
3998             {
3999               for (j = MIN_MULTIBYTE_LEADING_CODE;
4000                    j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4001                 fastmap[j] = 1;
4002               match_any_multibyte_characters = true;
4003             }
4004           break;
4005
4006       /* All cases after this match the empty string.  These end with
4007          `continue'.  */
4008
4009         case at_dot:
4010 #endif /* !emacs */
4011         case no_op:
4012         case begline:
4013         case endline:
4014         case begbuf:
4015         case endbuf:
4016         case wordbound:
4017         case notwordbound:
4018         case wordbeg:
4019         case wordend:
4020         case symbeg:
4021         case symend:
4022           continue;
4023
4024
4025         case jump:
4026           EXTRACT_NUMBER_AND_INCR (j, p);
4027           if (j < 0)
4028             /* Backward jumps can only go back to code that we've already
4029                visited.  `re_compile' should make sure this is true.  */
4030             break;
4031           p += j;
4032           switch (*p)
4033             {
4034             case on_failure_jump:
4035             case on_failure_keep_string_jump:
4036             case on_failure_jump_loop:
4037             case on_failure_jump_nastyloop:
4038             case on_failure_jump_smart:
4039               p++;
4040               break;
4041             default:
4042               continue;
4043             };
4044           /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4045              to jump back to "just after here".  */
4046           FALLTHROUGH;
4047         case on_failure_jump:
4048         case on_failure_keep_string_jump:
4049         case on_failure_jump_nastyloop:
4050         case on_failure_jump_loop:
4051         case on_failure_jump_smart:
4052           EXTRACT_NUMBER_AND_INCR (j, p);
4053           if (p + j <= p1)
4054             ; /* Backward jump to be ignored.  */
4055           else
4056             { /* We have to look down both arms.
4057                  We first go down the "straight" path so as to minimize
4058                  stack usage when going through alternatives.  */
4059               int r = analyze_first (p, pend, fastmap, multibyte);
4060               if (r) return r;
4061               p += j;
4062             }
4063           continue;
4064
4065
4066         case jump_n:
4067           /* This code simply does not properly handle forward jump_n.  */
4068           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4069           p += 4;
4070           /* jump_n can either jump or fall through.  The (backward) jump
4071              case has already been handled, so we only need to look at the
4072              fallthrough case.  */
4073           continue;
4074
4075         case succeed_n:
4076           /* If N == 0, it should be an on_failure_jump_loop instead.  */
4077           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4078           p += 4;
4079           /* We only care about one iteration of the loop, so we don't
4080              need to consider the case where this behaves like an
4081              on_failure_jump.  */
4082           continue;
4083
4084
4085         case set_number_at:
4086           p += 4;
4087           continue;
4088
4089
4090         case start_memory:
4091         case stop_memory:
4092           p += 1;
4093           continue;
4094
4095
4096         default:
4097           abort (); /* We have listed all the cases.  */
4098         } /* switch *p++ */
4099
4100       /* Getting here means we have found the possible starting
4101          characters for one path of the pattern -- and that the empty
4102          string does not match.  We need not follow this path further.  */
4103       return 0;
4104     } /* while p */
4105
4106   /* We reached the end without matching anything.  */
4107   return 1;
4108
4109 } /* analyze_first */
4110 \f
4111 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4112    BUFP.  A fastmap records which of the (1 << BYTEWIDTH) possible
4113    characters can start a string that matches the pattern.  This fastmap
4114    is used by re_search to skip quickly over impossible starting points.
4115
4116    Character codes above (1 << BYTEWIDTH) are not represented in the
4117    fastmap, but the leading codes are represented.  Thus, the fastmap
4118    indicates which character sets could start a match.
4119
4120    The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4121    area as BUFP->fastmap.
4122
4123    We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4124    the pattern buffer.
4125
4126    Returns 0 if we succeed, -2 if an internal error.   */
4127
4128 int
4129 re_compile_fastmap (struct re_pattern_buffer *bufp)
4130 {
4131   char *fastmap = bufp->fastmap;
4132   int analysis;
4133
4134   assert (fastmap && bufp->buffer);
4135
4136   memset (fastmap, 0, 1 << BYTEWIDTH);  /* Assume nothing's valid.  */
4137   bufp->fastmap_accurate = 1;       /* It will be when we're done.  */
4138
4139   analysis = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
4140                             fastmap, RE_MULTIBYTE_P (bufp));
4141   bufp->can_be_null = (analysis != 0);
4142   return 0;
4143 } /* re_compile_fastmap */
4144 \f
4145 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4146    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
4147    this memory for recording register information.  STARTS and ENDS
4148    must be allocated using the malloc library routine, and must each
4149    be at least NUM_REGS * sizeof (regoff_t) bytes long.
4150
4151    If NUM_REGS == 0, then subsequent matches should allocate their own
4152    register data.
4153
4154    Unless this function is called, the first search or match using
4155    PATTERN_BUFFER will allocate its own register data, without
4156    freeing the old data.  */
4157
4158 void
4159 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
4160 {
4161   if (num_regs)
4162     {
4163       bufp->regs_allocated = REGS_REALLOCATE;
4164       regs->num_regs = num_regs;
4165       regs->start = starts;
4166       regs->end = ends;
4167     }
4168   else
4169     {
4170       bufp->regs_allocated = REGS_UNALLOCATED;
4171       regs->num_regs = 0;
4172       regs->start = regs->end = 0;
4173     }
4174 }
4175 WEAK_ALIAS (__re_set_registers, re_set_registers)
4176 \f
4177 /* Searching routines.  */
4178
4179 /* Like re_search_2, below, but only one string is specified, and
4180    doesn't let you say where to stop matching. */
4181
4182 regoff_t
4183 re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4184            ssize_t startpos, ssize_t range, struct re_registers *regs)
4185 {
4186   return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4187                       regs, size);
4188 }
4189 WEAK_ALIAS (__re_search, re_search)
4190
4191 /* Head address of virtual concatenation of string.  */
4192 #define HEAD_ADDR_VSTRING(P)            \
4193   (((P) >= size1 ? string2 : string1))
4194
4195 /* Address of POS in the concatenation of virtual string. */
4196 #define POS_ADDR_VSTRING(POS)                                   \
4197   (((POS) >= size1 ? string2 - size1 : string1) + (POS))
4198
4199 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4200    virtual concatenation of STRING1 and STRING2, starting first at index
4201    STARTPOS, then at STARTPOS + 1, and so on.
4202
4203    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4204
4205    RANGE is how far to scan while trying to match.  RANGE = 0 means try
4206    only at STARTPOS; in general, the last start tried is STARTPOS +
4207    RANGE.
4208
4209    In REGS, return the indices of the virtual concatenation of STRING1
4210    and STRING2 that matched the entire BUFP->buffer and its contained
4211    subexpressions.
4212
4213    Do not consider matching one past the index STOP in the virtual
4214    concatenation of STRING1 and STRING2.
4215
4216    We return either the position in the strings at which the match was
4217    found, -1 if no match, or -2 if error (such as failure
4218    stack overflow).  */
4219
4220 regoff_t
4221 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4222              const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4223              struct re_registers *regs, ssize_t stop)
4224 {
4225   regoff_t val;
4226   re_char *string1 = (re_char *) str1;
4227   re_char *string2 = (re_char *) str2;
4228   register char *fastmap = bufp->fastmap;
4229   register RE_TRANSLATE_TYPE translate = bufp->translate;
4230   size_t total_size = size1 + size2;
4231   ssize_t endpos = startpos + range;
4232   boolean anchored_start;
4233   /* Nonzero if we are searching multibyte string.  */
4234   const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4235
4236   /* Check for out-of-range STARTPOS.  */
4237   if (startpos < 0 || startpos > total_size)
4238     return -1;
4239
4240   /* Fix up RANGE if it might eventually take us outside
4241      the virtual concatenation of STRING1 and STRING2.
4242      Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE.  */
4243   if (endpos < 0)
4244     range = 0 - startpos;
4245   else if (endpos > total_size)
4246     range = total_size - startpos;
4247
4248   /* If the search isn't to be a backwards one, don't waste time in a
4249      search for a pattern anchored at beginning of buffer.  */
4250   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4251     {
4252       if (startpos > 0)
4253         return -1;
4254       else
4255         range = 0;
4256     }
4257
4258 #ifdef emacs
4259   /* In a forward search for something that starts with \=.
4260      don't keep searching past point.  */
4261   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4262     {
4263       range = PT_BYTE - BEGV_BYTE - startpos;
4264       if (range < 0)
4265         return -1;
4266     }
4267 #endif /* emacs */
4268
4269   /* Update the fastmap now if not correct already.  */
4270   if (fastmap && !bufp->fastmap_accurate)
4271     re_compile_fastmap (bufp);
4272
4273   /* See whether the pattern is anchored.  */
4274   anchored_start = (bufp->buffer[0] == begline);
4275
4276 #ifdef emacs
4277   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4278   {
4279     ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
4280
4281     SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4282   }
4283 #endif
4284
4285   /* Loop through the string, looking for a place to start matching.  */
4286   for (;;)
4287     {
4288       /* If the pattern is anchored,
4289          skip quickly past places we cannot match.
4290          We don't bother to treat startpos == 0 specially
4291          because that case doesn't repeat.  */
4292       if (anchored_start && startpos > 0)
4293         {
4294           if (! ((startpos <= size1 ? string1[startpos - 1]
4295                   : string2[startpos - size1 - 1])
4296                  == '\n'))
4297             goto advance;
4298         }
4299
4300       /* If a fastmap is supplied, skip quickly over characters that
4301          cannot be the start of a match.  If the pattern can match the
4302          null string, however, we don't need to skip characters; we want
4303          the first null string.  */
4304       if (fastmap && startpos < total_size && !bufp->can_be_null)
4305         {
4306           register re_char *d;
4307           register re_wchar_t buf_ch;
4308
4309           d = POS_ADDR_VSTRING (startpos);
4310
4311           if (range > 0)        /* Searching forwards.  */
4312             {
4313               ssize_t irange = range, lim = 0;
4314
4315               if (startpos < size1 && startpos + range >= size1)
4316                 lim = range - (size1 - startpos);
4317
4318               /* Written out as an if-else to avoid testing `translate'
4319                  inside the loop.  */
4320               if (RE_TRANSLATE_P (translate))
4321                 {
4322                   if (multibyte)
4323                     while (range > lim)
4324                       {
4325                         int buf_charlen;
4326
4327                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4328                         buf_ch = RE_TRANSLATE (translate, buf_ch);
4329                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4330                           break;
4331
4332                         range -= buf_charlen;
4333                         d += buf_charlen;
4334                       }
4335                   else
4336                     while (range > lim)
4337                       {
4338                         register re_wchar_t ch, translated;
4339
4340                         buf_ch = *d;
4341                         ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4342                         translated = RE_TRANSLATE (translate, ch);
4343                         if (translated != ch
4344                             && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4345                           buf_ch = ch;
4346                         if (fastmap[buf_ch])
4347                           break;
4348                         d++;
4349                         range--;
4350                       }
4351                 }
4352               else
4353                 {
4354                   if (multibyte)
4355                     while (range > lim)
4356                       {
4357                         int buf_charlen;
4358
4359                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4360                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4361                           break;
4362                         range -= buf_charlen;
4363                         d += buf_charlen;
4364                       }
4365                   else
4366                     while (range > lim && !fastmap[*d])
4367                       {
4368                         d++;
4369                         range--;
4370                       }
4371                 }
4372               startpos += irange - range;
4373             }
4374           else                          /* Searching backwards.  */
4375             {
4376               if (multibyte)
4377                 {
4378                   buf_ch = STRING_CHAR (d);
4379                   buf_ch = TRANSLATE (buf_ch);
4380                   if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4381                     goto advance;
4382                 }
4383               else
4384                 {
4385                   register re_wchar_t ch, translated;
4386
4387                   buf_ch = *d;
4388                   ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4389                   translated = TRANSLATE (ch);
4390                   if (translated != ch
4391                       && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4392                     buf_ch = ch;
4393                   if (! fastmap[TRANSLATE (buf_ch)])
4394                     goto advance;
4395                 }
4396             }
4397         }
4398
4399       /* If can't match the null string, and that's all we have left, fail.  */
4400       if (range >= 0 && startpos == total_size && fastmap
4401           && !bufp->can_be_null)
4402         return -1;
4403
4404       val = re_match_2_internal (bufp, string1, size1, string2, size2,
4405                                  startpos, regs, stop);
4406
4407       if (val >= 0)
4408         return startpos;
4409
4410       if (val == -2)
4411         return -2;
4412
4413     advance:
4414       if (!range)
4415         break;
4416       else if (range > 0)
4417         {
4418           /* Update STARTPOS to the next character boundary.  */
4419           if (multibyte)
4420             {
4421               re_char *p = POS_ADDR_VSTRING (startpos);
4422               int len = BYTES_BY_CHAR_HEAD (*p);
4423
4424               range -= len;
4425               if (range < 0)
4426                 break;
4427               startpos += len;
4428             }
4429           else
4430             {
4431               range--;
4432               startpos++;
4433             }
4434         }
4435       else
4436         {
4437           range++;
4438           startpos--;
4439
4440           /* Update STARTPOS to the previous character boundary.  */
4441           if (multibyte)
4442             {
4443               re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4444               re_char *p0 = p;
4445               re_char *phead = HEAD_ADDR_VSTRING (startpos);
4446
4447               /* Find the head of multibyte form.  */
4448               PREV_CHAR_BOUNDARY (p, phead);
4449               range += p0 - 1 - p;
4450               if (range > 0)
4451                 break;
4452
4453               startpos -= p0 - 1 - p;
4454             }
4455         }
4456     }
4457   return -1;
4458 } /* re_search_2 */
4459 WEAK_ALIAS (__re_search_2, re_search_2)
4460 \f
4461 /* Declarations and macros for re_match_2.  */
4462
4463 static int bcmp_translate (re_char *s1, re_char *s2,
4464                            register ssize_t len,
4465                            RE_TRANSLATE_TYPE translate,
4466                            const int multibyte);
4467
4468 /* This converts PTR, a pointer into one of the search strings `string1'
4469    and `string2' into an offset from the beginning of that string.  */
4470 #define POINTER_TO_OFFSET(ptr)                  \
4471   (FIRST_STRING_P (ptr)                         \
4472    ? (ptr) - string1                            \
4473    : (ptr) - string2 + (ptrdiff_t) size1)
4474
4475 /* Call before fetching a character with *d.  This switches over to
4476    string2 if necessary.
4477    Check re_match_2_internal for a discussion of why end_match_2 might
4478    not be within string2 (but be equal to end_match_1 instead).  */
4479 #define PREFETCH()                                                      \
4480   while (d == dend)                                                     \
4481     {                                                                   \
4482       /* End of string2 => fail.  */                                    \
4483       if (dend == end_match_2)                                          \
4484         goto fail;                                                      \
4485       /* End of string1 => advance to string2.  */                      \
4486       d = string2;                                                      \
4487       dend = end_match_2;                                               \
4488     }
4489
4490 /* Call before fetching a char with *d if you already checked other limits.
4491    This is meant for use in lookahead operations like wordend, etc..
4492    where we might need to look at parts of the string that might be
4493    outside of the LIMITs (i.e past `stop').  */
4494 #define PREFETCH_NOLIMIT()                                              \
4495   if (d == end1)                                                        \
4496      {                                                                  \
4497        d = string2;                                                     \
4498        dend = end_match_2;                                              \
4499      }                                                                  \
4500
4501 /* Test if at very beginning or at very end of the virtual concatenation
4502    of `string1' and `string2'.  If only one string, it's `string2'.  */
4503 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4504 #define AT_STRINGS_END(d) ((d) == end2)
4505
4506 /* Disabled due to a compiler bug -- see comment at case wordbound */
4507
4508 /* The comment at case wordbound is following one, but we don't use
4509    AT_WORD_BOUNDARY anymore to support multibyte form.
4510
4511    The DEC Alpha C compiler 3.x generates incorrect code for the
4512    test  WORDCHAR_P (d - 1) != WORDCHAR_P (d)  in the expansion of
4513    AT_WORD_BOUNDARY, so this code is disabled.  Expanding the
4514    macro and introducing temporary variables works around the bug.  */
4515
4516 #if 0
4517 /* Test if D points to a character which is word-constituent.  We have
4518    two special cases to check for: if past the end of string1, look at
4519    the first character in string2; and if before the beginning of
4520    string2, look at the last character in string1.  */
4521 #define WORDCHAR_P(d)                                                   \
4522   (SYNTAX ((d) == end1 ? *string2                                       \
4523            : (d) == string2 - 1 ? *(end1 - 1) : *(d))                   \
4524    == Sword)
4525
4526 /* Test if the character before D and the one at D differ with respect
4527    to being word-constituent.  */
4528 #define AT_WORD_BOUNDARY(d)                                             \
4529   (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)                             \
4530    || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4531 #endif
4532
4533 /* Free everything we malloc.  */
4534 #ifdef MATCH_MAY_ALLOCATE
4535 # define FREE_VAR(var)                                                  \
4536   do {                                                                  \
4537     if (var)                                                            \
4538       {                                                                 \
4539         REGEX_FREE (var);                                               \
4540         var = NULL;                                                     \
4541       }                                                                 \
4542   } while (0)
4543 # define FREE_VARIABLES()                                               \
4544   do {                                                                  \
4545     REGEX_FREE_STACK (fail_stack.stack);                                \
4546     FREE_VAR (regstart);                                                \
4547     FREE_VAR (regend);                                                  \
4548     FREE_VAR (best_regstart);                                           \
4549     FREE_VAR (best_regend);                                             \
4550     REGEX_SAFE_FREE ();                                                 \
4551   } while (0)
4552 #else
4553 # define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning.  */
4554 #endif /* not MATCH_MAY_ALLOCATE */
4555
4556 \f
4557 /* Optimization routines.  */
4558
4559 /* If the operation is a match against one or more chars,
4560    return a pointer to the next operation, else return NULL.  */
4561 static re_char *
4562 skip_one_char (re_char *p)
4563 {
4564   switch (*p++)
4565     {
4566     case anychar:
4567       break;
4568
4569     case exactn:
4570       p += *p + 1;
4571       break;
4572
4573     case charset_not:
4574     case charset:
4575       if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4576         {
4577           int mcnt;
4578           p = CHARSET_RANGE_TABLE (p - 1);
4579           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4580           p = CHARSET_RANGE_TABLE_END (p, mcnt);
4581         }
4582       else
4583         p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4584       break;
4585
4586     case syntaxspec:
4587     case notsyntaxspec:
4588 #ifdef emacs
4589     case categoryspec:
4590     case notcategoryspec:
4591 #endif /* emacs */
4592       p++;
4593       break;
4594
4595     default:
4596       p = NULL;
4597     }
4598   return p;
4599 }
4600
4601
4602 /* Jump over non-matching operations.  */
4603 static re_char *
4604 skip_noops (re_char *p, re_char *pend)
4605 {
4606   int mcnt;
4607   while (p < pend)
4608     {
4609       switch (*p)
4610         {
4611         case start_memory:
4612         case stop_memory:
4613           p += 2; break;
4614         case no_op:
4615           p += 1; break;
4616         case jump:
4617           p += 1;
4618           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4619           p += mcnt;
4620           break;
4621         default:
4622           return p;
4623         }
4624     }
4625   assert (p == pend);
4626   return p;
4627 }
4628
4629 /* Test if C matches charset op.  *PP points to the charset or charset_not
4630    opcode.  When the function finishes, *PP will be advanced past that opcode.
4631    C is character to test (possibly after translations) and CORIG is original
4632    character (i.e. without any translations).  UNIBYTE denotes whether c is
4633    unibyte or multibyte character. */
4634 static bool
4635 execute_charset (re_char **pp, unsigned c, unsigned corig, bool unibyte)
4636 {
4637   re_char *p = *pp, *rtp = NULL;
4638   bool not = (re_opcode_t) *p == charset_not;
4639
4640   if (CHARSET_RANGE_TABLE_EXISTS_P (p))
4641     {
4642       int count;
4643       rtp = CHARSET_RANGE_TABLE (p);
4644       EXTRACT_NUMBER_AND_INCR (count, rtp);
4645       *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
4646     }
4647   else
4648     *pp += 2 + CHARSET_BITMAP_SIZE (p);
4649
4650   if (unibyte && c < (1 << BYTEWIDTH))
4651     {                   /* Lookup bitmap.  */
4652       /* Cast to `unsigned' instead of `unsigned char' in
4653          case the bit list is a full 32 bytes long.  */
4654       if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
4655           && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4656         return !not;
4657     }
4658 #ifdef emacs
4659   else if (rtp)
4660     {
4661       int class_bits = CHARSET_RANGE_TABLE_BITS (p);
4662       re_wchar_t range_start, range_end;
4663
4664   /* Sort tests by the most commonly used classes with some adjustment to which
4665      tests are easiest to perform.  Take a look at comment in re_wctype_parse
4666      for table with frequencies of character class names. */
4667
4668       if ((class_bits & BIT_MULTIBYTE) ||
4669           (class_bits & BIT_ALNUM && ISALNUM (c)) ||
4670           (class_bits & BIT_ALPHA && ISALPHA (c)) ||
4671           (class_bits & BIT_SPACE && ISSPACE (c)) ||
4672           (class_bits & BIT_BLANK && ISBLANK (c)) ||
4673           (class_bits & BIT_WORD  && ISWORD  (c)) ||
4674           ((class_bits & BIT_UPPER) &&
4675            (ISUPPER (c) || (corig != c &&
4676                             c == downcase (corig) && ISLOWER (c)))) ||
4677           ((class_bits & BIT_LOWER) &&
4678            (ISLOWER (c) || (corig != c &&
4679                             c == upcase (corig) && ISUPPER(c)))) ||
4680           (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
4681           (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
4682           (class_bits & BIT_PRINT && ISPRINT (c)))
4683         return !not;
4684
4685       for (p = *pp; rtp < p; rtp += 2 * 3)
4686         {
4687           EXTRACT_CHARACTER (range_start, rtp);
4688           EXTRACT_CHARACTER (range_end, rtp + 3);
4689           if (range_start <= c && c <= range_end)
4690             return !not;
4691         }
4692     }
4693 #endif /* emacs */
4694   return not;
4695 }
4696
4697 /* Non-zero if "p1 matches something" implies "p2 fails".  */
4698 static int
4699 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
4700                       re_char *p2)
4701 {
4702   re_opcode_t op2;
4703   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4704   unsigned char *pend = bufp->buffer + bufp->used;
4705
4706   assert (p1 >= bufp->buffer && p1 < pend
4707           && p2 >= bufp->buffer && p2 <= pend);
4708
4709   /* Skip over open/close-group commands.
4710      If what follows this loop is a ...+ construct,
4711      look at what begins its body, since we will have to
4712      match at least one of that.  */
4713   p2 = skip_noops (p2, pend);
4714   /* The same skip can be done for p1, except that this function
4715      is only used in the case where p1 is a simple match operator.  */
4716   /* p1 = skip_noops (p1, pend); */
4717
4718   assert (p1 >= bufp->buffer && p1 < pend
4719           && p2 >= bufp->buffer && p2 <= pend);
4720
4721   op2 = p2 == pend ? succeed : *p2;
4722
4723   switch (op2)
4724     {
4725     case succeed:
4726     case endbuf:
4727       /* If we're at the end of the pattern, we can change.  */
4728       if (skip_one_char (p1))
4729         {
4730           DEBUG_PRINT ("  End of pattern: fast loop.\n");
4731           return 1;
4732         }
4733       break;
4734
4735     case endline:
4736     case exactn:
4737       {
4738         register re_wchar_t c
4739           = (re_opcode_t) *p2 == endline ? '\n'
4740           : RE_STRING_CHAR (p2 + 2, multibyte);
4741
4742         if ((re_opcode_t) *p1 == exactn)
4743           {
4744             if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4745               {
4746                 DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
4747                 return 1;
4748               }
4749           }
4750
4751         else if ((re_opcode_t) *p1 == charset
4752                  || (re_opcode_t) *p1 == charset_not)
4753           {
4754             if (!execute_charset (&p1, c, c, !multibyte || IS_REAL_ASCII (c)))
4755               {
4756                 DEBUG_PRINT ("   No match => fast loop.\n");
4757                 return 1;
4758               }
4759           }
4760         else if ((re_opcode_t) *p1 == anychar
4761                  && c == '\n')
4762           {
4763             DEBUG_PRINT ("   . != \\n => fast loop.\n");
4764             return 1;
4765           }
4766       }
4767       break;
4768
4769     case charset:
4770       {
4771         if ((re_opcode_t) *p1 == exactn)
4772           /* Reuse the code above.  */
4773           return mutually_exclusive_p (bufp, p2, p1);
4774
4775       /* It is hard to list up all the character in charset
4776          P2 if it includes multibyte character.  Give up in
4777          such case.  */
4778       else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4779         {
4780           /* Now, we are sure that P2 has no range table.
4781              So, for the size of bitmap in P2, `p2[1]' is
4782              enough.  But P1 may have range table, so the
4783              size of bitmap table of P1 is extracted by
4784              using macro `CHARSET_BITMAP_SIZE'.
4785
4786              In a multibyte case, we know that all the character
4787              listed in P2 is ASCII.  In a unibyte case, P1 has only a
4788              bitmap table.  So, in both cases, it is enough to test
4789              only the bitmap table of P1.  */
4790
4791           if ((re_opcode_t) *p1 == charset)
4792             {
4793               int idx;
4794               /* We win if the charset inside the loop
4795                  has no overlap with the one after the loop.  */
4796               for (idx = 0;
4797                    (idx < (int) p2[1]
4798                     && idx < CHARSET_BITMAP_SIZE (p1));
4799                    idx++)
4800                 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4801                   break;
4802
4803               if (idx == p2[1]
4804                   || idx == CHARSET_BITMAP_SIZE (p1))
4805                 {
4806                   DEBUG_PRINT ("         No match => fast loop.\n");
4807                   return 1;
4808                 }
4809             }
4810           else if ((re_opcode_t) *p1 == charset_not)
4811             {
4812               int idx;
4813               /* We win if the charset_not inside the loop lists
4814                  every character listed in the charset after.  */
4815               for (idx = 0; idx < (int) p2[1]; idx++)
4816                 if (! (p2[2 + idx] == 0
4817                        || (idx < CHARSET_BITMAP_SIZE (p1)
4818                            && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4819                   break;
4820
4821               if (idx == p2[1])
4822                 {
4823                   DEBUG_PRINT ("         No match => fast loop.\n");
4824                   return 1;
4825                 }
4826               }
4827           }
4828       }
4829       break;
4830
4831     case charset_not:
4832       switch (*p1)
4833         {
4834         case exactn:
4835         case charset:
4836           /* Reuse the code above.  */
4837           return mutually_exclusive_p (bufp, p2, p1);
4838         case charset_not:
4839           /* When we have two charset_not, it's very unlikely that
4840              they don't overlap.  The union of the two sets of excluded
4841              chars should cover all possible chars, which, as a matter of
4842              fact, is virtually impossible in multibyte buffers.  */
4843           break;
4844         }
4845       break;
4846
4847     case wordend:
4848       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4849     case symend:
4850       return ((re_opcode_t) *p1 == syntaxspec
4851               && (p1[1] == Ssymbol || p1[1] == Sword));
4852     case notsyntaxspec:
4853       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4854
4855     case wordbeg:
4856       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4857     case symbeg:
4858       return ((re_opcode_t) *p1 == notsyntaxspec
4859               && (p1[1] == Ssymbol || p1[1] == Sword));
4860     case syntaxspec:
4861       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4862
4863     case wordbound:
4864       return (((re_opcode_t) *p1 == notsyntaxspec
4865                || (re_opcode_t) *p1 == syntaxspec)
4866               && p1[1] == Sword);
4867
4868 #ifdef emacs
4869     case categoryspec:
4870       return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4871     case notcategoryspec:
4872       return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4873 #endif /* emacs */
4874
4875     default:
4876       ;
4877     }
4878
4879   /* Safe default.  */
4880   return 0;
4881 }
4882
4883 \f
4884 /* Matching routines.  */
4885
4886 #ifndef emacs   /* Emacs never uses this.  */
4887 /* re_match is like re_match_2 except it takes only a single string.  */
4888
4889 regoff_t
4890 re_match (struct re_pattern_buffer *bufp, const char *string,
4891           size_t size, ssize_t pos, struct re_registers *regs)
4892 {
4893   regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char *) string,
4894                                          size, pos, regs, size);
4895   return result;
4896 }
4897 WEAK_ALIAS (__re_match, re_match)
4898 #endif /* not emacs */
4899
4900 /* re_match_2 matches the compiled pattern in BUFP against the
4901    the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4902    and SIZE2, respectively).  We start matching at POS, and stop
4903    matching at STOP.
4904
4905    If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4906    store offsets for the substring each group matched in REGS.  See the
4907    documentation for exactly how many groups we fill.
4908
4909    We return -1 if no match, -2 if an internal error (such as the
4910    failure stack overflowing).  Otherwise, we return the length of the
4911    matched substring.  */
4912
4913 regoff_t
4914 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4915             size_t size1, const char *string2, size_t size2, ssize_t pos,
4916             struct re_registers *regs, ssize_t stop)
4917 {
4918   regoff_t result;
4919
4920 #ifdef emacs
4921   ssize_t charpos;
4922   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4923   charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4924   SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4925 #endif
4926
4927   result = re_match_2_internal (bufp, (re_char *) string1, size1,
4928                                 (re_char *) string2, size2,
4929                                 pos, regs, stop);
4930   return result;
4931 }
4932 WEAK_ALIAS (__re_match_2, re_match_2)
4933
4934
4935 /* This is a separate function so that we can force an alloca cleanup
4936    afterwards.  */
4937 static regoff_t
4938 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
4939                      size_t size1, re_char *string2, size_t size2,
4940                      ssize_t pos, struct re_registers *regs, ssize_t stop)
4941 {
4942   /* General temporaries.  */
4943   int mcnt;
4944   size_t reg;
4945
4946   /* Just past the end of the corresponding string.  */
4947   re_char *end1, *end2;
4948
4949   /* Pointers into string1 and string2, just past the last characters in
4950      each to consider matching.  */
4951   re_char *end_match_1, *end_match_2;
4952
4953   /* Where we are in the data, and the end of the current string.  */
4954   re_char *d, *dend;
4955
4956   /* Used sometimes to remember where we were before starting matching
4957      an operator so that we can go back in case of failure.  This "atomic"
4958      behavior of matching opcodes is indispensable to the correctness
4959      of the on_failure_keep_string_jump optimization.  */
4960   re_char *dfail;
4961
4962   /* Where we are in the pattern, and the end of the pattern.  */
4963   re_char *p = bufp->buffer;
4964   re_char *pend = p + bufp->used;
4965
4966   /* We use this to map every character in the string.  */
4967   RE_TRANSLATE_TYPE translate = bufp->translate;
4968
4969   /* Nonzero if BUFP is setup from a multibyte regex.  */
4970   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4971
4972   /* Nonzero if STRING1/STRING2 are multibyte.  */
4973   const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4974
4975   /* Failure point stack.  Each place that can handle a failure further
4976      down the line pushes a failure point on this stack.  It consists of
4977      regstart, and regend for all registers corresponding to
4978      the subexpressions we're currently inside, plus the number of such
4979      registers, and, finally, two char *'s.  The first char * is where
4980      to resume scanning the pattern; the second one is where to resume
4981      scanning the strings.  */
4982 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
4983   fail_stack_type fail_stack;
4984 #endif
4985 #ifdef DEBUG_COMPILES_ARGUMENTS
4986   unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4987 #endif
4988
4989 #if defined REL_ALLOC && defined REGEX_MALLOC
4990   /* This holds the pointer to the failure stack, when
4991      it is allocated relocatably.  */
4992   fail_stack_elt_t *failure_stack_ptr;
4993 #endif
4994
4995   /* We fill all the registers internally, independent of what we
4996      return, for use in backreferences.  The number here includes
4997      an element for register zero.  */
4998   size_t num_regs = bufp->re_nsub + 1;
4999
5000   /* Information on the contents of registers. These are pointers into
5001      the input strings; they record just what was matched (on this
5002      attempt) by a subexpression part of the pattern, that is, the
5003      regnum-th regstart pointer points to where in the pattern we began
5004      matching and the regnum-th regend points to right after where we
5005      stopped matching the regnum-th subexpression.  (The zeroth register
5006      keeps track of what the whole pattern matches.)  */
5007 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
5008   re_char **regstart, **regend;
5009 #endif
5010
5011   /* The following record the register info as found in the above
5012      variables when we find a match better than any we've seen before.
5013      This happens as we backtrack through the failure points, which in
5014      turn happens only if we have not yet matched the entire string. */
5015   unsigned best_regs_set = false;
5016 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
5017   re_char **best_regstart, **best_regend;
5018 #endif
5019
5020   /* Logically, this is `best_regend[0]'.  But we don't want to have to
5021      allocate space for that if we're not allocating space for anything
5022      else (see below).  Also, we never need info about register 0 for
5023      any of the other register vectors, and it seems rather a kludge to
5024      treat `best_regend' differently than the rest.  So we keep track of
5025      the end of the best match so far in a separate variable.  We
5026      initialize this to NULL so that when we backtrack the first time
5027      and need to test it, it's not garbage.  */
5028   re_char *match_end = NULL;
5029
5030 #ifdef DEBUG_COMPILES_ARGUMENTS
5031   /* Counts the total number of registers pushed.  */
5032   unsigned num_regs_pushed = 0;
5033 #endif
5034
5035   DEBUG_PRINT ("\n\nEntering re_match_2.\n");
5036
5037   REGEX_USE_SAFE_ALLOCA;
5038
5039   INIT_FAIL_STACK ();
5040
5041 #ifdef MATCH_MAY_ALLOCATE
5042   /* Do not bother to initialize all the register variables if there are
5043      no groups in the pattern, as it takes a fair amount of time.  If
5044      there are groups, we include space for register 0 (the whole
5045      pattern), even though we never use it, since it simplifies the
5046      array indexing.  We should fix this.  */
5047   if (bufp->re_nsub)
5048     {
5049       regstart = REGEX_TALLOC (num_regs, re_char *);
5050       regend = REGEX_TALLOC (num_regs, re_char *);
5051       best_regstart = REGEX_TALLOC (num_regs, re_char *);
5052       best_regend = REGEX_TALLOC (num_regs, re_char *);
5053
5054       if (!(regstart && regend && best_regstart && best_regend))
5055         {
5056           FREE_VARIABLES ();
5057           return -2;
5058         }
5059     }
5060   else
5061     {
5062       /* We must initialize all our variables to NULL, so that
5063          `FREE_VARIABLES' doesn't try to free them.  */
5064       regstart = regend = best_regstart = best_regend = NULL;
5065     }
5066 #endif /* MATCH_MAY_ALLOCATE */
5067
5068   /* The starting position is bogus.  */
5069   if (pos < 0 || pos > size1 + size2)
5070     {
5071       FREE_VARIABLES ();
5072       return -1;
5073     }
5074
5075   /* Initialize subexpression text positions to -1 to mark ones that no
5076      start_memory/stop_memory has been seen for. Also initialize the
5077      register information struct.  */
5078   for (reg = 1; reg < num_regs; reg++)
5079     regstart[reg] = regend[reg] = NULL;
5080
5081   /* We move `string1' into `string2' if the latter's empty -- but not if
5082      `string1' is null.  */
5083   if (size2 == 0 && string1 != NULL)
5084     {
5085       string2 = string1;
5086       size2 = size1;
5087       string1 = 0;
5088       size1 = 0;
5089     }
5090   end1 = string1 + size1;
5091   end2 = string2 + size2;
5092
5093   /* `p' scans through the pattern as `d' scans through the data.
5094      `dend' is the end of the input string that `d' points within.  `d'
5095      is advanced into the following input string whenever necessary, but
5096      this happens before fetching; therefore, at the beginning of the
5097      loop, `d' can be pointing at the end of a string, but it cannot
5098      equal `string2'.  */
5099   if (pos >= size1)
5100     {
5101       /* Only match within string2.  */
5102       d = string2 + pos - size1;
5103       dend = end_match_2 = string2 + stop - size1;
5104       end_match_1 = end1;       /* Just to give it a value.  */
5105     }
5106   else
5107     {
5108       if (stop < size1)
5109         {
5110           /* Only match within string1.  */
5111           end_match_1 = string1 + stop;
5112           /* BEWARE!
5113              When we reach end_match_1, PREFETCH normally switches to string2.
5114              But in the present case, this means that just doing a PREFETCH
5115              makes us jump from `stop' to `gap' within the string.
5116              What we really want here is for the search to stop as
5117              soon as we hit end_match_1.  That's why we set end_match_2
5118              to end_match_1 (since PREFETCH fails as soon as we hit
5119              end_match_2).  */
5120           end_match_2 = end_match_1;
5121         }
5122       else
5123         { /* It's important to use this code when stop == size so that
5124              moving `d' from end1 to string2 will not prevent the d == dend
5125              check from catching the end of string.  */
5126           end_match_1 = end1;
5127           end_match_2 = string2 + stop - size1;
5128         }
5129       d = string1 + pos;
5130       dend = end_match_1;
5131     }
5132
5133   DEBUG_PRINT ("The compiled pattern is: ");
5134   DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5135   DEBUG_PRINT ("The string to match is: \"");
5136   DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5137   DEBUG_PRINT ("\"\n");
5138
5139   /* This loops over pattern commands.  It exits by returning from the
5140      function if the match is complete, or it drops through if the match
5141      fails at this starting point in the input data.  */
5142   for (;;)
5143     {
5144       DEBUG_PRINT ("\n%p: ", p);
5145
5146       if (p == pend)
5147         {
5148           /* End of pattern means we might have succeeded.  */
5149           DEBUG_PRINT ("end of pattern ... ");
5150
5151           /* If we haven't matched the entire string, and we want the
5152              longest match, try backtracking.  */
5153           if (d != end_match_2)
5154             {
5155               /* True if this match is the best seen so far.  */
5156               bool best_match_p;
5157
5158               {
5159                 /* True if this match ends in the same string (string1
5160                    or string2) as the best previous match.  */
5161                 bool same_str_p = (FIRST_STRING_P (match_end)
5162                                    == FIRST_STRING_P (d));
5163
5164                 /* AIX compiler got confused when this was combined
5165                    with the previous declaration.  */
5166                 if (same_str_p)
5167                   best_match_p = d > match_end;
5168                 else
5169                   best_match_p = !FIRST_STRING_P (d);
5170               }
5171
5172               DEBUG_PRINT ("backtracking.\n");
5173
5174               if (!FAIL_STACK_EMPTY ())
5175                 { /* More failure points to try.  */
5176
5177                   /* If exceeds best match so far, save it.  */
5178                   if (!best_regs_set || best_match_p)
5179                     {
5180                       best_regs_set = true;
5181                       match_end = d;
5182
5183                       DEBUG_PRINT ("\nSAVING match as best so far.\n");
5184
5185                       for (reg = 1; reg < num_regs; reg++)
5186                         {
5187                           best_regstart[reg] = regstart[reg];
5188                           best_regend[reg] = regend[reg];
5189                         }
5190                     }
5191                   goto fail;
5192                 }
5193
5194               /* If no failure points, don't restore garbage.  And if
5195                  last match is real best match, don't restore second
5196                  best one. */
5197               else if (best_regs_set && !best_match_p)
5198                 {
5199                 restore_best_regs:
5200                   /* Restore best match.  It may happen that `dend ==
5201                      end_match_1' while the restored d is in string2.
5202                      For example, the pattern `x.*y.*z' against the
5203                      strings `x-' and `y-z-', if the two strings are
5204                      not consecutive in memory.  */
5205                   DEBUG_PRINT ("Restoring best registers.\n");
5206
5207                   d = match_end;
5208                   dend = ((d >= string1 && d <= end1)
5209                            ? end_match_1 : end_match_2);
5210
5211                   for (reg = 1; reg < num_regs; reg++)
5212                     {
5213                       regstart[reg] = best_regstart[reg];
5214                       regend[reg] = best_regend[reg];
5215                     }
5216                 }
5217             } /* d != end_match_2 */
5218
5219         succeed_label:
5220           DEBUG_PRINT ("Accepting match.\n");
5221
5222           /* If caller wants register contents data back, do it.  */
5223           if (regs && !bufp->no_sub)
5224             {
5225               /* Have the register data arrays been allocated?  */
5226               if (bufp->regs_allocated == REGS_UNALLOCATED)
5227                 { /* No.  So allocate them with malloc.  We need one
5228                      extra element beyond `num_regs' for the `-1' marker
5229                      GNU code uses.  */
5230                   regs->num_regs = max (RE_NREGS, num_regs + 1);
5231                   regs->start = TALLOC (regs->num_regs, regoff_t);
5232                   regs->end = TALLOC (regs->num_regs, regoff_t);
5233                   if (regs->start == NULL || regs->end == NULL)
5234                     {
5235                       FREE_VARIABLES ();
5236                       return -2;
5237                     }
5238                   bufp->regs_allocated = REGS_REALLOCATE;
5239                 }
5240               else if (bufp->regs_allocated == REGS_REALLOCATE)
5241                 { /* Yes.  If we need more elements than were already
5242                      allocated, reallocate them.  If we need fewer, just
5243                      leave it alone.  */
5244                   if (regs->num_regs < num_regs + 1)
5245                     {
5246                       regs->num_regs = num_regs + 1;
5247                       RETALLOC (regs->start, regs->num_regs, regoff_t);
5248                       RETALLOC (regs->end, regs->num_regs, regoff_t);
5249                       if (regs->start == NULL || regs->end == NULL)
5250                         {
5251                           FREE_VARIABLES ();
5252                           return -2;
5253                         }
5254                     }
5255                 }
5256               else
5257                 {
5258                   /* These braces fend off a "empty body in an else-statement"
5259                      warning under GCC when assert expands to nothing.  */
5260                   assert (bufp->regs_allocated == REGS_FIXED);
5261                 }
5262
5263               /* Convert the pointer data in `regstart' and `regend' to
5264                  indices.  Register zero has to be set differently,
5265                  since we haven't kept track of any info for it.  */
5266               if (regs->num_regs > 0)
5267                 {
5268                   regs->start[0] = pos;
5269                   regs->end[0] = POINTER_TO_OFFSET (d);
5270                 }
5271
5272               /* Go through the first `min (num_regs, regs->num_regs)'
5273                  registers, since that is all we initialized.  */
5274               for (reg = 1; reg < min (num_regs, regs->num_regs); reg++)
5275                 {
5276                   if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5277                     regs->start[reg] = regs->end[reg] = -1;
5278                   else
5279                     {
5280                       regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
5281                       regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
5282                     }
5283                 }
5284
5285               /* If the regs structure we return has more elements than
5286                  were in the pattern, set the extra elements to -1.  If
5287                  we (re)allocated the registers, this is the case,
5288                  because we always allocate enough to have at least one
5289                  -1 at the end.  */
5290               for (reg = num_regs; reg < regs->num_regs; reg++)
5291                 regs->start[reg] = regs->end[reg] = -1;
5292             } /* regs && !bufp->no_sub */
5293
5294           DEBUG_PRINT ("%u failure points pushed, %u popped (%u remain).\n",
5295                        nfailure_points_pushed, nfailure_points_popped,
5296                        nfailure_points_pushed - nfailure_points_popped);
5297           DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
5298
5299           ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
5300
5301           DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
5302
5303           FREE_VARIABLES ();
5304           return dcnt;
5305         }
5306
5307       /* Otherwise match next pattern command.  */
5308       switch (*p++)
5309         {
5310         /* Ignore these.  Used to ignore the n of succeed_n's which
5311            currently have n == 0.  */
5312         case no_op:
5313           DEBUG_PRINT ("EXECUTING no_op.\n");
5314           break;
5315
5316         case succeed:
5317           DEBUG_PRINT ("EXECUTING succeed.\n");
5318           goto succeed_label;
5319
5320         /* Match the next n pattern characters exactly.  The following
5321            byte in the pattern defines n, and the n bytes after that
5322            are the characters to match.  */
5323         case exactn:
5324           mcnt = *p++;
5325           DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
5326
5327           /* Remember the start point to rollback upon failure.  */
5328           dfail = d;
5329
5330 #ifndef emacs
5331           /* This is written out as an if-else so we don't waste time
5332              testing `translate' inside the loop.  */
5333           if (RE_TRANSLATE_P (translate))
5334             do
5335               {
5336                 PREFETCH ();
5337                 if (RE_TRANSLATE (translate, *d) != *p++)
5338                   {
5339                     d = dfail;
5340                     goto fail;
5341                   }
5342                 d++;
5343               }
5344             while (--mcnt);
5345           else
5346             do
5347               {
5348                 PREFETCH ();
5349                 if (*d++ != *p++)
5350                   {
5351                     d = dfail;
5352                     goto fail;
5353                   }
5354               }
5355             while (--mcnt);
5356 #else  /* emacs */
5357           /* The cost of testing `translate' is comparatively small.  */
5358           if (target_multibyte)
5359             do
5360               {
5361                 int pat_charlen, buf_charlen;
5362                 int pat_ch, buf_ch;
5363
5364                 PREFETCH ();
5365                 if (multibyte)
5366                   pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5367                 else
5368                   {
5369                     pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5370                     pat_charlen = 1;
5371                   }
5372                 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
5373
5374                 if (TRANSLATE (buf_ch) != pat_ch)
5375                   {
5376                     d = dfail;
5377                     goto fail;
5378                   }
5379
5380                 p += pat_charlen;
5381                 d += buf_charlen;
5382                 mcnt -= pat_charlen;
5383               }
5384             while (mcnt > 0);
5385           else
5386             do
5387               {
5388                 int pat_charlen;
5389                 int pat_ch, buf_ch;
5390
5391                 PREFETCH ();
5392                 if (multibyte)
5393                   {
5394                     pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5395                     pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5396                   }
5397                 else
5398                   {
5399                     pat_ch = *p;
5400                     pat_charlen = 1;
5401                   }
5402                 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5403                 if (! CHAR_BYTE8_P (buf_ch))
5404                   {
5405                     buf_ch = TRANSLATE (buf_ch);
5406                     buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5407                     if (buf_ch < 0)
5408                       buf_ch = *d;
5409                   }
5410                 else
5411                   buf_ch = *d;
5412                 if (buf_ch != pat_ch)
5413                   {
5414                     d = dfail;
5415                     goto fail;
5416                   }
5417                 p += pat_charlen;
5418                 d++;
5419               }
5420             while (--mcnt);
5421 #endif
5422           break;
5423
5424
5425         /* Match any character except possibly a newline or a null.  */
5426         case anychar:
5427           {
5428             int buf_charlen;
5429             re_wchar_t buf_ch;
5430             reg_syntax_t syntax;
5431
5432             DEBUG_PRINT ("EXECUTING anychar.\n");
5433
5434             PREFETCH ();
5435             buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
5436                                                 target_multibyte);
5437             buf_ch = TRANSLATE (buf_ch);
5438
5439 #ifdef emacs
5440             syntax = RE_SYNTAX_EMACS;
5441 #else
5442             syntax = bufp->syntax;
5443 #endif
5444
5445             if ((!(syntax & RE_DOT_NEWLINE) && buf_ch == '\n')
5446                 || ((syntax & RE_DOT_NOT_NULL) && buf_ch == '\000'))
5447               goto fail;
5448
5449             DEBUG_PRINT ("  Matched \"%d\".\n", *d);
5450             d += buf_charlen;
5451           }
5452           break;
5453
5454
5455         case charset:
5456         case charset_not:
5457           {
5458             register unsigned int c, corig;
5459             int len;
5460
5461             /* Whether matching against a unibyte character.  */
5462             boolean unibyte_char = false;
5463
5464             DEBUG_PRINT ("EXECUTING charset%s.\n",
5465                          (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
5466
5467             PREFETCH ();
5468             corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
5469             if (target_multibyte)
5470               {
5471                 int c1;
5472
5473                 c = TRANSLATE (c);
5474                 c1 = RE_CHAR_TO_UNIBYTE (c);
5475                 if (c1 >= 0)
5476                   {
5477                     unibyte_char = true;
5478                     c = c1;
5479                   }
5480               }
5481             else
5482               {
5483                 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5484
5485                 if (! CHAR_BYTE8_P (c1))
5486                   {
5487                     c1 = TRANSLATE (c1);
5488                     c1 = RE_CHAR_TO_UNIBYTE (c1);
5489                     if (c1 >= 0)
5490                       {
5491                         unibyte_char = true;
5492                         c = c1;
5493                       }
5494                   }
5495                 else
5496                   unibyte_char = true;
5497               }
5498
5499             p -= 1;
5500             if (!execute_charset (&p, c, corig, unibyte_char))
5501               goto fail;
5502
5503             d += len;
5504           }
5505           break;
5506
5507
5508         /* The beginning of a group is represented by start_memory.
5509            The argument is the register number.  The text
5510            matched within the group is recorded (in the internal
5511            registers data structure) under the register number.  */
5512         case start_memory:
5513           DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
5514
5515           /* In case we need to undo this operation (via backtracking).  */
5516           PUSH_FAILURE_REG (*p);
5517
5518           regstart[*p] = d;
5519           regend[*p] = NULL;    /* probably unnecessary.  -sm  */
5520           DEBUG_PRINT ("  regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
5521
5522           /* Move past the register number and inner group count.  */
5523           p += 1;
5524           break;
5525
5526
5527         /* The stop_memory opcode represents the end of a group.  Its
5528            argument is the same as start_memory's: the register number.  */
5529         case stop_memory:
5530           DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
5531
5532           assert (!REG_UNSET (regstart[*p]));
5533           /* Strictly speaking, there should be code such as:
5534
5535                 assert (REG_UNSET (regend[*p]));
5536                 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5537
5538              But the only info to be pushed is regend[*p] and it is known to
5539              be UNSET, so there really isn't anything to push.
5540              Not pushing anything, on the other hand deprives us from the
5541              guarantee that regend[*p] is UNSET since undoing this operation
5542              will not reset its value properly.  This is not important since
5543              the value will only be read on the next start_memory or at
5544              the very end and both events can only happen if this stop_memory
5545              is *not* undone.  */
5546
5547           regend[*p] = d;
5548           DEBUG_PRINT ("      regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
5549
5550           /* Move past the register number and the inner group count.  */
5551           p += 1;
5552           break;
5553
5554
5555         /* \<digit> has been turned into a `duplicate' command which is
5556            followed by the numeric value of <digit> as the register number.  */
5557         case duplicate:
5558           {
5559             register re_char *d2, *dend2;
5560             int regno = *p++;   /* Get which register to match against.  */
5561             DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
5562
5563             /* Can't back reference a group which we've never matched.  */
5564             if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5565               goto fail;
5566
5567             /* Where in input to try to start matching.  */
5568             d2 = regstart[regno];
5569
5570             /* Remember the start point to rollback upon failure.  */
5571             dfail = d;
5572
5573             /* Where to stop matching; if both the place to start and
5574                the place to stop matching are in the same string, then
5575                set to the place to stop, otherwise, for now have to use
5576                the end of the first string.  */
5577
5578             dend2 = ((FIRST_STRING_P (regstart[regno])
5579                       == FIRST_STRING_P (regend[regno]))
5580                      ? regend[regno] : end_match_1);
5581             for (;;)
5582               {
5583                 ptrdiff_t dcnt;
5584
5585                 /* If necessary, advance to next segment in register
5586                    contents.  */
5587                 while (d2 == dend2)
5588                   {
5589                     if (dend2 == end_match_2) break;
5590                     if (dend2 == regend[regno]) break;
5591
5592                     /* End of string1 => advance to string2. */
5593                     d2 = string2;
5594                     dend2 = regend[regno];
5595                   }
5596                 /* At end of register contents => success */
5597                 if (d2 == dend2) break;
5598
5599                 /* If necessary, advance to next segment in data.  */
5600                 PREFETCH ();
5601
5602                 /* How many characters left in this segment to match.  */
5603                 dcnt = dend - d;
5604
5605                 /* Want how many consecutive characters we can match in
5606                    one shot, so, if necessary, adjust the count.  */
5607                 if (dcnt > dend2 - d2)
5608                   dcnt = dend2 - d2;
5609
5610                 /* Compare that many; failure if mismatch, else move
5611                    past them.  */
5612                 if (RE_TRANSLATE_P (translate)
5613                     ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
5614                     : memcmp (d, d2, dcnt))
5615                   {
5616                     d = dfail;
5617                     goto fail;
5618                   }
5619                 d += dcnt, d2 += dcnt;
5620               }
5621           }
5622           break;
5623
5624
5625         /* begline matches the empty string at the beginning of the string
5626            (unless `not_bol' is set in `bufp'), and after newlines.  */
5627         case begline:
5628           DEBUG_PRINT ("EXECUTING begline.\n");
5629
5630           if (AT_STRINGS_BEG (d))
5631             {
5632               if (!bufp->not_bol) break;
5633             }
5634           else
5635             {
5636               unsigned c;
5637               GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5638               if (c == '\n')
5639                 break;
5640             }
5641           /* In all other cases, we fail.  */
5642           goto fail;
5643
5644
5645         /* endline is the dual of begline.  */
5646         case endline:
5647           DEBUG_PRINT ("EXECUTING endline.\n");
5648
5649           if (AT_STRINGS_END (d))
5650             {
5651               if (!bufp->not_eol) break;
5652             }
5653           else
5654             {
5655               PREFETCH_NOLIMIT ();
5656               if (*d == '\n')
5657                 break;
5658             }
5659           goto fail;
5660
5661
5662         /* Match at the very beginning of the data.  */
5663         case begbuf:
5664           DEBUG_PRINT ("EXECUTING begbuf.\n");
5665           if (AT_STRINGS_BEG (d))
5666             break;
5667           goto fail;
5668
5669
5670         /* Match at the very end of the data.  */
5671         case endbuf:
5672           DEBUG_PRINT ("EXECUTING endbuf.\n");
5673           if (AT_STRINGS_END (d))
5674             break;
5675           goto fail;
5676
5677
5678         /* on_failure_keep_string_jump is used to optimize `.*\n'.  It
5679            pushes NULL as the value for the string on the stack.  Then
5680            `POP_FAILURE_POINT' will keep the current value for the
5681            string, instead of restoring it.  To see why, consider
5682            matching `foo\nbar' against `.*\n'.  The .* matches the foo;
5683            then the . fails against the \n.  But the next thing we want
5684            to do is match the \n against the \n; if we restored the
5685            string value, we would be back at the foo.
5686
5687            Because this is used only in specific cases, we don't need to
5688            check all the things that `on_failure_jump' does, to make
5689            sure the right things get saved on the stack.  Hence we don't
5690            share its code.  The only reason to push anything on the
5691            stack at all is that otherwise we would have to change
5692            `anychar's code to do something besides goto fail in this
5693            case; that seems worse than this.  */
5694         case on_failure_keep_string_jump:
5695           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5696           DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5697                        mcnt, p + mcnt);
5698
5699           PUSH_FAILURE_POINT (p - 3, NULL);
5700           break;
5701
5702           /* A nasty loop is introduced by the non-greedy *? and +?.
5703              With such loops, the stack only ever contains one failure point
5704              at a time, so that a plain on_failure_jump_loop kind of
5705              cycle detection cannot work.  Worse yet, such a detection
5706              can not only fail to detect a cycle, but it can also wrongly
5707              detect a cycle (between different instantiations of the same
5708              loop).
5709              So the method used for those nasty loops is a little different:
5710              We use a special cycle-detection-stack-frame which is pushed
5711              when the on_failure_jump_nastyloop failure-point is *popped*.
5712              This special frame thus marks the beginning of one iteration
5713              through the loop and we can hence easily check right here
5714              whether something matched between the beginning and the end of
5715              the loop.  */
5716         case on_failure_jump_nastyloop:
5717           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5718           DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5719                        mcnt, p + mcnt);
5720
5721           assert ((re_opcode_t)p[-4] == no_op);
5722           {
5723             int cycle = 0;
5724             CHECK_INFINITE_LOOP (p - 4, d);
5725             if (!cycle)
5726               /* If there's a cycle, just continue without pushing
5727                  this failure point.  The failure point is the "try again"
5728                  option, which shouldn't be tried.
5729                  We want (x?)*?y\1z to match both xxyz and xxyxz.  */
5730               PUSH_FAILURE_POINT (p - 3, d);
5731           }
5732           break;
5733
5734           /* Simple loop detecting on_failure_jump:  just check on the
5735              failure stack if the same spot was already hit earlier.  */
5736         case on_failure_jump_loop:
5737         on_failure:
5738           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5739           DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5740                        mcnt, p + mcnt);
5741           {
5742             int cycle = 0;
5743             CHECK_INFINITE_LOOP (p - 3, d);
5744             if (cycle)
5745               /* If there's a cycle, get out of the loop, as if the matching
5746                  had failed.  We used to just `goto fail' here, but that was
5747                  aborting the search a bit too early: we want to keep the
5748                  empty-loop-match and keep matching after the loop.
5749                  We want (x?)*y\1z to match both xxyz and xxyxz.  */
5750               p += mcnt;
5751             else
5752               PUSH_FAILURE_POINT (p - 3, d);
5753           }
5754           break;
5755
5756
5757         /* Uses of on_failure_jump:
5758
5759            Each alternative starts with an on_failure_jump that points
5760            to the beginning of the next alternative.  Each alternative
5761            except the last ends with a jump that in effect jumps past
5762            the rest of the alternatives.  (They really jump to the
5763            ending jump of the following alternative, because tensioning
5764            these jumps is a hassle.)
5765
5766            Repeats start with an on_failure_jump that points past both
5767            the repetition text and either the following jump or
5768            pop_failure_jump back to this on_failure_jump.  */
5769         case on_failure_jump:
5770           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5771           DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
5772                        mcnt, p + mcnt);
5773
5774           PUSH_FAILURE_POINT (p -3, d);
5775           break;
5776
5777         /* This operation is used for greedy *.
5778            Compare the beginning of the repeat with what in the
5779            pattern follows its end. If we can establish that there
5780            is nothing that they would both match, i.e., that we
5781            would have to backtrack because of (as in, e.g., `a*a')
5782            then we can use a non-backtracking loop based on
5783            on_failure_keep_string_jump instead of on_failure_jump.  */
5784         case on_failure_jump_smart:
5785           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5786           DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5787                        mcnt, p + mcnt);
5788           {
5789             re_char *p1 = p; /* Next operation.  */
5790             /* Here, we discard `const', making re_match non-reentrant.  */
5791             unsigned char *p2 = (unsigned char *) p + mcnt; /* Jump dest.  */
5792             unsigned char *p3 = (unsigned char *) p - 3; /* opcode location.  */
5793
5794             p -= 3;             /* Reset so that we will re-execute the
5795                                    instruction once it's been changed. */
5796
5797             EXTRACT_NUMBER (mcnt, p2 - 2);
5798
5799             /* Ensure this is indeed the trivial kind of loop
5800                we are expecting.  */
5801             assert (skip_one_char (p1) == p2 - 3);
5802             assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5803             DEBUG_STATEMENT (debug += 2);
5804             if (mutually_exclusive_p (bufp, p1, p2))
5805               {
5806                 /* Use a fast `on_failure_keep_string_jump' loop.  */
5807                 DEBUG_PRINT ("  smart exclusive => fast loop.\n");
5808                 *p3 = (unsigned char) on_failure_keep_string_jump;
5809                 STORE_NUMBER (p2 - 2, mcnt + 3);
5810               }
5811             else
5812               {
5813                 /* Default to a safe `on_failure_jump' loop.  */
5814                 DEBUG_PRINT ("  smart default => slow loop.\n");
5815                 *p3 = (unsigned char) on_failure_jump;
5816               }
5817             DEBUG_STATEMENT (debug -= 2);
5818           }
5819           break;
5820
5821         /* Unconditionally jump (without popping any failure points).  */
5822         case jump:
5823         unconditional_jump:
5824           maybe_quit ();
5825           EXTRACT_NUMBER_AND_INCR (mcnt, p);    /* Get the amount to jump.  */
5826           DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
5827           p += mcnt;                            /* Do the jump.  */
5828           DEBUG_PRINT ("(to %p).\n", p);
5829           break;
5830
5831
5832         /* Have to succeed matching what follows at least n times.
5833            After that, handle like `on_failure_jump'.  */
5834         case succeed_n:
5835           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5836           EXTRACT_NUMBER (mcnt, p + 2);
5837           DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
5838
5839           /* Originally, mcnt is how many times we HAVE to succeed.  */
5840           if (mcnt != 0)
5841             {
5842               /* Here, we discard `const', making re_match non-reentrant.  */
5843               unsigned char *p2 = (unsigned char *) p + 2; /* counter loc.  */
5844               mcnt--;
5845               p += 4;
5846               PUSH_NUMBER (p2, mcnt);
5847             }
5848           else
5849             /* The two bytes encoding mcnt == 0 are two no_op opcodes.  */
5850             goto on_failure;
5851           break;
5852
5853         case jump_n:
5854           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5855           EXTRACT_NUMBER (mcnt, p + 2);
5856           DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
5857
5858           /* Originally, this is how many times we CAN jump.  */
5859           if (mcnt != 0)
5860             {
5861                /* Here, we discard `const', making re_match non-reentrant.  */
5862               unsigned char *p2 = (unsigned char *) p + 2; /* counter loc.  */
5863               mcnt--;
5864               PUSH_NUMBER (p2, mcnt);
5865               goto unconditional_jump;
5866             }
5867           /* If don't have to jump any more, skip over the rest of command.  */
5868           else
5869             p += 4;
5870           break;
5871
5872         case set_number_at:
5873           {
5874             unsigned char *p2;  /* Location of the counter.  */
5875             DEBUG_PRINT ("EXECUTING set_number_at.\n");
5876
5877             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5878             /* Here, we discard `const', making re_match non-reentrant.  */
5879             p2 = (unsigned char *) p + mcnt;
5880             /* Signedness doesn't matter since we only copy MCNT's bits.  */
5881             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5882             DEBUG_PRINT ("  Setting %p to %d.\n", p2, mcnt);
5883             PUSH_NUMBER (p2, mcnt);
5884             break;
5885           }
5886
5887         case wordbound:
5888         case notwordbound:
5889           {
5890             boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5891             DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
5892
5893             /* We SUCCEED (or FAIL) in one of the following cases: */
5894
5895             /* Case 1: D is at the beginning or the end of string.  */
5896             if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5897               not = !not;
5898             else
5899               {
5900                 /* C1 is the character before D, S1 is the syntax of C1, C2
5901                    is the character at D, and S2 is the syntax of C2.  */
5902                 re_wchar_t c1, c2;
5903                 int s1, s2;
5904                 int dummy;
5905 #ifdef emacs
5906                 ssize_t offset = PTR_TO_OFFSET (d - 1);
5907                 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5908                 UPDATE_SYNTAX_TABLE (charpos);
5909 #endif
5910                 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5911                 s1 = SYNTAX (c1);
5912 #ifdef emacs
5913                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5914 #endif
5915                 PREFETCH_NOLIMIT ();
5916                 GET_CHAR_AFTER (c2, d, dummy);
5917                 s2 = SYNTAX (c2);
5918
5919                 if (/* Case 2: Only one of S1 and S2 is Sword.  */
5920                     ((s1 == Sword) != (s2 == Sword))
5921                     /* Case 3: Both of S1 and S2 are Sword, and macro
5922                        WORD_BOUNDARY_P (C1, C2) returns nonzero.  */
5923                     || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5924                   not = !not;
5925               }
5926             if (not)
5927               break;
5928             else
5929               goto fail;
5930           }
5931
5932         case wordbeg:
5933           DEBUG_PRINT ("EXECUTING wordbeg.\n");
5934
5935           /* We FAIL in one of the following cases: */
5936
5937           /* Case 1: D is at the end of string.  */
5938           if (AT_STRINGS_END (d))
5939             goto fail;
5940           else
5941             {
5942               /* C1 is the character before D, S1 is the syntax of C1, C2
5943                  is the character at D, and S2 is the syntax of C2.  */
5944               re_wchar_t c1, c2;
5945               int s1, s2;
5946               int dummy;
5947 #ifdef emacs
5948               ssize_t offset = PTR_TO_OFFSET (d);
5949               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5950               UPDATE_SYNTAX_TABLE (charpos);
5951 #endif
5952               PREFETCH ();
5953               GET_CHAR_AFTER (c2, d, dummy);
5954               s2 = SYNTAX (c2);
5955
5956               /* Case 2: S2 is not Sword. */
5957               if (s2 != Sword)
5958                 goto fail;
5959
5960               /* Case 3: D is not at the beginning of string ... */
5961               if (!AT_STRINGS_BEG (d))
5962                 {
5963                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5964 #ifdef emacs
5965                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5966 #endif
5967                   s1 = SYNTAX (c1);
5968
5969                   /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5970                      returns 0.  */
5971                   if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5972                     goto fail;
5973                 }
5974             }
5975           break;
5976
5977         case wordend:
5978           DEBUG_PRINT ("EXECUTING wordend.\n");
5979
5980           /* We FAIL in one of the following cases: */
5981
5982           /* Case 1: D is at the beginning of string.  */
5983           if (AT_STRINGS_BEG (d))
5984             goto fail;
5985           else
5986             {
5987               /* C1 is the character before D, S1 is the syntax of C1, C2
5988                  is the character at D, and S2 is the syntax of C2.  */
5989               re_wchar_t c1, c2;
5990               int s1, s2;
5991               int dummy;
5992 #ifdef emacs
5993               ssize_t offset = PTR_TO_OFFSET (d) - 1;
5994               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5995               UPDATE_SYNTAX_TABLE (charpos);
5996 #endif
5997               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5998               s1 = SYNTAX (c1);
5999
6000               /* Case 2: S1 is not Sword.  */
6001               if (s1 != Sword)
6002                 goto fail;
6003
6004               /* Case 3: D is not at the end of string ... */
6005               if (!AT_STRINGS_END (d))
6006                 {
6007                   PREFETCH_NOLIMIT ();
6008                   GET_CHAR_AFTER (c2, d, dummy);
6009 #ifdef emacs
6010                   UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6011 #endif
6012                   s2 = SYNTAX (c2);
6013
6014                   /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
6015                      returns 0.  */
6016                   if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6017           goto fail;
6018                 }
6019             }
6020           break;
6021
6022         case symbeg:
6023           DEBUG_PRINT ("EXECUTING symbeg.\n");
6024
6025           /* We FAIL in one of the following cases: */
6026
6027           /* Case 1: D is at the end of string.  */
6028           if (AT_STRINGS_END (d))
6029             goto fail;
6030           else
6031             {
6032               /* C1 is the character before D, S1 is the syntax of C1, C2
6033                  is the character at D, and S2 is the syntax of C2.  */
6034               re_wchar_t c1, c2;
6035               int s1, s2;
6036 #ifdef emacs
6037               ssize_t offset = PTR_TO_OFFSET (d);
6038               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6039               UPDATE_SYNTAX_TABLE (charpos);
6040 #endif
6041               PREFETCH ();
6042               c2 = RE_STRING_CHAR (d, target_multibyte);
6043               s2 = SYNTAX (c2);
6044
6045               /* Case 2: S2 is neither Sword nor Ssymbol. */
6046               if (s2 != Sword && s2 != Ssymbol)
6047                 goto fail;
6048
6049               /* Case 3: D is not at the beginning of string ... */
6050               if (!AT_STRINGS_BEG (d))
6051                 {
6052                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6053 #ifdef emacs
6054                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6055 #endif
6056                   s1 = SYNTAX (c1);
6057
6058                   /* ... and S1 is Sword or Ssymbol.  */
6059                   if (s1 == Sword || s1 == Ssymbol)
6060                     goto fail;
6061                 }
6062             }
6063           break;
6064
6065         case symend:
6066           DEBUG_PRINT ("EXECUTING symend.\n");
6067
6068           /* We FAIL in one of the following cases: */
6069
6070           /* Case 1: D is at the beginning of string.  */
6071           if (AT_STRINGS_BEG (d))
6072             goto fail;
6073           else
6074             {
6075               /* C1 is the character before D, S1 is the syntax of C1, C2
6076                  is the character at D, and S2 is the syntax of C2.  */
6077               re_wchar_t c1, c2;
6078               int s1, s2;
6079 #ifdef emacs
6080               ssize_t offset = PTR_TO_OFFSET (d) - 1;
6081               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6082               UPDATE_SYNTAX_TABLE (charpos);
6083 #endif
6084               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6085               s1 = SYNTAX (c1);
6086
6087               /* Case 2: S1 is neither Ssymbol nor Sword.  */
6088               if (s1 != Sword && s1 != Ssymbol)
6089                 goto fail;
6090
6091               /* Case 3: D is not at the end of string ... */
6092               if (!AT_STRINGS_END (d))
6093                 {
6094                   PREFETCH_NOLIMIT ();
6095                   c2 = RE_STRING_CHAR (d, target_multibyte);
6096 #ifdef emacs
6097                   UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6098 #endif
6099                   s2 = SYNTAX (c2);
6100
6101                   /* ... and S2 is Sword or Ssymbol.  */
6102                   if (s2 == Sword || s2 == Ssymbol)
6103                     goto fail;
6104                 }
6105             }
6106           break;
6107
6108         case syntaxspec:
6109         case notsyntaxspec:
6110           {
6111             boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6112             mcnt = *p++;
6113             DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
6114                          mcnt);
6115             PREFETCH ();
6116 #ifdef emacs
6117             {
6118               ssize_t offset = PTR_TO_OFFSET (d);
6119               ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6120               UPDATE_SYNTAX_TABLE (pos1);
6121             }
6122 #endif
6123             {
6124               int len;
6125               re_wchar_t c;
6126
6127               GET_CHAR_AFTER (c, d, len);
6128               if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6129                 goto fail;
6130               d += len;
6131             }
6132           }
6133           break;
6134
6135 #ifdef emacs
6136         case at_dot:
6137           DEBUG_PRINT ("EXECUTING at_dot.\n");
6138           if (PTR_BYTE_POS (d) != PT_BYTE)
6139             goto fail;
6140           break;
6141
6142         case categoryspec:
6143         case notcategoryspec:
6144           {
6145             boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6146             mcnt = *p++;
6147             DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
6148                          not ? "not" : "", mcnt);
6149             PREFETCH ();
6150
6151             {
6152               int len;
6153               re_wchar_t c;
6154               GET_CHAR_AFTER (c, d, len);
6155               if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6156                 goto fail;
6157               d += len;
6158             }
6159           }
6160           break;
6161
6162 #endif /* emacs */
6163
6164         default:
6165           abort ();
6166         }
6167       continue;  /* Successfully executed one pattern command; keep going.  */
6168
6169
6170     /* We goto here if a matching operation fails. */
6171     fail:
6172       maybe_quit ();
6173       if (!FAIL_STACK_EMPTY ())
6174         {
6175           re_char *str, *pat;
6176           /* A restart point is known.  Restore to that state.  */
6177           DEBUG_PRINT ("\nFAIL:\n");
6178           POP_FAILURE_POINT (str, pat);
6179           switch (*pat++)
6180             {
6181             case on_failure_keep_string_jump:
6182               assert (str == NULL);
6183               goto continue_failure_jump;
6184
6185             case on_failure_jump_nastyloop:
6186               assert ((re_opcode_t)pat[-2] == no_op);
6187               PUSH_FAILURE_POINT (pat - 2, str);
6188               FALLTHROUGH;
6189             case on_failure_jump_loop:
6190             case on_failure_jump:
6191             case succeed_n:
6192               d = str;
6193             continue_failure_jump:
6194               EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6195               p = pat + mcnt;
6196               break;
6197
6198             case no_op:
6199               /* A special frame used for nastyloops. */
6200               goto fail;
6201
6202             default:
6203               abort ();
6204             }
6205
6206           assert (p >= bufp->buffer && p <= pend);
6207
6208           if (d >= string1 && d <= end1)
6209             dend = end_match_1;
6210         }
6211       else
6212         break;   /* Matching at this starting point really fails.  */
6213     } /* for (;;) */
6214
6215   if (best_regs_set)
6216     goto restore_best_regs;
6217
6218   FREE_VARIABLES ();
6219
6220   return -1;                            /* Failure to match.  */
6221 }
6222 \f
6223 /* Subroutine definitions for re_match_2.  */
6224
6225 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6226    bytes; nonzero otherwise.  */
6227
6228 static int
6229 bcmp_translate (re_char *s1, re_char *s2, ssize_t len,
6230                 RE_TRANSLATE_TYPE translate, const int target_multibyte)
6231 {
6232   re_char *p1 = s1, *p2 = s2;
6233   re_char *p1_end = s1 + len;
6234   re_char *p2_end = s2 + len;
6235
6236   /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6237      different lengths, but relying on a single `len' would break this. -sm  */
6238   while (p1 < p1_end && p2 < p2_end)
6239     {
6240       int p1_charlen, p2_charlen;
6241       re_wchar_t p1_ch, p2_ch;
6242
6243       GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6244       GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
6245
6246       if (RE_TRANSLATE (translate, p1_ch)
6247           != RE_TRANSLATE (translate, p2_ch))
6248         return 1;
6249
6250       p1 += p1_charlen, p2 += p2_charlen;
6251     }
6252
6253   if (p1 != p1_end || p2 != p2_end)
6254     return 1;
6255
6256   return 0;
6257 }
6258 \f
6259 /* Entry points for GNU code.  */
6260
6261 /* re_compile_pattern is the GNU regular expression compiler: it
6262    compiles PATTERN (of length SIZE) and puts the result in BUFP.
6263    Returns 0 if the pattern was valid, otherwise an error string.
6264
6265    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6266    are set in BUFP on entry.
6267
6268    We call regex_compile to do the actual compilation.  */
6269
6270 const char *
6271 re_compile_pattern (const char *pattern, size_t length,
6272 #ifdef emacs
6273                     bool posix_backtracking, const char *whitespace_regexp,
6274 #endif
6275                     struct re_pattern_buffer *bufp)
6276 {
6277   reg_errcode_t ret;
6278
6279   /* GNU code is written to assume at least RE_NREGS registers will be set
6280      (and at least one extra will be -1).  */
6281   bufp->regs_allocated = REGS_UNALLOCATED;
6282
6283   /* And GNU code determines whether or not to get register information
6284      by passing null for the REGS argument to re_match, etc., not by
6285      setting no_sub.  */
6286   bufp->no_sub = 0;
6287
6288   ret = regex_compile ((re_char *) pattern, length,
6289 #ifdef emacs
6290                        posix_backtracking,
6291                        whitespace_regexp,
6292 #else
6293                        re_syntax_options,
6294 #endif
6295                        bufp);
6296
6297   if (!ret)
6298     return NULL;
6299   return gettext (re_error_msgid[(int) ret]);
6300 }
6301 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
6302 \f
6303 /* Entry points compatible with 4.2 BSD regex library.  We don't define
6304    them unless specifically requested.  */
6305
6306 #if defined _REGEX_RE_COMP || defined _LIBC
6307
6308 /* BSD has one and only one pattern buffer.  */
6309 static struct re_pattern_buffer re_comp_buf;
6310
6311 char *
6312 # ifdef _LIBC
6313 /* Make these definitions weak in libc, so POSIX programs can redefine
6314    these names if they don't use our functions, and still use
6315    regcomp/regexec below without link errors.  */
6316 weak_function
6317 # endif
6318 re_comp (const char *s)
6319 {
6320   reg_errcode_t ret;
6321
6322   if (!s)
6323     {
6324       if (!re_comp_buf.buffer)
6325         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6326         return (char *) gettext ("No previous regular expression");
6327       return 0;
6328     }
6329
6330   if (!re_comp_buf.buffer)
6331     {
6332       re_comp_buf.buffer = malloc (200);
6333       if (re_comp_buf.buffer == NULL)
6334         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6335         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6336       re_comp_buf.allocated = 200;
6337
6338       re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
6339       if (re_comp_buf.fastmap == NULL)
6340         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6341         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6342     }
6343
6344   /* Since `re_exec' always passes NULL for the `regs' argument, we
6345      don't need to initialize the pattern buffer fields which affect it.  */
6346
6347   ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6348
6349   if (!ret)
6350     return NULL;
6351
6352   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6353   return (char *) gettext (re_error_msgid[(int) ret]);
6354 }
6355
6356
6357 int
6358 # ifdef _LIBC
6359 weak_function
6360 # endif
6361 re_exec (const char *s)
6362 {
6363   const size_t len = strlen (s);
6364   return re_search (&re_comp_buf, s, len, 0, len, 0) >= 0;
6365 }
6366 #endif /* _REGEX_RE_COMP */
6367 \f
6368 /* POSIX.2 functions.  Don't define these for Emacs.  */
6369
6370 #ifndef emacs
6371
6372 /* regcomp takes a regular expression as a string and compiles it.
6373
6374    PREG is a regex_t *.  We do not expect any fields to be initialized,
6375    since POSIX says we shouldn't.  Thus, we set
6376
6377      `buffer' to the compiled pattern;
6378      `used' to the length of the compiled pattern;
6379      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6380        REG_EXTENDED bit in CFLAGS is set; otherwise, to
6381        RE_SYNTAX_POSIX_BASIC;
6382      `fastmap' to an allocated space for the fastmap;
6383      `fastmap_accurate' to zero;
6384      `re_nsub' to the number of subexpressions in PATTERN.
6385
6386    PATTERN is the address of the pattern string.
6387
6388    CFLAGS is a series of bits which affect compilation.
6389
6390      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6391      use POSIX basic syntax.
6392
6393      If REG_NEWLINE is set, then . and [^...] don't match newline.
6394      Also, regexec will try a match beginning after every newline.
6395
6396      If REG_ICASE is set, then we considers upper- and lowercase
6397      versions of letters to be equivalent when matching.
6398
6399      If REG_NOSUB is set, then when PREG is passed to regexec, that
6400      routine will report only success or failure, and nothing about the
6401      registers.
6402
6403    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
6404    the return codes and their meanings.)  */
6405
6406 reg_errcode_t
6407 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern,
6408          int cflags)
6409 {
6410   reg_errcode_t ret;
6411   reg_syntax_t syntax
6412     = (cflags & REG_EXTENDED) ?
6413       RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6414
6415   /* regex_compile will allocate the space for the compiled pattern.  */
6416   preg->buffer = 0;
6417   preg->allocated = 0;
6418   preg->used = 0;
6419
6420   /* Try to allocate space for the fastmap.  */
6421   preg->fastmap = malloc (1 << BYTEWIDTH);
6422
6423   if (cflags & REG_ICASE)
6424     {
6425       unsigned i;
6426
6427       preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
6428       if (preg->translate == NULL)
6429         return (int) REG_ESPACE;
6430
6431       /* Map uppercase characters to corresponding lowercase ones.  */
6432       for (i = 0; i < CHAR_SET_SIZE; i++)
6433         preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6434     }
6435   else
6436     preg->translate = NULL;
6437
6438   /* If REG_NEWLINE is set, newlines are treated differently.  */
6439   if (cflags & REG_NEWLINE)
6440     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
6441       syntax &= ~RE_DOT_NEWLINE;
6442       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6443     }
6444   else
6445     syntax |= RE_NO_NEWLINE_ANCHOR;
6446
6447   preg->no_sub = !!(cflags & REG_NOSUB);
6448
6449   /* POSIX says a null character in the pattern terminates it, so we
6450      can use strlen here in compiling the pattern.  */
6451   ret = regex_compile ((re_char *) pattern, strlen (pattern), syntax, preg);
6452
6453   /* POSIX doesn't distinguish between an unmatched open-group and an
6454      unmatched close-group: both are REG_EPAREN.  */
6455   if (ret == REG_ERPAREN)
6456     ret = REG_EPAREN;
6457
6458   if (ret == REG_NOERROR && preg->fastmap)
6459     { /* Compute the fastmap now, since regexec cannot modify the pattern
6460          buffer.  */
6461       re_compile_fastmap (preg);
6462       if (preg->can_be_null)
6463         { /* The fastmap can't be used anyway.  */
6464           free (preg->fastmap);
6465           preg->fastmap = NULL;
6466         }
6467     }
6468   return ret;
6469 }
6470 WEAK_ALIAS (__regcomp, regcomp)
6471
6472
6473 /* regexec searches for a given pattern, specified by PREG, in the
6474    string STRING.
6475
6476    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6477    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
6478    least NMATCH elements, and we set them to the offsets of the
6479    corresponding matched substrings.
6480
6481    EFLAGS specifies `execution flags' which affect matching: if
6482    REG_NOTBOL is set, then ^ does not match at the beginning of the
6483    string; if REG_NOTEOL is set, then $ does not match at the end.
6484
6485    We return 0 if we find a match and REG_NOMATCH if not.  */
6486
6487 reg_errcode_t
6488 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
6489          size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
6490 {
6491   regoff_t ret;
6492   struct re_registers regs;
6493   regex_t private_preg;
6494   size_t len = strlen (string);
6495   boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6496
6497   private_preg = *preg;
6498
6499   private_preg.not_bol = !!(eflags & REG_NOTBOL);
6500   private_preg.not_eol = !!(eflags & REG_NOTEOL);
6501
6502   /* The user has told us exactly how many registers to return
6503      information about, via `nmatch'.  We have to pass that on to the
6504      matching routines.  */
6505   private_preg.regs_allocated = REGS_FIXED;
6506
6507   if (want_reg_info)
6508     {
6509       regs.num_regs = nmatch;
6510       regs.start = TALLOC (nmatch * 2, regoff_t);
6511       if (regs.start == NULL)
6512         return REG_NOMATCH;
6513       regs.end = regs.start + nmatch;
6514     }
6515
6516   /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6517      pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6518      was a little bit longer but still only matching the real part.
6519      This works because the `endline' will check for a '\n' and will find a
6520      '\0', correctly deciding that this is not the end of a line.
6521      But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6522      a convenient '\0' there.  For all we know, the string could be preceded
6523      by '\n' which would throw things off.  */
6524
6525   /* Perform the searching operation.  */
6526   ret = re_search (&private_preg, string, len,
6527                    /* start: */ 0, /* range: */ len,
6528                    want_reg_info ? &regs : 0);
6529
6530   /* Copy the register information to the POSIX structure.  */
6531   if (want_reg_info)
6532     {
6533       if (ret >= 0)
6534         {
6535           unsigned r;
6536
6537           for (r = 0; r < nmatch; r++)
6538             {
6539               pmatch[r].rm_so = regs.start[r];
6540               pmatch[r].rm_eo = regs.end[r];
6541             }
6542         }
6543
6544       /* If we needed the temporary register info, free the space now.  */
6545       free (regs.start);
6546     }
6547
6548   /* We want zero return to mean success, unlike `re_search'.  */
6549   return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
6550 }
6551 WEAK_ALIAS (__regexec, regexec)
6552
6553
6554 /* Returns a message corresponding to an error code, ERR_CODE, returned
6555    from either regcomp or regexec.   We don't use PREG here.
6556
6557    ERR_CODE was previously called ERRCODE, but that name causes an
6558    error with msvc8 compiler.  */
6559
6560 size_t
6561 regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
6562 {
6563   const char *msg;
6564   size_t msg_size;
6565
6566   if (err_code < 0
6567       || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6568     /* Only error codes returned by the rest of the code should be passed
6569        to this routine.  If we are given anything else, or if other regex
6570        code generates an invalid error code, then the program has a bug.
6571        Dump core so we can fix it.  */
6572     abort ();
6573
6574   msg = gettext (re_error_msgid[err_code]);
6575
6576   msg_size = strlen (msg) + 1; /* Includes the null.  */
6577
6578   if (errbuf_size != 0)
6579     {
6580       if (msg_size > errbuf_size)
6581         {
6582           memcpy (errbuf, msg, errbuf_size - 1);
6583           errbuf[errbuf_size - 1] = 0;
6584         }
6585       else
6586         strcpy (errbuf, msg);
6587     }
6588
6589   return msg_size;
6590 }
6591 WEAK_ALIAS (__regerror, regerror)
6592
6593
6594 /* Free dynamically allocated space used by PREG.  */
6595
6596 void
6597 regfree (regex_t *preg)
6598 {
6599   free (preg->buffer);
6600   preg->buffer = NULL;
6601
6602   preg->allocated = 0;
6603   preg->used = 0;
6604
6605   free (preg->fastmap);
6606   preg->fastmap = NULL;
6607   preg->fastmap_accurate = 0;
6608
6609   free (preg->translate);
6610   preg->translate = NULL;
6611 }
6612 WEAK_ALIAS (__regfree, regfree)
6613
6614 #endif /* not emacs  */