lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2017 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include <assert.h>
  28 #include <ctype.h>
  29 #include <stdint.h>
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <limits.h>
  33 #include <string.h>
  34 #include <locale.h>
  35
  36 static bool
  37 streq (char const *a, char const *b)
  38 {
  39   return strcmp (a, b) == 0;
  40 }
  41
  42 static bool
  43 isasciidigit (char c)
  44 {
  45   return '0' <= c && c <= '9';
  46 }
  47
  48 #include "gettext.h"
  49 #define _(str) gettext (str)
  50
  51 #include <wchar.h>
  52
  53 #include "intprops.h"
  54 #include "xalloc.h"
  55 #include "localeinfo.h"
  56
  57 #ifndef FALLTHROUGH
  58 # if __GNUC__ < 7
  59 #  define FALLTHROUGH ((void) 0)
  60 # else
  61 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  62 # endif
  63 #endif
  64
  65 #ifndef MIN
  66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  67 #endif
  68
  69 /* HPUX defines these as macros in sys/param.h.  */
  70 #ifdef setbit
  71 # undef setbit
  72 #endif
  73 #ifdef clrbit
  74 # undef clrbit
  75 #endif
  76
  77 /* First integer value that is greater than any character code.  */
  78 enum { NOTCHAR = 1 << CHAR_BIT };
  79
  80 /* This represents part of a character class.  It must be unsigned and
  81    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
  82 typedef unsigned long int charclass_word;
  83
  84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
  85    CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
  86    represents 64 bits' worth of a charclass, where LO and HI are the
  87    low and high-order 32 bits of the 64-bit quantity.  */
  88 #if ULONG_MAX >> 31 >> 31 < 3
  89 enum { CHARCLASS_WORD_BITS = 32 };
  90 # define CHARCLASS_PAIR(lo, hi) lo, hi
  91 #else
  92 enum { CHARCLASS_WORD_BITS = 64 };
  93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
  94 #endif
  95
  96 /* An initializer for a charclass whose 32-bit words are A through H.  */
  97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
  98    {{                                                   \
  99       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 100       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 101    }}
 102
 103 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 104 static charclass_word const CHARCLASS_WORD_MASK
 105   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 106
 107 /* Number of words required to hold a bit for every character.  */
 108 enum
 109 {
 110   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 111 };
 112
 113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 114 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 115
 116 /* Convert a possibly-signed character to an unsigned character.  This is
 117    a bit safer than casting to unsigned char, since it catches some type
 118    errors that the cast doesn't.  */
 119 static unsigned char
 120 to_uchar (char ch)
 121 {
 122   return ch;
 123 }
 124
 125 /* Contexts tell us whether a character is a newline or a word constituent.
 126    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 127    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 128    a particular character class.
 129
 130    A state also stores a context value, which is a bitmask of CTX_* values.
 131    A state's context represents a set of characters that the state's
 132    predecessors must match.  For example, a state whose context does not
 133    include CTX_LETTER will never have transitions where the previous
 134    character is a word constituent.  A state whose context is CTX_ANY
 135    might have transitions from any character.  */
 136
 137 enum
 138   {
 139     CTX_NONE = 1,
 140     CTX_LETTER = 2,
 141     CTX_NEWLINE = 4,
 142     CTX_ANY = 7
 143   };
 144
 145 /* Sometimes characters can only be matched depending on the surrounding
 146    context.  Such context decisions depend on what the previous character
 147    was, and the value of the current (lookahead) character.  Context
 148    dependent constraints are encoded as 9-bit integers.  Each bit that
 149    is set indicates that the constraint succeeds in the corresponding
 150    context.
 151
 152    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 153    bit 3-5  - valid contexts when next character is CTX_LETTER
 154    bit 0-2  - valid contexts when next character is CTX_NONE
 155
 156    succeeds_in_context determines whether a given constraint
 157    succeeds in a particular context.  Prev is a bitmask of possible
 158    context values for the previous character, curr is the (single-bit)
 159    context value for the lookahead character.  */
 160 static int
 161 newline_constraint (int constraint)
 162 {
 163   return (constraint >> 6) & 7;
 164 }
 165 static int
 166 letter_constraint (int constraint)
 167 {
 168   return (constraint >> 3) & 7;
 169 }
 170 static int
 171 other_constraint (int constraint)
 172 {
 173   return constraint & 7;
 174 }
 175
 176 static bool
 177 succeeds_in_context (int constraint, int prev, int curr)
 178 {
 179   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 180               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 181               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 182              & prev);
 183 }
 184
 185 /* The following describe what a constraint depends on.  */
 186 static bool
 187 prev_newline_dependent (int constraint)
 188 {
 189   return ((constraint ^ constraint >> 2) & 0111) != 0;
 190 }
 191 static bool
 192 prev_letter_dependent (int constraint)
 193 {
 194   return ((constraint ^ constraint >> 1) & 0111) != 0;
 195 }
 196
 197 /* Tokens that match the empty string subject to some constraint actually
 198    work by applying that constraint to determine what may follow them,
 199    taking into account what has gone before.  The following values are
 200    the constraints corresponding to the special tokens previously defined.  */
 201 enum
 202   {
 203     NO_CONSTRAINT = 0777,
 204     BEGLINE_CONSTRAINT = 0444,
 205     ENDLINE_CONSTRAINT = 0700,
 206     BEGWORD_CONSTRAINT = 0050,
 207     ENDWORD_CONSTRAINT = 0202,
 208     LIMWORD_CONSTRAINT = 0252,
 209     NOTLIMWORD_CONSTRAINT = 0525
 210   };
 211
 212 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 213    are operators and others are terminal symbols.  Most (but not all) of these
 214    codes are returned by the lexical analyzer.  */
 215
 216 typedef ptrdiff_t token;
 217 static ptrdiff_t const TOKEN_MAX = PTRDIFF_MAX;
 218
 219 /* States are indexed by state_num values.  These are normally
 220    nonnegative but -1 is used as a special value.  */
 221 typedef ptrdiff_t state_num;
 222
 223 /* Predefined token values.  */
 224 enum
 225 {
 226   END = -1,                     /* END is a terminal symbol that matches the
 227                                    end of input; any value of END or less in
 228                                    the parse tree is such a symbol.  Accepting
 229                                    states of the DFA are those that would have
 230                                    a transition on END.  */
 231
 232   /* Ordinary character values are terminal symbols that match themselves.  */
 233
 234   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 235                                    the empty string.  */
 236
 237   BACKREF,                      /* BACKREF is generated by \<digit>
 238                                    or by any other construct that
 239                                    is not completely handled.  If the scanner
 240                                    detects a transition on backref, it returns
 241                                    a kind of "semi-success" indicating that
 242                                    the match will have to be verified with
 243                                    a backtracking matcher.  */
 244
 245   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 246                                    the empty string at the beginning of a
 247                                    line.  */
 248
 249   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 250                                    the empty string at the end of a line.  */
 251
 252   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 253                                    the empty string at the beginning of a
 254                                    word.  */
 255
 256   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 257                                    the empty string at the end of a word.  */
 258
 259   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 260                                    the empty string at the beginning or the
 261                                    end of a word.  */
 262
 263   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 264                                    matches the empty string not at
 265                                    the beginning or end of a word.  */
 266
 267   QMARK,                        /* QMARK is an operator of one argument that
 268                                    matches zero or one occurrences of its
 269                                    argument.  */
 270
 271   STAR,                         /* STAR is an operator of one argument that
 272                                    matches the Kleene closure (zero or more
 273                                    occurrences) of its argument.  */
 274
 275   PLUS,                         /* PLUS is an operator of one argument that
 276                                    matches the positive closure (one or more
 277                                    occurrences) of its argument.  */
 278
 279   REPMN,                        /* REPMN is a lexical token corresponding
 280                                    to the {m,n} construct.  REPMN never
 281                                    appears in the compiled token vector.  */
 282
 283   CAT,                          /* CAT is an operator of two arguments that
 284                                    matches the concatenation of its
 285                                    arguments.  CAT is never returned by the
 286                                    lexical analyzer.  */
 287
 288   OR,                           /* OR is an operator of two arguments that
 289                                    matches either of its arguments.  */
 290
 291   LPAREN,                       /* LPAREN never appears in the parse tree,
 292                                    it is only a lexeme.  */
 293
 294   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 295
 296   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 297                                    a valid multibyte (or single byte) character.
 298                                    It is used only if MB_CUR_MAX > 1.  */
 299
 300   MBCSET,                       /* MBCSET is similar to CSET, but for
 301                                    multibyte characters.  */
 302
 303   WCHAR,                        /* Only returned by lex.  wctok contains
 304                                    the wide character representation.  */
 305
 306   CSET                          /* CSET and (and any value greater) is a
 307                                    terminal symbol that matches any of a
 308                                    class of characters.  */
 309 };
 310
 311
 312 /* States of the recognizer correspond to sets of positions in the parse
 313    tree, together with the constraints under which they may be matched.
 314    So a position is encoded as an index into the parse tree together with
 315    a constraint.  */
 316 typedef struct
 317 {
 318   size_t index;                 /* Index into the parse array.  */
 319   unsigned int constraint;      /* Constraint for matching this position.  */
 320 } position;
 321
 322 /* Sets of positions are stored as arrays.  */
 323 typedef struct
 324 {
 325   position *elems;              /* Elements of this position set.  */
 326   ptrdiff_t nelem;              /* Number of elements in this set.  */
 327   ptrdiff_t alloc;              /* Number of elements allocated in ELEMS.  */
 328 } position_set;
 329
 330 /* Sets of leaves are also stored as arrays.  */
 331 typedef struct
 332 {
 333   size_t *elems;                /* Elements of this position set.  */
 334   size_t nelem;                 /* Number of elements in this set.  */
 335 } leaf_set;
 336
 337 /* A state of the dfa consists of a set of positions, some flags,
 338    and the token value of the lowest-numbered position of the state that
 339    contains an END token.  */
 340 typedef struct
 341 {
 342   size_t hash;                  /* Hash of the positions of this state.  */
 343   position_set elems;           /* Positions this state could match.  */
 344   unsigned char context;        /* Context from previous state.  */
 345   unsigned short constraint;    /* Constraint for this state to accept.  */
 346   token first_end;              /* Token value of the first END in elems.  */
 347   position_set mbps;            /* Positions which can match multibyte
 348                                    characters or the follows, e.g., period.
 349                                    Used only if MB_CUR_MAX > 1.  */
 350   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 351                                    negative if the state does not have
 352                                    ANYCHAR.  */
 353 } dfa_state;
 354
 355 /* Maximum for any transition table count.  This should be at least 3,
 356    for the initial state setup.  */
 357 enum { MAX_TRCOUNT = 1024 };
 358
 359 /* A bracket operator.
 360    e.g., [a-c], [[:alpha:]], etc.  */
 361 struct mb_char_classes
 362 {
 363   ptrdiff_t cset;
 364   bool invert;
 365   wchar_t *chars;               /* Normal characters.  */
 366   ptrdiff_t nchars;
 367   ptrdiff_t nchars_alloc;
 368 };
 369
 370 struct regex_syntax
 371 {
 372   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 373   reg_syntax_t syntax_bits;
 374   bool syntax_bits_set;
 375
 376   /* Flag for case-folding letters into sets.  */
 377   bool case_fold;
 378
 379   /* True if ^ and $ match only the start and end of data, and do not match
 380      end-of-line within data.  */
 381   bool anchor;
 382
 383   /* End-of-line byte in data.  */
 384   unsigned char eolbyte;
 385
 386   /* Cache of char-context values.  */
 387   char sbit[NOTCHAR];
 388
 389   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 390      multibyte character.  */
 391   bool never_trail[NOTCHAR];
 392
 393   /* Set of characters considered letters.  */
 394   charclass letters;
 395
 396   /* Set of characters that are newline.  */
 397   charclass newline;
 398 };
 399
 400 /* Lexical analyzer.  All the dross that deals with the obnoxious
 401    GNU Regex syntax bits is located here.  The poor, suffering
 402    reader is referred to the GNU Regex documentation for the
 403    meaning of the @#%!@#%^!@ syntax bits.  */
 404 struct lexer_state
 405 {
 406   char const *ptr;      /* Pointer to next input character.  */
 407   size_t left;          /* Number of characters remaining.  */
 408   token lasttok;        /* Previous token returned; initially END.  */
 409   size_t parens;        /* Count of outstanding left parens.  */
 410   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 411
 412   /* Wide character representation of the current multibyte character,
 413      or WEOF if there was an encoding error.  Used only if
 414      MB_CUR_MAX > 1.  */
 415   wint_t wctok;
 416
 417   /* Length of the multibyte representation of wctok.  */
 418   int cur_mb_len;
 419
 420   /* The most recently analyzed multibyte bracket expression.  */
 421   struct mb_char_classes brack;
 422
 423   /* We're separated from beginning or (, | only by zero-width characters.  */
 424   bool laststart;
 425 };
 426
 427 /* Recursive descent parser for regular expressions.  */
 428
 429 struct parser_state
 430 {
 431   token tok;               /* Lookahead token.  */
 432   size_t depth;            /* Current depth of a hypothetical stack
 433                               holding deferred productions.  This is
 434                               used to determine the depth that will be
 435                               required of the real stack later on in
 436                               dfaanalyze.  */
 437 };
 438
 439 /* A compiled regular expression.  */
 440 struct dfa
 441 {
 442   /* Syntax configuration */
 443   struct regex_syntax syntax;
 444
 445   /* Fields filled by the scanner.  */
 446   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 447   ptrdiff_t cindex;             /* Index for adding new charclasses.  */
 448   ptrdiff_t calloc;             /* Number of charclasses allocated.  */
 449   size_t canychar;              /* Index of anychar class, or (size_t) -1.  */
 450
 451   /* Scanner state */
 452   struct lexer_state lex;
 453
 454   /* Parser state */
 455   struct parser_state parse;
 456
 457   /* Fields filled by the parser.  */
 458   token *tokens;                /* Postfix parse array.  */
 459   size_t tindex;                /* Index for adding new tokens.  */
 460   size_t talloc;                /* Number of tokens currently allocated.  */
 461   size_t depth;                 /* Depth required of an evaluation stack
 462                                    used for depth-first traversal of the
 463                                    parse tree.  */
 464   size_t nleaves;               /* Number of leaves on the parse tree.  */
 465   size_t nregexps;              /* Count of parallel regexps being built
 466                                    with dfaparse.  */
 467   bool fast;                    /* The DFA is fast.  */
 468   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
 469   mbstate_t mbs;                /* Multibyte conversion state.  */
 470
 471   /* The following are valid only if MB_CUR_MAX > 1.  */
 472
 473   /* The value of multibyte_prop[i] is defined by following rule.
 474      if tokens[i] < NOTCHAR
 475      bit 0 : tokens[i] is the first byte of a character, including
 476      single-byte characters.
 477      bit 1 : tokens[i] is the last byte of a character, including
 478      single-byte characters.
 479
 480      e.g.
 481      tokens
 482      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 483      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 484      multibyte_prop
 485      = 3     , 1               ,  0              ,  2              , 3
 486    */
 487   char *multibyte_prop;
 488
 489   /* Fields filled by the superset.  */
 490   struct dfa *superset;             /* Hint of the dfa.  */
 491
 492   /* Fields filled by the state builder.  */
 493   dfa_state *states;            /* States of the dfa.  */
 494   state_num sindex;             /* Index for adding new states.  */
 495   ptrdiff_t salloc;             /* Number of states currently allocated.  */
 496
 497   /* Fields filled by the parse tree->NFA conversion.  */
 498   position_set *follows;        /* Array of follow sets, indexed by position
 499                                    index.  The follow of a position is the set
 500                                    of positions containing characters that
 501                                    could conceivably follow a character
 502                                    matching the given position in a string
 503                                    matching the regexp.  Allocated to the
 504                                    maximum possible position index.  */
 505   bool searchflag;              /* We are supposed to build a searching
 506                                    as opposed to an exact matcher.  A searching
 507                                    matcher finds the first and shortest string
 508                                    matching a regexp anywhere in the buffer,
 509                                    whereas an exact matcher finds the longest
 510                                    string matching, but anchored to the
 511                                    beginning of the buffer.  */
 512
 513   /* Fields filled by dfaexec.  */
 514   state_num tralloc;            /* Number of transition tables that have
 515                                    slots so far, not counting trans[-1] and
 516                                    trans[-2].  */
 517   int trcount;                  /* Number of transition tables that have
 518                                    been built, other than for initial
 519                                    states.  */
 520   int min_trcount;              /* Number of initial states.  Equivalently,
 521                                    the minimum state number for which trcount
 522                                    counts transitions.  */
 523   state_num **trans;            /* Transition tables for states that can
 524                                    never accept.  If the transitions for a
 525                                    state have not yet been computed, or the
 526                                    state could possibly accept, its entry in
 527                                    this table is NULL.  This points to two
 528                                    past the start of the allocated array,
 529                                    and trans[-1] and trans[-2] are always
 530                                    NULL.  */
 531   state_num **fails;            /* Transition tables after failing to accept
 532                                    on a state that potentially could do so.
 533                                    If trans[i] is non-null, fails[i] must
 534                                    be null.  */
 535   char *success;                /* Table of acceptance conditions used in
 536                                    dfaexec and computed in build_state.  */
 537   state_num *newlines;          /* Transitions on newlines.  The entry for a
 538                                    newline in any transition table is always
 539                                    -1 so we can count lines without wasting
 540                                    too many cycles.  The transition for a
 541                                    newline is stored separately and handled
 542                                    as a special case.  Newline is also used
 543                                    as a sentinel at the end of the buffer.  */
 544   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 545                                    context in multibyte locales, in which we
 546                                    do not distinguish between their contexts,
 547                                    as not supported word.  */
 548   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 549   state_num **mb_trans;         /* Transition tables for states with
 550                                    ANYCHAR.  */
 551   state_num mb_trcount;         /* Number of transition tables for states with
 552                                    ANYCHAR that have actually been built.  */
 553
 554   /* Information derived from the locale.  This is at the end so that
 555      a quick memset need not clear it specially.  */
 556
 557   /* dfaexec implementation.  */
 558   char *(*dfaexec) (struct dfa *, char const *, char *,
 559                     bool, size_t *, bool *);
 560
 561   /* The locale is simple, like the C locale.  These locales can be
 562      processed more efficiently, as they are single-byte, their native
 563      character set is in collating-sequence order, and they do not
 564      have multi-character collating elements.  */
 565   bool simple_locale;
 566
 567   /* Other cached information derived from the locale.  */
 568   struct localeinfo localeinfo;
 569 };
 570
 571 /* User access to dfa internals.  */
 572
 573 /* S could possibly be an accepting state of R.  */
 574 static bool
 575 accepting (state_num s, struct dfa const *r)
 576 {
 577   return r->states[s].constraint != 0;
 578 }
 579
 580 /* STATE accepts in the specified context.  */
 581 static bool
 582 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 583 {
 584   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 585 }
 586
 587 static void regexp (struct dfa *dfa);
 588
 589 /* Store into *PWC the result of converting the leading bytes of the
 590    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 591    and updating the conversion state in *D.  On conversion error,
 592    convert just a single byte, to WEOF.  Return the number of bytes
 593    converted.
 594
 595    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 596
 597    * PWC points to wint_t, not to wchar_t.
 598    * The last arg is a dfa *D instead of merely a multibyte conversion
 599      state D->mbs.
 600    * N must be at least 1.
 601    * S[N - 1] must be a sentinel byte.
 602    * Shift encodings are not supported.
 603    * The return value is always in the range 1..N.
 604    * D->mbs is always valid afterwards.
 605    * *PWC is always set to something.  */
 606 static size_t
 607 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 608 {
 609   unsigned char uc = s[0];
 610   wint_t wc = d->localeinfo.sbctowc[uc];
 611
 612   if (wc == WEOF)
 613     {
 614       wchar_t wch;
 615       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 616       if (0 < nbytes && nbytes < (size_t) -2)
 617         {
 618           *pwc = wch;
 619           return nbytes;
 620         }
 621       memset (&d->mbs, 0, sizeof d->mbs);
 622     }
 623
 624   *pwc = wc;
 625   return 1;
 626 }
 627
 628 #ifdef DEBUG
 629
 630 static void
 631 prtok (token t)
 632 {
 633   if (t < 0)
 634     fprintf (stderr, "END");
 635   else if (t < NOTCHAR)
 636     {
 637       unsigned int ch = t;
 638       fprintf (stderr, "0x%02x", ch);
 639     }
 640   else
 641     {
 642       char const *s;
 643       switch (t)
 644         {
 645         case EMPTY:
 646           s = "EMPTY";
 647           break;
 648         case BACKREF:
 649           s = "BACKREF";
 650           break;
 651         case BEGLINE:
 652           s = "BEGLINE";
 653           break;
 654         case ENDLINE:
 655           s = "ENDLINE";
 656           break;
 657         case BEGWORD:
 658           s = "BEGWORD";
 659           break;
 660         case ENDWORD:
 661           s = "ENDWORD";
 662           break;
 663         case LIMWORD:
 664           s = "LIMWORD";
 665           break;
 666         case NOTLIMWORD:
 667           s = "NOTLIMWORD";
 668           break;
 669         case QMARK:
 670           s = "QMARK";
 671           break;
 672         case STAR:
 673           s = "STAR";
 674           break;
 675         case PLUS:
 676           s = "PLUS";
 677           break;
 678         case CAT:
 679           s = "CAT";
 680           break;
 681         case OR:
 682           s = "OR";
 683           break;
 684         case LPAREN:
 685           s = "LPAREN";
 686           break;
 687         case RPAREN:
 688           s = "RPAREN";
 689           break;
 690         case ANYCHAR:
 691           s = "ANYCHAR";
 692           break;
 693         case MBCSET:
 694           s = "MBCSET";
 695           break;
 696         default:
 697           s = "CSET";
 698           break;
 699         }
 700       fprintf (stderr, "%s", s);
 701     }
 702 }
 703 #endif /* DEBUG */
 704
 705 /* Stuff pertaining to charclasses.  */
 706
 707 static bool
 708 tstbit (unsigned int b, charclass const *c)
 709 {
 710   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 711 }
 712
 713 static void
 714 setbit (unsigned int b, charclass *c)
 715 {
 716   charclass_word one = 1;
 717   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 718 }
 719
 720 static void
 721 clrbit (unsigned int b, charclass *c)
 722 {
 723   charclass_word one = 1;
 724   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 725 }
 726
 727 static void
 728 zeroset (charclass *s)
 729 {
 730   memset (s, 0, sizeof *s);
 731 }
 732
 733 static void
 734 fillset (charclass *s)
 735 {
 736   for (int i = 0; i < CHARCLASS_WORDS; i++)
 737     s->w[i] = CHARCLASS_WORD_MASK;
 738 }
 739
 740 static void
 741 notset (charclass *s)
 742 {
 743   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 744     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 745 }
 746
 747 static bool
 748 equal (charclass const *s1, charclass const *s2)
 749 {
 750   charclass_word w = 0;
 751   for (int i = 0; i < CHARCLASS_WORDS; i++)
 752     w |= s1->w[i] ^ s2->w[i];
 753   return w == 0;
 754 }
 755
 756 static bool
 757 emptyset (charclass const *s)
 758 {
 759   charclass_word w = 0;
 760   for (int i = 0; i < CHARCLASS_WORDS; i++)
 761     w |= s->w[i];
 762   return w == 0;
 763 }
 764
 765 /* Grow PA, which points to an array of *NITEMS items, and return the
 766    location of the reallocated array, updating *NITEMS to reflect its
 767    new size.  The new array will contain at least NITEMS_INCR_MIN more
 768    items, but will not contain more than NITEMS_MAX items total.
 769    ITEM_SIZE is the size of each item, in bytes.
 770
 771    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 772    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 773    infinity.
 774
 775    If PA is null, then allocate a new array instead of reallocating
 776    the old one.
 777
 778    Thus, to grow an array A without saving its old contents, do
 779    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 780
 781 static void *
 782 xpalloc (void *pa, ptrdiff_t *nitems, ptrdiff_t nitems_incr_min,
 783          ptrdiff_t nitems_max, ptrdiff_t item_size)
 784 {
 785   ptrdiff_t n0 = *nitems;
 786
 787   /* The approximate size to use for initial small allocation
 788      requests.  This is the largest "small" request for the GNU C
 789      library malloc.  */
 790   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 791
 792   /* If the array is tiny, grow it to about (but no greater than)
 793      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 794      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 795      NITEMS_MAX, and what the C language can represent safely.  */
 796
 797   ptrdiff_t n, nbytes;
 798   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 799     n = PTRDIFF_MAX;
 800   if (0 <= nitems_max && nitems_max < n)
 801     n = nitems_max;
 802
 803   ptrdiff_t adjusted_nbytes
 804     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 805        ? MIN (PTRDIFF_MAX, SIZE_MAX)
 806        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 807   if (adjusted_nbytes)
 808     {
 809       n = adjusted_nbytes / item_size;
 810       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 811     }
 812
 813   if (! pa)
 814     *nitems = 0;
 815   if (n - n0 < nitems_incr_min
 816       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 817           || (0 <= nitems_max && nitems_max < n)
 818           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 819     xalloc_die ();
 820   pa = xrealloc (pa, nbytes);
 821   *nitems = n;
 822   return pa;
 823 }
 824
 825 /* Ensure that the array addressed by PA holds at least I + 1 items.
 826    Either return PA, or reallocate the array and return its new address.
 827    Although PA may be null, the returned value is never null.
 828
 829    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 830    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 831    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 832    ITEM_SIZE is the size of one item; it must be positive.
 833    Avoid O(N**2) behavior on arrays growing linearly.  */
 834 static void *
 835 maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems,
 836                ptrdiff_t nitems_max, ptrdiff_t item_size)
 837 {
 838   if (i < *nitems)
 839     return pa;
 840   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 841 }
 842
 843 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 844 static ptrdiff_t
 845 charclass_index (struct dfa *d, charclass *s)
 846 {
 847   ptrdiff_t i;
 848
 849   for (i = 0; i < d->cindex; ++i)
 850     if (equal (s, &d->charclasses[i]))
 851       return i;
 852   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 853                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 854   ++d->cindex;
 855   d->charclasses[i] = *s;
 856   return i;
 857 }
 858
 859 static bool
 860 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 861 {
 862   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 863 }
 864
 865 static int
 866 char_context (struct dfa const *dfa, unsigned char c)
 867 {
 868   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 869     return CTX_NEWLINE;
 870   if (unibyte_word_constituent (dfa, c))
 871     return CTX_LETTER;
 872   return CTX_NONE;
 873 }
 874
 875 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 876    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 877    this may happen when folding case in weird Turkish locales where
 878    dotless i/dotted I are not included in the chosen character set.
 879    Return whether a bit was set in the charclass.  */
 880 static bool
 881 setbit_wc (wint_t wc, charclass *c)
 882 {
 883   int b = wctob (wc);
 884   if (b < 0)
 885     return false;
 886
 887   setbit (b, c);
 888   return true;
 889 }
 890
 891 /* Set a bit for B and its case variants in the charclass C.
 892    MB_CUR_MAX must be 1.  */
 893 static void
 894 setbit_case_fold_c (int b, charclass *c)
 895 {
 896   int ub = toupper (b);
 897   for (int i = 0; i < NOTCHAR; i++)
 898     if (toupper (i) == ub)
 899       setbit (i, c);
 900 }
 901
 902 /* Return true if the locale compatible with the C locale.  */
 903
 904 static bool
 905 using_simple_locale (bool multibyte)
 906 {
 907   /* The native character set is known to be compatible with
 908      the C locale.  The following test isn't perfect, but it's good
 909      enough in practice, as only ASCII and EBCDIC are in common use
 910      and this test correctly accepts ASCII and rejects EBCDIC.  */
 911   enum { native_c_charset =
 912     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
 913      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
 914      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
 915      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
 916      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
 917      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
 918      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
 919      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
 920      && '}' == 125 && '~' == 126)
 921   };
 922
 923   if (!native_c_charset || multibyte)
 924     return false;
 925   else
 926     {
 927       /* Treat C and POSIX locales as being compatible.  Also, treat
 928          errors as compatible, as these are invariably from stubs.  */
 929       char const *loc = setlocale (LC_ALL, NULL);
 930       return !loc || streq (loc, "C") || streq (loc, "POSIX");
 931     }
 932 }
 933
 934 /* Fetch the next lexical input character from the pattern.  There
 935    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 936    value of the character or to WEOF depending on whether the input is
 937    a valid multibyte character (possibly of length 1).  Then return
 938    the next input byte value, except return EOF if the input is a
 939    multibyte character of length greater than 1.  */
 940 static int
 941 fetch_wc (struct dfa *dfa)
 942 {
 943   size_t nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 944                                 dfa);
 945   dfa->lex.cur_mb_len = nbytes;
 946   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 947   dfa->lex.ptr += nbytes;
 948   dfa->lex.left -= nbytes;
 949   return c;
 950 }
 951
 952 /* If there is no more input, report an error about unbalanced brackets.
 953    Otherwise, behave as with fetch_wc (DFA).  */
 954 static int
 955 bracket_fetch_wc (struct dfa *dfa)
 956 {
 957   if (! dfa->lex.left)
 958     dfaerror (_("unbalanced ["));
 959   return fetch_wc (dfa);
 960 }
 961
 962 typedef int predicate (int);
 963
 964 /* The following list maps the names of the Posix named character classes
 965    to predicate functions that determine whether a given character is in
 966    the class.  The leading [ has already been eaten by the lexical
 967    analyzer.  */
 968 struct dfa_ctype
 969 {
 970   const char *name;
 971   predicate *func;
 972   bool single_byte_only;
 973 };
 974
 975 static const struct dfa_ctype prednames[] = {
 976   {"alpha", isalpha, false},
 977   {"upper", isupper, false},
 978   {"lower", islower, false},
 979   {"digit", isdigit, true},
 980   {"xdigit", isxdigit, false},
 981   {"space", isspace, false},
 982   {"punct", ispunct, false},
 983   {"alnum", isalnum, false},
 984   {"print", isprint, false},
 985   {"graph", isgraph, false},
 986   {"cntrl", iscntrl, false},
 987   {"blank", isblank, false},
 988   {NULL, NULL, false}
 989 };
 990
 991 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 992 find_pred (const char *str)
 993 {
 994   for (unsigned int i = 0; prednames[i].name; ++i)
 995     if (streq (str, prednames[i].name))
 996       return &prednames[i];
 997   return NULL;
 998 }
 999
1000 /* Parse a bracket expression, which possibly includes multibyte
1001    characters.  */
1002 static token
1003 parse_bracket_exp (struct dfa *dfa)
1004 {
1005   /* This is a bracket expression that dfaexec is known to
1006      process correctly.  */
1007   bool known_bracket_exp = true;
1008
1009   /* Used to warn about [:space:].
1010      Bit 0 = first character is a colon.
1011      Bit 1 = last character is a colon.
1012      Bit 2 = includes any other character but a colon.
1013      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
1014   int colon_warning_state;
1015
1016   dfa->lex.brack.nchars = 0;
1017   charclass ccl;
1018   zeroset (&ccl);
1019   int c = bracket_fetch_wc (dfa);
1020   bool invert = c == '^';
1021   if (invert)
1022     {
1023       c = bracket_fetch_wc (dfa);
1024       known_bracket_exp = dfa->simple_locale;
1025     }
1026   wint_t wc = dfa->lex.wctok;
1027   int c1;
1028   wint_t wc1;
1029   colon_warning_state = (c == ':');
1030   do
1031     {
1032       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
1033       colon_warning_state &= ~2;
1034
1035       /* Note that if we're looking at some other [:...:] construct,
1036          we just treat it as a bunch of ordinary characters.  We can do
1037          this because we assume regex has checked for syntax errors before
1038          dfa is ever called.  */
1039       if (c == '[')
1040         {
1041           c1 = bracket_fetch_wc (dfa);
1042           wc1 = dfa->lex.wctok;
1043
1044           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1045               || c1 == '.' || c1 == '=')
1046             {
1047               enum { MAX_BRACKET_STRING_LEN = 32 };
1048               char str[MAX_BRACKET_STRING_LEN + 1];
1049               size_t len = 0;
1050               for (;;)
1051                 {
1052                   c = bracket_fetch_wc (dfa);
1053                   if (dfa->lex.left == 0
1054                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1055                     break;
1056                   if (len < MAX_BRACKET_STRING_LEN)
1057                     str[len++] = c;
1058                   else
1059                     /* This is in any case an invalid class name.  */
1060                     str[0] = '\0';
1061                 }
1062               str[len] = '\0';
1063
1064               /* Fetch bracket.  */
1065               c = bracket_fetch_wc (dfa);
1066               wc = dfa->lex.wctok;
1067               if (c1 == ':')
1068                 /* Build character class.  POSIX allows character
1069                    classes to match multicharacter collating elements,
1070                    but the regex code does not support that, so do not
1071                    worry about that possibility.  */
1072                 {
1073                   char const *class
1074                     = (dfa->syntax.case_fold && (streq (str, "upper")
1075                                                  || streq (str, "lower"))
1076                        ? "alpha" : str);
1077                   const struct dfa_ctype *pred = find_pred (class);
1078                   if (!pred)
1079                     dfaerror (_("invalid character class"));
1080
1081                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1082                     known_bracket_exp = false;
1083                   else
1084                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1085                       if (pred->func (c2))
1086                         setbit (c2, &ccl);
1087                 }
1088               else
1089                 known_bracket_exp = false;
1090
1091               colon_warning_state |= 8;
1092
1093               /* Fetch new lookahead character.  */
1094               c1 = bracket_fetch_wc (dfa);
1095               wc1 = dfa->lex.wctok;
1096               continue;
1097             }
1098
1099           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1100              are already set up.  */
1101         }
1102
1103       if (c == '\\'
1104           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1105         {
1106           c = bracket_fetch_wc (dfa);
1107           wc = dfa->lex.wctok;
1108         }
1109
1110       if (c1 == NOTCHAR)
1111         {
1112           c1 = bracket_fetch_wc (dfa);
1113           wc1 = dfa->lex.wctok;
1114         }
1115
1116       if (c1 == '-')
1117         /* build range characters.  */
1118         {
1119           int c2 = bracket_fetch_wc (dfa);
1120           wint_t wc2 = dfa->lex.wctok;
1121
1122           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1123              Treat it like [-a[.aa.]] while parsing it, and
1124              remember that the set is unknown.  */
1125           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1126             {
1127               known_bracket_exp = false;
1128               c2 = ']';
1129             }
1130
1131           if (c2 == ']')
1132             {
1133               /* In the case [x-], the - is an ordinary hyphen,
1134                  which is left in c1, the lookahead character.  */
1135               dfa->lex.ptr -= dfa->lex.cur_mb_len;
1136               dfa->lex.left += dfa->lex.cur_mb_len;
1137             }
1138           else
1139             {
1140               if (c2 == '\\' && (dfa->syntax.syntax_bits
1141                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1142                 {
1143                   c2 = bracket_fetch_wc (dfa);
1144                   wc2 = dfa->lex.wctok;
1145                 }
1146
1147               colon_warning_state |= 8;
1148               c1 = bracket_fetch_wc (dfa);
1149               wc1 = dfa->lex.wctok;
1150
1151               /* Treat [x-y] as a range if x != y.  */
1152               if (wc != wc2 || wc == WEOF)
1153                 {
1154                   if (dfa->simple_locale
1155                       || (isasciidigit (c) & isasciidigit (c2)))
1156                     {
1157                       for (int ci = c; ci <= c2; ci++)
1158                         if (dfa->syntax.case_fold && isalpha (ci))
1159                           setbit_case_fold_c (ci, &ccl);
1160                         else
1161                           setbit (ci, &ccl);
1162                     }
1163                   else
1164                     known_bracket_exp = false;
1165
1166                   continue;
1167                 }
1168             }
1169         }
1170
1171       colon_warning_state |= (c == ':') ? 2 : 4;
1172
1173       if (!dfa->localeinfo.multibyte)
1174         {
1175           if (dfa->syntax.case_fold && isalpha (c))
1176             setbit_case_fold_c (c, &ccl);
1177           else
1178             setbit (c, &ccl);
1179           continue;
1180         }
1181
1182       if (wc == WEOF)
1183         known_bracket_exp = false;
1184       else
1185         {
1186           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1187           unsigned int n = (dfa->syntax.case_fold
1188                             ? case_folded_counterparts (wc, folded + 1) + 1
1189                             : 1);
1190           folded[0] = wc;
1191           for (unsigned int i = 0; i < n; i++)
1192             if (!setbit_wc (folded[i], &ccl))
1193               {
1194                 dfa->lex.brack.chars
1195                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1196                                    &dfa->lex.brack.nchars_alloc, -1,
1197                                    sizeof *dfa->lex.brack.chars);
1198                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1199               }
1200         }
1201     }
1202   while ((wc = wc1, (c = c1) != ']'));
1203
1204   if (colon_warning_state == 7)
1205     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1206
1207   if (! known_bracket_exp)
1208     return BACKREF;
1209
1210   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1211     {
1212       dfa->lex.brack.invert = invert;
1213       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1214       return MBCSET;
1215     }
1216
1217   if (invert)
1218     {
1219       notset (&ccl);
1220       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1221         clrbit ('\n', &ccl);
1222     }
1223
1224   return CSET + charclass_index (dfa, &ccl);
1225 }
1226
1227 struct lexptr
1228 {
1229   char const *ptr;
1230   size_t left;
1231 };
1232
1233 static void
1234 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1235 {
1236   ls->ptr = dfa->lex.ptr;
1237   ls->left = dfa->lex.left;
1238   dfa->lex.ptr = s;
1239   dfa->lex.left = strlen (s);
1240 }
1241
1242 static void
1243 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1244 {
1245   dfa->lex.ptr = ls->ptr;
1246   dfa->lex.left = ls->left;
1247 }
1248
1249 static token
1250 lex (struct dfa *dfa)
1251 {
1252   bool backslash = false;
1253
1254   /* Basic plan: We fetch a character.  If it's a backslash,
1255      we set the backslash flag and go through the loop again.
1256      On the plus side, this avoids having a duplicate of the
1257      main switch inside the backslash case.  On the minus side,
1258      it means that just about every case begins with
1259      "if (backslash) ...".  */
1260   for (int i = 0; i < 2; ++i)
1261     {
1262       if (! dfa->lex.left)
1263         return dfa->lex.lasttok = END;
1264       int c = fetch_wc (dfa);
1265
1266       switch (c)
1267         {
1268         case '\\':
1269           if (backslash)
1270             goto normal_char;
1271           if (dfa->lex.left == 0)
1272             dfaerror (_("unfinished \\ escape"));
1273           backslash = true;
1274           break;
1275
1276         case '^':
1277           if (backslash)
1278             goto normal_char;
1279           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1280               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1281               || dfa->lex.lasttok == OR)
1282             return dfa->lex.lasttok = BEGLINE;
1283           goto normal_char;
1284
1285         case '$':
1286           if (backslash)
1287             goto normal_char;
1288           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1289               || dfa->lex.left == 0
1290               || ((dfa->lex.left
1291                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1292                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1293                                    & (dfa->lex.ptr[0] == '\\')]
1294                       == ')'))
1295               || ((dfa->lex.left
1296                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1297                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1298                                    & (dfa->lex.ptr[0] == '\\')]
1299                       == '|'))
1300               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1301                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1302             return dfa->lex.lasttok = ENDLINE;
1303           goto normal_char;
1304
1305         case '1':
1306         case '2':
1307         case '3':
1308         case '4':
1309         case '5':
1310         case '6':
1311         case '7':
1312         case '8':
1313         case '9':
1314           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1315             {
1316               dfa->lex.laststart = false;
1317               return dfa->lex.lasttok = BACKREF;
1318             }
1319           goto normal_char;
1320
1321         case '`':
1322           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1323             {
1324               /* FIXME: should be beginning of string */
1325               return dfa->lex.lasttok = BEGLINE;
1326             }
1327           goto normal_char;
1328
1329         case '\'':
1330           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1331             {
1332               /* FIXME: should be end of string */
1333               return dfa->lex.lasttok = ENDLINE;
1334             }
1335           goto normal_char;
1336
1337         case '<':
1338           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1339             return dfa->lex.lasttok = BEGWORD;
1340           goto normal_char;
1341
1342         case '>':
1343           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1344             return dfa->lex.lasttok = ENDWORD;
1345           goto normal_char;
1346
1347         case 'b':
1348           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1349             return dfa->lex.lasttok = LIMWORD;
1350           goto normal_char;
1351
1352         case 'B':
1353           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1354             return dfa->lex.lasttok = NOTLIMWORD;
1355           goto normal_char;
1356
1357         case '?':
1358           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1359             goto normal_char;
1360           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1361             goto normal_char;
1362           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1363               && dfa->lex.laststart)
1364             goto normal_char;
1365           return dfa->lex.lasttok = QMARK;
1366
1367         case '*':
1368           if (backslash)
1369             goto normal_char;
1370           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1371               && dfa->lex.laststart)
1372             goto normal_char;
1373           return dfa->lex.lasttok = STAR;
1374
1375         case '+':
1376           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1377             goto normal_char;
1378           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1379             goto normal_char;
1380           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1381               && dfa->lex.laststart)
1382             goto normal_char;
1383           return dfa->lex.lasttok = PLUS;
1384
1385         case '{':
1386           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1387             goto normal_char;
1388           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1389             goto normal_char;
1390           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1391               && dfa->lex.laststart)
1392             goto normal_char;
1393
1394           /* Cases:
1395              {M} - exact count
1396              {M,} - minimum count, maximum is infinity
1397              {,N} - 0 through N
1398              {,} - 0 to infinity (same as '*')
1399              {M,N} - M through N */
1400           {
1401             char const *p = dfa->lex.ptr;
1402             char const *lim = p + dfa->lex.left;
1403             dfa->lex.minrep = dfa->lex.maxrep = -1;
1404             for (; p != lim && isasciidigit (*p); p++)
1405               dfa->lex.minrep = (dfa->lex.minrep < 0
1406                                  ? *p - '0'
1407                                  : MIN (RE_DUP_MAX + 1,
1408                                         dfa->lex.minrep * 10 + *p - '0'));
1409             if (p != lim)
1410               {
1411                 if (*p != ',')
1412                   dfa->lex.maxrep = dfa->lex.minrep;
1413                 else
1414                   {
1415                     if (dfa->lex.minrep < 0)
1416                       dfa->lex.minrep = 0;
1417                     while (++p != lim && isasciidigit (*p))
1418                       dfa->lex.maxrep
1419                         = (dfa->lex.maxrep < 0
1420                            ? *p - '0'
1421                            : MIN (RE_DUP_MAX + 1,
1422                                   dfa->lex.maxrep * 10 + *p - '0'));
1423                   }
1424               }
1425             if (! ((! backslash || (p != lim && *p++ == '\\'))
1426                    && p != lim && *p++ == '}'
1427                    && 0 <= dfa->lex.minrep
1428                    && (dfa->lex.maxrep < 0
1429                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1430               {
1431                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1432                   goto normal_char;
1433                 dfaerror (_("invalid content of \\{\\}"));
1434               }
1435             if (RE_DUP_MAX < dfa->lex.maxrep)
1436               dfaerror (_("regular expression too big"));
1437             dfa->lex.ptr = p;
1438             dfa->lex.left = lim - p;
1439           }
1440           dfa->lex.laststart = false;
1441           return dfa->lex.lasttok = REPMN;
1442
1443         case '|':
1444           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1445             goto normal_char;
1446           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1447             goto normal_char;
1448           dfa->lex.laststart = true;
1449           return dfa->lex.lasttok = OR;
1450
1451         case '\n':
1452           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1453               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1454             goto normal_char;
1455           dfa->lex.laststart = true;
1456           return dfa->lex.lasttok = OR;
1457
1458         case '(':
1459           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1460             goto normal_char;
1461           dfa->lex.parens++;
1462           dfa->lex.laststart = true;
1463           return dfa->lex.lasttok = LPAREN;
1464
1465         case ')':
1466           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1467             goto normal_char;
1468           if (dfa->lex.parens == 0
1469               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1470             goto normal_char;
1471           dfa->lex.parens--;
1472           dfa->lex.laststart = false;
1473           return dfa->lex.lasttok = RPAREN;
1474
1475         case '.':
1476           if (backslash)
1477             goto normal_char;
1478           if (dfa->canychar == (size_t) -1)
1479             {
1480               charclass ccl;
1481               fillset (&ccl);
1482               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1483                 clrbit ('\n', &ccl);
1484               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1485                 clrbit ('\0', &ccl);
1486               if (dfa->localeinfo.multibyte)
1487                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1488                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1489                     clrbit (c2, &ccl);
1490               dfa->canychar = charclass_index (dfa, &ccl);
1491             }
1492           dfa->lex.laststart = false;
1493           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1494                                      ? ANYCHAR
1495                                      : CSET + dfa->canychar);
1496
1497         case 's':
1498         case 'S':
1499           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1500             goto normal_char;
1501           if (!dfa->localeinfo.multibyte)
1502             {
1503               charclass ccl;
1504               zeroset (&ccl);
1505               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1506                 if (isspace (c2))
1507                   setbit (c2, &ccl);
1508               if (c == 'S')
1509                 notset (&ccl);
1510               dfa->lex.laststart = false;
1511               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1512             }
1513
1514           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1515              add_utf8_anychar, makes sense.  */
1516
1517           /* \s and \S are documented to be equivalent to [[:space:]] and
1518              [^[:space:]] respectively, so tell the lexer to process those
1519              strings, each minus its "already processed" '['.  */
1520           {
1521             struct lexptr ls;
1522             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1523             dfa->lex.lasttok = parse_bracket_exp (dfa);
1524             pop_lex_state (dfa, &ls);
1525           }
1526
1527           dfa->lex.laststart = false;
1528           return dfa->lex.lasttok;
1529
1530         case 'w':
1531         case 'W':
1532           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1533             goto normal_char;
1534
1535           if (!dfa->localeinfo.multibyte)
1536             {
1537               charclass ccl;
1538               zeroset (&ccl);
1539               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1540                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1541                   setbit (c2, &ccl);
1542               if (c == 'W')
1543                 notset (&ccl);
1544               dfa->lex.laststart = false;
1545               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1546             }
1547
1548           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549              add_utf8_anychar, makes sense.  */
1550
1551           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552              [^_[:alnum:]] respectively, so tell the lexer to process those
1553              strings, each minus its "already processed" '['.  */
1554           {
1555             struct lexptr ls;
1556             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1557             dfa->lex.lasttok = parse_bracket_exp (dfa);
1558             pop_lex_state (dfa, &ls);
1559           }
1560
1561           dfa->lex.laststart = false;
1562           return dfa->lex.lasttok;
1563
1564         case '[':
1565           if (backslash)
1566             goto normal_char;
1567           dfa->lex.laststart = false;
1568           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1569
1570         default:
1571         normal_char:
1572           dfa->lex.laststart = false;
1573           /* For multibyte character sets, folding is done in atom.  Always
1574              return WCHAR.  */
1575           if (dfa->localeinfo.multibyte)
1576             return dfa->lex.lasttok = WCHAR;
1577
1578           if (dfa->syntax.case_fold && isalpha (c))
1579             {
1580               charclass ccl;
1581               zeroset (&ccl);
1582               setbit_case_fold_c (c, &ccl);
1583               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1584             }
1585
1586           return dfa->lex.lasttok = c;
1587         }
1588     }
1589
1590   /* The above loop should consume at most a backslash
1591      and some other character.  */
1592   abort ();
1593   return END;                   /* keeps pedantic compilers happy.  */
1594 }
1595
1596 static void
1597 addtok_mb (struct dfa *dfa, token t, char mbprop)
1598 {
1599   if (dfa->talloc == dfa->tindex)
1600     {
1601       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1602                                 sizeof *dfa->tokens);
1603       if (dfa->localeinfo.multibyte)
1604         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1605                                          sizeof *dfa->multibyte_prop);
1606     }
1607   if (dfa->localeinfo.multibyte)
1608     dfa->multibyte_prop[dfa->tindex] = mbprop;
1609   dfa->tokens[dfa->tindex++] = t;
1610
1611   switch (t)
1612     {
1613     case QMARK:
1614     case STAR:
1615     case PLUS:
1616       break;
1617
1618     case CAT:
1619     case OR:
1620       dfa->parse.depth--;
1621       break;
1622
1623     case BACKREF:
1624       dfa->fast = false;
1625       FALLTHROUGH;
1626     default:
1627       dfa->nleaves++;
1628       FALLTHROUGH;
1629     case EMPTY:
1630       dfa->parse.depth++;
1631       break;
1632     }
1633   if (dfa->parse.depth > dfa->depth)
1634     dfa->depth = dfa->parse.depth;
1635 }
1636
1637 static void addtok_wc (struct dfa *dfa, wint_t wc);
1638
1639 /* Add the given token to the parse tree, maintaining the depth count and
1640    updating the maximum depth if necessary.  */
1641 static void
1642 addtok (struct dfa *dfa, token t)
1643 {
1644   if (dfa->localeinfo.multibyte && t == MBCSET)
1645     {
1646       bool need_or = false;
1647
1648       /* Extract wide characters into alternations for better performance.
1649          This does not require UTF-8.  */
1650       for (ptrdiff_t i = 0; i < dfa->lex.brack.nchars; i++)
1651         {
1652           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1653           if (need_or)
1654             addtok (dfa, OR);
1655           need_or = true;
1656         }
1657       dfa->lex.brack.nchars = 0;
1658
1659       /* Wide characters have been handled above, so it is possible
1660          that the set is empty now.  Do nothing in that case.  */
1661       if (dfa->lex.brack.cset != -1)
1662         {
1663           addtok (dfa, CSET + dfa->lex.brack.cset);
1664           if (need_or)
1665             addtok (dfa, OR);
1666         }
1667     }
1668   else
1669     {
1670       addtok_mb (dfa, t, 3);
1671     }
1672 }
1673
1674 /* We treat a multibyte character as a single atom, so that DFA
1675    can treat a multibyte character as a single expression.
1676
1677    e.g., we construct the following tree from "<mb1><mb2>".
1678    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1679    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1680 static void
1681 addtok_wc (struct dfa *dfa, wint_t wc)
1682 {
1683   unsigned char buf[MB_LEN_MAX];
1684   mbstate_t s = { 0 };
1685   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1686
1687   if (stored_bytes != (size_t) -1)
1688     dfa->lex.cur_mb_len = stored_bytes;
1689   else
1690     {
1691       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1692          the addtok_mb call altogether can corrupt the heap.  */
1693       dfa->lex.cur_mb_len = 1;
1694       buf[0] = 0;
1695     }
1696
1697   addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1698   for (int i = 1; i < dfa->lex.cur_mb_len; i++)
1699     {
1700       addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1701       addtok (dfa, CAT);
1702     }
1703 }
1704
1705 static void
1706 add_utf8_anychar (struct dfa *dfa)
1707 {
1708   static charclass const utf8_classes[5] = {
1709     /* 80-bf: non-leading bytes.  */
1710     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1711
1712     /* 00-7f: 1-byte sequence.  */
1713     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1714
1715     /* c2-df: 2-byte sequence.  */
1716     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1717
1718     /* e0-ef: 3-byte sequence.  */
1719     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1720
1721     /* f0-f7: 4-byte sequence.  */
1722     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1723   };
1724   const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1725
1726   /* Define the five character classes that are needed below.  */
1727   if (dfa->utf8_anychar_classes[0] == 0)
1728     for (unsigned int i = 0; i < n; i++)
1729       {
1730         charclass c = utf8_classes[i];
1731         if (i == 1)
1732           {
1733             if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1734               clrbit ('\n', &c);
1735             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1736               clrbit ('\0', &c);
1737           }
1738         dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
1739       }
1740
1741   /* A valid UTF-8 character is
1742
1743      ([0x00-0x7f]
1744      |[0xc2-0xdf][0x80-0xbf]
1745      |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1746      |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1747
1748      which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
1749      and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
1750      Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
1751   unsigned int i;
1752   for (i = 1; i < n; i++)
1753     addtok (dfa, dfa->utf8_anychar_classes[i]);
1754   while (--i > 1)
1755     {
1756       addtok (dfa, dfa->utf8_anychar_classes[0]);
1757       addtok (dfa, CAT);
1758       addtok (dfa, OR);
1759     }
1760 }
1761
1762 /* The grammar understood by the parser is as follows.
1763
1764    regexp:
1765      regexp OR branch
1766      branch
1767
1768    branch:
1769      branch closure
1770      closure
1771
1772    closure:
1773      closure QMARK
1774      closure STAR
1775      closure PLUS
1776      closure REPMN
1777      atom
1778
1779    atom:
1780      <normal character>
1781      <multibyte character>
1782      ANYCHAR
1783      MBCSET
1784      CSET
1785      BACKREF
1786      BEGLINE
1787      ENDLINE
1788      BEGWORD
1789      ENDWORD
1790      LIMWORD
1791      NOTLIMWORD
1792      LPAREN regexp RPAREN
1793      <empty>
1794
1795    The parser builds a parse tree in postfix form in an array of tokens.  */
1796
1797 static void
1798 atom (struct dfa *dfa)
1799 {
1800   if (dfa->parse.tok == WCHAR)
1801     {
1802       if (dfa->lex.wctok == WEOF)
1803         addtok (dfa, BACKREF);
1804       else
1805         {
1806           addtok_wc (dfa, dfa->lex.wctok);
1807
1808           if (dfa->syntax.case_fold)
1809             {
1810               wchar_t folded[CASE_FOLDED_BUFSIZE];
1811               unsigned int n = case_folded_counterparts (dfa->lex.wctok,
1812                                                          folded);
1813               for (unsigned int i = 0; i < n; i++)
1814                 {
1815                   addtok_wc (dfa, folded[i]);
1816                   addtok (dfa, OR);
1817                 }
1818             }
1819         }
1820
1821       dfa->parse.tok = lex (dfa);
1822     }
1823   else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1824     {
1825       /* For UTF-8 expand the period to a series of CSETs that define a valid
1826          UTF-8 character.  This avoids using the slow multibyte path.  I'm
1827          pretty sure it would be both profitable and correct to do it for
1828          any encoding; however, the optimization must be done manually as
1829          it is done above in add_utf8_anychar.  So, let's start with
1830          UTF-8: it is the most used, and the structure of the encoding
1831          makes the correctness more obvious.  */
1832       add_utf8_anychar (dfa);
1833       dfa->parse.tok = lex (dfa);
1834     }
1835   else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1836            || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
1837            || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1838            || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
1839            || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
1840            || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
1841     {
1842       addtok (dfa, dfa->parse.tok);
1843       dfa->parse.tok = lex (dfa);
1844     }
1845   else if (dfa->parse.tok == LPAREN)
1846     {
1847       dfa->parse.tok = lex (dfa);
1848       regexp (dfa);
1849       if (dfa->parse.tok != RPAREN)
1850         dfaerror (_("unbalanced ("));
1851       dfa->parse.tok = lex (dfa);
1852     }
1853   else
1854     addtok (dfa, EMPTY);
1855 }
1856
1857 /* Return the number of tokens in the given subexpression.  */
1858 static size_t _GL_ATTRIBUTE_PURE
1859 nsubtoks (struct dfa const *dfa, size_t tindex)
1860 {
1861   switch (dfa->tokens[tindex - 1])
1862     {
1863     default:
1864       return 1;
1865     case QMARK:
1866     case STAR:
1867     case PLUS:
1868       return 1 + nsubtoks (dfa, tindex - 1);
1869     case CAT:
1870     case OR:
1871       {
1872         size_t ntoks1 = nsubtoks (dfa, tindex - 1);
1873         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1874       }
1875     }
1876 }
1877
1878 /* Copy the given subexpression to the top of the tree.  */
1879 static void
1880 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1881 {
1882   if (dfa->localeinfo.multibyte)
1883     for (size_t i = 0; i < ntokens; ++i)
1884       addtok_mb (dfa, dfa->tokens[tindex + i],
1885                  dfa->multibyte_prop[tindex + i]);
1886   else
1887     for (size_t i = 0; i < ntokens; ++i)
1888       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1889 }
1890
1891 static void
1892 closure (struct dfa *dfa)
1893 {
1894   atom (dfa);
1895   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1896          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1897     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1898       {
1899         size_t ntokens = nsubtoks (dfa, dfa->tindex);
1900         size_t tindex = dfa->tindex - ntokens;
1901         if (dfa->lex.maxrep < 0)
1902           addtok (dfa, PLUS);
1903         if (dfa->lex.minrep == 0)
1904           addtok (dfa, QMARK);
1905         int i;
1906         for (i = 1; i < dfa->lex.minrep; i++)
1907           {
1908             copytoks (dfa, tindex, ntokens);
1909             addtok (dfa, CAT);
1910           }
1911         for (; i < dfa->lex.maxrep; i++)
1912           {
1913             copytoks (dfa, tindex, ntokens);
1914             addtok (dfa, QMARK);
1915             addtok (dfa, CAT);
1916           }
1917         dfa->parse.tok = lex (dfa);
1918       }
1919     else if (dfa->parse.tok == REPMN)
1920       {
1921         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1922         dfa->parse.tok = lex (dfa);
1923         closure (dfa);
1924       }
1925     else
1926       {
1927         addtok (dfa, dfa->parse.tok);
1928         dfa->parse.tok = lex (dfa);
1929       }
1930 }
1931
1932 static void
1933 branch (struct dfa* dfa)
1934 {
1935   closure (dfa);
1936   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1937          && dfa->parse.tok >= 0)
1938     {
1939       closure (dfa);
1940       addtok (dfa, CAT);
1941     }
1942 }
1943
1944 static void
1945 regexp (struct dfa *dfa)
1946 {
1947   branch (dfa);
1948   while (dfa->parse.tok == OR)
1949     {
1950       dfa->parse.tok = lex (dfa);
1951       branch (dfa);
1952       addtok (dfa, OR);
1953     }
1954 }
1955
1956 /* Main entry point for the parser.  S is a string to be parsed, len is the
1957    length of the string, so s can include NUL characters.  D is a pointer to
1958    the struct dfa to parse into.  */
1959 static void
1960 dfaparse (char const *s, size_t len, struct dfa *d)
1961 {
1962   d->lex.ptr = s;
1963   d->lex.left = len;
1964   d->lex.lasttok = END;
1965   d->lex.laststart = true;
1966
1967   if (!d->syntax.syntax_bits_set)
1968     dfaerror (_("no syntax specified"));
1969
1970   d->parse.tok = lex (d);
1971   d->parse.depth = d->depth;
1972
1973   regexp (d);
1974
1975   if (d->parse.tok != END)
1976     dfaerror (_("unbalanced )"));
1977
1978   addtok (d, END - d->nregexps);
1979   addtok (d, CAT);
1980
1981   if (d->nregexps)
1982     addtok (d, OR);
1983
1984   ++d->nregexps;
1985 }
1986
1987 /* Some primitives for operating on sets of positions.  */
1988
1989 /* Copy one set to another.  */
1990 static void
1991 copy (position_set const *src, position_set *dst)
1992 {
1993   if (dst->alloc < src->nelem)
1994     {
1995       free (dst->elems);
1996       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
1997                             sizeof *dst->elems);
1998     }
1999   dst->nelem = src->nelem;
2000   if (src->nelem != 0)
2001     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2002 }
2003
2004 static void
2005 alloc_position_set (position_set *s, size_t size)
2006 {
2007   s->elems = xnmalloc (size, sizeof *s->elems);
2008   s->alloc = size;
2009   s->nelem = 0;
2010 }
2011
2012 /* Insert position P in set S.  S is maintained in sorted order on
2013    decreasing index.  If there is already an entry in S with P.index
2014    then merge (logically-OR) P's constraints into the one in S.
2015    S->elems must point to an array large enough to hold the resulting set.  */
2016 static void
2017 insert (position p, position_set *s)
2018 {
2019   ptrdiff_t count = s->nelem;
2020   ptrdiff_t lo = 0, hi = count;
2021   while (lo < hi)
2022     {
2023       ptrdiff_t mid = (lo + hi) >> 1;
2024       if (s->elems[mid].index > p.index)
2025         lo = mid + 1;
2026       else if (s->elems[mid].index == p.index)
2027         {
2028           s->elems[mid].constraint |= p.constraint;
2029           return;
2030         }
2031       else
2032         hi = mid;
2033     }
2034
2035   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2036   for (ptrdiff_t i = count; i > lo; i--)
2037     s->elems[i] = s->elems[i - 1];
2038   s->elems[lo] = p;
2039   ++s->nelem;
2040 }
2041
2042 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2043    result is as if the positions of S1, and of S2 with the additional
2044    constraint C2, were inserted into an initially empty set.  */
2045 static void
2046 merge_constrained (position_set const *s1, position_set const *s2,
2047                    unsigned int c2, position_set *m)
2048 {
2049   ptrdiff_t i = 0, j = 0;
2050
2051   if (m->alloc - s1->nelem < s2->nelem)
2052     {
2053       free (m->elems);
2054       m->alloc = s1->nelem;
2055       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2056     }
2057   m->nelem = 0;
2058   while (i < s1->nelem || j < s2->nelem)
2059     if (! (j < s2->nelem)
2060         || (i < s1->nelem && s1->elems[i].index >= s2->elems[j].index))
2061       {
2062         unsigned int c = ((i < s1->nelem && j < s2->nelem
2063                            && s1->elems[i].index == s2->elems[j].index)
2064                           ? s2->elems[j++].constraint & c2
2065                           : 0);
2066         m->elems[m->nelem].index = s1->elems[i].index;
2067         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2068       }
2069     else
2070       {
2071         if (s2->elems[j].constraint & c2)
2072           {
2073             m->elems[m->nelem].index = s2->elems[j].index;
2074             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2075           }
2076         j++;
2077       }
2078 }
2079
2080 /* Merge two sets of positions into a third.  The result is exactly as if
2081    the positions of both sets were inserted into an initially empty set.  */
2082 static void
2083 merge (position_set const *s1, position_set const *s2, position_set *m)
2084 {
2085   merge_constrained (s1, s2, -1, m);
2086 }
2087
2088 /* Delete a position from a set.  Return the nonzero constraint of the
2089    deleted position, or zero if there was no such position.  */
2090 static unsigned int
2091 delete (size_t del, position_set *s)
2092 {
2093   size_t count = s->nelem;
2094   size_t lo = 0, hi = count;
2095   while (lo < hi)
2096     {
2097       size_t mid = (lo + hi) >> 1;
2098       if (s->elems[mid].index > del)
2099         lo = mid + 1;
2100       else if (s->elems[mid].index == del)
2101         {
2102           unsigned int c = s->elems[mid].constraint;
2103           size_t i;
2104           for (i = mid; i + 1 < count; i++)
2105             s->elems[i] = s->elems[i + 1];
2106           s->nelem = i;
2107           return c;
2108         }
2109       else
2110         hi = mid;
2111     }
2112   return 0;
2113 }
2114
2115 /* Replace a position with the followed set.  */
2116 static void
2117 replace (position_set *dst, size_t del, position_set *add,
2118          unsigned int constraint, position_set *tmp)
2119 {
2120   unsigned int c = delete (del, dst) & constraint;
2121
2122   if (c)
2123     {
2124       copy (dst, tmp);
2125       merge_constrained (tmp, add, c, dst);
2126     }
2127 }
2128
2129 /* Find the index of the state corresponding to the given position set with
2130    the given preceding context, or create a new state if there is no such
2131    state.  Context tells whether we got here on a newline or letter.  */
2132 static state_num
2133 state_index (struct dfa *d, position_set const *s, int context)
2134 {
2135   size_t hash = 0;
2136   int constraint = 0;
2137   state_num i;
2138   token first_end = 0;
2139
2140   for (i = 0; i < s->nelem; ++i)
2141     hash ^= s->elems[i].index + s->elems[i].constraint;
2142
2143   /* Try to find a state that exactly matches the proposed one.  */
2144   for (i = 0; i < d->sindex; ++i)
2145     {
2146       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2147           || context != d->states[i].context)
2148         continue;
2149       state_num j;
2150       for (j = 0; j < s->nelem; ++j)
2151         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2152             || s->elems[j].index != d->states[i].elems.elems[j].index)
2153           break;
2154       if (j == s->nelem)
2155         return i;
2156     }
2157
2158 #ifdef DEBUG
2159   fprintf (stderr, "new state %zd\n nextpos:", i);
2160   for (state_num j = 0; j < s->nelem; j++)
2161     {
2162       fprintf (stderr, " %zu:", s->elems[j].index);
2163       prtok (d->tokens[s->elems[j].index]);
2164     }
2165   fprintf (stderr, "\n context:");
2166   if (context ^ CTX_ANY)
2167     {
2168       if (context & CTX_NONE)
2169         fprintf (stderr, " CTX_NONE");
2170       if (context & CTX_LETTER)
2171         fprintf (stderr, " CTX_LETTER");
2172       if (context & CTX_NEWLINE)
2173         fprintf (stderr, " CTX_NEWLINE");
2174     }
2175   else
2176     fprintf (stderr, " CTX_ANY");
2177   fprintf (stderr, "\n");
2178 #endif
2179
2180   for (state_num j = 0; j < s->nelem; j++)
2181     {
2182       int c = s->elems[j].constraint;
2183       if (d->tokens[s->elems[j].index] < 0)
2184         {
2185           if (succeeds_in_context (c, context, CTX_ANY))
2186             constraint |= c;
2187           if (!first_end)
2188             first_end = d->tokens[s->elems[j].index];
2189         }
2190       else if (d->tokens[s->elems[j].index] == BACKREF)
2191         constraint = NO_CONSTRAINT;
2192     }
2193
2194
2195   /* Create a new state.  */
2196   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2197                              sizeof *d->states);
2198   d->states[i].hash = hash;
2199   alloc_position_set (&d->states[i].elems, s->nelem);
2200   copy (s, &d->states[i].elems);
2201   d->states[i].context = context;
2202   d->states[i].constraint = constraint;
2203   d->states[i].first_end = first_end;
2204   d->states[i].mbps.nelem = 0;
2205   d->states[i].mbps.elems = NULL;
2206   d->states[i].mb_trindex = -1;
2207
2208   ++d->sindex;
2209
2210   return i;
2211 }
2212
2213 /* Find the epsilon closure of a set of positions.  If any position of the set
2214    contains a symbol that matches the empty string in some context, replace
2215    that position with the elements of its follow labeled with an appropriate
2216    constraint.  Repeat exhaustively until no funny positions are left.
2217    S->elems must be large enough to hold the result.  */
2218 static void
2219 epsclosure (position_set *initial, struct dfa const *d)
2220 {
2221   position_set tmp;
2222   alloc_position_set (&tmp, d->nleaves);
2223   for (size_t i = 0; i < d->tindex; ++i)
2224     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2225         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2226         && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2227       {
2228         unsigned int constraint;
2229         switch (d->tokens[i])
2230           {
2231           case BEGLINE:
2232             constraint = BEGLINE_CONSTRAINT;
2233             break;
2234           case ENDLINE:
2235             constraint = ENDLINE_CONSTRAINT;
2236             break;
2237           case BEGWORD:
2238             constraint = BEGWORD_CONSTRAINT;
2239             break;
2240           case ENDWORD:
2241             constraint = ENDWORD_CONSTRAINT;
2242             break;
2243           case LIMWORD:
2244             constraint = LIMWORD_CONSTRAINT;
2245             break;
2246           case NOTLIMWORD:
2247             constraint = NOTLIMWORD_CONSTRAINT;
2248             break;
2249           default:
2250             constraint = NO_CONSTRAINT;
2251             break;
2252           }
2253
2254         delete (i, &d->follows[i]);
2255
2256         for (size_t j = 0; j < d->tindex; j++)
2257           if (i != j && d->follows[j].nelem > 0)
2258             replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2259
2260         replace (initial, i, &d->follows[i], constraint, &tmp);
2261       }
2262   free (tmp.elems);
2263 }
2264
2265 /* Returns the set of contexts for which there is at least one
2266    character included in C.  */
2267
2268 static int
2269 charclass_context (struct dfa const *dfa, charclass const *c)
2270 {
2271   int context = 0;
2272
2273   for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j)
2274     {
2275       if (c->w[j] & dfa->syntax.newline.w[j])
2276         context |= CTX_NEWLINE;
2277       if (c->w[j] & dfa->syntax.letters.w[j])
2278         context |= CTX_LETTER;
2279       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2280         context |= CTX_NONE;
2281     }
2282
2283   return context;
2284 }
2285
2286 /* Returns the contexts on which the position set S depends.  Each context
2287    in the set of returned contexts (let's call it SC) may have a different
2288    follow set than other contexts in SC, and also different from the
2289    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2290    in the complement set will have the same follow set.  */
2291
2292 static int _GL_ATTRIBUTE_PURE
2293 state_separate_contexts (position_set const *s)
2294 {
2295   int separate_contexts = 0;
2296
2297   for (size_t j = 0; j < s->nelem; j++)
2298     {
2299       if (prev_newline_dependent (s->elems[j].constraint))
2300         separate_contexts |= CTX_NEWLINE;
2301       if (prev_letter_dependent (s->elems[j].constraint))
2302         separate_contexts |= CTX_LETTER;
2303     }
2304
2305   return separate_contexts;
2306 }
2307
2308
2309 /* Perform bottom-up analysis on the parse tree, computing various functions.
2310    Note that at this point, we're pretending constructs like \< are real
2311    characters rather than constraints on what can follow them.
2312
2313    Nullable:  A node is nullable if it is at the root of a regexp that can
2314    match the empty string.
2315    *  EMPTY leaves are nullable.
2316    * No other leaf is nullable.
2317    * A QMARK or STAR node is nullable.
2318    * A PLUS node is nullable if its argument is nullable.
2319    * A CAT node is nullable if both its arguments are nullable.
2320    * An OR node is nullable if either argument is nullable.
2321
2322    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2323    that could correspond to the first character of a string matching the
2324    regexp rooted at the given node.
2325    * EMPTY leaves have empty firstpos.
2326    * The firstpos of a nonempty leaf is that leaf itself.
2327    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2328      argument.
2329    * The firstpos of a CAT node is the firstpos of the left argument, union
2330      the firstpos of the right if the left argument is nullable.
2331    * The firstpos of an OR node is the union of firstpos of each argument.
2332
2333    Lastpos:  The lastpos of a node is the set of positions that could
2334    correspond to the last character of a string matching the regexp at
2335    the given node.
2336    * EMPTY leaves have empty lastpos.
2337    * The lastpos of a nonempty leaf is that leaf itself.
2338    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2339      argument.
2340    * The lastpos of a CAT node is the lastpos of its right argument, union
2341      the lastpos of the left if the right argument is nullable.
2342    * The lastpos of an OR node is the union of the lastpos of each argument.
2343
2344    Follow:  The follow of a position is the set of positions that could
2345    correspond to the character following a character matching the node in
2346    a string matching the regexp.  At this point we consider special symbols
2347    that match the empty string in some context to be just normal characters.
2348    Later, if we find that a special symbol is in a follow set, we will
2349    replace it with the elements of its follow, labeled with an appropriate
2350    constraint.
2351    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2352      the follow of every node in the lastpos.
2353    * Every node in the firstpos of the second argument of a CAT node is in
2354      the follow of every node in the lastpos of the first argument.
2355
2356    Because of the postfix representation of the parse tree, the depth-first
2357    analysis is conveniently done by a linear scan with the aid of a stack.
2358    Sets are stored as arrays of the elements, obeying a stack-like allocation
2359    scheme; the number of elements in each set deeper in the stack can be
2360    used to determine the address of a particular set's array.  */
2361 static void
2362 dfaanalyze (struct dfa *d, bool searchflag)
2363 {
2364   /* Array allocated to hold position sets.  */
2365   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2366   /* Firstpos and lastpos elements.  */
2367   position *firstpos = posalloc + d->nleaves;
2368   position *lastpos = firstpos + d->nleaves;
2369
2370   /* Stack for element counts and nullable flags.  */
2371   struct
2372   {
2373     /* Whether the entry is nullable.  */
2374     bool nullable;
2375
2376     /* Counts of firstpos and lastpos sets.  */
2377     size_t nfirstpos;
2378     size_t nlastpos;
2379   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2380
2381   position_set merged;          /* Result of merging sets.  */
2382
2383 #ifdef DEBUG
2384   fprintf (stderr, "dfaanalyze:\n");
2385   for (size_t i = 0; i < d->tindex; ++i)
2386     {
2387       fprintf (stderr, " %zu:", i);
2388       prtok (d->tokens[i]);
2389     }
2390   putc ('\n', stderr);
2391 #endif
2392
2393   d->searchflag = searchflag;
2394   alloc_position_set (&merged, d->nleaves);
2395   d->follows = xcalloc (d->tindex, sizeof *d->follows);
2396
2397   for (size_t i = 0; i < d->tindex; ++i)
2398     {
2399       switch (d->tokens[i])
2400         {
2401         case EMPTY:
2402           /* The empty set is nullable.  */
2403           stk->nullable = true;
2404
2405           /* The firstpos and lastpos of the empty leaf are both empty.  */
2406           stk->nfirstpos = stk->nlastpos = 0;
2407           stk++;
2408           break;
2409
2410         case STAR:
2411         case PLUS:
2412           /* Every element in the firstpos of the argument is in the follow
2413              of every element in the lastpos.  */
2414           {
2415             position_set tmp;
2416             tmp.nelem = stk[-1].nfirstpos;
2417             tmp.elems = firstpos;
2418             position *pos = lastpos;
2419             for (size_t j = 0; j < stk[-1].nlastpos; j++)
2420               {
2421                 merge (&tmp, &d->follows[pos[j].index], &merged);
2422                 copy (&merged, &d->follows[pos[j].index]);
2423               }
2424           }
2425           FALLTHROUGH;
2426         case QMARK:
2427           /* A QMARK or STAR node is automatically nullable.  */
2428           if (d->tokens[i] != PLUS)
2429             stk[-1].nullable = true;
2430           break;
2431
2432         case CAT:
2433           /* Every element in the firstpos of the second argument is in the
2434              follow of every element in the lastpos of the first argument.  */
2435           {
2436             position_set tmp;
2437             tmp.nelem = stk[-1].nfirstpos;
2438             tmp.elems = firstpos;
2439             position *pos = lastpos + stk[-1].nlastpos;
2440             for (size_t j = 0; j < stk[-2].nlastpos; j++)
2441               {
2442                 merge (&tmp, &d->follows[pos[j].index], &merged);
2443                 copy (&merged, &d->follows[pos[j].index]);
2444               }
2445           }
2446
2447           /* The firstpos of a CAT node is the firstpos of the first argument,
2448              union that of the second argument if the first is nullable.  */
2449           if (stk[-2].nullable)
2450             stk[-2].nfirstpos += stk[-1].nfirstpos;
2451           else
2452             firstpos += stk[-1].nfirstpos;
2453
2454           /* The lastpos of a CAT node is the lastpos of the second argument,
2455              union that of the first argument if the second is nullable.  */
2456           if (stk[-1].nullable)
2457             stk[-2].nlastpos += stk[-1].nlastpos;
2458           else
2459             {
2460               position *pos = lastpos + stk[-2].nlastpos;
2461               for (size_t j = stk[-1].nlastpos; j-- > 0;)
2462                 pos[j] = lastpos[j];
2463               lastpos += stk[-2].nlastpos;
2464               stk[-2].nlastpos = stk[-1].nlastpos;
2465             }
2466
2467           /* A CAT node is nullable if both arguments are nullable.  */
2468           stk[-2].nullable &= stk[-1].nullable;
2469           stk--;
2470           break;
2471
2472         case OR:
2473           /* The firstpos is the union of the firstpos of each argument.  */
2474           stk[-2].nfirstpos += stk[-1].nfirstpos;
2475
2476           /* The lastpos is the union of the lastpos of each argument.  */
2477           stk[-2].nlastpos += stk[-1].nlastpos;
2478
2479           /* An OR node is nullable if either argument is nullable.  */
2480           stk[-2].nullable |= stk[-1].nullable;
2481           stk--;
2482           break;
2483
2484         default:
2485           /* Anything else is a nonempty position.  (Note that special
2486              constructs like \< are treated as nonempty strings here;
2487              an "epsilon closure" effectively makes them nullable later.
2488              Backreferences have to get a real position so we can detect
2489              transitions on them later.  But they are nullable.  */
2490           stk->nullable = d->tokens[i] == BACKREF;
2491
2492           /* This position is in its own firstpos and lastpos.  */
2493           stk->nfirstpos = stk->nlastpos = 1;
2494           stk++;
2495
2496           --firstpos, --lastpos;
2497           firstpos->index = lastpos->index = i;
2498           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2499
2500           break;
2501         }
2502 #ifdef DEBUG
2503       /* ... balance the above nonsyntactic #ifdef goo...  */
2504       fprintf (stderr, "node %zu:", i);
2505       prtok (d->tokens[i]);
2506       putc ('\n', stderr);
2507       fprintf (stderr,
2508                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2509       fprintf (stderr, " firstpos:");
2510       for (size_t j = stk[-1].nfirstpos; j-- > 0;)
2511         {
2512           fprintf (stderr, " %zu:", firstpos[j].index);
2513           prtok (d->tokens[firstpos[j].index]);
2514         }
2515       fprintf (stderr, "\n lastpos:");
2516       for (size_t j = stk[-1].nlastpos; j-- > 0;)
2517         {
2518           fprintf (stderr, " %zu:", lastpos[j].index);
2519           prtok (d->tokens[lastpos[j].index]);
2520         }
2521       putc ('\n', stderr);
2522 #endif
2523     }
2524
2525 #ifdef DEBUG
2526   for (size_t i = 0; i < d->tindex; ++i)
2527     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2528         || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2529         || d->tokens[i] >= CSET)
2530       {
2531         fprintf (stderr, "follows(%zu:", i);
2532         prtok (d->tokens[i]);
2533         fprintf (stderr, "):");
2534         for (size_t j = d->follows[i].nelem; j-- > 0;)
2535           {
2536             fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2537             prtok (d->tokens[d->follows[i].elems[j].index]);
2538           }
2539         putc ('\n', stderr);
2540       }
2541 #endif
2542
2543   /* Get the epsilon closure of the firstpos of the regexp.  The result will
2544      be the set of positions of state 0.  */
2545   merged.nelem = 0;
2546   for (size_t i = 0; i < stk[-1].nfirstpos; ++i)
2547     insert (firstpos[i], &merged);
2548
2549   /* For each follow set that is the follow set of a real position, replace
2550      it with its epsilon closure.  */
2551   epsclosure (&merged, d);
2552
2553   /* Context wanted by some position.  */
2554   int separate_contexts = state_separate_contexts (&merged);
2555
2556   /* Build the initial state.  */
2557   if (separate_contexts & CTX_NEWLINE)
2558     state_index (d, &merged, CTX_NEWLINE);
2559   d->initstate_notbol = d->min_trcount
2560     = state_index (d, &merged, separate_contexts ^ CTX_ANY);
2561   if (separate_contexts & CTX_LETTER)
2562     d->min_trcount = state_index (d, &merged, CTX_LETTER);
2563   d->min_trcount++;
2564   d->trcount = 0;
2565
2566   free (posalloc);
2567   free (stkalloc);
2568   free (merged.elems);
2569 }
2570
2571 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2572 static void
2573 realloc_trans_if_necessary (struct dfa *d)
2574 {
2575   state_num oldalloc = d->tralloc;
2576   if (oldalloc < d->sindex)
2577     {
2578       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2579       ptrdiff_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2580       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2581                            -1, sizeof *realtrans);
2582       realtrans[0] = realtrans[1] = NULL;
2583       d->trans = realtrans + 2;
2584       ptrdiff_t newalloc = d->tralloc = newalloc1 - 2;
2585       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2586       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2587       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2588       if (d->localeinfo.multibyte)
2589         {
2590           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2591           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2592           if (oldalloc == 0)
2593             realtrans[0] = realtrans[1] = NULL;
2594           d->mb_trans = realtrans + 2;
2595         }
2596       for (; oldalloc < newalloc; oldalloc++)
2597         {
2598           d->trans[oldalloc] = NULL;
2599           d->fails[oldalloc] = NULL;
2600           if (d->localeinfo.multibyte)
2601             d->mb_trans[oldalloc] = NULL;
2602         }
2603     }
2604 }
2605
2606 /*
2607    Calculate the transition table for a new state derived from state s
2608    for a compiled dfa d after input character uc, and return the new
2609    state number.
2610
2611    Do not worry about all possible input characters; calculate just the group
2612    of positions that match uc.  Label it with the set of characters that
2613    every position in the group matches (taking into account, if necessary,
2614    preceding context information of s).  Then find the union
2615    of these positions' follows, i.e., the set of positions of the
2616    new state.  For each character in the group's label, set the transition
2617    on this character to be to a state corresponding to the set's positions,
2618    and its associated backward context information, if necessary.
2619
2620    When building a searching matcher, include the positions of state
2621    0 in every state.
2622
2623    The group is constructed by building an equivalence-class
2624    partition of the positions of s.
2625
2626    For each position, find the set of characters C that it matches.  Eliminate
2627    any characters from C that fail on grounds of backward context.
2628
2629    Check whether the group's label L has nonempty
2630    intersection with C.  If L - C is nonempty, create a new group labeled
2631    L - C and having the same positions as the current group, and set L to
2632    the intersection of L and C.  Insert the position in the group, set
2633    C = C - L, and resume scanning.
2634
2635    If after comparing with every group there are characters remaining in C,
2636    create a new group labeled with the characters of C and insert this
2637    position in that group.  */
2638
2639 static state_num
2640 build_state (state_num s, struct dfa *d, unsigned char uc)
2641 {
2642   position_set follows;         /* Union of the follows of the group.  */
2643   position_set tmp;             /* Temporary space for merging sets.  */
2644   state_num state;              /* New state.  */
2645   state_num state_newline;      /* New state on a newline transition.  */
2646   state_num state_letter;       /* New state on a letter transition.  */
2647
2648 #ifdef DEBUG
2649   fprintf (stderr, "build state %td\n", s);
2650 #endif
2651
2652   /* A pointer to the new transition table, and the table itself.  */
2653   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2654   state_num *trans = *ptrans;
2655
2656   if (!trans)
2657     {
2658       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2659          transition tables that can exist at once, other than for
2660          initial states.  Often-used transition tables are quickly
2661          rebuilt, whereas rarely-used ones are cleared away.  */
2662       if (MAX_TRCOUNT <= d->trcount)
2663         {
2664           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2665             {
2666               free (d->trans[i]);
2667               free (d->fails[i]);
2668               d->trans[i] = d->fails[i] = NULL;
2669             }
2670           d->trcount = 0;
2671         }
2672
2673       d->trcount++;
2674       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2675
2676       /* Fill transition table with a default value which means that the
2677          transited state has not been calculated yet.  */
2678       for (int i = 0; i < NOTCHAR; i++)
2679         trans[i] = -2;
2680     }
2681
2682   /* Set up the success bits for this state.  */
2683   d->success[s] = 0;
2684   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2685     d->success[s] |= CTX_NEWLINE;
2686   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2687     d->success[s] |= CTX_LETTER;
2688   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2689     d->success[s] |= CTX_NONE;
2690
2691   /* Positions that match the input char.  */
2692   leaf_set group;
2693   group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
2694   group.nelem = 0;
2695
2696   /* The group's label.  */
2697   charclass label;
2698   fillset (&label);
2699
2700   for (size_t i = 0; i < d->states[s].elems.nelem; ++i)
2701     {
2702       charclass matches;            /* Set of matching characters.  */
2703       position pos = d->states[s].elems.elems[i];
2704       bool matched = false;
2705       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2706         {
2707           zeroset (&matches);
2708           setbit (d->tokens[pos.index], &matches);
2709           if (d->tokens[pos.index] == uc)
2710             matched = true;
2711         }
2712       else if (d->tokens[pos.index] >= CSET)
2713         {
2714           matches = d->charclasses[d->tokens[pos.index] - CSET];
2715           if (tstbit (uc, &matches))
2716             matched = true;
2717         }
2718       else if (d->tokens[pos.index] == ANYCHAR)
2719         {
2720           matches = d->charclasses[d->canychar];
2721           if (tstbit (uc, &matches))
2722             matched = true;
2723
2724           /* ANYCHAR must match with a single character, so we must put
2725              it to D->states[s].mbps which contains the positions which
2726              can match with a single character not a byte.  If all
2727              positions which has ANYCHAR does not depend on context of
2728              next character, we put the follows instead of it to
2729              D->states[s].mbps to optimize.  */
2730           if (succeeds_in_context (pos.constraint, d->states[s].context,
2731                                    CTX_NONE))
2732             {
2733               if (d->states[s].mbps.nelem == 0)
2734                 alloc_position_set (&d->states[s].mbps,
2735                                     d->follows[pos.index].nelem);
2736               for (size_t j = 0; j < d->follows[pos.index].nelem; j++)
2737                 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
2738             }
2739         }
2740       else
2741         continue;
2742
2743       /* Some characters may need to be eliminated from matches because
2744          they fail in the current context.  */
2745       if (pos.constraint != NO_CONSTRAINT)
2746         {
2747           if (!succeeds_in_context (pos.constraint,
2748                                     d->states[s].context, CTX_NEWLINE))
2749             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2750               matches.w[j] &= ~d->syntax.newline.w[j];
2751           if (!succeeds_in_context (pos.constraint,
2752                                     d->states[s].context, CTX_LETTER))
2753             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2754               matches.w[j] &= ~d->syntax.letters.w[j];
2755           if (!succeeds_in_context (pos.constraint,
2756                                     d->states[s].context, CTX_NONE))
2757             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2758               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
2759
2760           /* If there are no characters left, there's no point in going on.  */
2761           if (emptyset (&matches))
2762             continue;
2763
2764           /* If we have reset the bit that made us declare "matched", reset
2765              that indicator, too.  This is required to avoid an infinite loop
2766              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
2767           if (!tstbit (uc, &matches))
2768             matched = false;
2769         }
2770
2771 #ifdef DEBUG
2772       fprintf (stderr, " nextpos %zu:", pos.index);
2773       prtok (d->tokens[pos.index]);
2774       fprintf (stderr, " of");
2775       for (size_t j = 0; j < NOTCHAR; j++)
2776         if (tstbit (j, &matches))
2777           fprintf (stderr, " 0x%02zx", j);
2778       fprintf (stderr, "\n");
2779 #endif
2780
2781       if (matched)
2782         {
2783           for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
2784             label.w[k] &= matches.w[k];
2785           group.elems[group.nelem++] = pos.index;
2786         }
2787       else
2788         {
2789           for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
2790             label.w[k] &= ~matches.w[k];
2791         }
2792     }
2793
2794   alloc_position_set (&follows, d->nleaves);
2795   alloc_position_set (&tmp, d->nleaves);
2796
2797   if (group.nelem > 0)
2798     {
2799       follows.nelem = 0;
2800
2801       /* Find the union of the follows of the positions of the group.
2802          This is a hideously inefficient loop.  Fix it someday.  */
2803       for (size_t j = 0; j < group.nelem; ++j)
2804         for (size_t k = 0; k < d->follows[group.elems[j]].nelem; ++k)
2805           insert (d->follows[group.elems[j]].elems[k], &follows);
2806
2807       /* If we are building a searching matcher, throw in the positions
2808          of state 0 as well, if possible.  */
2809       if (d->searchflag)
2810         {
2811           /* If a token in follows.elems is not 1st byte of a multibyte
2812              character, or the states of follows must accept the bytes
2813              which are not 1st byte of the multibyte character.
2814              Then, if a state of follows encounters a byte, it must not be
2815              a 1st byte of a multibyte character nor a single byte character.
2816              In this case, do not add state[0].follows to next state, because
2817              state[0] must accept 1st-byte.
2818
2819              For example, suppose <sb a> is a certain single byte character,
2820              <mb A> is a certain multibyte character, and the codepoint of
2821              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
2822              state[0] accepts <sb a>, state[i] transits to state[i+1] by
2823              accepting the 1st byte of <mb A>, and state[i+1] accepts the
2824              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
2825              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
2826              not add state[0].  */
2827
2828           bool mergeit = !d->localeinfo.multibyte;
2829           if (!mergeit)
2830             {
2831               mergeit = true;
2832               for (size_t j = 0; mergeit && j < follows.nelem; j++)
2833                 mergeit &= d->multibyte_prop[follows.elems[j].index];
2834             }
2835           if (mergeit)
2836             {
2837               merge (&d->states[0].elems, &follows, &tmp);
2838               copy (&tmp, &follows);
2839             }
2840         }
2841
2842       /* Find out if the new state will want any context information,
2843          by calculating possible contexts that the group can match,
2844          and separate contexts that the new state wants to know.  */
2845       int possible_contexts = charclass_context (d, &label);
2846       int separate_contexts = state_separate_contexts (&follows);
2847
2848       /* Find the state(s) corresponding to the union of the follows.  */
2849       if (possible_contexts & ~separate_contexts)
2850         state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2851       else
2852         state = -1;
2853       if (separate_contexts & possible_contexts & CTX_NEWLINE)
2854         state_newline = state_index (d, &follows, CTX_NEWLINE);
2855       else
2856         state_newline = state;
2857       if (separate_contexts & possible_contexts & CTX_LETTER)
2858         state_letter = state_index (d, &follows, CTX_LETTER);
2859       else
2860         state_letter = state;
2861
2862       /* Reallocate now, to reallocate any newline transition properly.  */
2863       realloc_trans_if_necessary (d);
2864     }
2865
2866   /* If we are a searching matcher, the default transition is to a state
2867      containing the positions of state 0, otherwise the default transition
2868      is to fail miserably.  */
2869   else if (d->searchflag)
2870     {
2871       state_newline = 0;
2872       state_letter = d->min_trcount - 1;
2873       state = d->initstate_notbol;
2874     }
2875   else
2876     {
2877       state_newline = -1;
2878       state_letter = -1;
2879       state = -1;
2880     }
2881
2882   /* Set the transitions for each character in the label.  */
2883   for (size_t i = 0; i < NOTCHAR; i++)
2884     if (tstbit (i, &label))
2885       switch (d->syntax.sbit[i])
2886         {
2887         case CTX_NEWLINE:
2888           trans[i] = state_newline;
2889           break;
2890         case CTX_LETTER:
2891           trans[i] = state_letter;
2892           break;
2893         default:
2894           trans[i] = state;
2895           break;
2896         }
2897
2898 #ifdef DEBUG
2899   fprintf (stderr, "trans table %td", s);
2900   for (size_t i = 0; i < NOTCHAR; ++i)
2901     {
2902       if (!(i & 0xf))
2903         fprintf (stderr, "\n");
2904       fprintf (stderr, " %2td", trans[i]);
2905     }
2906   fprintf (stderr, "\n");
2907 #endif
2908
2909   free (group.elems);
2910   free (follows.elems);
2911   free (tmp.elems);
2912
2913   /* Keep the newline transition in a special place so we can use it as
2914      a sentinel.  */
2915   if (tstbit (d->syntax.eolbyte, &label))
2916     {
2917       d->newlines[s] = trans[d->syntax.eolbyte];
2918       trans[d->syntax.eolbyte] = -1;
2919     }
2920
2921   return trans[uc];
2922 }
2923
2924 /* Multibyte character handling sub-routines for dfaexec.  */
2925
2926 /* Consume a single byte and transit state from 's' to '*next_state'.
2927    This function is almost same as the state transition routin in dfaexec.
2928    But state transition is done just once, otherwise matching succeed or
2929    reach the end of the buffer.  */
2930 static state_num
2931 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
2932 {
2933   state_num *t;
2934
2935   if (d->trans[s])
2936     t = d->trans[s];
2937   else if (d->fails[s])
2938     t = d->fails[s];
2939   else
2940     {
2941       build_state (s, d, **pp);
2942       if (d->trans[s])
2943         t = d->trans[s];
2944       else
2945         {
2946           t = d->fails[s];
2947           assert (t);
2948         }
2949     }
2950
2951   if (t[**pp] == -2)
2952     build_state (s, d, **pp);
2953
2954   return t[*(*pp)++];
2955 }
2956
2957 /* Transit state from s, then return new state and update the pointer of
2958    the buffer.  This function is for a period operator which can match a
2959    multi-byte character.  */
2960 static state_num
2961 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
2962                unsigned char const *end)
2963 {
2964   wint_t wc;
2965
2966   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
2967
2968   /* This state has some operators which can match a multibyte character.  */
2969   d->mb_follows.nelem = 0;
2970
2971   /* Calculate the state which can be reached from the state 's' by
2972      consuming 'mbclen' single bytes from the buffer.  */
2973   state_num s1 = s;
2974   int mbci;
2975   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
2976     s = transit_state_singlebyte (d, s, pp);
2977   *pp += mbclen - mbci;
2978
2979   if (wc == WEOF)
2980     {
2981       /* It is an invalid character, so ANYCHAR is not accepted.  */
2982       return s;
2983     }
2984
2985   /* If all positions which have ANYCHAR do not depend on the context
2986      of the next character, calculate the next state with
2987      pre-calculated follows and cache the result.  */
2988   if (d->states[s1].mb_trindex < 0)
2989     {
2990       if (MAX_TRCOUNT <= d->mb_trcount)
2991         {
2992           state_num s3;
2993           for (s3 = -1; s3 < d->tralloc; s3++)
2994             {
2995               free (d->mb_trans[s3]);
2996               d->mb_trans[s3] = NULL;
2997             }
2998
2999           for (state_num i = 0; i < d->sindex; i++)
3000             d->states[i].mb_trindex = -1;
3001           d->mb_trcount = 0;
3002         }
3003       d->states[s1].mb_trindex = d->mb_trcount++;
3004     }
3005
3006   if (! d->mb_trans[s])
3007     {
3008       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3009       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3010       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3011       for (int i = 0; i < MAX_TRCOUNT; i++)
3012         d->mb_trans[s][i] = -1;
3013     }
3014   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3015     return d->mb_trans[s][d->states[s1].mb_trindex];
3016
3017   if (s == -1)
3018     copy (&d->states[s1].mbps, &d->mb_follows);
3019   else
3020     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3021
3022   int separate_contexts = state_separate_contexts (&d->mb_follows);
3023   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3024   realloc_trans_if_necessary (d);
3025
3026   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3027
3028   return s2;
3029 }
3030
3031 /* The initial state may encounter a byte which is not a single byte character
3032    nor the first byte of a multibyte character.  But it is incorrect for the
3033    initial state to accept such a byte.  For example, in Shift JIS the regular
3034    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3035    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3036    that are not a single byte character nor the first byte of a multibyte
3037    character.
3038
3039    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3040    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3041    the result is greater than P, set *WCP to the final wide character
3042    processed, or to WEOF if no wide character is processed.  Otherwise,
3043    if WCP is non-NULL, *WCP may or may not be updated.
3044
3045    Both P and MBP must be no larger than END.  */
3046 static unsigned char const *
3047 skip_remains_mb (struct dfa *d, unsigned char const *p,
3048                  unsigned char const *mbp, char const *end)
3049 {
3050   if (d->syntax.never_trail[*p])
3051     return p;
3052   while (mbp < p)
3053     {
3054       wint_t wc;
3055       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3056                            end - (char const *) mbp, d);
3057     }
3058   return mbp;
3059 }
3060
3061 /* Search through a buffer looking for a match to the struct dfa *D.
3062    Find the first occurrence of a string matching the regexp in the
3063    buffer, and the shortest possible version thereof.  Return a pointer to
3064    the first character after the match, or NULL if none is found.  BEGIN
3065    points to the beginning of the buffer, and END points to the first byte
3066    after its end.  Note however that we store a sentinel byte (usually
3067    newline) in *END, so the actual buffer must be one byte longer.
3068    When ALLOW_NL, newlines may appear in the matching string.
3069    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3070    If MULTIBYTE, the input consists of multibyte characters and/or
3071    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3072    Here is the list of features that make this DFA matcher punt:
3073     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3074     - [^...] in non-simple locale
3075     - [[=foo=]] or [[.foo.]]
3076     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3077     - back-reference: (.)\1
3078     - word-delimiter in multibyte locale: \<, \>, \b, \B
3079    See using_simple_locale for the definition of "simple locale".  */
3080
3081 static inline char *
3082 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3083               size_t *count, bool multibyte)
3084 {
3085   if (MAX_TRCOUNT <= d->sindex)
3086     {
3087       for (state_num s = d->min_trcount; s < d->sindex; s++)
3088         {
3089           free (d->states[s].elems.elems);
3090           free (d->states[s].mbps.elems);
3091         }
3092       d->sindex = d->min_trcount;
3093
3094       if (d->trans)
3095         {
3096           for (state_num s = 0; s < d->tralloc; s++)
3097             {
3098               free (d->trans[s]);
3099               free (d->fails[s]);
3100               d->trans[s] = d->fails[s] = NULL;
3101             }
3102           d->trcount = 0;
3103         }
3104
3105       if (d->localeinfo.multibyte && d->mb_trans)
3106         {
3107           for (state_num s = -1; s < d->tralloc; s++)
3108             {
3109               free (d->mb_trans[s]);
3110               d->mb_trans[s] = NULL;
3111             }
3112           for (state_num s = 0; s < d->min_trcount; s++)
3113             d->states[s].mb_trindex = -1;
3114           d->mb_trcount = 0;
3115         }
3116     }
3117
3118   if (!d->tralloc)
3119     realloc_trans_if_necessary (d);
3120
3121   /* Current state.  */
3122   state_num s = 0, s1 = 0;
3123
3124   /* Current input character.  */
3125   unsigned char const *p = (unsigned char const *) begin;
3126   unsigned char const *mbp = p;
3127
3128   /* Copy of d->trans so it can be optimized into a register.  */
3129   state_num **trans = d->trans;
3130   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3131   unsigned char saved_end = *(unsigned char *) end;
3132   *end = eol;
3133
3134   if (multibyte)
3135     {
3136       memset (&d->mbs, 0, sizeof d->mbs);
3137       if (d->mb_follows.alloc == 0)
3138         alloc_position_set (&d->mb_follows, d->nleaves);
3139     }
3140
3141   size_t nlcount = 0;
3142   for (;;)
3143     {
3144       state_num *t;
3145       while ((t = trans[s]) != NULL)
3146         {
3147           if (s < d->min_trcount)
3148             {
3149               if (!multibyte || d->states[s].mbps.nelem == 0)
3150                 {
3151                   while (t[*p] == s)
3152                     p++;
3153                 }
3154               if (multibyte)
3155                 p = mbp = skip_remains_mb (d, p, mbp, end);
3156             }
3157
3158           if (multibyte)
3159             {
3160               s1 = s;
3161
3162               if (d->states[s].mbps.nelem == 0
3163                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3164                 {
3165                   /* If an input character does not match ANYCHAR, do it
3166                      like a single-byte character.  */
3167                   s = t[*p++];
3168                 }
3169               else
3170                 {
3171                   s = transit_state (d, s, &p, (unsigned char *) end);
3172                   mbp = p;
3173                   trans = d->trans;
3174                 }
3175             }
3176           else
3177             {
3178               s1 = t[*p++];
3179               t = trans[s1];
3180               if (! t)
3181                 {
3182                   state_num tmp = s;
3183                   s = s1;
3184                   s1 = tmp;     /* swap */
3185                   break;
3186                 }
3187               if (s < d->min_trcount)
3188                 {
3189                   while (t[*p] == s1)
3190                     p++;
3191                 }
3192               s = t[*p++];
3193             }
3194         }
3195
3196       if (s < 0)
3197         {
3198           if (s == -2)
3199             {
3200               s = build_state (s1, d, p[-1]);
3201               trans = d->trans;
3202             }
3203           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3204             {
3205               /* The previous character was a newline.  Count it, and skip
3206                  checking of multibyte character boundary until here.  */
3207               nlcount++;
3208               mbp = p;
3209
3210               s = (allow_nl ? d->newlines[s1]
3211                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3212                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3213                    : d->initstate_notbol);
3214             }
3215           else
3216             {
3217               p = NULL;
3218               goto done;
3219             }
3220         }
3221       else if (d->fails[s])
3222         {
3223           if ((d->success[s] & d->syntax.sbit[*p])
3224               || ((char *) p == end
3225                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3226                                          d)))
3227             goto done;
3228
3229           if (multibyte && s < d->min_trcount)
3230             p = mbp = skip_remains_mb (d, p, mbp, end);
3231
3232           s1 = s;
3233           if (!multibyte || d->states[s].mbps.nelem == 0
3234               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3235             {
3236               /* If a input character does not match ANYCHAR, do it
3237                  like a single-byte character.  */
3238               s = d->fails[s][*p++];
3239             }
3240           else
3241             {
3242               s = transit_state (d, s, &p, (unsigned char *) end);
3243               mbp = p;
3244               trans = d->trans;
3245             }
3246         }
3247       else
3248         {
3249           build_state (s, d, p[0]);
3250           trans = d->trans;
3251         }
3252     }
3253
3254  done:
3255   if (count)
3256     *count += nlcount;
3257   *end = saved_end;
3258   return (char *) p;
3259 }
3260
3261 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3262    This is for performance, as dfaexec_main is an inline function.  */
3263
3264 static char *
3265 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3266             bool allow_nl, size_t *count, bool *backref)
3267 {
3268   return dfaexec_main (d, begin, end, allow_nl, count, true);
3269 }
3270
3271 static char *
3272 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3273             bool allow_nl, size_t *count, bool *backref)
3274 {
3275   return dfaexec_main (d, begin, end, allow_nl, count, false);
3276 }
3277
3278 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3279    any regexp that uses a construct not supported by this code.  */
3280 static char *
3281 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3282               bool allow_nl, size_t *count, bool *backref)
3283 {
3284   *backref = true;
3285   return (char *) begin;
3286 }
3287
3288 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3289    but faster and set *BACKREF if the DFA code does not support this
3290    regexp usage.  */
3291
3292 char *
3293 dfaexec (struct dfa *d, char const *begin, char *end,
3294          bool allow_nl, size_t *count, bool *backref)
3295 {
3296   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3297 }
3298
3299 struct dfa *
3300 dfasuperset (struct dfa const *d)
3301 {
3302   return d->superset;
3303 }
3304
3305 bool
3306 dfaisfast (struct dfa const *d)
3307 {
3308   return d->fast;
3309 }
3310
3311 static void
3312 free_mbdata (struct dfa *d)
3313 {
3314   free (d->multibyte_prop);
3315   free (d->lex.brack.chars);
3316   free (d->mb_follows.elems);
3317
3318   if (d->mb_trans)
3319     {
3320       state_num s;
3321       for (s = -1; s < d->tralloc; s++)
3322         free (d->mb_trans[s]);
3323       free (d->mb_trans - 2);
3324     }
3325 }
3326
3327 /* Return true if every construct in D is supported by this DFA matcher.  */
3328 static bool _GL_ATTRIBUTE_PURE
3329 dfa_supported (struct dfa const *d)
3330 {
3331   for (size_t i = 0; i < d->tindex; i++)
3332     {
3333       switch (d->tokens[i])
3334         {
3335         case BEGWORD:
3336         case ENDWORD:
3337         case LIMWORD:
3338         case NOTLIMWORD:
3339           if (!d->localeinfo.multibyte)
3340             continue;
3341           FALLTHROUGH;
3342         case BACKREF:
3343         case MBCSET:
3344           return false;
3345         }
3346     }
3347   return true;
3348 }
3349
3350 static void
3351 dfaoptimize (struct dfa *d)
3352 {
3353   if (!d->localeinfo.using_utf8)
3354     return;
3355
3356   bool have_backref = false;
3357   for (size_t i = 0; i < d->tindex; ++i)
3358     {
3359       switch (d->tokens[i])
3360         {
3361         case ANYCHAR:
3362           /* Lowered.  */
3363           abort ();
3364         case BACKREF:
3365           have_backref = true;
3366           break;
3367         case MBCSET:
3368           /* Requires multi-byte algorithm.  */
3369           return;
3370         default:
3371           break;
3372         }
3373     }
3374
3375   if (!have_backref && d->superset)
3376     {
3377       /* The superset DFA is not likely to be much faster, so remove it.  */
3378       dfafree (d->superset);
3379       free (d->superset);
3380       d->superset = NULL;
3381     }
3382
3383   free_mbdata (d);
3384   d->localeinfo.multibyte = false;
3385   d->dfaexec = dfaexec_sb;
3386   d->fast = true;
3387 }
3388
3389 static void
3390 dfassbuild (struct dfa *d)
3391 {
3392   struct dfa *sup = dfaalloc ();
3393
3394   *sup = *d;
3395   sup->localeinfo.multibyte = false;
3396   sup->dfaexec = dfaexec_sb;
3397   sup->multibyte_prop = NULL;
3398   sup->superset = NULL;
3399   sup->states = NULL;
3400   sup->sindex = 0;
3401   sup->follows = NULL;
3402   sup->tralloc = 0;
3403   sup->trans = NULL;
3404   sup->fails = NULL;
3405   sup->success = NULL;
3406   sup->newlines = NULL;
3407
3408   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3409   if (d->cindex)
3410     {
3411       memcpy (sup->charclasses, d->charclasses,
3412               d->cindex * sizeof *sup->charclasses);
3413     }
3414
3415   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3416   sup->talloc = d->tindex * 2;
3417
3418   bool have_achar = false;
3419   bool have_nchar = false;
3420   size_t j;
3421   for (size_t i = j = 0; i < d->tindex; i++)
3422     {
3423       switch (d->tokens[i])
3424         {
3425         case ANYCHAR:
3426         case MBCSET:
3427         case BACKREF:
3428           {
3429             charclass ccl;
3430             fillset (&ccl);
3431             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3432             sup->tokens[j++] = STAR;
3433             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3434                 || d->tokens[i + 1] == PLUS)
3435               i++;
3436             have_achar = true;
3437           }
3438           break;
3439         case BEGWORD:
3440         case ENDWORD:
3441         case LIMWORD:
3442         case NOTLIMWORD:
3443           if (d->localeinfo.multibyte)
3444             {
3445               /* These constraints aren't supported in a multibyte locale.
3446                  Ignore them in the superset DFA.  */
3447               sup->tokens[j++] = EMPTY;
3448               break;
3449             }
3450           FALLTHROUGH;
3451         default:
3452           sup->tokens[j++] = d->tokens[i];
3453           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3454               || d->tokens[i] >= CSET)
3455             have_nchar = true;
3456           break;
3457         }
3458     }
3459   sup->tindex = j;
3460
3461   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3462     d->superset = sup;
3463   else
3464     {
3465       dfafree (sup);
3466       free (sup);
3467     }
3468 }
3469
3470 /* Parse and analyze a single string of the given length.  */
3471 void
3472 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3473 {
3474   dfaparse (s, len, d);
3475   dfassbuild (d);
3476
3477   if (dfa_supported (d))
3478     {
3479       dfaoptimize (d);
3480       dfaanalyze (d, searchflag);
3481     }
3482   else
3483     {
3484       d->dfaexec = dfaexec_noop;
3485     }
3486
3487   if (d->superset)
3488     {
3489       d->fast = true;
3490       dfaanalyze (d->superset, searchflag);
3491     }
3492 }
3493
3494 /* Free the storage held by the components of a dfa.  */
3495 void
3496 dfafree (struct dfa *d)
3497 {
3498   free (d->charclasses);
3499   free (d->tokens);
3500
3501   if (d->localeinfo.multibyte)
3502     free_mbdata (d);
3503
3504   for (size_t i = 0; i < d->sindex; ++i)
3505     {
3506       free (d->states[i].elems.elems);
3507       free (d->states[i].mbps.elems);
3508     }
3509   free (d->states);
3510
3511   if (d->follows)
3512     {
3513       for (size_t i = 0; i < d->tindex; ++i)
3514         free (d->follows[i].elems);
3515       free (d->follows);
3516     }
3517
3518   if (d->trans)
3519     {
3520       for (size_t i = 0; i < d->tralloc; ++i)
3521         {
3522           free (d->trans[i]);
3523           free (d->fails[i]);
3524         }
3525
3526       free (d->trans - 2);
3527       free (d->fails);
3528       free (d->newlines);
3529       free (d->success);
3530     }
3531
3532   if (d->superset)
3533     dfafree (d->superset);
3534 }
3535
3536 /* Having found the postfix representation of the regular expression,
3537    try to find a long sequence of characters that must appear in any line
3538    containing the r.e.
3539    Finding a "longest" sequence is beyond the scope here;
3540    we take an easy way out and hope for the best.
3541    (Take "(ab|a)b"--please.)
3542
3543    We do a bottom-up calculation of sequences of characters that must appear
3544    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3545    representation:
3546         sequences that must appear at the left of the match ("left")
3547         sequences that must appear at the right of the match ("right")
3548         lists of sequences that must appear somewhere in the match ("in")
3549         sequences that must constitute the match ("is")
3550
3551    When we get to the root of the tree, we use one of the longest of its
3552    calculated "in" sequences as our answer.
3553
3554    The sequences calculated for the various types of node (in pseudo ANSI c)
3555    are shown below.  "p" is the operand of unary operators (and the left-hand
3556    operand of binary operators); "q" is the right-hand operand of binary
3557    operators.
3558
3559    "ZERO" means "a zero-length sequence" below.
3560
3561         Type    left            right           is              in
3562         ----    ----            -----           --              --
3563         char c  # c             # c             # c             # c
3564
3565         ANYCHAR ZERO            ZERO            ZERO            ZERO
3566
3567         MBCSET  ZERO            ZERO            ZERO            ZERO
3568
3569         CSET    ZERO            ZERO            ZERO            ZERO
3570
3571         STAR    ZERO            ZERO            ZERO            ZERO
3572
3573         QMARK   ZERO            ZERO            ZERO            ZERO
3574
3575         PLUS    p->left         p->right        ZERO            p->in
3576
3577         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3578                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3579                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3580                                                 ZERO
3581
3582         OR      longest common  longest common  (do p->is and substrings common
3583                 leading         trailing        to q->is have same p->in and
3584                 (sub)sequence   (sub)sequence   q->in length and content) ?
3585                 of p->left      of p->right
3586                 and q->left     and q->right    p->is : NULL
3587
3588    If there's anything else we recognize in the tree, all four sequences get set
3589    to zero-length sequences.  If there's something we don't recognize in the
3590    tree, we just return a zero-length sequence.
3591
3592    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3593    'aaa')?
3594
3595    And ... is it here or someplace that we might ponder "optimizations" such as
3596         egrep 'psi|epsilon'     ->      egrep 'psi'
3597         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3598                                         (Yes, we now find "epsi" as a "string
3599                                         that must occur", but we might also
3600                                         simplify the *entire* r.e. being sought)
3601         grep '[c]'              ->      grep 'c'
3602         grep '(ab|a)b'          ->      grep 'ab'
3603         grep 'ab*'              ->      grep 'a'
3604         grep 'a*b'              ->      grep 'b'
3605
3606    There are several issues:
3607
3608    Is optimization easy (enough)?
3609
3610    Does optimization actually accomplish anything,
3611    or is the automaton you get from "psi|epsilon" (for example)
3612    the same as the one you get from "psi" (for example)?
3613
3614    Are optimizable r.e.'s likely to be used in real-life situations
3615    (something like 'ab*' is probably unlikely; something like is
3616    'psi|epsilon' is likelier)?  */
3617
3618 static char *
3619 icatalloc (char *old, char const *new)
3620 {
3621   size_t newsize = strlen (new);
3622   if (newsize == 0)
3623     return old;
3624   size_t oldsize = strlen (old);
3625   char *result = xrealloc (old, oldsize + newsize + 1);
3626   memcpy (result + oldsize, new, newsize + 1);
3627   return result;
3628 }
3629
3630 static void
3631 freelist (char **cpp)
3632 {
3633   while (*cpp)
3634     free (*cpp++);
3635 }
3636
3637 static char **
3638 enlist (char **cpp, char *new, size_t len)
3639 {
3640   new = memcpy (xmalloc (len + 1), new, len);
3641   new[len] = '\0';
3642   /* Is there already something in the list that's new (or longer)?  */
3643   size_t i;
3644   for (i = 0; cpp[i] != NULL; ++i)
3645     if (strstr (cpp[i], new) != NULL)
3646       {
3647         free (new);
3648         return cpp;
3649       }
3650   /* Eliminate any obsoleted strings.  */
3651   for (size_t j = 0; cpp[j] != NULL; )
3652     if (strstr (new, cpp[j]) == NULL)
3653       ++j;
3654     else
3655       {
3656         free (cpp[j]);
3657         if (--i == j)
3658           break;
3659         cpp[j] = cpp[i];
3660         cpp[i] = NULL;
3661       }
3662   /* Add the new string.  */
3663   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3664   cpp[i] = new;
3665   cpp[i + 1] = NULL;
3666   return cpp;
3667 }
3668
3669 /* Given pointers to two strings, return a pointer to an allocated
3670    list of their distinct common substrings.  */
3671 static char **
3672 comsubs (char *left, char const *right)
3673 {
3674   char **cpp = xzalloc (sizeof *cpp);
3675
3676   for (char *lcp = left; *lcp != '\0'; lcp++)
3677     {
3678       size_t len = 0;
3679       char *rcp = strchr (right, *lcp);
3680       while (rcp != NULL)
3681         {
3682           size_t i;
3683           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3684             continue;
3685           if (i > len)
3686             len = i;
3687           rcp = strchr (rcp + 1, *lcp);
3688         }
3689       if (len != 0)
3690         cpp = enlist (cpp, lcp, len);
3691     }
3692   return cpp;
3693 }
3694
3695 static char **
3696 addlists (char **old, char **new)
3697 {
3698   for (; *new; new++)
3699     old = enlist (old, *new, strlen (*new));
3700   return old;
3701 }
3702
3703 /* Given two lists of substrings, return a new list giving substrings
3704    common to both.  */
3705 static char **
3706 inboth (char **left, char **right)
3707 {
3708   char **both = xzalloc (sizeof *both);
3709
3710   for (size_t lnum = 0; left[lnum] != NULL; ++lnum)
3711     {
3712       for (size_t rnum = 0; right[rnum] != NULL; ++rnum)
3713         {
3714           char **temp = comsubs (left[lnum], right[rnum]);
3715           both = addlists (both, temp);
3716           freelist (temp);
3717           free (temp);
3718         }
3719     }
3720   return both;
3721 }
3722
3723 typedef struct must must;
3724
3725 struct must
3726 {
3727   char **in;
3728   char *left;
3729   char *right;
3730   char *is;
3731   bool begline;
3732   bool endline;
3733   must *prev;
3734 };
3735
3736 static must *
3737 allocmust (must *mp, size_t size)
3738 {
3739   must *new_mp = xmalloc (sizeof *new_mp);
3740   new_mp->in = xzalloc (sizeof *new_mp->in);
3741   new_mp->left = xzalloc (size);
3742   new_mp->right = xzalloc (size);
3743   new_mp->is = xzalloc (size);
3744   new_mp->begline = false;
3745   new_mp->endline = false;
3746   new_mp->prev = mp;
3747   return new_mp;
3748 }
3749
3750 static void
3751 resetmust (must *mp)
3752 {
3753   freelist (mp->in);
3754   mp->in[0] = NULL;
3755   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3756   mp->begline = false;
3757   mp->endline = false;
3758 }
3759
3760 static void
3761 freemust (must *mp)
3762 {
3763   freelist (mp->in);
3764   free (mp->in);
3765   free (mp->left);
3766   free (mp->right);
3767   free (mp->is);
3768   free (mp);
3769 }
3770
3771 struct dfamust *
3772 dfamust (struct dfa const *d)
3773 {
3774   must *mp = NULL;
3775   char const *result = "";
3776   bool exact = false;
3777   bool begline = false;
3778   bool endline = false;
3779   bool need_begline = false;
3780   bool need_endline = false;
3781   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
3782
3783   for (size_t ri = 0; ri < d->tindex; ++ri)
3784     {
3785       token t = d->tokens[ri];
3786       switch (t)
3787         {
3788         case BEGLINE:
3789           mp = allocmust (mp, 2);
3790           mp->begline = true;
3791           need_begline = true;
3792           break;
3793         case ENDLINE:
3794           mp = allocmust (mp, 2);
3795           mp->endline = true;
3796           need_endline = true;
3797           break;
3798         case LPAREN:
3799         case RPAREN:
3800           assert (!"neither LPAREN nor RPAREN may appear here");
3801
3802         case EMPTY:
3803         case BEGWORD:
3804         case ENDWORD:
3805         case LIMWORD:
3806         case NOTLIMWORD:
3807         case BACKREF:
3808         case ANYCHAR:
3809         case MBCSET:
3810           mp = allocmust (mp, 2);
3811           break;
3812
3813         case STAR:
3814         case QMARK:
3815           resetmust (mp);
3816           break;
3817
3818         case OR:
3819           {
3820             char **new;
3821             must *rmp = mp;
3822             must *lmp = mp = mp->prev;
3823             size_t j, ln, rn, n;
3824
3825             /* Guaranteed to be.  Unlikely, but ...  */
3826             if (streq (lmp->is, rmp->is))
3827               {
3828                 lmp->begline &= rmp->begline;
3829                 lmp->endline &= rmp->endline;
3830               }
3831             else
3832               {
3833                 lmp->is[0] = '\0';
3834                 lmp->begline = false;
3835                 lmp->endline = false;
3836               }
3837             /* Left side--easy */
3838             size_t i = 0;
3839             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3840               ++i;
3841             lmp->left[i] = '\0';
3842             /* Right side */
3843             ln = strlen (lmp->right);
3844             rn = strlen (rmp->right);
3845             n = ln;
3846             if (n > rn)
3847               n = rn;
3848             for (i = 0; i < n; ++i)
3849               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3850                 break;
3851             for (j = 0; j < i; ++j)
3852               lmp->right[j] = lmp->right[(ln - i) + j];
3853             lmp->right[j] = '\0';
3854             new = inboth (lmp->in, rmp->in);
3855             freelist (lmp->in);
3856             free (lmp->in);
3857             lmp->in = new;
3858             freemust (rmp);
3859           }
3860           break;
3861
3862         case PLUS:
3863           mp->is[0] = '\0';
3864           break;
3865
3866         case END:
3867           assert (!mp->prev);
3868           for (size_t i = 0; mp->in[i] != NULL; ++i)
3869             if (strlen (mp->in[i]) > strlen (result))
3870               result = mp->in[i];
3871           if (streq (result, mp->is))
3872             {
3873               if ((!need_begline || mp->begline) && (!need_endline
3874                                                      || mp->endline))
3875                 exact = true;
3876               begline = mp->begline;
3877               endline = mp->endline;
3878             }
3879           goto done;
3880
3881         case CAT:
3882           {
3883             must *rmp = mp;
3884             must *lmp = mp = mp->prev;
3885
3886             /* In.  Everything in left, plus everything in
3887                right, plus concatenation of
3888                left's right and right's left.  */
3889             lmp->in = addlists (lmp->in, rmp->in);
3890             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3891               {
3892                 size_t lrlen = strlen (lmp->right);
3893                 size_t rllen = strlen (rmp->left);
3894                 char *tp = xmalloc (lrlen + rllen);
3895                 memcpy (tp, lmp->right, lrlen);
3896                 memcpy (tp + lrlen, rmp->left, rllen);
3897                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
3898                 free (tp);
3899               }
3900             /* Left-hand */
3901             if (lmp->is[0] != '\0')
3902               lmp->left = icatalloc (lmp->left, rmp->left);
3903             /* Right-hand */
3904             if (rmp->is[0] == '\0')
3905               lmp->right[0] = '\0';
3906             lmp->right = icatalloc (lmp->right, rmp->right);
3907             /* Guaranteed to be */
3908             if ((lmp->is[0] != '\0' || lmp->begline)
3909                 && (rmp->is[0] != '\0' || rmp->endline))
3910               {
3911                 lmp->is = icatalloc (lmp->is, rmp->is);
3912                 lmp->endline = rmp->endline;
3913               }
3914             else
3915               {
3916                 lmp->is[0] = '\0';
3917                 lmp->begline = false;
3918                 lmp->endline = false;
3919               }
3920             freemust (rmp);
3921           }
3922           break;
3923
3924         case '\0':
3925           /* Not on *my* shift.  */
3926           goto done;
3927
3928         default:
3929           if (CSET <= t)
3930             {
3931               /* If T is a singleton, or if case-folding in a unibyte
3932                  locale and T's members all case-fold to the same char,
3933                  convert T to one of its members.  Otherwise, do
3934                  nothing further with T.  */
3935               charclass *ccl = &d->charclasses[t - CSET];
3936               int j;
3937               for (j = 0; j < NOTCHAR; j++)
3938                 if (tstbit (j, ccl))
3939                   break;
3940               if (! (j < NOTCHAR))
3941                 {
3942                   mp = allocmust (mp, 2);
3943                   break;
3944                 }
3945               t = j;
3946               while (++j < NOTCHAR)
3947                 if (tstbit (j, ccl)
3948                     && ! (case_fold_unibyte
3949                           && toupper (j) == toupper (t)))
3950                   break;
3951               if (j < NOTCHAR)
3952                 {
3953                   mp = allocmust (mp, 2);
3954                   break;
3955                 }
3956             }
3957
3958           size_t rj = ri + 2;
3959           if (d->tokens[ri + 1] == CAT)
3960             {
3961               for (; rj < d->tindex - 1; rj += 2)
3962                 {
3963                   if ((rj != ri && (d->tokens[rj] <= 0
3964                                     || NOTCHAR <= d->tokens[rj]))
3965                       || d->tokens[rj + 1] != CAT)
3966                     break;
3967                 }
3968             }
3969           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
3970           mp->is[0] = mp->left[0] = mp->right[0]
3971             = case_fold_unibyte ? toupper (t) : t;
3972
3973           size_t i;
3974           for (i = 1; ri + 2 < rj; i++)
3975             {
3976               ri += 2;
3977               t = d->tokens[ri];
3978               mp->is[i] = mp->left[i] = mp->right[i]
3979                 = case_fold_unibyte ? toupper (t) : t;
3980             }
3981           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
3982           mp->in = enlist (mp->in, mp->is, i);
3983           break;
3984         }
3985     }
3986  done:;
3987
3988   struct dfamust *dm = NULL;
3989   if (*result)
3990     {
3991       dm = xmalloc (sizeof *dm);
3992       dm->exact = exact;
3993       dm->begline = begline;
3994       dm->endline = endline;
3995       dm->must = xstrdup (result);
3996     }
3997
3998   while (mp)
3999     {
4000       must *prev = mp->prev;
4001       freemust (mp);
4002       mp = prev;
4003     }
4004
4005   return dm;
4006 }
4007
4008 void
4009 dfamustfree (struct dfamust *dm)
4010 {
4011   free (dm->must);
4012   free (dm);
4013 }
4014
4015 struct dfa *
4016 dfaalloc (void)
4017 {
4018   return xmalloc (sizeof (struct dfa));
4019 }
4020
4021 /* Initialize DFA.  */
4022 void
4023 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4024            reg_syntax_t bits, int dfaopts)
4025 {
4026   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4027   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4028   dfa->simple_locale = using_simple_locale (linfo->multibyte);
4029   dfa->localeinfo = *linfo;
4030
4031   dfa->fast = !dfa->localeinfo.multibyte;
4032
4033   dfa->canychar = -1;
4034   dfa->lex.cur_mb_len = 1;
4035   dfa->syntax.syntax_bits_set = true;
4036   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4037   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4038   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4039   dfa->syntax.syntax_bits = bits;
4040
4041   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4042     {
4043       unsigned char uc = i;
4044
4045       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4046       switch (dfa->syntax.sbit[uc])
4047         {
4048         case CTX_LETTER:
4049           setbit (uc, &dfa->syntax.letters);
4050           break;
4051         case CTX_NEWLINE:
4052           setbit (uc, &dfa->syntax.newline);
4053           break;
4054         }
4055
4056       /* POSIX requires that the five bytes in "\n\r./" (including the
4057          terminating NUL) cannot occur inside a multibyte character.  */
4058       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4059                                      ? (uc & 0xc0) != 0x80
4060                                      : strchr ("\n\r./", uc) != NULL);
4061     }
4062 }
4063
4064 /* vim:set shiftwidth=2: */