lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include "flexmember.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36
  37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
  38    indexes into objects.  It is signed to help catch integer overflow.
  39    It has its own name because it is for nonnegative values only.  */
  40 typedef ptrdiff_t idx_t;
  41 static idx_t const IDX_MAX = PTRDIFF_MAX;
  42
  43 static bool
  44 streq (char const *a, char const *b)
  45 {
  46   return strcmp (a, b) == 0;
  47 }
  48
  49 static bool
  50 isasciidigit (char c)
  51 {
  52   return '0' <= c && c <= '9';
  53 }
  54
  55 #include "gettext.h"
  56 #define _(str) gettext (str)
  57
  58 #include <wchar.h>
  59
  60 #include "intprops.h"
  61 #include "xalloc.h"
  62 #include "localeinfo.h"
  63
  64 #ifndef FALLTHROUGH
  65 # if 201710L < __STDC_VERSION__
  66 #  define FALLTHROUGH [[__fallthrough__]]
  67 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
  68 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  69 # else
  70 #  define FALLTHROUGH ((void) 0)
  71 # endif
  72 #endif
  73
  74 #ifndef MIN
  75 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  76 #endif
  77
  78 /* HPUX defines these as macros in sys/param.h.  */
  79 #ifdef setbit
  80 # undef setbit
  81 #endif
  82 #ifdef clrbit
  83 # undef clrbit
  84 #endif
  85
  86 /* For code that does not use Gnulib’s isblank module.  */
  87 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
  88 # define isblank dfa_isblank
  89 static int
  90 isblank (int c)
  91 {
  92   return c == ' ' || c == '\t';
  93 }
  94 #endif
  95
  96 /* First integer value that is greater than any character code.  */
  97 enum { NOTCHAR = 1 << CHAR_BIT };
  98
  99 #ifdef UINT_LEAST64_MAX
 100
 101 /* Number of bits used in a charclass word.  */
 102 enum { CHARCLASS_WORD_BITS = 64 };
 103
 104 /* This represents part of a character class.  It must be unsigned and
 105    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 106 typedef uint_least64_t charclass_word;
 107
 108 /* Part of a charclass initializer that represents 64 bits' worth of a
 109    charclass, where LO and HI are the low and high-order 32 bits of
 110    the 64-bit quantity.  */
 111 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 112
 113 #else
 114 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 115 enum { CHARCLASS_WORD_BITS = 32 };
 116 typedef unsigned long charclass_word;
 117 # define CHARCLASS_PAIR(lo, hi) lo, hi
 118 #endif
 119
 120 /* An initializer for a charclass whose 32-bit words are A through H.  */
 121 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 122    {{                                                   \
 123       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 124       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 125    }}
 126
 127 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 128 static charclass_word const CHARCLASS_WORD_MASK
 129   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 130
 131 /* Number of words required to hold a bit for every character.  */
 132 enum
 133 {
 134   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 135 };
 136
 137 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 138 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 139
 140 /* Convert a possibly-signed character to an unsigned character.  This is
 141    a bit safer than casting to unsigned char, since it catches some type
 142    errors that the cast doesn't.  */
 143 static unsigned char
 144 to_uchar (char ch)
 145 {
 146   return ch;
 147 }
 148
 149 /* Contexts tell us whether a character is a newline or a word constituent.
 150    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 151    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 152    a particular character class.
 153
 154    A state also stores a context value, which is a bitmask of CTX_* values.
 155    A state's context represents a set of characters that the state's
 156    predecessors must match.  For example, a state whose context does not
 157    include CTX_LETTER will never have transitions where the previous
 158    character is a word constituent.  A state whose context is CTX_ANY
 159    might have transitions from any character.  */
 160
 161 enum
 162   {
 163     CTX_NONE = 1,
 164     CTX_LETTER = 2,
 165     CTX_NEWLINE = 4,
 166     CTX_ANY = 7
 167   };
 168
 169 /* Sometimes characters can only be matched depending on the surrounding
 170    context.  Such context decisions depend on what the previous character
 171    was, and the value of the current (lookahead) character.  Context
 172    dependent constraints are encoded as 9-bit integers.  Each bit that
 173    is set indicates that the constraint succeeds in the corresponding
 174    context.
 175
 176    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 177    bit 3-5  - valid contexts when next character is CTX_LETTER
 178    bit 0-2  - valid contexts when next character is CTX_NONE
 179
 180    succeeds_in_context determines whether a given constraint
 181    succeeds in a particular context.  Prev is a bitmask of possible
 182    context values for the previous character, curr is the (single-bit)
 183    context value for the lookahead character.  */
 184 static int
 185 newline_constraint (int constraint)
 186 {
 187   return (constraint >> 6) & 7;
 188 }
 189 static int
 190 letter_constraint (int constraint)
 191 {
 192   return (constraint >> 3) & 7;
 193 }
 194 static int
 195 other_constraint (int constraint)
 196 {
 197   return constraint & 7;
 198 }
 199
 200 static bool
 201 succeeds_in_context (int constraint, int prev, int curr)
 202 {
 203   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 204               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 205               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 206              & prev);
 207 }
 208
 209 /* The following describe what a constraint depends on.  */
 210 static bool
 211 prev_newline_dependent (int constraint)
 212 {
 213   return ((constraint ^ constraint >> 2) & 0111) != 0;
 214 }
 215 static bool
 216 prev_letter_dependent (int constraint)
 217 {
 218   return ((constraint ^ constraint >> 1) & 0111) != 0;
 219 }
 220
 221 /* Tokens that match the empty string subject to some constraint actually
 222    work by applying that constraint to determine what may follow them,
 223    taking into account what has gone before.  The following values are
 224    the constraints corresponding to the special tokens previously defined.  */
 225 enum
 226   {
 227     NO_CONSTRAINT = 0777,
 228     BEGLINE_CONSTRAINT = 0444,
 229     ENDLINE_CONSTRAINT = 0700,
 230     BEGWORD_CONSTRAINT = 0050,
 231     ENDWORD_CONSTRAINT = 0202,
 232     LIMWORD_CONSTRAINT = 0252,
 233     NOTLIMWORD_CONSTRAINT = 0525
 234   };
 235
 236 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 237    are operators and others are terminal symbols.  Most (but not all) of these
 238    codes are returned by the lexical analyzer.  */
 239
 240 typedef ptrdiff_t token;
 241 static token const TOKEN_MAX = PTRDIFF_MAX;
 242
 243 /* States are indexed by state_num values.  These are normally
 244    nonnegative but -1 is used as a special value.  */
 245 typedef ptrdiff_t state_num;
 246
 247 /* Predefined token values.  */
 248 enum
 249 {
 250   END = -1,                     /* END is a terminal symbol that matches the
 251                                    end of input; any value of END or less in
 252                                    the parse tree is such a symbol.  Accepting
 253                                    states of the DFA are those that would have
 254                                    a transition on END.  This is -1, not some
 255                                    more-negative value, to tweak the speed of
 256                                    comparisons to END.  */
 257
 258   /* Ordinary character values are terminal symbols that match themselves.  */
 259
 260   /* CSET must come last in the following list of special tokens.  Otherwise,
 261      the list order matters only for performance.  Related special tokens
 262      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 263      || CSET <= t) can be done with a single machine-level comparison.  */
 264
 265   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 266                                    the empty string.  */
 267
 268   QMARK,                        /* QMARK is an operator of one argument that
 269                                    matches zero or one occurrences of its
 270                                    argument.  */
 271
 272   STAR,                         /* STAR is an operator of one argument that
 273                                    matches the Kleene closure (zero or more
 274                                    occurrences) of its argument.  */
 275
 276   PLUS,                         /* PLUS is an operator of one argument that
 277                                    matches the positive closure (one or more
 278                                    occurrences) of its argument.  */
 279
 280   REPMN,                        /* REPMN is a lexical token corresponding
 281                                    to the {m,n} construct.  REPMN never
 282                                    appears in the compiled token vector.  */
 283
 284   CAT,                          /* CAT is an operator of two arguments that
 285                                    matches the concatenation of its
 286                                    arguments.  CAT is never returned by the
 287                                    lexical analyzer.  */
 288
 289   OR,                           /* OR is an operator of two arguments that
 290                                    matches either of its arguments.  */
 291
 292   LPAREN,                       /* LPAREN never appears in the parse tree,
 293                                    it is only a lexeme.  */
 294
 295   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 296
 297   WCHAR,                        /* Only returned by lex.  wctok contains
 298                                    the wide character representation.  */
 299
 300   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 301                                    a valid multibyte (or single byte) character.
 302                                    It is used only if MB_CUR_MAX > 1.  */
 303
 304   BEG,                          /* BEG is an initial symbol that matches the
 305                                    beginning of input.  */
 306
 307   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 308                                    the empty string at the beginning of a
 309                                    line.  */
 310
 311   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 312                                    the empty string at the end of a line.  */
 313
 314   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 315                                    the empty string at the beginning of a
 316                                    word.  */
 317
 318   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 319                                    the empty string at the end of a word.  */
 320
 321   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 322                                    the empty string at the beginning or the
 323                                    end of a word.  */
 324
 325   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 326                                    matches the empty string not at
 327                                    the beginning or end of a word.  */
 328
 329   BACKREF,                      /* BACKREF is generated by \<digit>
 330                                    or by any other construct that
 331                                    is not completely handled.  If the scanner
 332                                    detects a transition on backref, it returns
 333                                    a kind of "semi-success" indicating that
 334                                    the match will have to be verified with
 335                                    a backtracking matcher.  */
 336
 337   MBCSET,                       /* MBCSET is similar to CSET, but for
 338                                    multibyte characters.  */
 339
 340   CSET                          /* CSET and (and any value greater) is a
 341                                    terminal symbol that matches any of a
 342                                    class of characters.  */
 343 };
 344
 345
 346 /* States of the recognizer correspond to sets of positions in the parse
 347    tree, together with the constraints under which they may be matched.
 348    So a position is encoded as an index into the parse tree together with
 349    a constraint.  */
 350 typedef struct
 351 {
 352   idx_t index;                  /* Index into the parse array.  */
 353   unsigned int constraint;      /* Constraint for matching this position.  */
 354 } position;
 355
 356 /* Sets of positions are stored as arrays.  */
 357 typedef struct
 358 {
 359   position *elems;              /* Elements of this position set.  */
 360   idx_t nelem;                  /* Number of elements in this set.  */
 361   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 362 } position_set;
 363
 364 /* A state of the dfa consists of a set of positions, some flags,
 365    and the token value of the lowest-numbered position of the state that
 366    contains an END token.  */
 367 typedef struct
 368 {
 369   size_t hash;                  /* Hash of the positions of this state.  */
 370   position_set elems;           /* Positions this state could match.  */
 371   unsigned char context;        /* Context from previous state.  */
 372   unsigned short constraint;    /* Constraint for this state to accept.  */
 373   position_set mbps;            /* Positions which can match multibyte
 374                                    characters or the follows, e.g., period.
 375                                    Used only if MB_CUR_MAX > 1.  */
 376   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 377                                    negative if the state does not have
 378                                    ANYCHAR.  */
 379 } dfa_state;
 380
 381 /* Maximum for any transition table count.  This should be at least 3,
 382    for the initial state setup.  */
 383 enum { MAX_TRCOUNT = 1024 };
 384
 385 /* A bracket operator.
 386    e.g., [a-c], [[:alpha:]], etc.  */
 387 struct mb_char_classes
 388 {
 389   ptrdiff_t cset;
 390   bool invert;
 391   wchar_t *chars;               /* Normal characters.  */
 392   idx_t nchars;
 393   idx_t nchars_alloc;
 394 };
 395
 396 struct regex_syntax
 397 {
 398   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 399   reg_syntax_t syntax_bits;
 400   bool syntax_bits_set;
 401
 402   /* Flag for case-folding letters into sets.  */
 403   bool case_fold;
 404
 405   /* True if ^ and $ match only the start and end of data, and do not match
 406      end-of-line within data.  */
 407   bool anchor;
 408
 409   /* End-of-line byte in data.  */
 410   unsigned char eolbyte;
 411
 412   /* Cache of char-context values.  */
 413   char sbit[NOTCHAR];
 414
 415   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 416      multibyte character.  */
 417   bool never_trail[NOTCHAR];
 418
 419   /* Set of characters considered letters.  */
 420   charclass letters;
 421
 422   /* Set of characters that are newline.  */
 423   charclass newline;
 424 };
 425
 426 /* Lexical analyzer.  All the dross that deals with the obnoxious
 427    GNU Regex syntax bits is located here.  The poor, suffering
 428    reader is referred to the GNU Regex documentation for the
 429    meaning of the @#%!@#%^!@ syntax bits.  */
 430 struct lexer_state
 431 {
 432   char const *ptr;      /* Pointer to next input character.  */
 433   idx_t left;           /* Number of characters remaining.  */
 434   token lasttok;        /* Previous token returned; initially END.  */
 435   idx_t parens;         /* Count of outstanding left parens.  */
 436   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 437
 438   /* Wide character representation of the current multibyte character,
 439      or WEOF if there was an encoding error.  Used only if
 440      MB_CUR_MAX > 1.  */
 441   wint_t wctok;
 442
 443   /* The most recently analyzed multibyte bracket expression.  */
 444   struct mb_char_classes brack;
 445
 446   /* We're separated from beginning or (, | only by zero-width characters.  */
 447   bool laststart;
 448 };
 449
 450 /* Recursive descent parser for regular expressions.  */
 451
 452 struct parser_state
 453 {
 454   token tok;               /* Lookahead token.  */
 455   idx_t depth;             /* Current depth of a hypothetical stack
 456                               holding deferred productions.  This is
 457                               used to determine the depth that will be
 458                               required of the real stack later on in
 459                               dfaanalyze.  */
 460 };
 461
 462 /* A compiled regular expression.  */
 463 struct dfa
 464 {
 465   /* Fields filled by the scanner.  */
 466   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 467   idx_t cindex;                 /* Index for adding new charclasses.  */
 468   idx_t calloc;                 /* Number of charclasses allocated.  */
 469   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 470
 471   /* Scanner state */
 472   struct lexer_state lex;
 473
 474   /* Parser state */
 475   struct parser_state parse;
 476
 477   /* Fields filled by the parser.  */
 478   token *tokens;                /* Postfix parse array.  */
 479   idx_t tindex;                 /* Index for adding new tokens.  */
 480   idx_t talloc;                 /* Number of tokens currently allocated.  */
 481   idx_t depth;                  /* Depth required of an evaluation stack
 482                                    used for depth-first traversal of the
 483                                    parse tree.  */
 484   idx_t nleaves;                /* Number of non-EMPTY leaves
 485                                    in the parse tree.  */
 486   idx_t nregexps;               /* Count of parallel regexps being built
 487                                    with dfaparse.  */
 488   bool fast;                    /* The DFA is fast.  */
 489   bool epsilon;                 /* Does a token match only the empty string?  */
 490   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 491   mbstate_t mbs;                /* Multibyte conversion state.  */
 492
 493   /* The following are valid only if MB_CUR_MAX > 1.  */
 494
 495   /* The value of multibyte_prop[i] is defined by following rule.
 496      if tokens[i] < NOTCHAR
 497      bit 0 : tokens[i] is the first byte of a character, including
 498      single-byte characters.
 499      bit 1 : tokens[i] is the last byte of a character, including
 500      single-byte characters.
 501
 502      e.g.
 503      tokens
 504      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 505      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 506      multibyte_prop
 507      = 3     , 1               ,  0              ,  2              , 3
 508    */
 509   char *multibyte_prop;
 510
 511   /* Fields filled by the superset.  */
 512   struct dfa *superset;             /* Hint of the dfa.  */
 513
 514   /* Fields filled by the state builder.  */
 515   dfa_state *states;            /* States of the dfa.  */
 516   state_num sindex;             /* Index for adding new states.  */
 517   idx_t salloc;                 /* Number of states currently allocated.  */
 518
 519   /* Fields filled by the parse tree->NFA conversion.  */
 520   position_set *follows;        /* Array of follow sets, indexed by position
 521                                    index.  The follow of a position is the set
 522                                    of positions containing characters that
 523                                    could conceivably follow a character
 524                                    matching the given position in a string
 525                                    matching the regexp.  Allocated to the
 526                                    maximum possible position index.  */
 527   bool searchflag;              /* We are supposed to build a searching
 528                                    as opposed to an exact matcher.  A searching
 529                                    matcher finds the first and shortest string
 530                                    matching a regexp anywhere in the buffer,
 531                                    whereas an exact matcher finds the longest
 532                                    string matching, but anchored to the
 533                                    beginning of the buffer.  */
 534
 535   /* Fields filled by dfaanalyze.  */
 536   int *constraints;             /* Array of union of accepting constraints
 537                                    in the follow of a position.  */
 538   int *separates;               /* Array of contexts on follow of a
 539                                    position.  */
 540
 541   /* Fields filled by dfaexec.  */
 542   state_num tralloc;            /* Number of transition tables that have
 543                                    slots so far, not counting trans[-1] and
 544                                    trans[-2].  */
 545   int trcount;                  /* Number of transition tables that have
 546                                    been built, other than for initial
 547                                    states.  */
 548   int min_trcount;              /* Number of initial states.  Equivalently,
 549                                    the minimum state number for which trcount
 550                                    counts transitions.  */
 551   state_num **trans;            /* Transition tables for states that can
 552                                    never accept.  If the transitions for a
 553                                    state have not yet been computed, or the
 554                                    state could possibly accept, its entry in
 555                                    this table is NULL.  This points to two
 556                                    past the start of the allocated array,
 557                                    and trans[-1] and trans[-2] are always
 558                                    NULL.  */
 559   state_num **fails;            /* Transition tables after failing to accept
 560                                    on a state that potentially could do so.
 561                                    If trans[i] is non-null, fails[i] must
 562                                    be null.  */
 563   char *success;                /* Table of acceptance conditions used in
 564                                    dfaexec and computed in build_state.  */
 565   state_num *newlines;          /* Transitions on newlines.  The entry for a
 566                                    newline in any transition table is always
 567                                    -1 so we can count lines without wasting
 568                                    too many cycles.  The transition for a
 569                                    newline is stored separately and handled
 570                                    as a special case.  Newline is also used
 571                                    as a sentinel at the end of the buffer.  */
 572   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 573                                    context in multibyte locales, in which we
 574                                    do not distinguish between their contexts,
 575                                    as not supported word.  */
 576   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 577   state_num **mb_trans;         /* Transition tables for states with
 578                                    ANYCHAR.  */
 579   state_num mb_trcount;         /* Number of transition tables for states with
 580                                    ANYCHAR that have actually been built.  */
 581
 582   /* Syntax configuration.  This is near the end so that dfacopysyntax
 583      can memset up to here.  */
 584   struct regex_syntax syntax;
 585
 586   /* Information derived from the locale.  This is at the end so that
 587      a quick memset need not clear it specially.  */
 588
 589   /* dfaexec implementation.  */
 590   char *(*dfaexec) (struct dfa *, char const *, char *,
 591                     bool, ptrdiff_t *, bool *);
 592
 593   /* Other cached information derived from the locale.  */
 594   struct localeinfo localeinfo;
 595 };
 596
 597 /* User access to dfa internals.  */
 598
 599 /* S could possibly be an accepting state of R.  */
 600 static bool
 601 accepting (state_num s, struct dfa const *r)
 602 {
 603   return r->states[s].constraint != 0;
 604 }
 605
 606 /* STATE accepts in the specified context.  */
 607 static bool
 608 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 609 {
 610   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 611 }
 612
 613 static void regexp (struct dfa *dfa);
 614
 615 /* Store into *PWC the result of converting the leading bytes of the
 616    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 617    and updating the conversion state in *D.  On conversion error,
 618    convert just a single byte, to WEOF.  Return the number of bytes
 619    converted.
 620
 621    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 622
 623    * PWC points to wint_t, not to wchar_t.
 624    * The last arg is a dfa *D instead of merely a multibyte conversion
 625      state D->mbs.
 626    * N must be at least 1.
 627    * S[N - 1] must be a sentinel byte.
 628    * Shift encodings are not supported.
 629    * The return value is always in the range 1..N.
 630    * D->mbs is always valid afterwards.
 631    * *PWC is always set to something.  */
 632 static int
 633 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 634 {
 635   unsigned char uc = s[0];
 636   wint_t wc = d->localeinfo.sbctowc[uc];
 637
 638   if (wc == WEOF)
 639     {
 640       wchar_t wch;
 641       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 642       if (0 < nbytes && nbytes < (size_t) -2)
 643         {
 644           *pwc = wch;
 645           return nbytes;
 646         }
 647       memset (&d->mbs, 0, sizeof d->mbs);
 648     }
 649
 650   *pwc = wc;
 651   return 1;
 652 }
 653
 654 #ifdef DEBUG
 655
 656 static void
 657 prtok (token t)
 658 {
 659   if (t <= END)
 660     fprintf (stderr, "END");
 661   else if (0 <= t && t < NOTCHAR)
 662     {
 663       unsigned int ch = t;
 664       fprintf (stderr, "0x%02x", ch);
 665     }
 666   else
 667     {
 668       char const *s;
 669       switch (t)
 670         {
 671         case BEG:
 672           s = "BEG";
 673           break;
 674         case EMPTY:
 675           s = "EMPTY";
 676           break;
 677         case BACKREF:
 678           s = "BACKREF";
 679           break;
 680         case BEGLINE:
 681           s = "BEGLINE";
 682           break;
 683         case ENDLINE:
 684           s = "ENDLINE";
 685           break;
 686         case BEGWORD:
 687           s = "BEGWORD";
 688           break;
 689         case ENDWORD:
 690           s = "ENDWORD";
 691           break;
 692         case LIMWORD:
 693           s = "LIMWORD";
 694           break;
 695         case NOTLIMWORD:
 696           s = "NOTLIMWORD";
 697           break;
 698         case QMARK:
 699           s = "QMARK";
 700           break;
 701         case STAR:
 702           s = "STAR";
 703           break;
 704         case PLUS:
 705           s = "PLUS";
 706           break;
 707         case CAT:
 708           s = "CAT";
 709           break;
 710         case OR:
 711           s = "OR";
 712           break;
 713         case LPAREN:
 714           s = "LPAREN";
 715           break;
 716         case RPAREN:
 717           s = "RPAREN";
 718           break;
 719         case ANYCHAR:
 720           s = "ANYCHAR";
 721           break;
 722         case MBCSET:
 723           s = "MBCSET";
 724           break;
 725         default:
 726           s = "CSET";
 727           break;
 728         }
 729       fprintf (stderr, "%s", s);
 730     }
 731 }
 732 #endif /* DEBUG */
 733
 734 /* Stuff pertaining to charclasses.  */
 735
 736 static bool
 737 tstbit (unsigned int b, charclass const *c)
 738 {
 739   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 740 }
 741
 742 static void
 743 setbit (unsigned int b, charclass *c)
 744 {
 745   charclass_word one = 1;
 746   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 747 }
 748
 749 static void
 750 clrbit (unsigned int b, charclass *c)
 751 {
 752   charclass_word one = 1;
 753   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 754 }
 755
 756 static void
 757 zeroset (charclass *s)
 758 {
 759   memset (s, 0, sizeof *s);
 760 }
 761
 762 static void
 763 fillset (charclass *s)
 764 {
 765   for (int i = 0; i < CHARCLASS_WORDS; i++)
 766     s->w[i] = CHARCLASS_WORD_MASK;
 767 }
 768
 769 static void
 770 notset (charclass *s)
 771 {
 772   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 773     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 774 }
 775
 776 static bool
 777 equal (charclass const *s1, charclass const *s2)
 778 {
 779   charclass_word w = 0;
 780   for (int i = 0; i < CHARCLASS_WORDS; i++)
 781     w |= s1->w[i] ^ s2->w[i];
 782   return w == 0;
 783 }
 784
 785 static bool
 786 emptyset (charclass const *s)
 787 {
 788   charclass_word w = 0;
 789   for (int i = 0; i < CHARCLASS_WORDS; i++)
 790     w |= s->w[i];
 791   return w == 0;
 792 }
 793
 794 /* Grow PA, which points to an array of *NITEMS items, and return the
 795    location of the reallocated array, updating *NITEMS to reflect its
 796    new size.  The new array will contain at least NITEMS_INCR_MIN more
 797    items, but will not contain more than NITEMS_MAX items total.
 798    ITEM_SIZE is the size of each item, in bytes.
 799
 800    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 801    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 802    infinity.
 803
 804    If PA is null, then allocate a new array instead of reallocating
 805    the old one.
 806
 807    Thus, to grow an array A without saving its old contents, do
 808    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 809
 810 static void *
 811 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
 812          ptrdiff_t nitems_max, idx_t item_size)
 813 {
 814   idx_t n0 = *nitems;
 815
 816   /* The approximate size to use for initial small allocation
 817      requests.  This is the largest "small" request for the GNU C
 818      library malloc.  */
 819   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 820
 821   /* If the array is tiny, grow it to about (but no greater than)
 822      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 823      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 824      NITEMS_MAX, and what the C language can represent safely.  */
 825
 826   idx_t n, nbytes;
 827   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 828     n = IDX_MAX;
 829   if (0 <= nitems_max && nitems_max < n)
 830     n = nitems_max;
 831
 832   idx_t adjusted_nbytes
 833     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 834        ? MIN (IDX_MAX, SIZE_MAX)
 835        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 836   if (adjusted_nbytes)
 837     {
 838       n = adjusted_nbytes / item_size;
 839       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 840     }
 841
 842   if (! pa)
 843     *nitems = 0;
 844   if (n - n0 < nitems_incr_min
 845       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 846           || (0 <= nitems_max && nitems_max < n)
 847           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 848     xalloc_die ();
 849   pa = xrealloc (pa, nbytes);
 850   *nitems = n;
 851   return pa;
 852 }
 853
 854 /* Ensure that the array addressed by PA holds at least I + 1 items.
 855    Either return PA, or reallocate the array and return its new address.
 856    Although PA may be null, the returned value is never null.
 857
 858    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 859    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 860    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 861    ITEM_SIZE is the size of one item; it must be positive.
 862    Avoid O(N**2) behavior on arrays growing linearly.  */
 863 static void *
 864 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 865                ptrdiff_t nitems_max, idx_t item_size)
 866 {
 867   if (i < *nitems)
 868     return pa;
 869   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 870 }
 871
 872 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 873 static idx_t
 874 charclass_index (struct dfa *d, charclass const *s)
 875 {
 876   idx_t i;
 877
 878   for (i = 0; i < d->cindex; ++i)
 879     if (equal (s, &d->charclasses[i]))
 880       return i;
 881   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 882                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 883   ++d->cindex;
 884   d->charclasses[i] = *s;
 885   return i;
 886 }
 887
 888 static bool
 889 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 890 {
 891   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 892 }
 893
 894 static int
 895 char_context (struct dfa const *dfa, unsigned char c)
 896 {
 897   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 898     return CTX_NEWLINE;
 899   if (unibyte_word_constituent (dfa, c))
 900     return CTX_LETTER;
 901   return CTX_NONE;
 902 }
 903
 904 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 905    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 906    this may happen when folding case in weird Turkish locales where
 907    dotless i/dotted I are not included in the chosen character set.
 908    Return whether a bit was set in the charclass.  */
 909 static bool
 910 setbit_wc (wint_t wc, charclass *c)
 911 {
 912   int b = wctob (wc);
 913   if (b < 0)
 914     return false;
 915
 916   setbit (b, c);
 917   return true;
 918 }
 919
 920 /* Set a bit for B and its case variants in the charclass C.
 921    MB_CUR_MAX must be 1.  */
 922 static void
 923 setbit_case_fold_c (int b, charclass *c)
 924 {
 925   int ub = toupper (b);
 926   for (int i = 0; i < NOTCHAR; i++)
 927     if (toupper (i) == ub)
 928       setbit (i, c);
 929 }
 930
 931 /* Fetch the next lexical input character from the pattern.  There
 932    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 933    value of the character or to WEOF depending on whether the input is
 934    a valid multibyte character (possibly of length 1).  Then return
 935    the next input byte value, except return EOF if the input is a
 936    multibyte character of length greater than 1.  */
 937 static int
 938 fetch_wc (struct dfa *dfa)
 939 {
 940   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 941                              dfa);
 942   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 943   dfa->lex.ptr += nbytes;
 944   dfa->lex.left -= nbytes;
 945   return c;
 946 }
 947
 948 /* If there is no more input, report an error about unbalanced brackets.
 949    Otherwise, behave as with fetch_wc (DFA).  */
 950 static int
 951 bracket_fetch_wc (struct dfa *dfa)
 952 {
 953   if (! dfa->lex.left)
 954     dfaerror (_("unbalanced ["));
 955   return fetch_wc (dfa);
 956 }
 957
 958 typedef int predicate (int);
 959
 960 /* The following list maps the names of the Posix named character classes
 961    to predicate functions that determine whether a given character is in
 962    the class.  The leading [ has already been eaten by the lexical
 963    analyzer.  */
 964 struct dfa_ctype
 965 {
 966   const char *name;
 967   predicate *func;
 968   bool single_byte_only;
 969 };
 970
 971 static const struct dfa_ctype prednames[] = {
 972   {"alpha", isalpha, false},
 973   {"upper", isupper, false},
 974   {"lower", islower, false},
 975   {"digit", isdigit, true},
 976   {"xdigit", isxdigit, false},
 977   {"space", isspace, false},
 978   {"punct", ispunct, false},
 979   {"alnum", isalnum, false},
 980   {"print", isprint, false},
 981   {"graph", isgraph, false},
 982   {"cntrl", iscntrl, false},
 983   {"blank", isblank, false},
 984   {NULL, NULL, false}
 985 };
 986
 987 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 988 find_pred (const char *str)
 989 {
 990   for (int i = 0; prednames[i].name; i++)
 991     if (streq (str, prednames[i].name))
 992       return &prednames[i];
 993   return NULL;
 994 }
 995
 996 /* Parse a bracket expression, which possibly includes multibyte
 997    characters.  */
 998 static token
 999 parse_bracket_exp (struct dfa *dfa)
1000 {
1001   /* This is a bracket expression that dfaexec is known to
1002      process correctly.  */
1003   bool known_bracket_exp = true;
1004
1005   /* Used to warn about [:space:].
1006      Bit 0 = first character is a colon.
1007      Bit 1 = last character is a colon.
1008      Bit 2 = includes any other character but a colon.
1009      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
1010   int colon_warning_state;
1011
1012   dfa->lex.brack.nchars = 0;
1013   charclass ccl;
1014   zeroset (&ccl);
1015   int c = bracket_fetch_wc (dfa);
1016   bool invert = c == '^';
1017   if (invert)
1018     {
1019       c = bracket_fetch_wc (dfa);
1020       known_bracket_exp = dfa->localeinfo.simple;
1021     }
1022   wint_t wc = dfa->lex.wctok;
1023   int c1;
1024   wint_t wc1;
1025   colon_warning_state = (c == ':');
1026   do
1027     {
1028       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
1029       colon_warning_state &= ~2;
1030
1031       /* Note that if we're looking at some other [:...:] construct,
1032          we just treat it as a bunch of ordinary characters.  We can do
1033          this because we assume regex has checked for syntax errors before
1034          dfa is ever called.  */
1035       if (c == '[')
1036         {
1037           c1 = bracket_fetch_wc (dfa);
1038           wc1 = dfa->lex.wctok;
1039
1040           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1041               || c1 == '.' || c1 == '=')
1042             {
1043               enum { MAX_BRACKET_STRING_LEN = 32 };
1044               char str[MAX_BRACKET_STRING_LEN + 1];
1045               int len = 0;
1046               for (;;)
1047                 {
1048                   c = bracket_fetch_wc (dfa);
1049                   if (dfa->lex.left == 0
1050                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1051                     break;
1052                   if (len < MAX_BRACKET_STRING_LEN)
1053                     str[len++] = c;
1054                   else
1055                     /* This is in any case an invalid class name.  */
1056                     str[0] = '\0';
1057                 }
1058               str[len] = '\0';
1059
1060               /* Fetch bracket.  */
1061               c = bracket_fetch_wc (dfa);
1062               wc = dfa->lex.wctok;
1063               if (c1 == ':')
1064                 /* Build character class.  POSIX allows character
1065                    classes to match multicharacter collating elements,
1066                    but the regex code does not support that, so do not
1067                    worry about that possibility.  */
1068                 {
1069                   char const *class
1070                     = (dfa->syntax.case_fold && (streq (str, "upper")
1071                                                  || streq (str, "lower"))
1072                        ? "alpha" : str);
1073                   const struct dfa_ctype *pred = find_pred (class);
1074                   if (!pred)
1075                     dfaerror (_("invalid character class"));
1076
1077                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1078                     known_bracket_exp = false;
1079                   else
1080                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1081                       if (pred->func (c2))
1082                         setbit (c2, &ccl);
1083                 }
1084               else
1085                 known_bracket_exp = false;
1086
1087               colon_warning_state |= 8;
1088
1089               /* Fetch new lookahead character.  */
1090               c1 = bracket_fetch_wc (dfa);
1091               wc1 = dfa->lex.wctok;
1092               continue;
1093             }
1094
1095           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1096              are already set up.  */
1097         }
1098
1099       if (c == '\\'
1100           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1101         {
1102           c = bracket_fetch_wc (dfa);
1103           wc = dfa->lex.wctok;
1104         }
1105
1106       if (c1 == NOTCHAR)
1107         {
1108           c1 = bracket_fetch_wc (dfa);
1109           wc1 = dfa->lex.wctok;
1110         }
1111
1112       if (c1 == '-')
1113         /* build range characters.  */
1114         {
1115           int c2 = bracket_fetch_wc (dfa);
1116           wint_t wc2 = dfa->lex.wctok;
1117
1118           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1119              Treat it like [-a[.aa.]] while parsing it, and
1120              remember that the set is unknown.  */
1121           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1122             {
1123               known_bracket_exp = false;
1124               c2 = ']';
1125             }
1126
1127           if (c2 == ']')
1128             {
1129               /* In the case [x-], the - is an ordinary hyphen,
1130                  which is left in c1, the lookahead character.  */
1131               dfa->lex.ptr--;
1132               dfa->lex.left++;
1133             }
1134           else
1135             {
1136               if (c2 == '\\' && (dfa->syntax.syntax_bits
1137                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1138                 {
1139                   c2 = bracket_fetch_wc (dfa);
1140                   wc2 = dfa->lex.wctok;
1141                 }
1142
1143               colon_warning_state |= 8;
1144               c1 = bracket_fetch_wc (dfa);
1145               wc1 = dfa->lex.wctok;
1146
1147               /* Treat [x-y] as a range if x != y.  */
1148               if (wc != wc2 || wc == WEOF)
1149                 {
1150                   if (dfa->localeinfo.simple
1151                       || (isasciidigit (c) & isasciidigit (c2)))
1152                     {
1153                       for (int ci = c; ci <= c2; ci++)
1154                         if (dfa->syntax.case_fold && isalpha (ci))
1155                           setbit_case_fold_c (ci, &ccl);
1156                         else
1157                           setbit (ci, &ccl);
1158                     }
1159                   else
1160                     known_bracket_exp = false;
1161
1162                   continue;
1163                 }
1164             }
1165         }
1166
1167       colon_warning_state |= (c == ':') ? 2 : 4;
1168
1169       if (!dfa->localeinfo.multibyte)
1170         {
1171           if (dfa->syntax.case_fold && isalpha (c))
1172             setbit_case_fold_c (c, &ccl);
1173           else
1174             setbit (c, &ccl);
1175           continue;
1176         }
1177
1178       if (wc == WEOF)
1179         known_bracket_exp = false;
1180       else
1181         {
1182           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1183           int n = (dfa->syntax.case_fold
1184                    ? case_folded_counterparts (wc, folded + 1) + 1
1185                    : 1);
1186           folded[0] = wc;
1187           for (int i = 0; i < n; i++)
1188             if (!setbit_wc (folded[i], &ccl))
1189               {
1190                 dfa->lex.brack.chars
1191                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1192                                    &dfa->lex.brack.nchars_alloc, -1,
1193                                    sizeof *dfa->lex.brack.chars);
1194                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1195               }
1196         }
1197     }
1198   while ((wc = wc1, (c = c1) != ']'));
1199
1200   if (colon_warning_state == 7)
1201     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1202
1203   if (! known_bracket_exp)
1204     return BACKREF;
1205
1206   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1207     {
1208       dfa->lex.brack.invert = invert;
1209       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1210       return MBCSET;
1211     }
1212
1213   if (invert)
1214     {
1215       notset (&ccl);
1216       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1217         clrbit ('\n', &ccl);
1218     }
1219
1220   return CSET + charclass_index (dfa, &ccl);
1221 }
1222
1223 struct lexptr
1224 {
1225   char const *ptr;
1226   idx_t left;
1227 };
1228
1229 static void
1230 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1231 {
1232   ls->ptr = dfa->lex.ptr;
1233   ls->left = dfa->lex.left;
1234   dfa->lex.ptr = s;
1235   dfa->lex.left = strlen (s);
1236 }
1237
1238 static void
1239 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1240 {
1241   dfa->lex.ptr = ls->ptr;
1242   dfa->lex.left = ls->left;
1243 }
1244
1245 static token
1246 lex (struct dfa *dfa)
1247 {
1248   bool backslash = false;
1249
1250   /* Basic plan: We fetch a character.  If it's a backslash,
1251      we set the backslash flag and go through the loop again.
1252      On the plus side, this avoids having a duplicate of the
1253      main switch inside the backslash case.  On the minus side,
1254      it means that just about every case begins with
1255      "if (backslash) ...".  */
1256   for (int i = 0; i < 2; ++i)
1257     {
1258       if (! dfa->lex.left)
1259         return dfa->lex.lasttok = END;
1260       int c = fetch_wc (dfa);
1261
1262       switch (c)
1263         {
1264         case '\\':
1265           if (backslash)
1266             goto normal_char;
1267           if (dfa->lex.left == 0)
1268             dfaerror (_("unfinished \\ escape"));
1269           backslash = true;
1270           break;
1271
1272         case '^':
1273           if (backslash)
1274             goto normal_char;
1275           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1276               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1277               || dfa->lex.lasttok == OR)
1278             return dfa->lex.lasttok = BEGLINE;
1279           goto normal_char;
1280
1281         case '$':
1282           if (backslash)
1283             goto normal_char;
1284           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1285               || dfa->lex.left == 0
1286               || ((dfa->lex.left
1287                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1288                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1289                                    & (dfa->lex.ptr[0] == '\\')]
1290                       == ')'))
1291               || ((dfa->lex.left
1292                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1293                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1294                                    & (dfa->lex.ptr[0] == '\\')]
1295                       == '|'))
1296               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1297                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1298             return dfa->lex.lasttok = ENDLINE;
1299           goto normal_char;
1300
1301         case '1':
1302         case '2':
1303         case '3':
1304         case '4':
1305         case '5':
1306         case '6':
1307         case '7':
1308         case '8':
1309         case '9':
1310           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1311             {
1312               dfa->lex.laststart = false;
1313               return dfa->lex.lasttok = BACKREF;
1314             }
1315           goto normal_char;
1316
1317         case '`':
1318           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319             {
1320               /* FIXME: should be beginning of string */
1321               return dfa->lex.lasttok = BEGLINE;
1322             }
1323           goto normal_char;
1324
1325         case '\'':
1326           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1327             {
1328               /* FIXME: should be end of string */
1329               return dfa->lex.lasttok = ENDLINE;
1330             }
1331           goto normal_char;
1332
1333         case '<':
1334           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1335             return dfa->lex.lasttok = BEGWORD;
1336           goto normal_char;
1337
1338         case '>':
1339           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1340             return dfa->lex.lasttok = ENDWORD;
1341           goto normal_char;
1342
1343         case 'b':
1344           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1345             return dfa->lex.lasttok = LIMWORD;
1346           goto normal_char;
1347
1348         case 'B':
1349           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1350             return dfa->lex.lasttok = NOTLIMWORD;
1351           goto normal_char;
1352
1353         case '?':
1354           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1355             goto normal_char;
1356           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1357             goto normal_char;
1358           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1359               && dfa->lex.laststart)
1360             goto normal_char;
1361           return dfa->lex.lasttok = QMARK;
1362
1363         case '*':
1364           if (backslash)
1365             goto normal_char;
1366           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1367               && dfa->lex.laststart)
1368             goto normal_char;
1369           return dfa->lex.lasttok = STAR;
1370
1371         case '+':
1372           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1373             goto normal_char;
1374           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1375             goto normal_char;
1376           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1377               && dfa->lex.laststart)
1378             goto normal_char;
1379           return dfa->lex.lasttok = PLUS;
1380
1381         case '{':
1382           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1383             goto normal_char;
1384           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1385             goto normal_char;
1386           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1387               && dfa->lex.laststart)
1388             goto normal_char;
1389
1390           /* Cases:
1391              {M} - exact count
1392              {M,} - minimum count, maximum is infinity
1393              {,N} - 0 through N
1394              {,} - 0 to infinity (same as '*')
1395              {M,N} - M through N */
1396           {
1397             char const *p = dfa->lex.ptr;
1398             char const *lim = p + dfa->lex.left;
1399             dfa->lex.minrep = dfa->lex.maxrep = -1;
1400             for (; p != lim && isasciidigit (*p); p++)
1401               dfa->lex.minrep = (dfa->lex.minrep < 0
1402                                  ? *p - '0'
1403                                  : MIN (RE_DUP_MAX + 1,
1404                                         dfa->lex.minrep * 10 + *p - '0'));
1405             if (p != lim)
1406               {
1407                 if (*p != ',')
1408                   dfa->lex.maxrep = dfa->lex.minrep;
1409                 else
1410                   {
1411                     if (dfa->lex.minrep < 0)
1412                       dfa->lex.minrep = 0;
1413                     while (++p != lim && isasciidigit (*p))
1414                       dfa->lex.maxrep
1415                         = (dfa->lex.maxrep < 0
1416                            ? *p - '0'
1417                            : MIN (RE_DUP_MAX + 1,
1418                                   dfa->lex.maxrep * 10 + *p - '0'));
1419                   }
1420               }
1421             if (! ((! backslash || (p != lim && *p++ == '\\'))
1422                    && p != lim && *p++ == '}'
1423                    && 0 <= dfa->lex.minrep
1424                    && (dfa->lex.maxrep < 0
1425                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1426               {
1427                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1428                   goto normal_char;
1429                 dfaerror (_("invalid content of \\{\\}"));
1430               }
1431             if (RE_DUP_MAX < dfa->lex.maxrep)
1432               dfaerror (_("regular expression too big"));
1433             dfa->lex.ptr = p;
1434             dfa->lex.left = lim - p;
1435           }
1436           dfa->lex.laststart = false;
1437           return dfa->lex.lasttok = REPMN;
1438
1439         case '|':
1440           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1441             goto normal_char;
1442           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1443             goto normal_char;
1444           dfa->lex.laststart = true;
1445           return dfa->lex.lasttok = OR;
1446
1447         case '\n':
1448           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1449               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1450             goto normal_char;
1451           dfa->lex.laststart = true;
1452           return dfa->lex.lasttok = OR;
1453
1454         case '(':
1455           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1456             goto normal_char;
1457           dfa->lex.parens++;
1458           dfa->lex.laststart = true;
1459           return dfa->lex.lasttok = LPAREN;
1460
1461         case ')':
1462           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1463             goto normal_char;
1464           if (dfa->lex.parens == 0
1465               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1466             goto normal_char;
1467           dfa->lex.parens--;
1468           dfa->lex.laststart = false;
1469           return dfa->lex.lasttok = RPAREN;
1470
1471         case '.':
1472           if (backslash)
1473             goto normal_char;
1474           if (dfa->canychar < 0)
1475             {
1476               charclass ccl;
1477               fillset (&ccl);
1478               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1479                 clrbit ('\n', &ccl);
1480               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1481                 clrbit ('\0', &ccl);
1482               if (dfa->localeinfo.multibyte)
1483                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1484                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1485                     clrbit (c2, &ccl);
1486               dfa->canychar = charclass_index (dfa, &ccl);
1487             }
1488           dfa->lex.laststart = false;
1489           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1490                                      ? ANYCHAR
1491                                      : CSET + dfa->canychar);
1492
1493         case 's':
1494         case 'S':
1495           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1496             goto normal_char;
1497           if (!dfa->localeinfo.multibyte)
1498             {
1499               charclass ccl;
1500               zeroset (&ccl);
1501               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1502                 if (isspace (c2))
1503                   setbit (c2, &ccl);
1504               if (c == 'S')
1505                 notset (&ccl);
1506               dfa->lex.laststart = false;
1507               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1508             }
1509
1510           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1511              add_utf8_anychar, makes sense.  */
1512
1513           /* \s and \S are documented to be equivalent to [[:space:]] and
1514              [^[:space:]] respectively, so tell the lexer to process those
1515              strings, each minus its "already processed" '['.  */
1516           {
1517             struct lexptr ls;
1518             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1519             dfa->lex.lasttok = parse_bracket_exp (dfa);
1520             pop_lex_state (dfa, &ls);
1521           }
1522
1523           dfa->lex.laststart = false;
1524           return dfa->lex.lasttok;
1525
1526         case 'w':
1527         case 'W':
1528           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1529             goto normal_char;
1530
1531           if (!dfa->localeinfo.multibyte)
1532             {
1533               charclass ccl;
1534               zeroset (&ccl);
1535               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1536                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1537                   setbit (c2, &ccl);
1538               if (c == 'W')
1539                 notset (&ccl);
1540               dfa->lex.laststart = false;
1541               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1542             }
1543
1544           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1545              add_utf8_anychar, makes sense.  */
1546
1547           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1548              [^_[:alnum:]] respectively, so tell the lexer to process those
1549              strings, each minus its "already processed" '['.  */
1550           {
1551             struct lexptr ls;
1552             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1553             dfa->lex.lasttok = parse_bracket_exp (dfa);
1554             pop_lex_state (dfa, &ls);
1555           }
1556
1557           dfa->lex.laststart = false;
1558           return dfa->lex.lasttok;
1559
1560         case '[':
1561           if (backslash)
1562             goto normal_char;
1563           dfa->lex.laststart = false;
1564           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1565
1566         default:
1567         normal_char:
1568           dfa->lex.laststart = false;
1569           /* For multibyte character sets, folding is done in atom.  Always
1570              return WCHAR.  */
1571           if (dfa->localeinfo.multibyte)
1572             return dfa->lex.lasttok = WCHAR;
1573
1574           if (dfa->syntax.case_fold && isalpha (c))
1575             {
1576               charclass ccl;
1577               zeroset (&ccl);
1578               setbit_case_fold_c (c, &ccl);
1579               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1580             }
1581
1582           return dfa->lex.lasttok = c;
1583         }
1584     }
1585
1586   /* The above loop should consume at most a backslash
1587      and some other character.  */
1588   abort ();
1589   return END;                   /* keeps pedantic compilers happy.  */
1590 }
1591
1592 static void
1593 addtok_mb (struct dfa *dfa, token t, char mbprop)
1594 {
1595   if (dfa->talloc == dfa->tindex)
1596     {
1597       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1598                              sizeof *dfa->tokens);
1599       if (dfa->localeinfo.multibyte)
1600         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1601                                          sizeof *dfa->multibyte_prop);
1602     }
1603   if (dfa->localeinfo.multibyte)
1604     dfa->multibyte_prop[dfa->tindex] = mbprop;
1605   dfa->tokens[dfa->tindex++] = t;
1606
1607   switch (t)
1608     {
1609     case QMARK:
1610     case STAR:
1611     case PLUS:
1612       break;
1613
1614     case CAT:
1615     case OR:
1616       dfa->parse.depth--;
1617       break;
1618
1619     case EMPTY:
1620       dfa->epsilon = true;
1621       goto increment_depth;
1622
1623     case BACKREF:
1624       dfa->fast = false;
1625       goto increment_nleaves;
1626
1627     case BEGLINE:
1628     case ENDLINE:
1629     case BEGWORD:
1630     case ENDWORD:
1631     case LIMWORD:
1632     case NOTLIMWORD:
1633       dfa->epsilon = true;
1634       FALLTHROUGH;
1635     default:
1636     increment_nleaves:
1637       dfa->nleaves++;
1638     increment_depth:
1639       dfa->parse.depth++;
1640       if (dfa->depth < dfa->parse.depth)
1641         dfa->depth = dfa->parse.depth;
1642       break;
1643     }
1644 }
1645
1646 static void addtok_wc (struct dfa *dfa, wint_t wc);
1647
1648 /* Add the given token to the parse tree, maintaining the depth count and
1649    updating the maximum depth if necessary.  */
1650 static void
1651 addtok (struct dfa *dfa, token t)
1652 {
1653   if (dfa->localeinfo.multibyte && t == MBCSET)
1654     {
1655       bool need_or = false;
1656
1657       /* Extract wide characters into alternations for better performance.
1658          This does not require UTF-8.  */
1659       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1660         {
1661           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1662           if (need_or)
1663             addtok (dfa, OR);
1664           need_or = true;
1665         }
1666       dfa->lex.brack.nchars = 0;
1667
1668       /* Wide characters have been handled above, so it is possible
1669          that the set is empty now.  Do nothing in that case.  */
1670       if (dfa->lex.brack.cset != -1)
1671         {
1672           addtok (dfa, CSET + dfa->lex.brack.cset);
1673           if (need_or)
1674             addtok (dfa, OR);
1675         }
1676     }
1677   else
1678     {
1679       addtok_mb (dfa, t, 3);
1680     }
1681 }
1682
1683 /* We treat a multibyte character as a single atom, so that DFA
1684    can treat a multibyte character as a single expression.
1685
1686    e.g., we construct the following tree from "<mb1><mb2>".
1687    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1688    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1689 static void
1690 addtok_wc (struct dfa *dfa, wint_t wc)
1691 {
1692   unsigned char buf[MB_LEN_MAX];
1693   mbstate_t s = { 0 };
1694   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1695   int buflen;
1696
1697   if (stored_bytes != (size_t) -1)
1698     buflen = stored_bytes;
1699   else
1700     {
1701       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1702          the addtok_mb call altogether can corrupt the heap.  */
1703       buflen = 1;
1704       buf[0] = 0;
1705     }
1706
1707   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1708   for (int i = 1; i < buflen; i++)
1709     {
1710       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1711       addtok (dfa, CAT);
1712     }
1713 }
1714
1715 static void
1716 add_utf8_anychar (struct dfa *dfa)
1717 {
1718   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1719      UTF-8 byte sequence has been defined as follows:
1720
1721      ([\x00-\x7f]
1722      |[\xc2-\xdf][\x80-\xbf]
1723      |[\xe0][\xa0-\xbf][\x80-\xbf]
1724      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1725      |[\xed][\x80-\x9f][\x80-\xbf]
1726      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1727      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1728      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1729
1730      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1731      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1732      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1733      H = [\x80-\x9f], I = [\xf0],
1734      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1735
1736      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1737
1738   /* Mnemonics for classes containing two or more bytes.  */
1739   enum { A, B, C, E, F, H, J, K, M };
1740
1741   /* Mnemonics for single-byte tokens.  */
1742   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1743
1744   static charclass const utf8_classes[] = {
1745     /* A. 00-7f: 1-byte sequence.  */
1746     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1747
1748     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1749     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1750
1751     /* C. 80-bf: non-leading bytes.  */
1752     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1753
1754     /* D. e0 (just a token).  */
1755
1756     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1757     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1758
1759     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1760     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1761
1762     /* G. ed (just a token).  */
1763
1764     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1765     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1766
1767     /* I. f0 (just a token).  */
1768
1769     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1770     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1771
1772     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1773     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1774
1775     /* L. f4 (just a token).  */
1776
1777     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1778     CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1779   };
1780
1781   /* Define the character classes that are needed below.  */
1782   if (dfa->utf8_anychar_classes[0] == 0)
1783     {
1784       charclass c = utf8_classes[0];
1785       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1786         clrbit ('\n', &c);
1787       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1788         clrbit ('\0', &c);
1789       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1790
1791       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1792         dfa->utf8_anychar_classes[i]
1793           = CSET + charclass_index (dfa, &utf8_classes[i]);
1794     }
1795
1796   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1797      The token buffer is in reverse Polish order, so we get
1798      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1799       C CAT OR C CAT OR C CAT OR".  */
1800   addtok (dfa, dfa->utf8_anychar_classes[A]);
1801   addtok (dfa, dfa->utf8_anychar_classes[B]);
1802   addtok (dfa, D_token);
1803   addtok (dfa, dfa->utf8_anychar_classes[E]);
1804   addtok (dfa, CAT);
1805   addtok (dfa, OR);
1806   addtok (dfa, G_token);
1807   addtok (dfa, dfa->utf8_anychar_classes[H]);
1808   addtok (dfa, CAT);
1809   addtok (dfa, OR);
1810   addtok (dfa, dfa->utf8_anychar_classes[F]);
1811   addtok (dfa, I_token);
1812   addtok (dfa, dfa->utf8_anychar_classes[J]);
1813   addtok (dfa, CAT);
1814   addtok (dfa, OR);
1815   addtok (dfa, L_token);
1816   addtok (dfa, dfa->utf8_anychar_classes[M]);
1817   addtok (dfa, CAT);
1818   addtok (dfa, OR);
1819   addtok (dfa, dfa->utf8_anychar_classes[K]);
1820   for (int i = 0; i < 3; i++)
1821     {
1822       addtok (dfa, dfa->utf8_anychar_classes[C]);
1823       addtok (dfa, CAT);
1824       addtok (dfa, OR);
1825     }
1826 }
1827
1828 /* The grammar understood by the parser is as follows.
1829
1830    regexp:
1831      regexp OR branch
1832      branch
1833
1834    branch:
1835      branch closure
1836      closure
1837
1838    closure:
1839      closure QMARK
1840      closure STAR
1841      closure PLUS
1842      closure REPMN
1843      atom
1844
1845    atom:
1846      <normal character>
1847      <multibyte character>
1848      ANYCHAR
1849      MBCSET
1850      CSET
1851      BACKREF
1852      BEGLINE
1853      ENDLINE
1854      BEGWORD
1855      ENDWORD
1856      LIMWORD
1857      NOTLIMWORD
1858      LPAREN regexp RPAREN
1859      <empty>
1860
1861    The parser builds a parse tree in postfix form in an array of tokens.  */
1862
1863 static void
1864 atom (struct dfa *dfa)
1865 {
1866   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1867       || dfa->parse.tok >= CSET
1868       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1869       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1870       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1871       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1872       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1873     {
1874       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1875         {
1876           /* For UTF-8 expand the period to a series of CSETs that define a
1877              valid UTF-8 character.  This avoids using the slow multibyte
1878              path.  I'm pretty sure it would be both profitable and correct to
1879              do it for any encoding; however, the optimization must be done
1880              manually as it is done above in add_utf8_anychar.  So, let's
1881              start with UTF-8: it is the most used, and the structure of the
1882              encoding makes the correctness more obvious.  */
1883           add_utf8_anychar (dfa);
1884         }
1885       else
1886         addtok (dfa, dfa->parse.tok);
1887       dfa->parse.tok = lex (dfa);
1888     }
1889   else if (dfa->parse.tok == WCHAR)
1890     {
1891       if (dfa->lex.wctok == WEOF)
1892         addtok (dfa, BACKREF);
1893       else
1894         {
1895           addtok_wc (dfa, dfa->lex.wctok);
1896
1897           if (dfa->syntax.case_fold)
1898             {
1899               wchar_t folded[CASE_FOLDED_BUFSIZE];
1900               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1901               for (int i = 0; i < n; i++)
1902                 {
1903                   addtok_wc (dfa, folded[i]);
1904                   addtok (dfa, OR);
1905                 }
1906             }
1907         }
1908
1909       dfa->parse.tok = lex (dfa);
1910     }
1911   else if (dfa->parse.tok == LPAREN)
1912     {
1913       dfa->parse.tok = lex (dfa);
1914       regexp (dfa);
1915       if (dfa->parse.tok != RPAREN)
1916         dfaerror (_("unbalanced ("));
1917       dfa->parse.tok = lex (dfa);
1918     }
1919   else
1920     addtok (dfa, EMPTY);
1921 }
1922
1923 /* Return the number of tokens in the given subexpression.  */
1924 static idx_t _GL_ATTRIBUTE_PURE
1925 nsubtoks (struct dfa const *dfa, idx_t tindex)
1926 {
1927   switch (dfa->tokens[tindex - 1])
1928     {
1929     default:
1930       return 1;
1931     case QMARK:
1932     case STAR:
1933     case PLUS:
1934       return 1 + nsubtoks (dfa, tindex - 1);
1935     case CAT:
1936     case OR:
1937       {
1938         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1939         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1940       }
1941     }
1942 }
1943
1944 /* Copy the given subexpression to the top of the tree.  */
1945 static void
1946 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1947 {
1948   if (dfa->localeinfo.multibyte)
1949     for (idx_t i = 0; i < ntokens; i++)
1950       addtok_mb (dfa, dfa->tokens[tindex + i],
1951                  dfa->multibyte_prop[tindex + i]);
1952   else
1953     for (idx_t i = 0; i < ntokens; i++)
1954       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1955 }
1956
1957 static void
1958 closure (struct dfa *dfa)
1959 {
1960   atom (dfa);
1961   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1962          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1963     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1964       {
1965         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1966         idx_t tindex = dfa->tindex - ntokens;
1967         if (dfa->lex.maxrep < 0)
1968           addtok (dfa, PLUS);
1969         if (dfa->lex.minrep == 0)
1970           addtok (dfa, QMARK);
1971         int i;
1972         for (i = 1; i < dfa->lex.minrep; i++)
1973           {
1974             copytoks (dfa, tindex, ntokens);
1975             addtok (dfa, CAT);
1976           }
1977         for (; i < dfa->lex.maxrep; i++)
1978           {
1979             copytoks (dfa, tindex, ntokens);
1980             addtok (dfa, QMARK);
1981             addtok (dfa, CAT);
1982           }
1983         dfa->parse.tok = lex (dfa);
1984       }
1985     else if (dfa->parse.tok == REPMN)
1986       {
1987         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1988         dfa->parse.tok = lex (dfa);
1989         closure (dfa);
1990       }
1991     else
1992       {
1993         addtok (dfa, dfa->parse.tok);
1994         dfa->parse.tok = lex (dfa);
1995       }
1996 }
1997
1998 static void
1999 branch (struct dfa* dfa)
2000 {
2001   closure (dfa);
2002   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2003          && dfa->parse.tok >= 0)
2004     {
2005       closure (dfa);
2006       addtok (dfa, CAT);
2007     }
2008 }
2009
2010 static void
2011 regexp (struct dfa *dfa)
2012 {
2013   branch (dfa);
2014   while (dfa->parse.tok == OR)
2015     {
2016       dfa->parse.tok = lex (dfa);
2017       branch (dfa);
2018       addtok (dfa, OR);
2019     }
2020 }
2021
2022 /* Parse a string S of length LEN into D.  S can include NUL characters.
2023    This is the main entry point for the parser.  */
2024 void
2025 dfaparse (char const *s, idx_t len, struct dfa *d)
2026 {
2027   d->lex.ptr = s;
2028   d->lex.left = len;
2029   d->lex.lasttok = END;
2030   d->lex.laststart = true;
2031
2032   if (!d->syntax.syntax_bits_set)
2033     dfaerror (_("no syntax specified"));
2034
2035   if (!d->nregexps)
2036     addtok (d, BEG);
2037
2038   d->parse.tok = lex (d);
2039   d->parse.depth = d->depth;
2040
2041   regexp (d);
2042
2043   if (d->parse.tok != END)
2044     dfaerror (_("unbalanced )"));
2045
2046   addtok (d, END - d->nregexps);
2047   addtok (d, CAT);
2048
2049   if (d->nregexps)
2050     addtok (d, OR);
2051
2052   ++d->nregexps;
2053 }
2054
2055 /* Some primitives for operating on sets of positions.  */
2056
2057 /* Copy one set to another.  */
2058 static void
2059 copy (position_set const *src, position_set *dst)
2060 {
2061   if (dst->alloc < src->nelem)
2062     {
2063       free (dst->elems);
2064       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2065                             sizeof *dst->elems);
2066     }
2067   dst->nelem = src->nelem;
2068   if (src->nelem != 0)
2069     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2070 }
2071
2072 static void
2073 alloc_position_set (position_set *s, idx_t size)
2074 {
2075   s->elems = xnmalloc (size, sizeof *s->elems);
2076   s->alloc = size;
2077   s->nelem = 0;
2078 }
2079
2080 /* Insert position P in set S.  S is maintained in sorted order on
2081    decreasing index.  If there is already an entry in S with P.index
2082    then merge (logically-OR) P's constraints into the one in S.
2083    S->elems must point to an array large enough to hold the resulting set.  */
2084 static void
2085 insert (position p, position_set *s)
2086 {
2087   idx_t count = s->nelem;
2088   idx_t lo = 0, hi = count;
2089   while (lo < hi)
2090     {
2091       idx_t mid = (lo + hi) >> 1;
2092       if (s->elems[mid].index < p.index)
2093         lo = mid + 1;
2094       else if (s->elems[mid].index == p.index)
2095         {
2096           s->elems[mid].constraint |= p.constraint;
2097           return;
2098         }
2099       else
2100         hi = mid;
2101     }
2102
2103   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2104   for (idx_t i = count; i > lo; i--)
2105     s->elems[i] = s->elems[i - 1];
2106   s->elems[lo] = p;
2107   ++s->nelem;
2108 }
2109
2110 static void
2111 append (position p, position_set *s)
2112 {
2113   idx_t count = s->nelem;
2114   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2115   s->elems[s->nelem++] = p;
2116 }
2117
2118 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2119    result is as if the positions of S1, and of S2 with the additional
2120    constraint C2, were inserted into an initially empty set.  */
2121 static void
2122 merge_constrained (position_set const *s1, position_set const *s2,
2123                    unsigned int c2, position_set *m)
2124 {
2125   idx_t i = 0, j = 0;
2126
2127   if (m->alloc - s1->nelem < s2->nelem)
2128     {
2129       free (m->elems);
2130       m->alloc = s1->nelem;
2131       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2132     }
2133   m->nelem = 0;
2134   while (i < s1->nelem || j < s2->nelem)
2135     if (! (j < s2->nelem)
2136         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2137       {
2138         unsigned int c = ((i < s1->nelem && j < s2->nelem
2139                            && s1->elems[i].index == s2->elems[j].index)
2140                           ? s2->elems[j++].constraint & c2
2141                           : 0);
2142         m->elems[m->nelem].index = s1->elems[i].index;
2143         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2144       }
2145     else
2146       {
2147         if (s2->elems[j].constraint & c2)
2148           {
2149             m->elems[m->nelem].index = s2->elems[j].index;
2150             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2151           }
2152         j++;
2153       }
2154 }
2155
2156 /* Merge two sets of positions into a third.  The result is exactly as if
2157    the positions of both sets were inserted into an initially empty set.  */
2158 static void
2159 merge (position_set const *s1, position_set const *s2, position_set *m)
2160 {
2161   merge_constrained (s1, s2, -1, m);
2162 }
2163
2164 /* Merge into DST all the elements of SRC, possibly destroying
2165    the contents of the temporary M.  */
2166 static void
2167 merge2 (position_set *dst, position_set const *src, position_set *m)
2168 {
2169   if (src->nelem < 4)
2170     {
2171       for (idx_t i = 0; i < src->nelem; i++)
2172         insert (src->elems[i], dst);
2173     }
2174    else
2175     {
2176       merge (src, dst, m);
2177       copy (m, dst);
2178     }
2179 }
2180
2181 /* Delete a position from a set.  Return the nonzero constraint of the
2182    deleted position, or zero if there was no such position.  */
2183 static unsigned int
2184 delete (idx_t del, position_set *s)
2185 {
2186   idx_t count = s->nelem;
2187   idx_t lo = 0, hi = count;
2188   while (lo < hi)
2189     {
2190       idx_t mid = (lo + hi) >> 1;
2191       if (s->elems[mid].index < del)
2192         lo = mid + 1;
2193       else if (s->elems[mid].index == del)
2194         {
2195           unsigned int c = s->elems[mid].constraint;
2196           idx_t i;
2197           for (i = mid; i + 1 < count; i++)
2198             s->elems[i] = s->elems[i + 1];
2199           s->nelem = i;
2200           return c;
2201         }
2202       else
2203         hi = mid;
2204     }
2205   return 0;
2206 }
2207
2208 /* Replace a position with the followed set.  */
2209 static void
2210 replace (position_set *dst, idx_t del, position_set *add,
2211          unsigned int constraint, position_set *tmp)
2212 {
2213   unsigned int c = delete (del, dst) & constraint;
2214
2215   if (c)
2216     {
2217       copy (dst, tmp);
2218       merge_constrained (tmp, add, c, dst);
2219     }
2220 }
2221
2222 /* Find the index of the state corresponding to the given position set with
2223    the given preceding context, or create a new state if there is no such
2224    state.  Context tells whether we got here on a newline or letter.  */
2225 static state_num
2226 state_index (struct dfa *d, position_set const *s, int context)
2227 {
2228   size_t hash = 0;
2229   int constraint = 0;
2230   state_num i;
2231
2232   for (i = 0; i < s->nelem; ++i)
2233     {
2234       size_t ind = s->elems[i].index;
2235       hash ^= ind + s->elems[i].constraint;
2236     }
2237
2238   /* Try to find a state that exactly matches the proposed one.  */
2239   for (i = 0; i < d->sindex; ++i)
2240     {
2241       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2242           || context != d->states[i].context)
2243         continue;
2244       state_num j;
2245       for (j = 0; j < s->nelem; ++j)
2246         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2247             || s->elems[j].index != d->states[i].elems.elems[j].index)
2248           break;
2249       if (j == s->nelem)
2250         return i;
2251     }
2252
2253 #ifdef DEBUG
2254   fprintf (stderr, "new state %td\n nextpos:", i);
2255   for (state_num j = 0; j < s->nelem; j++)
2256     {
2257       fprintf (stderr, " %td:", s->elems[j].index);
2258       prtok (d->tokens[s->elems[j].index]);
2259     }
2260   fprintf (stderr, "\n context:");
2261   if (context ^ CTX_ANY)
2262     {
2263       if (context & CTX_NONE)
2264         fprintf (stderr, " CTX_NONE");
2265       if (context & CTX_LETTER)
2266         fprintf (stderr, " CTX_LETTER");
2267       if (context & CTX_NEWLINE)
2268         fprintf (stderr, " CTX_NEWLINE");
2269     }
2270   else
2271     fprintf (stderr, " CTX_ANY");
2272   fprintf (stderr, "\n");
2273 #endif
2274
2275   for (state_num j = 0; j < s->nelem; j++)
2276     {
2277       int c = d->constraints[s->elems[j].index];
2278
2279       if (c != 0)
2280         {
2281           if (succeeds_in_context (c, context, CTX_ANY))
2282             constraint |= c;
2283         }
2284       else if (d->tokens[s->elems[j].index] == BACKREF)
2285         constraint = NO_CONSTRAINT;
2286     }
2287
2288
2289   /* Create a new state.  */
2290   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2291                              sizeof *d->states);
2292   d->states[i].hash = hash;
2293   alloc_position_set (&d->states[i].elems, s->nelem);
2294   copy (s, &d->states[i].elems);
2295   d->states[i].context = context;
2296   d->states[i].constraint = constraint;
2297   d->states[i].mbps.nelem = 0;
2298   d->states[i].mbps.elems = NULL;
2299   d->states[i].mb_trindex = -1;
2300
2301   ++d->sindex;
2302
2303   return i;
2304 }
2305
2306 /* Find the epsilon closure of D's set of positions.  If any position of the set
2307    contains a symbol that matches the empty string in some context, replace
2308    that position with the elements of its follow labeled with an appropriate
2309    constraint.  Repeat exhaustively until no funny positions are left.
2310    S->elems must be large enough to hold the result.  BACKWARD is D's
2311    backward set; use and update it too.  */
2312 static void
2313 epsclosure (struct dfa const *d, position_set *backward)
2314 {
2315   position_set tmp;
2316   alloc_position_set (&tmp, d->nleaves);
2317   for (idx_t i = 0; i < d->tindex; i++)
2318     if (0 < d->follows[i].nelem)
2319       {
2320         unsigned int constraint;
2321         switch (d->tokens[i])
2322           {
2323           default:
2324             continue;
2325
2326           case BEGLINE:
2327             constraint = BEGLINE_CONSTRAINT;
2328             break;
2329           case ENDLINE:
2330             constraint = ENDLINE_CONSTRAINT;
2331             break;
2332           case BEGWORD:
2333             constraint = BEGWORD_CONSTRAINT;
2334             break;
2335           case ENDWORD:
2336             constraint = ENDWORD_CONSTRAINT;
2337             break;
2338           case LIMWORD:
2339             constraint = LIMWORD_CONSTRAINT;
2340             break;
2341           case NOTLIMWORD:
2342             constraint = NOTLIMWORD_CONSTRAINT;
2343             break;
2344           case EMPTY:
2345             constraint = NO_CONSTRAINT;
2346             break;
2347           }
2348
2349         delete (i, &d->follows[i]);
2350
2351         for (idx_t j = 0; j < backward[i].nelem; j++)
2352           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2353                    constraint, &tmp);
2354         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2355           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2356                    NO_CONSTRAINT, &tmp);
2357       }
2358   free (tmp.elems);
2359 }
2360
2361 /* Returns the set of contexts for which there is at least one
2362    character included in C.  */
2363
2364 static int
2365 charclass_context (struct dfa const *dfa, charclass const *c)
2366 {
2367   int context = 0;
2368
2369   for (int j = 0; j < CHARCLASS_WORDS; j++)
2370     {
2371       if (c->w[j] & dfa->syntax.newline.w[j])
2372         context |= CTX_NEWLINE;
2373       if (c->w[j] & dfa->syntax.letters.w[j])
2374         context |= CTX_LETTER;
2375       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2376         context |= CTX_NONE;
2377     }
2378
2379   return context;
2380 }
2381
2382 /* Returns the contexts on which the position set S depends.  Each context
2383    in the set of returned contexts (let's call it SC) may have a different
2384    follow set than other contexts in SC, and also different from the
2385    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2386    in the complement set will have the same follow set.  */
2387
2388 static int _GL_ATTRIBUTE_PURE
2389 state_separate_contexts (struct dfa *d, position_set const *s)
2390 {
2391   int separate_contexts = 0;
2392
2393   for (idx_t j = 0; j < s->nelem; j++)
2394     separate_contexts |= d->separates[s->elems[j].index];
2395
2396   return separate_contexts;
2397 }
2398
2399 enum
2400 {
2401   /* Single token is repeated.  It is distinguished from non-repeated.  */
2402   OPT_REPEAT = (1 << 0),
2403
2404   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2405      node is not merged.  */
2406   OPT_LPAREN = (1 << 1),
2407
2408   /* Multiple branches are joined.  The node is not merged.  */
2409   OPT_RPAREN = (1 << 2),
2410
2411   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2412      flag is turned on. */
2413   OPT_WALKED = (1 << 3),
2414
2415   /* The node is queued.  The node is not queued again.  */
2416   OPT_QUEUED = (1 << 4)
2417 };
2418
2419 static void
2420 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2421                  position_set *merged)
2422 {
2423   position_set *follows = d->follows;
2424   idx_t nelem = 0;
2425
2426   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2427     {
2428       idx_t sindex = follows[tindex].elems[i].index;
2429
2430       /* Skip the node as pruned in future.  */
2431       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2432       if (iconstraint == 0)
2433         continue;
2434
2435       if (d->tokens[follows[tindex].elems[i].index] <= END)
2436         {
2437           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2438           continue;
2439         }
2440
2441       if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2442         {
2443           idx_t j;
2444
2445           for (j = 0; j < nelem; j++)
2446             {
2447               idx_t dindex = follows[tindex].elems[j].index;
2448
2449               if (dindex == tindex)
2450                 continue;
2451
2452               if (follows[tindex].elems[j].constraint != iconstraint)
2453                 continue;
2454
2455               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2456                 continue;
2457
2458               if (d->tokens[sindex] != d->tokens[dindex])
2459                 continue;
2460
2461               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2462                 continue;
2463
2464               if (flags[sindex] & OPT_REPEAT)
2465                 delete (sindex, &follows[sindex]);
2466
2467               merge2 (&follows[dindex], &follows[sindex], merged);
2468
2469               break;
2470             }
2471
2472           if (j < nelem)
2473             continue;
2474         }
2475
2476       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2477       flags[sindex] |= OPT_QUEUED;
2478     }
2479
2480   follows[tindex].nelem = nelem;
2481 }
2482
2483 static int
2484 compare (const void *a, const void *b)
2485 {
2486   position const *p = a, *q = b;
2487   return (p->index > q->index) - (p->index < q->index);
2488 }
2489
2490 static void
2491 reorder_tokens (struct dfa *d)
2492 {
2493   idx_t nleaves = 0;
2494   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2495   map[0] = nleaves++;
2496   for (idx_t i = 1; i < d->tindex; i++)
2497     map[i] = -1;
2498
2499   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2500   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2501   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2502   char *multibyte_prop = (d->localeinfo.multibyte
2503                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2504                           : NULL);
2505
2506   for (idx_t i = 0; i < d->tindex; i++)
2507     {
2508       if (map[i] < 0)
2509         {
2510           free (d->follows[i].elems);
2511           d->follows[i].elems = NULL;
2512           d->follows[i].nelem = 0;
2513           continue;
2514         }
2515
2516       tokens[map[i]] = d->tokens[i];
2517       follows[map[i]] = d->follows[i];
2518       constraints[map[i]] = d->constraints[i];
2519
2520       if (multibyte_prop != NULL)
2521         multibyte_prop[map[i]] = d->multibyte_prop[i];
2522
2523       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2524         {
2525           if (map[d->follows[i].elems[j].index] == -1)
2526             map[d->follows[i].elems[j].index] = nleaves++;
2527
2528           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2529         }
2530
2531       qsort (d->follows[i].elems, d->follows[i].nelem,
2532              sizeof *d->follows[i].elems, compare);
2533     }
2534
2535   for (idx_t i = 0; i < nleaves; i++)
2536     {
2537       d->tokens[i] = tokens[i];
2538       d->follows[i] = follows[i];
2539       d->constraints[i] = constraints[i];
2540
2541       if (multibyte_prop != NULL)
2542         d->multibyte_prop[i] = multibyte_prop[i];
2543     }
2544
2545   d->tindex = d->nleaves = nleaves;
2546
2547   free (tokens);
2548   free (follows);
2549   free (constraints);
2550   free (multibyte_prop);
2551   free (map);
2552 }
2553
2554 static void
2555 dfaoptimize (struct dfa *d)
2556 {
2557   char *flags = xzalloc (d->tindex);
2558
2559   for (idx_t i = 0; i < d->tindex; i++)
2560     {
2561       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2562         {
2563           if (d->follows[i].elems[j].index == i)
2564             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2565           else if (d->follows[i].elems[j].index < i)
2566             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2567           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2568             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2569           else
2570             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2571         }
2572     }
2573
2574   flags[0] |= OPT_QUEUED;
2575
2576   position_set merged0;
2577   position_set *merged = &merged0;
2578   alloc_position_set (merged, d->nleaves);
2579
2580   d->constraints = xcalloc (d->tindex, sizeof *d->constraints);
2581
2582   for (idx_t i = 0; i < d->tindex; i++)
2583     if (flags[i] & OPT_QUEUED)
2584       merge_nfa_state (d, i, flags, merged);
2585
2586   reorder_tokens (d);
2587
2588   free (merged->elems);
2589   free (flags);
2590 }
2591
2592 /* Perform bottom-up analysis on the parse tree, computing various functions.
2593    Note that at this point, we're pretending constructs like \< are real
2594    characters rather than constraints on what can follow them.
2595
2596    Nullable:  A node is nullable if it is at the root of a regexp that can
2597    match the empty string.
2598    *  EMPTY leaves are nullable.
2599    * No other leaf is nullable.
2600    * A QMARK or STAR node is nullable.
2601    * A PLUS node is nullable if its argument is nullable.
2602    * A CAT node is nullable if both its arguments are nullable.
2603    * An OR node is nullable if either argument is nullable.
2604
2605    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2606    that could correspond to the first character of a string matching the
2607    regexp rooted at the given node.
2608    * EMPTY leaves have empty firstpos.
2609    * The firstpos of a nonempty leaf is that leaf itself.
2610    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2611      argument.
2612    * The firstpos of a CAT node is the firstpos of the left argument, union
2613      the firstpos of the right if the left argument is nullable.
2614    * The firstpos of an OR node is the union of firstpos of each argument.
2615
2616    Lastpos:  The lastpos of a node is the set of positions that could
2617    correspond to the last character of a string matching the regexp at
2618    the given node.
2619    * EMPTY leaves have empty lastpos.
2620    * The lastpos of a nonempty leaf is that leaf itself.
2621    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2622      argument.
2623    * The lastpos of a CAT node is the lastpos of its right argument, union
2624      the lastpos of the left if the right argument is nullable.
2625    * The lastpos of an OR node is the union of the lastpos of each argument.
2626
2627    Follow:  The follow of a position is the set of positions that could
2628    correspond to the character following a character matching the node in
2629    a string matching the regexp.  At this point we consider special symbols
2630    that match the empty string in some context to be just normal characters.
2631    Later, if we find that a special symbol is in a follow set, we will
2632    replace it with the elements of its follow, labeled with an appropriate
2633    constraint.
2634    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2635      the follow of every node in the lastpos.
2636    * Every node in the firstpos of the second argument of a CAT node is in
2637      the follow of every node in the lastpos of the first argument.
2638
2639    Because of the postfix representation of the parse tree, the depth-first
2640    analysis is conveniently done by a linear scan with the aid of a stack.
2641    Sets are stored as arrays of the elements, obeying a stack-like allocation
2642    scheme; the number of elements in each set deeper in the stack can be
2643    used to determine the address of a particular set's array.  */
2644 static void
2645 dfaanalyze (struct dfa *d, bool searchflag)
2646 {
2647   /* Array allocated to hold position sets.  */
2648   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2649   /* Firstpos and lastpos elements.  */
2650   position *firstpos = posalloc;
2651   position *lastpos = firstpos + d->nleaves;
2652   position pos;
2653   position_set tmp;
2654
2655   /* Stack for element counts and nullable flags.  */
2656   struct
2657   {
2658     /* Whether the entry is nullable.  */
2659     bool nullable;
2660
2661     /* Counts of firstpos and lastpos sets.  */
2662     idx_t nfirstpos;
2663     idx_t nlastpos;
2664   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2665
2666   position_set merged;          /* Result of merging sets.  */
2667
2668   addtok (d, CAT);
2669   idx_t tindex = d->tindex;
2670
2671 #ifdef DEBUG
2672   fprintf (stderr, "dfaanalyze:\n");
2673   for (idx_t i = 0; i < tindex; i++)
2674     {
2675       fprintf (stderr, " %td:", i);
2676       prtok (d->tokens[i]);
2677     }
2678   putc ('\n', stderr);
2679 #endif
2680
2681   d->searchflag = searchflag;
2682   alloc_position_set (&merged, d->nleaves);
2683   d->follows = xcalloc (tindex, sizeof *d->follows);
2684   position_set *backward
2685     = d->epsilon ? xcalloc (tindex, sizeof *backward) : NULL;
2686
2687   for (idx_t i = 0; i < tindex; i++)
2688     {
2689       switch (d->tokens[i])
2690         {
2691         case EMPTY:
2692           /* The empty set is nullable.  */
2693           stk->nullable = true;
2694
2695           /* The firstpos and lastpos of the empty leaf are both empty.  */
2696           stk->nfirstpos = stk->nlastpos = 0;
2697           stk++;
2698           break;
2699
2700         case STAR:
2701         case PLUS:
2702           /* Every element in the lastpos of the argument is in the backward
2703              set of every element in the firstpos.  */
2704           if (d->epsilon)
2705             {
2706               tmp.elems = lastpos - stk[-1].nlastpos;
2707               tmp.nelem = stk[-1].nlastpos;
2708               for (position *p = firstpos - stk[-1].nfirstpos;
2709                    p < firstpos; p++)
2710                 merge2 (&backward[p->index], &tmp, &merged);
2711             }
2712
2713           /* Every element in the firstpos of the argument is in the follow
2714              of every element in the lastpos.  */
2715           {
2716             tmp.elems = firstpos - stk[-1].nfirstpos;
2717             tmp.nelem = stk[-1].nfirstpos;
2718             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2719               merge2 (&d->follows[p->index], &tmp, &merged);
2720           }
2721           FALLTHROUGH;
2722         case QMARK:
2723           /* A QMARK or STAR node is automatically nullable.  */
2724           if (d->tokens[i] != PLUS)
2725             stk[-1].nullable = true;
2726           break;
2727
2728         case CAT:
2729           /* Every element in the lastpos of the first argument is in
2730              the backward set of every element in the firstpos of the
2731              second argument.  */
2732           if (backward)
2733             {
2734               tmp.nelem = stk[-2].nlastpos;
2735               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2736               for (position *p = firstpos - stk[-1].nfirstpos;
2737                    p < firstpos; p++)
2738                 merge2 (&backward[p->index], &tmp, &merged);
2739             }
2740
2741           /* Every element in the firstpos of the second argument is in the
2742              follow of every element in the lastpos of the first argument.  */
2743           {
2744             tmp.nelem = stk[-1].nfirstpos;
2745             tmp.elems = firstpos - stk[-1].nfirstpos;
2746             for (position *plim = lastpos - stk[-1].nlastpos,
2747                    *p = plim - stk[-2].nlastpos;
2748                  p < plim; p++)
2749               merge2 (&d->follows[p->index], &tmp, &merged);
2750           }
2751
2752           /* The firstpos of a CAT node is the firstpos of the first argument,
2753              union that of the second argument if the first is nullable.  */
2754           if (stk[-2].nullable)
2755             stk[-2].nfirstpos += stk[-1].nfirstpos;
2756           else
2757             firstpos -= stk[-1].nfirstpos;
2758
2759           /* The lastpos of a CAT node is the lastpos of the second argument,
2760              union that of the first argument if the second is nullable.  */
2761           if (stk[-1].nullable)
2762             stk[-2].nlastpos += stk[-1].nlastpos;
2763           else
2764             {
2765               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2766               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2767                 p[j] = p[j + stk[-2].nlastpos];
2768               lastpos -= stk[-2].nlastpos;
2769               stk[-2].nlastpos = stk[-1].nlastpos;
2770             }
2771
2772           /* A CAT node is nullable if both arguments are nullable.  */
2773           stk[-2].nullable &= stk[-1].nullable;
2774           stk--;
2775           break;
2776
2777         case OR:
2778           /* The firstpos is the union of the firstpos of each argument.  */
2779           stk[-2].nfirstpos += stk[-1].nfirstpos;
2780
2781           /* The lastpos is the union of the lastpos of each argument.  */
2782           stk[-2].nlastpos += stk[-1].nlastpos;
2783
2784           /* An OR node is nullable if either argument is nullable.  */
2785           stk[-2].nullable |= stk[-1].nullable;
2786           stk--;
2787           break;
2788
2789         default:
2790           /* Anything else is a nonempty position.  (Note that special
2791              constructs like \< are treated as nonempty strings here;
2792              an "epsilon closure" effectively makes them nullable later.
2793              Backreferences have to get a real position so we can detect
2794              transitions on them later.  But they are nullable.  */
2795           stk->nullable = d->tokens[i] == BACKREF;
2796
2797           /* This position is in its own firstpos and lastpos.  */
2798           stk->nfirstpos = stk->nlastpos = 1;
2799           stk++;
2800
2801           firstpos->index = lastpos->index = i;
2802           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2803           firstpos++, lastpos++;
2804
2805           break;
2806         }
2807 #ifdef DEBUG
2808       /* ... balance the above nonsyntactic #ifdef goo...  */
2809       fprintf (stderr, "node %td:", i);
2810       prtok (d->tokens[i]);
2811       putc ('\n', stderr);
2812       fprintf (stderr,
2813                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2814       fprintf (stderr, " firstpos:");
2815       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2816         {
2817           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2818           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2819         }
2820       fprintf (stderr, "\n lastpos:");
2821       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2822         {
2823           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2824           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2825         }
2826       putc ('\n', stderr);
2827 #endif
2828     }
2829
2830   if (backward)
2831     {
2832       /* For each follow set that is the follow set of a real position,
2833          replace it with its epsilon closure.  */
2834       epsclosure (d, backward);
2835
2836       for (idx_t i = 0; i < tindex; i++)
2837         free (backward[i].elems);
2838       free (backward);
2839     }
2840
2841   dfaoptimize (d);
2842
2843 #ifdef DEBUG
2844   for (idx_t i = 0; i < tindex; i++)
2845     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2846         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2847         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2848       {
2849         fprintf (stderr, "follows(%td:", i);
2850         prtok (d->tokens[i]);
2851         fprintf (stderr, "):");
2852         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2853           {
2854             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2855             prtok (d->tokens[d->follows[i].elems[j].index]);
2856           }
2857         putc ('\n', stderr);
2858       }
2859 #endif
2860
2861   pos.index = 0;
2862   pos.constraint = NO_CONSTRAINT;
2863
2864   alloc_position_set (&tmp, 1);
2865
2866   append (pos, &tmp);
2867
2868   d->separates = xcalloc (tindex, sizeof *d->separates);
2869
2870   for (idx_t i = 0; i < tindex; i++)
2871     {
2872       if (prev_newline_dependent (d->constraints[i]))
2873         d->separates[i] |= CTX_NEWLINE;
2874       if (prev_letter_dependent (d->constraints[i]))
2875         d->separates[i] |= CTX_LETTER;
2876
2877       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2878         {
2879           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2880             d->separates[i] |= CTX_NEWLINE;
2881           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2882             d->separates[i] |= CTX_LETTER;
2883         }
2884     }
2885
2886   /* Context wanted by some position.  */
2887   int separate_contexts = state_separate_contexts (d, &tmp);
2888
2889   /* Build the initial state.  */
2890   if (separate_contexts & CTX_NEWLINE)
2891     state_index (d, &tmp, CTX_NEWLINE);
2892   d->initstate_notbol = d->min_trcount
2893     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2894   if (separate_contexts & CTX_LETTER)
2895     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2896   d->min_trcount++;
2897   d->trcount = 0;
2898
2899   free (posalloc);
2900   free (stkalloc);
2901   free (merged.elems);
2902   free (tmp.elems);
2903 }
2904
2905 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2906 static void
2907 realloc_trans_if_necessary (struct dfa *d)
2908 {
2909   state_num oldalloc = d->tralloc;
2910   if (oldalloc < d->sindex)
2911     {
2912       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2913       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2914       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2915                            -1, sizeof *realtrans);
2916       realtrans[0] = realtrans[1] = NULL;
2917       d->trans = realtrans + 2;
2918       idx_t newalloc = d->tralloc = newalloc1 - 2;
2919       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2920       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2921       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2922       if (d->localeinfo.multibyte)
2923         {
2924           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2925           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2926           if (oldalloc == 0)
2927             realtrans[0] = realtrans[1] = NULL;
2928           d->mb_trans = realtrans + 2;
2929         }
2930       for (; oldalloc < newalloc; oldalloc++)
2931         {
2932           d->trans[oldalloc] = NULL;
2933           d->fails[oldalloc] = NULL;
2934           if (d->localeinfo.multibyte)
2935             d->mb_trans[oldalloc] = NULL;
2936         }
2937     }
2938 }
2939
2940 /*
2941    Calculate the transition table for a new state derived from state s
2942    for a compiled dfa d after input character uc, and return the new
2943    state number.
2944
2945    Do not worry about all possible input characters; calculate just the group
2946    of positions that match uc.  Label it with the set of characters that
2947    every position in the group matches (taking into account, if necessary,
2948    preceding context information of s).  Then find the union
2949    of these positions' follows, i.e., the set of positions of the
2950    new state.  For each character in the group's label, set the transition
2951    on this character to be to a state corresponding to the set's positions,
2952    and its associated backward context information, if necessary.
2953
2954    When building a searching matcher, include the positions of state
2955    0 in every state.
2956
2957    The group is constructed by building an equivalence-class
2958    partition of the positions of s.
2959
2960    For each position, find the set of characters C that it matches.  Eliminate
2961    any characters from C that fail on grounds of backward context.
2962
2963    Check whether the group's label L has nonempty
2964    intersection with C.  If L - C is nonempty, create a new group labeled
2965    L - C and having the same positions as the current group, and set L to
2966    the intersection of L and C.  Insert the position in the group, set
2967    C = C - L, and resume scanning.
2968
2969    If after comparing with every group there are characters remaining in C,
2970    create a new group labeled with the characters of C and insert this
2971    position in that group.  */
2972
2973 static state_num
2974 build_state (state_num s, struct dfa *d, unsigned char uc)
2975 {
2976   position_set follows;         /* Union of the follows for each
2977                                    position of the current state.  */
2978   position_set group;           /* Positions that match the input char.  */
2979   position_set tmp;             /* Temporary space for merging sets.  */
2980   state_num state;              /* New state.  */
2981   state_num state_newline;      /* New state on a newline transition.  */
2982   state_num state_letter;       /* New state on a letter transition.  */
2983
2984 #ifdef DEBUG
2985   fprintf (stderr, "build state %td\n", s);
2986 #endif
2987
2988   /* A pointer to the new transition table, and the table itself.  */
2989   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2990   state_num *trans = *ptrans;
2991
2992   if (!trans)
2993     {
2994       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2995          transition tables that can exist at once, other than for
2996          initial states.  Often-used transition tables are quickly
2997          rebuilt, whereas rarely-used ones are cleared away.  */
2998       if (MAX_TRCOUNT <= d->trcount)
2999         {
3000           for (state_num i = d->min_trcount; i < d->tralloc; i++)
3001             {
3002               free (d->trans[i]);
3003               free (d->fails[i]);
3004               d->trans[i] = d->fails[i] = NULL;
3005             }
3006           d->trcount = 0;
3007         }
3008
3009       d->trcount++;
3010       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3011
3012       /* Fill transition table with a default value which means that the
3013          transited state has not been calculated yet.  */
3014       for (int i = 0; i < NOTCHAR; i++)
3015         trans[i] = -2;
3016     }
3017
3018   /* Set up the success bits for this state.  */
3019   d->success[s] = 0;
3020   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3021     d->success[s] |= CTX_NEWLINE;
3022   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3023     d->success[s] |= CTX_LETTER;
3024   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3025     d->success[s] |= CTX_NONE;
3026
3027   alloc_position_set (&follows, d->nleaves);
3028
3029   /* Find the union of the follows of the positions of the group.
3030      This is a hideously inefficient loop.  Fix it someday.  */
3031   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3032     for (idx_t k = 0;
3033          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3034       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3035               &follows);
3036
3037   /* Positions that match the input char.  */
3038   alloc_position_set (&group, d->nleaves);
3039
3040   /* The group's label.  */
3041   charclass label;
3042   fillset (&label);
3043
3044   for (idx_t i = 0; i < follows.nelem; i++)
3045     {
3046       charclass matches;            /* Set of matching characters.  */
3047       position pos = follows.elems[i];
3048       bool matched = false;
3049       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3050         {
3051           zeroset (&matches);
3052           setbit (d->tokens[pos.index], &matches);
3053           if (d->tokens[pos.index] == uc)
3054             matched = true;
3055         }
3056       else if (d->tokens[pos.index] >= CSET)
3057         {
3058           matches = d->charclasses[d->tokens[pos.index] - CSET];
3059           if (tstbit (uc, &matches))
3060             matched = true;
3061         }
3062       else if (d->tokens[pos.index] == ANYCHAR)
3063         {
3064           matches = d->charclasses[d->canychar];
3065           if (tstbit (uc, &matches))
3066             matched = true;
3067
3068           /* ANYCHAR must match with a single character, so we must put
3069              it to D->states[s].mbps which contains the positions which
3070              can match with a single character not a byte.  If all
3071              positions which has ANYCHAR does not depend on context of
3072              next character, we put the follows instead of it to
3073              D->states[s].mbps to optimize.  */
3074           if (succeeds_in_context (pos.constraint, d->states[s].context,
3075                                    CTX_NONE))
3076             {
3077               if (d->states[s].mbps.nelem == 0)
3078                 alloc_position_set (&d->states[s].mbps, 1);
3079               insert (pos, &d->states[s].mbps);
3080             }
3081         }
3082       else
3083         continue;
3084
3085       /* Some characters may need to be eliminated from matches because
3086          they fail in the current context.  */
3087       if (pos.constraint != NO_CONSTRAINT)
3088         {
3089           if (!succeeds_in_context (pos.constraint,
3090                                     d->states[s].context, CTX_NEWLINE))
3091             for (int j = 0; j < CHARCLASS_WORDS; j++)
3092               matches.w[j] &= ~d->syntax.newline.w[j];
3093           if (!succeeds_in_context (pos.constraint,
3094                                     d->states[s].context, CTX_LETTER))
3095             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3096               matches.w[j] &= ~d->syntax.letters.w[j];
3097           if (!succeeds_in_context (pos.constraint,
3098                                     d->states[s].context, CTX_NONE))
3099             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3100               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3101
3102           /* If there are no characters left, there's no point in going on.  */
3103           if (emptyset (&matches))
3104             continue;
3105
3106           /* If we have reset the bit that made us declare "matched", reset
3107              that indicator, too.  This is required to avoid an infinite loop
3108              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3109           if (!tstbit (uc, &matches))
3110             matched = false;
3111         }
3112
3113 #ifdef DEBUG
3114       fprintf (stderr, " nextpos %td:", pos.index);
3115       prtok (d->tokens[pos.index]);
3116       fprintf (stderr, " of");
3117       for (unsigned j = 0; j < NOTCHAR; j++)
3118         if (tstbit (j, &matches))
3119           fprintf (stderr, " 0x%02x", j);
3120       fprintf (stderr, "\n");
3121 #endif
3122
3123       if (matched)
3124         {
3125           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3126             label.w[k] &= matches.w[k];
3127           append (pos, &group);
3128         }
3129       else
3130         {
3131           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3132             label.w[k] &= ~matches.w[k];
3133         }
3134     }
3135
3136   alloc_position_set (&tmp, d->nleaves);
3137
3138   if (group.nelem > 0)
3139     {
3140       /* If we are building a searching matcher, throw in the positions
3141          of state 0 as well, if possible.  */
3142       if (d->searchflag)
3143         {
3144           /* If a token in follows.elems is not 1st byte of a multibyte
3145              character, or the states of follows must accept the bytes
3146              which are not 1st byte of the multibyte character.
3147              Then, if a state of follows encounters a byte, it must not be
3148              a 1st byte of a multibyte character nor a single byte character.
3149              In this case, do not add state[0].follows to next state, because
3150              state[0] must accept 1st-byte.
3151
3152              For example, suppose <sb a> is a certain single byte character,
3153              <mb A> is a certain multibyte character, and the codepoint of
3154              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3155              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3156              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3157              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3158              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3159              not add state[0].  */
3160
3161           bool mergeit = !d->localeinfo.multibyte;
3162           if (!mergeit)
3163             {
3164               mergeit = true;
3165               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3166                 mergeit &= d->multibyte_prop[group.elems[j].index];
3167             }
3168           if (mergeit)
3169             merge2 (&group, &d->states[0].elems, &tmp);
3170         }
3171
3172       /* Find out if the new state will want any context information,
3173          by calculating possible contexts that the group can match,
3174          and separate contexts that the new state wants to know.  */
3175       int possible_contexts = charclass_context (d, &label);
3176       int separate_contexts = state_separate_contexts (d, &group);
3177
3178       /* Find the state(s) corresponding to the union of the follows.  */
3179       if (possible_contexts & ~separate_contexts)
3180         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3181       else
3182         state = -1;
3183       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3184         state_newline = state_index (d, &group, CTX_NEWLINE);
3185       else
3186         state_newline = state;
3187       if (separate_contexts & possible_contexts & CTX_LETTER)
3188         state_letter = state_index (d, &group, CTX_LETTER);
3189       else
3190         state_letter = state;
3191
3192       /* Reallocate now, to reallocate any newline transition properly.  */
3193       realloc_trans_if_necessary (d);
3194     }
3195
3196   /* If we are a searching matcher, the default transition is to a state
3197      containing the positions of state 0, otherwise the default transition
3198      is to fail miserably.  */
3199   else if (d->searchflag)
3200     {
3201       state_newline = 0;
3202       state_letter = d->min_trcount - 1;
3203       state = d->initstate_notbol;
3204     }
3205   else
3206     {
3207       state_newline = -1;
3208       state_letter = -1;
3209       state = -1;
3210     }
3211
3212   /* Set the transitions for each character in the label.  */
3213   for (int i = 0; i < NOTCHAR; i++)
3214     if (tstbit (i, &label))
3215       switch (d->syntax.sbit[i])
3216         {
3217         case CTX_NEWLINE:
3218           trans[i] = state_newline;
3219           break;
3220         case CTX_LETTER:
3221           trans[i] = state_letter;
3222           break;
3223         default:
3224           trans[i] = state;
3225           break;
3226         }
3227
3228 #ifdef DEBUG
3229   fprintf (stderr, "trans table %td", s);
3230   for (int i = 0; i < NOTCHAR; ++i)
3231     {
3232       if (!(i & 0xf))
3233         fprintf (stderr, "\n");
3234       fprintf (stderr, " %2td", trans[i]);
3235     }
3236   fprintf (stderr, "\n");
3237 #endif
3238
3239   free (group.elems);
3240   free (follows.elems);
3241   free (tmp.elems);
3242
3243   /* Keep the newline transition in a special place so we can use it as
3244      a sentinel.  */
3245   if (tstbit (d->syntax.eolbyte, &label))
3246     {
3247       d->newlines[s] = trans[d->syntax.eolbyte];
3248       trans[d->syntax.eolbyte] = -1;
3249     }
3250
3251   return trans[uc];
3252 }
3253
3254 /* Multibyte character handling sub-routines for dfaexec.  */
3255
3256 /* Consume a single byte and transit state from 's' to '*next_state'.
3257    This function is almost same as the state transition routin in dfaexec.
3258    But state transition is done just once, otherwise matching succeed or
3259    reach the end of the buffer.  */
3260 static state_num
3261 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3262 {
3263   state_num *t;
3264
3265   if (d->trans[s])
3266     t = d->trans[s];
3267   else if (d->fails[s])
3268     t = d->fails[s];
3269   else
3270     {
3271       build_state (s, d, **pp);
3272       if (d->trans[s])
3273         t = d->trans[s];
3274       else
3275         {
3276           t = d->fails[s];
3277           assert (t);
3278         }
3279     }
3280
3281   if (t[**pp] == -2)
3282     build_state (s, d, **pp);
3283
3284   return t[*(*pp)++];
3285 }
3286
3287 /* Transit state from s, then return new state and update the pointer of
3288    the buffer.  This function is for a period operator which can match a
3289    multi-byte character.  */
3290 static state_num
3291 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3292                unsigned char const *end)
3293 {
3294   wint_t wc;
3295
3296   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3297
3298   /* This state has some operators which can match a multibyte character.  */
3299   d->mb_follows.nelem = 0;
3300
3301   /* Calculate the state which can be reached from the state 's' by
3302      consuming 'mbclen' single bytes from the buffer.  */
3303   state_num s1 = s;
3304   int mbci;
3305   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3306     s = transit_state_singlebyte (d, s, pp);
3307   *pp += mbclen - mbci;
3308
3309   if (wc == WEOF)
3310     {
3311       /* It is an invalid character, so ANYCHAR is not accepted.  */
3312       return s;
3313     }
3314
3315   /* If all positions which have ANYCHAR do not depend on the context
3316      of the next character, calculate the next state with
3317      pre-calculated follows and cache the result.  */
3318   if (d->states[s1].mb_trindex < 0)
3319     {
3320       if (MAX_TRCOUNT <= d->mb_trcount)
3321         {
3322           state_num s3;
3323           for (s3 = -1; s3 < d->tralloc; s3++)
3324             {
3325               free (d->mb_trans[s3]);
3326               d->mb_trans[s3] = NULL;
3327             }
3328
3329           for (state_num i = 0; i < d->sindex; i++)
3330             d->states[i].mb_trindex = -1;
3331           d->mb_trcount = 0;
3332         }
3333       d->states[s1].mb_trindex = d->mb_trcount++;
3334     }
3335
3336   if (! d->mb_trans[s])
3337     {
3338       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3339       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3340       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3341       for (int i = 0; i < MAX_TRCOUNT; i++)
3342         d->mb_trans[s][i] = -1;
3343     }
3344   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3345     return d->mb_trans[s][d->states[s1].mb_trindex];
3346
3347   if (s == -1)
3348     copy (&d->states[s1].mbps, &d->mb_follows);
3349   else
3350     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3351
3352   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3353   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3354   realloc_trans_if_necessary (d);
3355
3356   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3357
3358   return s2;
3359 }
3360
3361 /* The initial state may encounter a byte which is not a single byte character
3362    nor the first byte of a multibyte character.  But it is incorrect for the
3363    initial state to accept such a byte.  For example, in Shift JIS the regular
3364    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3365    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3366    that are not a single byte character nor the first byte of a multibyte
3367    character.
3368
3369    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3370    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3371    the result is greater than P, set *WCP to the final wide character
3372    processed, or to WEOF if no wide character is processed.  Otherwise,
3373    if WCP is non-NULL, *WCP may or may not be updated.
3374
3375    Both P and MBP must be no larger than END.  */
3376 static unsigned char const *
3377 skip_remains_mb (struct dfa *d, unsigned char const *p,
3378                  unsigned char const *mbp, char const *end)
3379 {
3380   if (d->syntax.never_trail[*p])
3381     return p;
3382   while (mbp < p)
3383     {
3384       wint_t wc;
3385       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3386                            end - (char const *) mbp, d);
3387     }
3388   return mbp;
3389 }
3390
3391 /* Search through a buffer looking for a match to the struct dfa *D.
3392    Find the first occurrence of a string matching the regexp in the
3393    buffer, and the shortest possible version thereof.  Return a pointer to
3394    the first character after the match, or NULL if none is found.  BEGIN
3395    points to the beginning of the buffer, and END points to the first byte
3396    after its end.  Note however that we store a sentinel byte (usually
3397    newline) in *END, so the actual buffer must be one byte longer.
3398    When ALLOW_NL, newlines may appear in the matching string.
3399    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3400    If MULTIBYTE, the input consists of multibyte characters and/or
3401    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3402    Here is the list of features that make this DFA matcher punt:
3403     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3404     - [^...] in non-simple locale
3405     - [[=foo=]] or [[.foo.]]
3406     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3407     - back-reference: (.)\1
3408     - word-delimiter in multibyte locale: \<, \>, \b, \B
3409    See struct localeinfo.simple for the definition of "simple locale".  */
3410
3411 static inline char *
3412 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3413               ptrdiff_t *count, bool multibyte)
3414 {
3415   if (MAX_TRCOUNT <= d->sindex)
3416     {
3417       for (state_num s = d->min_trcount; s < d->sindex; s++)
3418         {
3419           free (d->states[s].elems.elems);
3420           free (d->states[s].mbps.elems);
3421         }
3422       d->sindex = d->min_trcount;
3423
3424       if (d->trans)
3425         {
3426           for (state_num s = 0; s < d->tralloc; s++)
3427             {
3428               free (d->trans[s]);
3429               free (d->fails[s]);
3430               d->trans[s] = d->fails[s] = NULL;
3431             }
3432           d->trcount = 0;
3433         }
3434
3435       if (d->localeinfo.multibyte && d->mb_trans)
3436         {
3437           for (state_num s = -1; s < d->tralloc; s++)
3438             {
3439               free (d->mb_trans[s]);
3440               d->mb_trans[s] = NULL;
3441             }
3442           for (state_num s = 0; s < d->min_trcount; s++)
3443             d->states[s].mb_trindex = -1;
3444           d->mb_trcount = 0;
3445         }
3446     }
3447
3448   if (!d->tralloc)
3449     realloc_trans_if_necessary (d);
3450
3451   /* Current state.  */
3452   state_num s = 0, s1 = 0;
3453
3454   /* Current input character.  */
3455   unsigned char const *p = (unsigned char const *) begin;
3456   unsigned char const *mbp = p;
3457
3458   /* Copy of d->trans so it can be optimized into a register.  */
3459   state_num **trans = d->trans;
3460   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3461   unsigned char saved_end = *(unsigned char *) end;
3462   *end = eol;
3463
3464   if (multibyte)
3465     {
3466       memset (&d->mbs, 0, sizeof d->mbs);
3467       if (d->mb_follows.alloc == 0)
3468         alloc_position_set (&d->mb_follows, d->nleaves);
3469     }
3470
3471   idx_t nlcount = 0;
3472   for (;;)
3473     {
3474       state_num *t;
3475       while ((t = trans[s]) != NULL)
3476         {
3477           if (s < d->min_trcount)
3478             {
3479               if (!multibyte || d->states[s].mbps.nelem == 0)
3480                 {
3481                   while (t[*p] == s)
3482                     p++;
3483                 }
3484               if (multibyte)
3485                 p = mbp = skip_remains_mb (d, p, mbp, end);
3486             }
3487
3488           if (multibyte)
3489             {
3490               s1 = s;
3491
3492               if (d->states[s].mbps.nelem == 0
3493                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3494                 {
3495                   /* If an input character does not match ANYCHAR, do it
3496                      like a single-byte character.  */
3497                   s = t[*p++];
3498                 }
3499               else
3500                 {
3501                   s = transit_state (d, s, &p, (unsigned char *) end);
3502                   mbp = p;
3503                   trans = d->trans;
3504                 }
3505             }
3506           else
3507             {
3508               s1 = t[*p++];
3509               t = trans[s1];
3510               if (! t)
3511                 {
3512                   state_num tmp = s;
3513                   s = s1;
3514                   s1 = tmp;     /* swap */
3515                   break;
3516                 }
3517               if (s < d->min_trcount)
3518                 {
3519                   while (t[*p] == s1)
3520                     p++;
3521                 }
3522               s = t[*p++];
3523             }
3524         }
3525
3526       if (s < 0)
3527         {
3528           if (s == -2)
3529             {
3530               s = build_state (s1, d, p[-1]);
3531               trans = d->trans;
3532             }
3533           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3534             {
3535               /* The previous character was a newline.  Count it, and skip
3536                  checking of multibyte character boundary until here.  */
3537               nlcount++;
3538               mbp = p;
3539
3540               s = (allow_nl ? d->newlines[s1]
3541                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3542                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3543                    : d->initstate_notbol);
3544             }
3545           else
3546             {
3547               p = NULL;
3548               goto done;
3549             }
3550         }
3551       else if (d->fails[s])
3552         {
3553           if ((d->success[s] & d->syntax.sbit[*p])
3554               || ((char *) p == end
3555                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3556                                          d)))
3557             goto done;
3558
3559           if (multibyte && s < d->min_trcount)
3560             p = mbp = skip_remains_mb (d, p, mbp, end);
3561
3562           s1 = s;
3563           if (!multibyte || d->states[s].mbps.nelem == 0
3564               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3565             {
3566               /* If a input character does not match ANYCHAR, do it
3567                  like a single-byte character.  */
3568               s = d->fails[s][*p++];
3569             }
3570           else
3571             {
3572               s = transit_state (d, s, &p, (unsigned char *) end);
3573               mbp = p;
3574               trans = d->trans;
3575             }
3576         }
3577       else
3578         {
3579           build_state (s, d, p[0]);
3580           trans = d->trans;
3581         }
3582     }
3583
3584  done:
3585   if (count)
3586     *count += nlcount;
3587   *end = saved_end;
3588   return (char *) p;
3589 }
3590
3591 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3592    This is for performance, as dfaexec_main is an inline function.  */
3593
3594 static char *
3595 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3596             bool allow_nl, ptrdiff_t *count, bool *backref)
3597 {
3598   return dfaexec_main (d, begin, end, allow_nl, count, true);
3599 }
3600
3601 static char *
3602 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3603             bool allow_nl, ptrdiff_t *count, bool *backref)
3604 {
3605   return dfaexec_main (d, begin, end, allow_nl, count, false);
3606 }
3607
3608 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3609    any regexp that uses a construct not supported by this code.  */
3610 static char *
3611 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3612               bool allow_nl, ptrdiff_t *count, bool *backref)
3613 {
3614   *backref = true;
3615   return (char *) begin;
3616 }
3617
3618 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3619    but faster and set *BACKREF if the DFA code does not support this
3620    regexp usage.  */
3621
3622 char *
3623 dfaexec (struct dfa *d, char const *begin, char *end,
3624          bool allow_nl, ptrdiff_t *count, bool *backref)
3625 {
3626   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3627 }
3628
3629 struct dfa *
3630 dfasuperset (struct dfa const *d)
3631 {
3632   return d->superset;
3633 }
3634
3635 bool
3636 dfaisfast (struct dfa const *d)
3637 {
3638   return d->fast;
3639 }
3640
3641 static void
3642 free_mbdata (struct dfa *d)
3643 {
3644   free (d->multibyte_prop);
3645   free (d->lex.brack.chars);
3646   free (d->mb_follows.elems);
3647
3648   if (d->mb_trans)
3649     {
3650       state_num s;
3651       for (s = -1; s < d->tralloc; s++)
3652         free (d->mb_trans[s]);
3653       free (d->mb_trans - 2);
3654     }
3655 }
3656
3657 /* Return true if every construct in D is supported by this DFA matcher.  */
3658 bool
3659 dfasupported (struct dfa const *d)
3660 {
3661   for (idx_t i = 0; i < d->tindex; i++)
3662     {
3663       switch (d->tokens[i])
3664         {
3665         case BEGWORD:
3666         case ENDWORD:
3667         case LIMWORD:
3668         case NOTLIMWORD:
3669           if (!d->localeinfo.multibyte)
3670             continue;
3671           FALLTHROUGH;
3672         case BACKREF:
3673         case MBCSET:
3674           return false;
3675         }
3676     }
3677   return true;
3678 }
3679
3680 /* Disable use of the superset DFA if it is not likely to help
3681    performance.  */
3682 static void
3683 maybe_disable_superset_dfa (struct dfa *d)
3684 {
3685   if (!d->localeinfo.using_utf8)
3686     return;
3687
3688   bool have_backref = false;
3689   for (idx_t i = 0; i < d->tindex; i++)
3690     {
3691       switch (d->tokens[i])
3692         {
3693         case ANYCHAR:
3694           /* Lowered.  */
3695           abort ();
3696         case BACKREF:
3697           have_backref = true;
3698           break;
3699         case MBCSET:
3700           /* Requires multi-byte algorithm.  */
3701           return;
3702         default:
3703           break;
3704         }
3705     }
3706
3707   if (!have_backref && d->superset)
3708     {
3709       /* The superset DFA is not likely to be much faster, so remove it.  */
3710       dfafree (d->superset);
3711       free (d->superset);
3712       d->superset = NULL;
3713     }
3714
3715   free_mbdata (d);
3716   d->localeinfo.multibyte = false;
3717   d->dfaexec = dfaexec_sb;
3718   d->fast = true;
3719 }
3720
3721 static void
3722 dfassbuild (struct dfa *d)
3723 {
3724   struct dfa *sup = dfaalloc ();
3725
3726   *sup = *d;
3727   sup->localeinfo.multibyte = false;
3728   sup->dfaexec = dfaexec_sb;
3729   sup->multibyte_prop = NULL;
3730   sup->superset = NULL;
3731   sup->states = NULL;
3732   sup->sindex = 0;
3733   sup->constraints = NULL;
3734   sup->separates = NULL;
3735   sup->follows = NULL;
3736   sup->tralloc = 0;
3737   sup->trans = NULL;
3738   sup->fails = NULL;
3739   sup->success = NULL;
3740   sup->newlines = NULL;
3741
3742   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3743   if (d->cindex)
3744     {
3745       memcpy (sup->charclasses, d->charclasses,
3746               d->cindex * sizeof *sup->charclasses);
3747     }
3748
3749   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3750   sup->talloc = d->tindex * 2;
3751
3752   bool have_achar = false;
3753   bool have_nchar = false;
3754   idx_t j;
3755   for (idx_t i = j = 0; i < d->tindex; i++)
3756     {
3757       switch (d->tokens[i])
3758         {
3759         case ANYCHAR:
3760         case MBCSET:
3761         case BACKREF:
3762           {
3763             charclass ccl;
3764             fillset (&ccl);
3765             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3766             sup->tokens[j++] = STAR;
3767             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3768                 || d->tokens[i + 1] == PLUS)
3769               i++;
3770             have_achar = true;
3771           }
3772           break;
3773         case BEGWORD:
3774         case ENDWORD:
3775         case LIMWORD:
3776         case NOTLIMWORD:
3777           if (d->localeinfo.multibyte)
3778             {
3779               /* These constraints aren't supported in a multibyte locale.
3780                  Ignore them in the superset DFA.  */
3781               sup->tokens[j++] = EMPTY;
3782               break;
3783             }
3784           FALLTHROUGH;
3785         default:
3786           sup->tokens[j++] = d->tokens[i];
3787           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3788               || d->tokens[i] >= CSET)
3789             have_nchar = true;
3790           break;
3791         }
3792     }
3793   sup->tindex = j;
3794
3795   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3796     d->superset = sup;
3797   else
3798     {
3799       dfafree (sup);
3800       free (sup);
3801     }
3802 }
3803
3804 /* Parse a string S of length LEN into D (but skip this step if S is null).
3805    Then analyze D and build a matcher for it.
3806    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3807 void
3808 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3809 {
3810   if (s != NULL)
3811     dfaparse (s, len, d);
3812
3813   dfassbuild (d);
3814
3815   if (dfasupported (d))
3816     {
3817       maybe_disable_superset_dfa (d);
3818       dfaanalyze (d, searchflag);
3819     }
3820   else
3821     {
3822       d->dfaexec = dfaexec_noop;
3823     }
3824
3825   if (d->superset)
3826     {
3827       d->fast = true;
3828       dfaanalyze (d->superset, searchflag);
3829     }
3830 }
3831
3832 /* Free the storage held by the components of a dfa.  */
3833 void
3834 dfafree (struct dfa *d)
3835 {
3836   free (d->charclasses);
3837   free (d->tokens);
3838
3839   if (d->localeinfo.multibyte)
3840     free_mbdata (d);
3841
3842   free (d->constraints);
3843   free (d->separates);
3844
3845   for (idx_t i = 0; i < d->sindex; i++)
3846     {
3847       free (d->states[i].elems.elems);
3848       free (d->states[i].mbps.elems);
3849     }
3850   free (d->states);
3851
3852   if (d->follows)
3853     {
3854       for (idx_t i = 0; i < d->tindex; i++)
3855         free (d->follows[i].elems);
3856       free (d->follows);
3857     }
3858
3859   if (d->trans)
3860     {
3861       for (idx_t i = 0; i < d->tralloc; i++)
3862         {
3863           free (d->trans[i]);
3864           free (d->fails[i]);
3865         }
3866
3867       free (d->trans - 2);
3868       free (d->fails);
3869       free (d->newlines);
3870       free (d->success);
3871     }
3872
3873   if (d->superset)
3874     {
3875       dfafree (d->superset);
3876       free (d->superset);
3877     }
3878 }
3879
3880 /* Having found the postfix representation of the regular expression,
3881    try to find a long sequence of characters that must appear in any line
3882    containing the r.e.
3883    Finding a "longest" sequence is beyond the scope here;
3884    we take an easy way out and hope for the best.
3885    (Take "(ab|a)b"--please.)
3886
3887    We do a bottom-up calculation of sequences of characters that must appear
3888    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3889    representation:
3890         sequences that must appear at the left of the match ("left")
3891         sequences that must appear at the right of the match ("right")
3892         lists of sequences that must appear somewhere in the match ("in")
3893         sequences that must constitute the match ("is")
3894
3895    When we get to the root of the tree, we use one of the longest of its
3896    calculated "in" sequences as our answer.
3897
3898    The sequences calculated for the various types of node (in pseudo ANSI c)
3899    are shown below.  "p" is the operand of unary operators (and the left-hand
3900    operand of binary operators); "q" is the right-hand operand of binary
3901    operators.
3902
3903    "ZERO" means "a zero-length sequence" below.
3904
3905         Type    left            right           is              in
3906         ----    ----            -----           --              --
3907         char c  # c             # c             # c             # c
3908
3909         ANYCHAR ZERO            ZERO            ZERO            ZERO
3910
3911         MBCSET  ZERO            ZERO            ZERO            ZERO
3912
3913         CSET    ZERO            ZERO            ZERO            ZERO
3914
3915         STAR    ZERO            ZERO            ZERO            ZERO
3916
3917         QMARK   ZERO            ZERO            ZERO            ZERO
3918
3919         PLUS    p->left         p->right        ZERO            p->in
3920
3921         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3922                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3923                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3924                                                 ZERO
3925
3926         OR      longest common  longest common  (do p->is and substrings common
3927                 leading         trailing        to q->is have same p->in and
3928                 (sub)sequence   (sub)sequence   q->in length and content) ?
3929                 of p->left      of p->right
3930                 and q->left     and q->right    p->is : NULL
3931
3932    If there's anything else we recognize in the tree, all four sequences get set
3933    to zero-length sequences.  If there's something we don't recognize in the
3934    tree, we just return a zero-length sequence.
3935
3936    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3937    'aaa')?
3938
3939    And ... is it here or someplace that we might ponder "optimizations" such as
3940         egrep 'psi|epsilon'     ->      egrep 'psi'
3941         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3942                                         (Yes, we now find "epsi" as a "string
3943                                         that must occur", but we might also
3944                                         simplify the *entire* r.e. being sought)
3945         grep '[c]'              ->      grep 'c'
3946         grep '(ab|a)b'          ->      grep 'ab'
3947         grep 'ab*'              ->      grep 'a'
3948         grep 'a*b'              ->      grep 'b'
3949
3950    There are several issues:
3951
3952    Is optimization easy (enough)?
3953
3954    Does optimization actually accomplish anything,
3955    or is the automaton you get from "psi|epsilon" (for example)
3956    the same as the one you get from "psi" (for example)?
3957
3958    Are optimizable r.e.'s likely to be used in real-life situations
3959    (something like 'ab*' is probably unlikely; something like is
3960    'psi|epsilon' is likelier)?  */
3961
3962 static char *
3963 icatalloc (char *old, char const *new)
3964 {
3965   idx_t newsize = strlen (new);
3966   if (newsize == 0)
3967     return old;
3968   idx_t oldsize = strlen (old);
3969   char *result = xrealloc (old, oldsize + newsize + 1);
3970   memcpy (result + oldsize, new, newsize + 1);
3971   return result;
3972 }
3973
3974 static void
3975 freelist (char **cpp)
3976 {
3977   while (*cpp)
3978     free (*cpp++);
3979 }
3980
3981 static char **
3982 enlist (char **cpp, char *new, idx_t len)
3983 {
3984   new = memcpy (xmalloc (len + 1), new, len);
3985   new[len] = '\0';
3986   /* Is there already something in the list that's new (or longer)?  */
3987   idx_t i;
3988   for (i = 0; cpp[i] != NULL; i++)
3989     if (strstr (cpp[i], new) != NULL)
3990       {
3991         free (new);
3992         return cpp;
3993       }
3994   /* Eliminate any obsoleted strings.  */
3995   for (idx_t j = 0; cpp[j] != NULL; )
3996     if (strstr (new, cpp[j]) == NULL)
3997       ++j;
3998     else
3999       {
4000         free (cpp[j]);
4001         if (--i == j)
4002           break;
4003         cpp[j] = cpp[i];
4004         cpp[i] = NULL;
4005       }
4006   /* Add the new string.  */
4007   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
4008   cpp[i] = new;
4009   cpp[i + 1] = NULL;
4010   return cpp;
4011 }
4012
4013 /* Given pointers to two strings, return a pointer to an allocated
4014    list of their distinct common substrings.  */
4015 static char **
4016 comsubs (char *left, char const *right)
4017 {
4018   char **cpp = xzalloc (sizeof *cpp);
4019
4020   for (char *lcp = left; *lcp != '\0'; lcp++)
4021     {
4022       idx_t len = 0;
4023       char *rcp = strchr (right, *lcp);
4024       while (rcp != NULL)
4025         {
4026           idx_t i;
4027           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4028             continue;
4029           if (i > len)
4030             len = i;
4031           rcp = strchr (rcp + 1, *lcp);
4032         }
4033       if (len != 0)
4034         cpp = enlist (cpp, lcp, len);
4035     }
4036   return cpp;
4037 }
4038
4039 static char **
4040 addlists (char **old, char **new)
4041 {
4042   for (; *new; new++)
4043     old = enlist (old, *new, strlen (*new));
4044   return old;
4045 }
4046
4047 /* Given two lists of substrings, return a new list giving substrings
4048    common to both.  */
4049 static char **
4050 inboth (char **left, char **right)
4051 {
4052   char **both = xzalloc (sizeof *both);
4053
4054   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4055     {
4056       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4057         {
4058           char **temp = comsubs (left[lnum], right[rnum]);
4059           both = addlists (both, temp);
4060           freelist (temp);
4061           free (temp);
4062         }
4063     }
4064   return both;
4065 }
4066
4067 typedef struct must must;
4068
4069 struct must
4070 {
4071   char **in;
4072   char *left;
4073   char *right;
4074   char *is;
4075   bool begline;
4076   bool endline;
4077   must *prev;
4078 };
4079
4080 static must *
4081 allocmust (must *mp, idx_t size)
4082 {
4083   must *new_mp = xmalloc (sizeof *new_mp);
4084   new_mp->in = xzalloc (sizeof *new_mp->in);
4085   new_mp->left = xzalloc (size);
4086   new_mp->right = xzalloc (size);
4087   new_mp->is = xzalloc (size);
4088   new_mp->begline = false;
4089   new_mp->endline = false;
4090   new_mp->prev = mp;
4091   return new_mp;
4092 }
4093
4094 static void
4095 resetmust (must *mp)
4096 {
4097   freelist (mp->in);
4098   mp->in[0] = NULL;
4099   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4100   mp->begline = false;
4101   mp->endline = false;
4102 }
4103
4104 static void
4105 freemust (must *mp)
4106 {
4107   freelist (mp->in);
4108   free (mp->in);
4109   free (mp->left);
4110   free (mp->right);
4111   free (mp->is);
4112   free (mp);
4113 }
4114
4115 struct dfamust *
4116 dfamust (struct dfa const *d)
4117 {
4118   must *mp = NULL;
4119   char const *result = "";
4120   bool exact = false;
4121   bool begline = false;
4122   bool endline = false;
4123   bool need_begline = false;
4124   bool need_endline = false;
4125   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4126
4127   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4128     {
4129       token t = d->tokens[ri];
4130       switch (t)
4131         {
4132         case BEGLINE:
4133           mp = allocmust (mp, 2);
4134           mp->begline = true;
4135           need_begline = true;
4136           break;
4137         case ENDLINE:
4138           mp = allocmust (mp, 2);
4139           mp->endline = true;
4140           need_endline = true;
4141           break;
4142         case LPAREN:
4143         case RPAREN:
4144           assert (!"neither LPAREN nor RPAREN may appear here");
4145
4146         case EMPTY:
4147         case BEGWORD:
4148         case ENDWORD:
4149         case LIMWORD:
4150         case NOTLIMWORD:
4151         case BACKREF:
4152         case ANYCHAR:
4153         case MBCSET:
4154           mp = allocmust (mp, 2);
4155           break;
4156
4157         case STAR:
4158         case QMARK:
4159           resetmust (mp);
4160           break;
4161
4162         case OR:
4163           {
4164             char **new;
4165             must *rmp = mp;
4166             must *lmp = mp = mp->prev;
4167             idx_t j, ln, rn, n;
4168
4169             /* Guaranteed to be.  Unlikely, but ...  */
4170             if (streq (lmp->is, rmp->is))
4171               {
4172                 lmp->begline &= rmp->begline;
4173                 lmp->endline &= rmp->endline;
4174               }
4175             else
4176               {
4177                 lmp->is[0] = '\0';
4178                 lmp->begline = false;
4179                 lmp->endline = false;
4180               }
4181             /* Left side--easy */
4182             idx_t i = 0;
4183             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4184               ++i;
4185             lmp->left[i] = '\0';
4186             /* Right side */
4187             ln = strlen (lmp->right);
4188             rn = strlen (rmp->right);
4189             n = ln;
4190             if (n > rn)
4191               n = rn;
4192             for (i = 0; i < n; ++i)
4193               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4194                 break;
4195             for (j = 0; j < i; ++j)
4196               lmp->right[j] = lmp->right[(ln - i) + j];
4197             lmp->right[j] = '\0';
4198             new = inboth (lmp->in, rmp->in);
4199             freelist (lmp->in);
4200             free (lmp->in);
4201             lmp->in = new;
4202             freemust (rmp);
4203           }
4204           break;
4205
4206         case PLUS:
4207           mp->is[0] = '\0';
4208           break;
4209
4210         case END:
4211           assert (!mp->prev);
4212           for (idx_t i = 0; mp->in[i] != NULL; i++)
4213             if (strlen (mp->in[i]) > strlen (result))
4214               result = mp->in[i];
4215           if (streq (result, mp->is))
4216             {
4217               if ((!need_begline || mp->begline) && (!need_endline
4218                                                      || mp->endline))
4219                 exact = true;
4220               begline = mp->begline;
4221               endline = mp->endline;
4222             }
4223           goto done;
4224
4225         case CAT:
4226           {
4227             must *rmp = mp;
4228             must *lmp = mp = mp->prev;
4229
4230             /* In.  Everything in left, plus everything in
4231                right, plus concatenation of
4232                left's right and right's left.  */
4233             lmp->in = addlists (lmp->in, rmp->in);
4234             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4235               {
4236                 idx_t lrlen = strlen (lmp->right);
4237                 idx_t rllen = strlen (rmp->left);
4238                 char *tp = xmalloc (lrlen + rllen);
4239                 memcpy (tp, lmp->right, lrlen);
4240                 memcpy (tp + lrlen, rmp->left, rllen);
4241                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4242                 free (tp);
4243               }
4244             /* Left-hand */
4245             if (lmp->is[0] != '\0')
4246               lmp->left = icatalloc (lmp->left, rmp->left);
4247             /* Right-hand */
4248             if (rmp->is[0] == '\0')
4249               lmp->right[0] = '\0';
4250             lmp->right = icatalloc (lmp->right, rmp->right);
4251             /* Guaranteed to be */
4252             if ((lmp->is[0] != '\0' || lmp->begline)
4253                 && (rmp->is[0] != '\0' || rmp->endline))
4254               {
4255                 lmp->is = icatalloc (lmp->is, rmp->is);
4256                 lmp->endline = rmp->endline;
4257               }
4258             else
4259               {
4260                 lmp->is[0] = '\0';
4261                 lmp->begline = false;
4262                 lmp->endline = false;
4263               }
4264             freemust (rmp);
4265           }
4266           break;
4267
4268         case '\0':
4269           /* Not on *my* shift.  */
4270           goto done;
4271
4272         default:
4273           if (CSET <= t)
4274             {
4275               /* If T is a singleton, or if case-folding in a unibyte
4276                  locale and T's members all case-fold to the same char,
4277                  convert T to one of its members.  Otherwise, do
4278                  nothing further with T.  */
4279               charclass *ccl = &d->charclasses[t - CSET];
4280               int j;
4281               for (j = 0; j < NOTCHAR; j++)
4282                 if (tstbit (j, ccl))
4283                   break;
4284               if (! (j < NOTCHAR))
4285                 {
4286                   mp = allocmust (mp, 2);
4287                   break;
4288                 }
4289               t = j;
4290               while (++j < NOTCHAR)
4291                 if (tstbit (j, ccl)
4292                     && ! (case_fold_unibyte
4293                           && toupper (j) == toupper (t)))
4294                   break;
4295               if (j < NOTCHAR)
4296                 {
4297                   mp = allocmust (mp, 2);
4298                   break;
4299                 }
4300             }
4301
4302           idx_t rj = ri + 2;
4303           if (d->tokens[ri + 1] == CAT)
4304             {
4305               for (; rj < d->tindex - 1; rj += 2)
4306                 {
4307                   if ((rj != ri && (d->tokens[rj] <= 0
4308                                     || NOTCHAR <= d->tokens[rj]))
4309                       || d->tokens[rj + 1] != CAT)
4310                     break;
4311                 }
4312             }
4313           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4314           mp->is[0] = mp->left[0] = mp->right[0]
4315             = case_fold_unibyte ? toupper (t) : t;
4316
4317           idx_t i;
4318           for (i = 1; ri + 2 < rj; i++)
4319             {
4320               ri += 2;
4321               t = d->tokens[ri];
4322               mp->is[i] = mp->left[i] = mp->right[i]
4323                 = case_fold_unibyte ? toupper (t) : t;
4324             }
4325           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4326           mp->in = enlist (mp->in, mp->is, i);
4327           break;
4328         }
4329     }
4330  done:;
4331
4332   struct dfamust *dm = NULL;
4333   if (*result)
4334     {
4335       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4336       dm->exact = exact;
4337       dm->begline = begline;
4338       dm->endline = endline;
4339       strcpy (dm->must, result);
4340     }
4341
4342   while (mp)
4343     {
4344       must *prev = mp->prev;
4345       freemust (mp);
4346       mp = prev;
4347     }
4348
4349   return dm;
4350 }
4351
4352 void
4353 dfamustfree (struct dfamust *dm)
4354 {
4355   free (dm);
4356 }
4357
4358 struct dfa *
4359 dfaalloc (void)
4360 {
4361   return xmalloc (sizeof (struct dfa));
4362 }
4363
4364 /* Initialize DFA.  */
4365 void
4366 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4367            reg_syntax_t bits, int dfaopts)
4368 {
4369   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4370   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4371   dfa->localeinfo = *linfo;
4372
4373   dfa->fast = !dfa->localeinfo.multibyte;
4374
4375   dfa->canychar = -1;
4376   dfa->syntax.syntax_bits_set = true;
4377   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4378   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4379   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4380   dfa->syntax.syntax_bits = bits;
4381
4382   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4383     {
4384       unsigned char uc = i;
4385
4386       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4387       switch (dfa->syntax.sbit[uc])
4388         {
4389         case CTX_LETTER:
4390           setbit (uc, &dfa->syntax.letters);
4391           break;
4392         case CTX_NEWLINE:
4393           setbit (uc, &dfa->syntax.newline);
4394           break;
4395         }
4396
4397       /* POSIX requires that the five bytes in "\n\r./" (including the
4398          terminating NUL) cannot occur inside a multibyte character.  */
4399       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4400                                      ? (uc & 0xc0) != 0x80
4401                                      : strchr ("\n\r./", uc) != NULL);
4402     }
4403 }
4404
4405 /* Initialize TO by copying FROM's syntax settings.  */
4406 void
4407 dfacopysyntax (struct dfa *to, struct dfa const *from)
4408 {
4409   memset (to, 0, offsetof (struct dfa, syntax));
4410   to->canychar = -1;
4411   to->fast = from->fast;
4412   to->syntax = from->syntax;
4413   to->dfaexec = from->dfaexec;
4414   to->localeinfo = from->localeinfo;
4415 }
4416
4417 /* vim:set shiftwidth=2: */