lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include "flexmember.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36
  37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
  38    indexes into objects.  It is signed to help catch integer overflow.
  39    It has its own name because it is for nonnegative values only.  */
  40 typedef ptrdiff_t idx_t;
  41 static idx_t const IDX_MAX = PTRDIFF_MAX;
  42
  43 static bool
  44 streq (char const *a, char const *b)
  45 {
  46   return strcmp (a, b) == 0;
  47 }
  48
  49 static bool
  50 isasciidigit (char c)
  51 {
  52   return '0' <= c && c <= '9';
  53 }
  54
  55 #include "gettext.h"
  56 #define _(str) gettext (str)
  57
  58 #include <wchar.h>
  59
  60 #include "intprops.h"
  61 #include "xalloc.h"
  62 #include "localeinfo.h"
  63
  64 #ifndef FALLTHROUGH
  65 # if 201710L < __STDC_VERSION__
  66 #  define FALLTHROUGH [[__fallthrough__]]
  67 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
  68 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  69 # else
  70 #  define FALLTHROUGH ((void) 0)
  71 # endif
  72 #endif
  73
  74 #ifndef MIN
  75 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  76 #endif
  77
  78 /* HPUX defines these as macros in sys/param.h.  */
  79 #ifdef setbit
  80 # undef setbit
  81 #endif
  82 #ifdef clrbit
  83 # undef clrbit
  84 #endif
  85
  86 /* For code that does not use Gnulib’s isblank module.  */
  87 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
  88 # define isblank dfa_isblank
  89 static int
  90 isblank (int c)
  91 {
  92   return c == ' ' || c == '\t';
  93 }
  94 #endif
  95
  96 /* First integer value that is greater than any character code.  */
  97 enum { NOTCHAR = 1 << CHAR_BIT };
  98
  99 #ifdef UINT_LEAST64_MAX
 100
 101 /* Number of bits used in a charclass word.  */
 102 enum { CHARCLASS_WORD_BITS = 64 };
 103
 104 /* This represents part of a character class.  It must be unsigned and
 105    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 106 typedef uint_least64_t charclass_word;
 107
 108 /* Part of a charclass initializer that represents 64 bits' worth of a
 109    charclass, where LO and HI are the low and high-order 32 bits of
 110    the 64-bit quantity.  */
 111 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 112
 113 #else
 114 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 115 enum { CHARCLASS_WORD_BITS = 32 };
 116 typedef unsigned long charclass_word;
 117 # define CHARCLASS_PAIR(lo, hi) lo, hi
 118 #endif
 119
 120 /* An initializer for a charclass whose 32-bit words are A through H.  */
 121 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 122    {{                                                   \
 123       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 124       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 125    }}
 126
 127 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 128 static charclass_word const CHARCLASS_WORD_MASK
 129   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 130
 131 /* Number of words required to hold a bit for every character.  */
 132 enum
 133 {
 134   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 135 };
 136
 137 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 138 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 139
 140 /* Convert a possibly-signed character to an unsigned character.  This is
 141    a bit safer than casting to unsigned char, since it catches some type
 142    errors that the cast doesn't.  */
 143 static unsigned char
 144 to_uchar (char ch)
 145 {
 146   return ch;
 147 }
 148
 149 /* Contexts tell us whether a character is a newline or a word constituent.
 150    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 151    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 152    a particular character class.
 153
 154    A state also stores a context value, which is a bitmask of CTX_* values.
 155    A state's context represents a set of characters that the state's
 156    predecessors must match.  For example, a state whose context does not
 157    include CTX_LETTER will never have transitions where the previous
 158    character is a word constituent.  A state whose context is CTX_ANY
 159    might have transitions from any character.  */
 160
 161 enum
 162   {
 163     CTX_NONE = 1,
 164     CTX_LETTER = 2,
 165     CTX_NEWLINE = 4,
 166     CTX_ANY = 7
 167   };
 168
 169 /* Sometimes characters can only be matched depending on the surrounding
 170    context.  Such context decisions depend on what the previous character
 171    was, and the value of the current (lookahead) character.  Context
 172    dependent constraints are encoded as 9-bit integers.  Each bit that
 173    is set indicates that the constraint succeeds in the corresponding
 174    context.
 175
 176    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 177    bit 3-5  - valid contexts when next character is CTX_LETTER
 178    bit 0-2  - valid contexts when next character is CTX_NONE
 179
 180    succeeds_in_context determines whether a given constraint
 181    succeeds in a particular context.  Prev is a bitmask of possible
 182    context values for the previous character, curr is the (single-bit)
 183    context value for the lookahead character.  */
 184 static int
 185 newline_constraint (int constraint)
 186 {
 187   return (constraint >> 6) & 7;
 188 }
 189 static int
 190 letter_constraint (int constraint)
 191 {
 192   return (constraint >> 3) & 7;
 193 }
 194 static int
 195 other_constraint (int constraint)
 196 {
 197   return constraint & 7;
 198 }
 199
 200 static bool
 201 succeeds_in_context (int constraint, int prev, int curr)
 202 {
 203   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 204               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 205               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 206              & prev);
 207 }
 208
 209 /* The following describe what a constraint depends on.  */
 210 static bool
 211 prev_newline_dependent (int constraint)
 212 {
 213   return ((constraint ^ constraint >> 2) & 0111) != 0;
 214 }
 215 static bool
 216 prev_letter_dependent (int constraint)
 217 {
 218   return ((constraint ^ constraint >> 1) & 0111) != 0;
 219 }
 220
 221 /* Tokens that match the empty string subject to some constraint actually
 222    work by applying that constraint to determine what may follow them,
 223    taking into account what has gone before.  The following values are
 224    the constraints corresponding to the special tokens previously defined.  */
 225 enum
 226   {
 227     NO_CONSTRAINT = 0777,
 228     BEGLINE_CONSTRAINT = 0444,
 229     ENDLINE_CONSTRAINT = 0700,
 230     BEGWORD_CONSTRAINT = 0050,
 231     ENDWORD_CONSTRAINT = 0202,
 232     LIMWORD_CONSTRAINT = 0252,
 233     NOTLIMWORD_CONSTRAINT = 0525
 234   };
 235
 236 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 237    are operators and others are terminal symbols.  Most (but not all) of these
 238    codes are returned by the lexical analyzer.  */
 239
 240 typedef ptrdiff_t token;
 241 static token const TOKEN_MAX = PTRDIFF_MAX;
 242
 243 /* States are indexed by state_num values.  These are normally
 244    nonnegative but -1 is used as a special value.  */
 245 typedef ptrdiff_t state_num;
 246
 247 /* Predefined token values.  */
 248 enum
 249 {
 250   END = -1,                     /* END is a terminal symbol that matches the
 251                                    end of input; any value of END or less in
 252                                    the parse tree is such a symbol.  Accepting
 253                                    states of the DFA are those that would have
 254                                    a transition on END.  This is -1, not some
 255                                    more-negative value, to tweak the speed of
 256                                    comparisons to END.  */
 257
 258   /* Ordinary character values are terminal symbols that match themselves.  */
 259
 260   /* CSET must come last in the following list of special tokens.  Otherwise,
 261      the list order matters only for performance.  Related special tokens
 262      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 263      || CSET <= t) can be done with a single machine-level comparison.  */
 264
 265   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 266                                    the empty string.  */
 267
 268   QMARK,                        /* QMARK is an operator of one argument that
 269                                    matches zero or one occurrences of its
 270                                    argument.  */
 271
 272   STAR,                         /* STAR is an operator of one argument that
 273                                    matches the Kleene closure (zero or more
 274                                    occurrences) of its argument.  */
 275
 276   PLUS,                         /* PLUS is an operator of one argument that
 277                                    matches the positive closure (one or more
 278                                    occurrences) of its argument.  */
 279
 280   REPMN,                        /* REPMN is a lexical token corresponding
 281                                    to the {m,n} construct.  REPMN never
 282                                    appears in the compiled token vector.  */
 283
 284   CAT,                          /* CAT is an operator of two arguments that
 285                                    matches the concatenation of its
 286                                    arguments.  CAT is never returned by the
 287                                    lexical analyzer.  */
 288
 289   OR,                           /* OR is an operator of two arguments that
 290                                    matches either of its arguments.  */
 291
 292   LPAREN,                       /* LPAREN never appears in the parse tree,
 293                                    it is only a lexeme.  */
 294
 295   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 296
 297   WCHAR,                        /* Only returned by lex.  wctok contains
 298                                    the wide character representation.  */
 299
 300   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 301                                    a valid multibyte (or single byte) character.
 302                                    It is used only if MB_CUR_MAX > 1.  */
 303
 304   BEG,                          /* BEG is an initial symbol that matches the
 305                                    beginning of input.  */
 306
 307   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 308                                    the empty string at the beginning of a
 309                                    line.  */
 310
 311   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 312                                    the empty string at the end of a line.  */
 313
 314   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 315                                    the empty string at the beginning of a
 316                                    word.  */
 317
 318   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 319                                    the empty string at the end of a word.  */
 320
 321   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 322                                    the empty string at the beginning or the
 323                                    end of a word.  */
 324
 325   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 326                                    matches the empty string not at
 327                                    the beginning or end of a word.  */
 328
 329   BACKREF,                      /* BACKREF is generated by \<digit>
 330                                    or by any other construct that
 331                                    is not completely handled.  If the scanner
 332                                    detects a transition on backref, it returns
 333                                    a kind of "semi-success" indicating that
 334                                    the match will have to be verified with
 335                                    a backtracking matcher.  */
 336
 337   MBCSET,                       /* MBCSET is similar to CSET, but for
 338                                    multibyte characters.  */
 339
 340   CSET                          /* CSET and (and any value greater) is a
 341                                    terminal symbol that matches any of a
 342                                    class of characters.  */
 343 };
 344
 345
 346 /* States of the recognizer correspond to sets of positions in the parse
 347    tree, together with the constraints under which they may be matched.
 348    So a position is encoded as an index into the parse tree together with
 349    a constraint.  */
 350 typedef struct
 351 {
 352   idx_t index;                  /* Index into the parse array.  */
 353   unsigned int constraint;      /* Constraint for matching this position.  */
 354 } position;
 355
 356 /* Sets of positions are stored as arrays.  */
 357 typedef struct
 358 {
 359   position *elems;              /* Elements of this position set.  */
 360   idx_t nelem;                  /* Number of elements in this set.  */
 361   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 362 } position_set;
 363
 364 /* A state of the dfa consists of a set of positions, some flags,
 365    and the token value of the lowest-numbered position of the state that
 366    contains an END token.  */
 367 typedef struct
 368 {
 369   size_t hash;                  /* Hash of the positions of this state.  */
 370   position_set elems;           /* Positions this state could match.  */
 371   unsigned char context;        /* Context from previous state.  */
 372   unsigned short constraint;    /* Constraint for this state to accept.  */
 373   position_set mbps;            /* Positions which can match multibyte
 374                                    characters or the follows, e.g., period.
 375                                    Used only if MB_CUR_MAX > 1.  */
 376   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 377                                    negative if the state does not have
 378                                    ANYCHAR.  */
 379 } dfa_state;
 380
 381 /* Maximum for any transition table count.  This should be at least 3,
 382    for the initial state setup.  */
 383 enum { MAX_TRCOUNT = 1024 };
 384
 385 /* A bracket operator.
 386    e.g., [a-c], [[:alpha:]], etc.  */
 387 struct mb_char_classes
 388 {
 389   ptrdiff_t cset;
 390   bool invert;
 391   wchar_t *chars;               /* Normal characters.  */
 392   idx_t nchars;
 393   idx_t nchars_alloc;
 394 };
 395
 396 struct regex_syntax
 397 {
 398   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 399   reg_syntax_t syntax_bits;
 400   bool syntax_bits_set;
 401
 402   /* Flag for case-folding letters into sets.  */
 403   bool case_fold;
 404
 405   /* True if ^ and $ match only the start and end of data, and do not match
 406      end-of-line within data.  */
 407   bool anchor;
 408
 409   /* End-of-line byte in data.  */
 410   unsigned char eolbyte;
 411
 412   /* Cache of char-context values.  */
 413   char sbit[NOTCHAR];
 414
 415   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 416      multibyte character.  */
 417   bool never_trail[NOTCHAR];
 418
 419   /* Set of characters considered letters.  */
 420   charclass letters;
 421
 422   /* Set of characters that are newline.  */
 423   charclass newline;
 424 };
 425
 426 /* Lexical analyzer.  All the dross that deals with the obnoxious
 427    GNU Regex syntax bits is located here.  The poor, suffering
 428    reader is referred to the GNU Regex documentation for the
 429    meaning of the @#%!@#%^!@ syntax bits.  */
 430 struct lexer_state
 431 {
 432   char const *ptr;      /* Pointer to next input character.  */
 433   idx_t left;           /* Number of characters remaining.  */
 434   token lasttok;        /* Previous token returned; initially END.  */
 435   idx_t parens;         /* Count of outstanding left parens.  */
 436   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 437
 438   /* Wide character representation of the current multibyte character,
 439      or WEOF if there was an encoding error.  Used only if
 440      MB_CUR_MAX > 1.  */
 441   wint_t wctok;
 442
 443   /* The most recently analyzed multibyte bracket expression.  */
 444   struct mb_char_classes brack;
 445
 446   /* We're separated from beginning or (, | only by zero-width characters.  */
 447   bool laststart;
 448 };
 449
 450 /* Recursive descent parser for regular expressions.  */
 451
 452 struct parser_state
 453 {
 454   token tok;               /* Lookahead token.  */
 455   idx_t depth;             /* Current depth of a hypothetical stack
 456                               holding deferred productions.  This is
 457                               used to determine the depth that will be
 458                               required of the real stack later on in
 459                               dfaanalyze.  */
 460 };
 461
 462 /* A compiled regular expression.  */
 463 struct dfa
 464 {
 465   /* Fields filled by the scanner.  */
 466   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 467   idx_t cindex;                 /* Index for adding new charclasses.  */
 468   idx_t calloc;                 /* Number of charclasses allocated.  */
 469   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 470
 471   /* Scanner state */
 472   struct lexer_state lex;
 473
 474   /* Parser state */
 475   struct parser_state parse;
 476
 477   /* Fields filled by the parser.  */
 478   token *tokens;                /* Postfix parse array.  */
 479   idx_t tindex;                 /* Index for adding new tokens.  */
 480   idx_t talloc;                 /* Number of tokens currently allocated.  */
 481   idx_t depth;                  /* Depth required of an evaluation stack
 482                                    used for depth-first traversal of the
 483                                    parse tree.  */
 484   idx_t nleaves;                /* Number of non-EMPTY leaves
 485                                    in the parse tree.  */
 486   idx_t nregexps;               /* Count of parallel regexps being built
 487                                    with dfaparse.  */
 488   bool fast;                    /* The DFA is fast.  */
 489   bool epsilon;                 /* Does a token match only the empty string?  */
 490   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 491   mbstate_t mbs;                /* Multibyte conversion state.  */
 492
 493   /* The following are valid only if MB_CUR_MAX > 1.  */
 494
 495   /* The value of multibyte_prop[i] is defined by following rule.
 496      if tokens[i] < NOTCHAR
 497      bit 0 : tokens[i] is the first byte of a character, including
 498      single-byte characters.
 499      bit 1 : tokens[i] is the last byte of a character, including
 500      single-byte characters.
 501
 502      e.g.
 503      tokens
 504      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 505      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 506      multibyte_prop
 507      = 3     , 1               ,  0              ,  2              , 3
 508    */
 509   char *multibyte_prop;
 510
 511   /* Fields filled by the superset.  */
 512   struct dfa *superset;             /* Hint of the dfa.  */
 513
 514   /* Fields filled by the state builder.  */
 515   dfa_state *states;            /* States of the dfa.  */
 516   state_num sindex;             /* Index for adding new states.  */
 517   idx_t salloc;                 /* Number of states currently allocated.  */
 518
 519   /* Fields filled by the parse tree->NFA conversion.  */
 520   position_set *follows;        /* Array of follow sets, indexed by position
 521                                    index.  The follow of a position is the set
 522                                    of positions containing characters that
 523                                    could conceivably follow a character
 524                                    matching the given position in a string
 525                                    matching the regexp.  Allocated to the
 526                                    maximum possible position index.  */
 527   bool searchflag;              /* We are supposed to build a searching
 528                                    as opposed to an exact matcher.  A searching
 529                                    matcher finds the first and shortest string
 530                                    matching a regexp anywhere in the buffer,
 531                                    whereas an exact matcher finds the longest
 532                                    string matching, but anchored to the
 533                                    beginning of the buffer.  */
 534
 535   /* Fields filled by dfaanalyze.  */
 536   int *constraints;             /* Array of union of accepting constraints
 537                                    in the follow of a position.  */
 538   int *separates;               /* Array of contexts on follow of a
 539                                    position.  */
 540
 541   /* Fields filled by dfaexec.  */
 542   state_num tralloc;            /* Number of transition tables that have
 543                                    slots so far, not counting trans[-1] and
 544                                    trans[-2].  */
 545   int trcount;                  /* Number of transition tables that have
 546                                    been built, other than for initial
 547                                    states.  */
 548   int min_trcount;              /* Number of initial states.  Equivalently,
 549                                    the minimum state number for which trcount
 550                                    counts transitions.  */
 551   state_num **trans;            /* Transition tables for states that can
 552                                    never accept.  If the transitions for a
 553                                    state have not yet been computed, or the
 554                                    state could possibly accept, its entry in
 555                                    this table is NULL.  This points to two
 556                                    past the start of the allocated array,
 557                                    and trans[-1] and trans[-2] are always
 558                                    NULL.  */
 559   state_num **fails;            /* Transition tables after failing to accept
 560                                    on a state that potentially could do so.
 561                                    If trans[i] is non-null, fails[i] must
 562                                    be null.  */
 563   char *success;                /* Table of acceptance conditions used in
 564                                    dfaexec and computed in build_state.  */
 565   state_num *newlines;          /* Transitions on newlines.  The entry for a
 566                                    newline in any transition table is always
 567                                    -1 so we can count lines without wasting
 568                                    too many cycles.  The transition for a
 569                                    newline is stored separately and handled
 570                                    as a special case.  Newline is also used
 571                                    as a sentinel at the end of the buffer.  */
 572   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 573                                    context in multibyte locales, in which we
 574                                    do not distinguish between their contexts,
 575                                    as not supported word.  */
 576   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 577   state_num **mb_trans;         /* Transition tables for states with
 578                                    ANYCHAR.  */
 579   state_num mb_trcount;         /* Number of transition tables for states with
 580                                    ANYCHAR that have actually been built.  */
 581
 582   /* Syntax configuration.  This is near the end so that dfacopysyntax
 583      can memset up to here.  */
 584   struct regex_syntax syntax;
 585
 586   /* Information derived from the locale.  This is at the end so that
 587      a quick memset need not clear it specially.  */
 588
 589   /* dfaexec implementation.  */
 590   char *(*dfaexec) (struct dfa *, char const *, char *,
 591                     bool, ptrdiff_t *, bool *);
 592
 593   /* Other cached information derived from the locale.  */
 594   struct localeinfo localeinfo;
 595 };
 596
 597 /* User access to dfa internals.  */
 598
 599 /* S could possibly be an accepting state of R.  */
 600 static bool
 601 accepting (state_num s, struct dfa const *r)
 602 {
 603   return r->states[s].constraint != 0;
 604 }
 605
 606 /* STATE accepts in the specified context.  */
 607 static bool
 608 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 609 {
 610   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 611 }
 612
 613 static void regexp (struct dfa *dfa);
 614
 615 /* Store into *PWC the result of converting the leading bytes of the
 616    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 617    and updating the conversion state in *D.  On conversion error,
 618    convert just a single byte, to WEOF.  Return the number of bytes
 619    converted.
 620
 621    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 622
 623    * PWC points to wint_t, not to wchar_t.
 624    * The last arg is a dfa *D instead of merely a multibyte conversion
 625      state D->mbs.
 626    * N must be at least 1.
 627    * S[N - 1] must be a sentinel byte.
 628    * Shift encodings are not supported.
 629    * The return value is always in the range 1..N.
 630    * D->mbs is always valid afterwards.
 631    * *PWC is always set to something.  */
 632 static int
 633 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 634 {
 635   unsigned char uc = s[0];
 636   wint_t wc = d->localeinfo.sbctowc[uc];
 637
 638   if (wc == WEOF)
 639     {
 640       wchar_t wch;
 641       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 642       if (0 < nbytes && nbytes < (size_t) -2)
 643         {
 644           *pwc = wch;
 645           return nbytes;
 646         }
 647       memset (&d->mbs, 0, sizeof d->mbs);
 648     }
 649
 650   *pwc = wc;
 651   return 1;
 652 }
 653
 654 #ifdef DEBUG
 655
 656 static void
 657 prtok (token t)
 658 {
 659   if (t <= END)
 660     fprintf (stderr, "END");
 661   else if (0 <= t && t < NOTCHAR)
 662     {
 663       unsigned int ch = t;
 664       fprintf (stderr, "0x%02x", ch);
 665     }
 666   else
 667     {
 668       char const *s;
 669       switch (t)
 670         {
 671         case BEG:
 672           s = "BEG";
 673           break;
 674         case EMPTY:
 675           s = "EMPTY";
 676           break;
 677         case BACKREF:
 678           s = "BACKREF";
 679           break;
 680         case BEGLINE:
 681           s = "BEGLINE";
 682           break;
 683         case ENDLINE:
 684           s = "ENDLINE";
 685           break;
 686         case BEGWORD:
 687           s = "BEGWORD";
 688           break;
 689         case ENDWORD:
 690           s = "ENDWORD";
 691           break;
 692         case LIMWORD:
 693           s = "LIMWORD";
 694           break;
 695         case NOTLIMWORD:
 696           s = "NOTLIMWORD";
 697           break;
 698         case QMARK:
 699           s = "QMARK";
 700           break;
 701         case STAR:
 702           s = "STAR";
 703           break;
 704         case PLUS:
 705           s = "PLUS";
 706           break;
 707         case CAT:
 708           s = "CAT";
 709           break;
 710         case OR:
 711           s = "OR";
 712           break;
 713         case LPAREN:
 714           s = "LPAREN";
 715           break;
 716         case RPAREN:
 717           s = "RPAREN";
 718           break;
 719         case ANYCHAR:
 720           s = "ANYCHAR";
 721           break;
 722         case MBCSET:
 723           s = "MBCSET";
 724           break;
 725         default:
 726           s = "CSET";
 727           break;
 728         }
 729       fprintf (stderr, "%s", s);
 730     }
 731 }
 732 #endif /* DEBUG */
 733
 734 /* Stuff pertaining to charclasses.  */
 735
 736 static bool
 737 tstbit (unsigned int b, charclass const *c)
 738 {
 739   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 740 }
 741
 742 static void
 743 setbit (unsigned int b, charclass *c)
 744 {
 745   charclass_word one = 1;
 746   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 747 }
 748
 749 static void
 750 clrbit (unsigned int b, charclass *c)
 751 {
 752   charclass_word one = 1;
 753   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 754 }
 755
 756 static void
 757 zeroset (charclass *s)
 758 {
 759   memset (s, 0, sizeof *s);
 760 }
 761
 762 static void
 763 fillset (charclass *s)
 764 {
 765   for (int i = 0; i < CHARCLASS_WORDS; i++)
 766     s->w[i] = CHARCLASS_WORD_MASK;
 767 }
 768
 769 static void
 770 notset (charclass *s)
 771 {
 772   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 773     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 774 }
 775
 776 static bool
 777 equal (charclass const *s1, charclass const *s2)
 778 {
 779   charclass_word w = 0;
 780   for (int i = 0; i < CHARCLASS_WORDS; i++)
 781     w |= s1->w[i] ^ s2->w[i];
 782   return w == 0;
 783 }
 784
 785 static bool
 786 emptyset (charclass const *s)
 787 {
 788   charclass_word w = 0;
 789   for (int i = 0; i < CHARCLASS_WORDS; i++)
 790     w |= s->w[i];
 791   return w == 0;
 792 }
 793
 794 /* Grow PA, which points to an array of *NITEMS items, and return the
 795    location of the reallocated array, updating *NITEMS to reflect its
 796    new size.  The new array will contain at least NITEMS_INCR_MIN more
 797    items, but will not contain more than NITEMS_MAX items total.
 798    ITEM_SIZE is the size of each item, in bytes.
 799
 800    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 801    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 802    infinity.
 803
 804    If PA is null, then allocate a new array instead of reallocating
 805    the old one.
 806
 807    Thus, to grow an array A without saving its old contents, do
 808    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 809
 810 static void *
 811 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
 812          ptrdiff_t nitems_max, idx_t item_size)
 813 {
 814   idx_t n0 = *nitems;
 815
 816   /* The approximate size to use for initial small allocation
 817      requests.  This is the largest "small" request for the GNU C
 818      library malloc.  */
 819   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 820
 821   /* If the array is tiny, grow it to about (but no greater than)
 822      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 823      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 824      NITEMS_MAX, and what the C language can represent safely.  */
 825
 826   idx_t n, nbytes;
 827   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 828     n = IDX_MAX;
 829   if (0 <= nitems_max && nitems_max < n)
 830     n = nitems_max;
 831
 832   idx_t adjusted_nbytes
 833     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 834        ? MIN (IDX_MAX, SIZE_MAX)
 835        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 836   if (adjusted_nbytes)
 837     {
 838       n = adjusted_nbytes / item_size;
 839       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 840     }
 841
 842   if (! pa)
 843     *nitems = 0;
 844   if (n - n0 < nitems_incr_min
 845       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 846           || (0 <= nitems_max && nitems_max < n)
 847           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 848     xalloc_die ();
 849   pa = xrealloc (pa, nbytes);
 850   *nitems = n;
 851   return pa;
 852 }
 853
 854 /* Ensure that the array addressed by PA holds at least I + 1 items.
 855    Either return PA, or reallocate the array and return its new address.
 856    Although PA may be null, the returned value is never null.
 857
 858    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 859    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 860    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 861    ITEM_SIZE is the size of one item; it must be positive.
 862    Avoid O(N**2) behavior on arrays growing linearly.  */
 863 static void *
 864 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 865                ptrdiff_t nitems_max, idx_t item_size)
 866 {
 867   if (i < *nitems)
 868     return pa;
 869   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 870 }
 871
 872 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 873 static idx_t
 874 charclass_index (struct dfa *d, charclass const *s)
 875 {
 876   idx_t i;
 877
 878   for (i = 0; i < d->cindex; ++i)
 879     if (equal (s, &d->charclasses[i]))
 880       return i;
 881   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 882                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 883   ++d->cindex;
 884   d->charclasses[i] = *s;
 885   return i;
 886 }
 887
 888 static bool
 889 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 890 {
 891   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 892 }
 893
 894 static int
 895 char_context (struct dfa const *dfa, unsigned char c)
 896 {
 897   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 898     return CTX_NEWLINE;
 899   if (unibyte_word_constituent (dfa, c))
 900     return CTX_LETTER;
 901   return CTX_NONE;
 902 }
 903
 904 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 905    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 906    this may happen when folding case in weird Turkish locales where
 907    dotless i/dotted I are not included in the chosen character set.
 908    Return whether a bit was set in the charclass.  */
 909 static bool
 910 setbit_wc (wint_t wc, charclass *c)
 911 {
 912   int b = wctob (wc);
 913   if (b < 0)
 914     return false;
 915
 916   setbit (b, c);
 917   return true;
 918 }
 919
 920 /* Set a bit for B and its case variants in the charclass C.
 921    MB_CUR_MAX must be 1.  */
 922 static void
 923 setbit_case_fold_c (int b, charclass *c)
 924 {
 925   int ub = toupper (b);
 926   for (int i = 0; i < NOTCHAR; i++)
 927     if (toupper (i) == ub)
 928       setbit (i, c);
 929 }
 930
 931 /* Fetch the next lexical input character from the pattern.  There
 932    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 933    value of the character or to WEOF depending on whether the input is
 934    a valid multibyte character (possibly of length 1).  Then return
 935    the next input byte value, except return EOF if the input is a
 936    multibyte character of length greater than 1.  */
 937 static int
 938 fetch_wc (struct dfa *dfa)
 939 {
 940   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 941                              dfa);
 942   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 943   dfa->lex.ptr += nbytes;
 944   dfa->lex.left -= nbytes;
 945   return c;
 946 }
 947
 948 /* If there is no more input, report an error about unbalanced brackets.
 949    Otherwise, behave as with fetch_wc (DFA).  */
 950 static int
 951 bracket_fetch_wc (struct dfa *dfa)
 952 {
 953   if (! dfa->lex.left)
 954     dfaerror (_("unbalanced ["));
 955   return fetch_wc (dfa);
 956 }
 957
 958 typedef int predicate (int);
 959
 960 /* The following list maps the names of the Posix named character classes
 961    to predicate functions that determine whether a given character is in
 962    the class.  The leading [ has already been eaten by the lexical
 963    analyzer.  */
 964 struct dfa_ctype
 965 {
 966   const char *name;
 967   predicate *func;
 968   bool single_byte_only;
 969 };
 970
 971 static const struct dfa_ctype prednames[] = {
 972   {"alpha", isalpha, false},
 973   {"upper", isupper, false},
 974   {"lower", islower, false},
 975   {"digit", isdigit, true},
 976   {"xdigit", isxdigit, false},
 977   {"space", isspace, false},
 978   {"punct", ispunct, false},
 979   {"alnum", isalnum, false},
 980   {"print", isprint, false},
 981   {"graph", isgraph, false},
 982   {"cntrl", iscntrl, false},
 983   {"blank", isblank, false},
 984   {NULL, NULL, false}
 985 };
 986
 987 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 988 find_pred (const char *str)
 989 {
 990   for (int i = 0; prednames[i].name; i++)
 991     if (streq (str, prednames[i].name))
 992       return &prednames[i];
 993   return NULL;
 994 }
 995
 996 /* Parse a bracket expression, which possibly includes multibyte
 997    characters.  */
 998 static token
 999 parse_bracket_exp (struct dfa *dfa)
1000 {
1001   /* This is a bracket expression that dfaexec is known to
1002      process correctly.  */
1003   bool known_bracket_exp = true;
1004
1005   /* Used to warn about [:space:].
1006      Bit 0 = first character is a colon.
1007      Bit 1 = last character is a colon.
1008      Bit 2 = includes any other character but a colon.
1009      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
1010   int colon_warning_state;
1011
1012   dfa->lex.brack.nchars = 0;
1013   charclass ccl;
1014   zeroset (&ccl);
1015   int c = bracket_fetch_wc (dfa);
1016   bool invert = c == '^';
1017   if (invert)
1018     {
1019       c = bracket_fetch_wc (dfa);
1020       known_bracket_exp = dfa->localeinfo.simple;
1021     }
1022   wint_t wc = dfa->lex.wctok;
1023   int c1;
1024   wint_t wc1;
1025   colon_warning_state = (c == ':');
1026   do
1027     {
1028       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
1029       colon_warning_state &= ~2;
1030
1031       /* Note that if we're looking at some other [:...:] construct,
1032          we just treat it as a bunch of ordinary characters.  We can do
1033          this because we assume regex has checked for syntax errors before
1034          dfa is ever called.  */
1035       if (c == '[')
1036         {
1037           c1 = bracket_fetch_wc (dfa);
1038           wc1 = dfa->lex.wctok;
1039
1040           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1041               || c1 == '.' || c1 == '=')
1042             {
1043               enum { MAX_BRACKET_STRING_LEN = 32 };
1044               char str[MAX_BRACKET_STRING_LEN + 1];
1045               int len = 0;
1046               for (;;)
1047                 {
1048                   c = bracket_fetch_wc (dfa);
1049                   if (dfa->lex.left == 0
1050                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1051                     break;
1052                   if (len < MAX_BRACKET_STRING_LEN)
1053                     str[len++] = c;
1054                   else
1055                     /* This is in any case an invalid class name.  */
1056                     str[0] = '\0';
1057                 }
1058               str[len] = '\0';
1059
1060               /* Fetch bracket.  */
1061               c = bracket_fetch_wc (dfa);
1062               wc = dfa->lex.wctok;
1063               if (c1 == ':')
1064                 /* Build character class.  POSIX allows character
1065                    classes to match multicharacter collating elements,
1066                    but the regex code does not support that, so do not
1067                    worry about that possibility.  */
1068                 {
1069                   char const *class
1070                     = (dfa->syntax.case_fold && (streq (str, "upper")
1071                                                  || streq (str, "lower"))
1072                        ? "alpha" : str);
1073                   const struct dfa_ctype *pred = find_pred (class);
1074                   if (!pred)
1075                     dfaerror (_("invalid character class"));
1076
1077                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1078                     known_bracket_exp = false;
1079                   else
1080                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1081                       if (pred->func (c2))
1082                         setbit (c2, &ccl);
1083                 }
1084               else
1085                 known_bracket_exp = false;
1086
1087               colon_warning_state |= 8;
1088
1089               /* Fetch new lookahead character.  */
1090               c1 = bracket_fetch_wc (dfa);
1091               wc1 = dfa->lex.wctok;
1092               continue;
1093             }
1094
1095           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1096              are already set up.  */
1097         }
1098
1099       if (c == '\\'
1100           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1101         {
1102           c = bracket_fetch_wc (dfa);
1103           wc = dfa->lex.wctok;
1104         }
1105
1106       if (c1 == NOTCHAR)
1107         {
1108           c1 = bracket_fetch_wc (dfa);
1109           wc1 = dfa->lex.wctok;
1110         }
1111
1112       if (c1 == '-')
1113         /* build range characters.  */
1114         {
1115           int c2 = bracket_fetch_wc (dfa);
1116           wint_t wc2 = dfa->lex.wctok;
1117
1118           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1119              Treat it like [-a[.aa.]] while parsing it, and
1120              remember that the set is unknown.  */
1121           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1122             {
1123               known_bracket_exp = false;
1124               c2 = ']';
1125             }
1126
1127           if (c2 == ']')
1128             {
1129               /* In the case [x-], the - is an ordinary hyphen,
1130                  which is left in c1, the lookahead character.  */
1131               dfa->lex.ptr--;
1132               dfa->lex.left++;
1133             }
1134           else
1135             {
1136               if (c2 == '\\' && (dfa->syntax.syntax_bits
1137                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1138                 {
1139                   c2 = bracket_fetch_wc (dfa);
1140                   wc2 = dfa->lex.wctok;
1141                 }
1142
1143               colon_warning_state |= 8;
1144               c1 = bracket_fetch_wc (dfa);
1145               wc1 = dfa->lex.wctok;
1146
1147               /* Treat [x-y] as a range if x != y.  */
1148               if (wc != wc2 || wc == WEOF)
1149                 {
1150                   if (dfa->localeinfo.simple
1151                       || (isasciidigit (c) & isasciidigit (c2)))
1152                     {
1153                       for (int ci = c; ci <= c2; ci++)
1154                         if (dfa->syntax.case_fold && isalpha (ci))
1155                           setbit_case_fold_c (ci, &ccl);
1156                         else
1157                           setbit (ci, &ccl);
1158                     }
1159                   else
1160                     known_bracket_exp = false;
1161
1162                   continue;
1163                 }
1164             }
1165         }
1166
1167       colon_warning_state |= (c == ':') ? 2 : 4;
1168
1169       if (!dfa->localeinfo.multibyte)
1170         {
1171           if (dfa->syntax.case_fold && isalpha (c))
1172             setbit_case_fold_c (c, &ccl);
1173           else
1174             setbit (c, &ccl);
1175           continue;
1176         }
1177
1178       if (wc == WEOF)
1179         known_bracket_exp = false;
1180       else
1181         {
1182           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1183           int n = (dfa->syntax.case_fold
1184                    ? case_folded_counterparts (wc, folded + 1) + 1
1185                    : 1);
1186           folded[0] = wc;
1187           for (int i = 0; i < n; i++)
1188             if (!setbit_wc (folded[i], &ccl))
1189               {
1190                 dfa->lex.brack.chars
1191                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1192                                    &dfa->lex.brack.nchars_alloc, -1,
1193                                    sizeof *dfa->lex.brack.chars);
1194                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1195               }
1196         }
1197     }
1198   while ((wc = wc1, (c = c1) != ']'));
1199
1200   if (colon_warning_state == 7)
1201     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1202
1203   if (! known_bracket_exp)
1204     return BACKREF;
1205
1206   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1207     {
1208       dfa->lex.brack.invert = invert;
1209       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1210       return MBCSET;
1211     }
1212
1213   if (invert)
1214     {
1215       notset (&ccl);
1216       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1217         clrbit ('\n', &ccl);
1218     }
1219
1220   return CSET + charclass_index (dfa, &ccl);
1221 }
1222
1223 struct lexptr
1224 {
1225   char const *ptr;
1226   idx_t left;
1227 };
1228
1229 static void
1230 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1231 {
1232   ls->ptr = dfa->lex.ptr;
1233   ls->left = dfa->lex.left;
1234   dfa->lex.ptr = s;
1235   dfa->lex.left = strlen (s);
1236 }
1237
1238 static void
1239 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1240 {
1241   dfa->lex.ptr = ls->ptr;
1242   dfa->lex.left = ls->left;
1243 }
1244
1245 static token
1246 lex (struct dfa *dfa)
1247 {
1248   bool backslash = false;
1249
1250   /* Basic plan: We fetch a character.  If it's a backslash,
1251      we set the backslash flag and go through the loop again.
1252      On the plus side, this avoids having a duplicate of the
1253      main switch inside the backslash case.  On the minus side,
1254      it means that just about every case begins with
1255      "if (backslash) ...".  */
1256   for (int i = 0; i < 2; ++i)
1257     {
1258       if (! dfa->lex.left)
1259         return dfa->lex.lasttok = END;
1260       int c = fetch_wc (dfa);
1261
1262       switch (c)
1263         {
1264         case '\\':
1265           if (backslash)
1266             goto normal_char;
1267           if (dfa->lex.left == 0)
1268             dfaerror (_("unfinished \\ escape"));
1269           backslash = true;
1270           break;
1271
1272         case '^':
1273           if (backslash)
1274             goto normal_char;
1275           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1276               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1277               || dfa->lex.lasttok == OR)
1278             return dfa->lex.lasttok = BEGLINE;
1279           goto normal_char;
1280
1281         case '$':
1282           if (backslash)
1283             goto normal_char;
1284           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1285               || dfa->lex.left == 0
1286               || ((dfa->lex.left
1287                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1288                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1289                                    & (dfa->lex.ptr[0] == '\\')]
1290                       == ')'))
1291               || ((dfa->lex.left
1292                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1293                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1294                                    & (dfa->lex.ptr[0] == '\\')]
1295                       == '|'))
1296               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1297                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1298             return dfa->lex.lasttok = ENDLINE;
1299           goto normal_char;
1300
1301         case '1':
1302         case '2':
1303         case '3':
1304         case '4':
1305         case '5':
1306         case '6':
1307         case '7':
1308         case '8':
1309         case '9':
1310           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1311             {
1312               dfa->lex.laststart = false;
1313               return dfa->lex.lasttok = BACKREF;
1314             }
1315           goto normal_char;
1316
1317         case '`':
1318           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319             {
1320               /* FIXME: should be beginning of string */
1321               return dfa->lex.lasttok = BEGLINE;
1322             }
1323           goto normal_char;
1324
1325         case '\'':
1326           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1327             {
1328               /* FIXME: should be end of string */
1329               return dfa->lex.lasttok = ENDLINE;
1330             }
1331           goto normal_char;
1332
1333         case '<':
1334           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1335             return dfa->lex.lasttok = BEGWORD;
1336           goto normal_char;
1337
1338         case '>':
1339           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1340             return dfa->lex.lasttok = ENDWORD;
1341           goto normal_char;
1342
1343         case 'b':
1344           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1345             return dfa->lex.lasttok = LIMWORD;
1346           goto normal_char;
1347
1348         case 'B':
1349           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1350             return dfa->lex.lasttok = NOTLIMWORD;
1351           goto normal_char;
1352
1353         case '?':
1354           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1355             goto normal_char;
1356           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1357             goto normal_char;
1358           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1359               && dfa->lex.laststart)
1360             goto normal_char;
1361           return dfa->lex.lasttok = QMARK;
1362
1363         case '*':
1364           if (backslash)
1365             goto normal_char;
1366           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1367               && dfa->lex.laststart)
1368             goto normal_char;
1369           return dfa->lex.lasttok = STAR;
1370
1371         case '+':
1372           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1373             goto normal_char;
1374           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1375             goto normal_char;
1376           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1377               && dfa->lex.laststart)
1378             goto normal_char;
1379           return dfa->lex.lasttok = PLUS;
1380
1381         case '{':
1382           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1383             goto normal_char;
1384           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1385             goto normal_char;
1386           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1387               && dfa->lex.laststart)
1388             goto normal_char;
1389
1390           /* Cases:
1391              {M} - exact count
1392              {M,} - minimum count, maximum is infinity
1393              {,N} - 0 through N
1394              {,} - 0 to infinity (same as '*')
1395              {M,N} - M through N */
1396           {
1397             char const *p = dfa->lex.ptr;
1398             char const *lim = p + dfa->lex.left;
1399             dfa->lex.minrep = dfa->lex.maxrep = -1;
1400             for (; p != lim && isasciidigit (*p); p++)
1401               dfa->lex.minrep = (dfa->lex.minrep < 0
1402                                  ? *p - '0'
1403                                  : MIN (RE_DUP_MAX + 1,
1404                                         dfa->lex.minrep * 10 + *p - '0'));
1405             if (p != lim)
1406               {
1407                 if (*p != ',')
1408                   dfa->lex.maxrep = dfa->lex.minrep;
1409                 else
1410                   {
1411                     if (dfa->lex.minrep < 0)
1412                       dfa->lex.minrep = 0;
1413                     while (++p != lim && isasciidigit (*p))
1414                       dfa->lex.maxrep
1415                         = (dfa->lex.maxrep < 0
1416                            ? *p - '0'
1417                            : MIN (RE_DUP_MAX + 1,
1418                                   dfa->lex.maxrep * 10 + *p - '0'));
1419                   }
1420               }
1421             if (! ((! backslash || (p != lim && *p++ == '\\'))
1422                    && p != lim && *p++ == '}'
1423                    && 0 <= dfa->lex.minrep
1424                    && (dfa->lex.maxrep < 0
1425                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1426               {
1427                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1428                   goto normal_char;
1429                 dfaerror (_("invalid content of \\{\\}"));
1430               }
1431             if (RE_DUP_MAX < dfa->lex.maxrep)
1432               dfaerror (_("regular expression too big"));
1433             dfa->lex.ptr = p;
1434             dfa->lex.left = lim - p;
1435           }
1436           dfa->lex.laststart = false;
1437           return dfa->lex.lasttok = REPMN;
1438
1439         case '|':
1440           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1441             goto normal_char;
1442           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1443             goto normal_char;
1444           dfa->lex.laststart = true;
1445           return dfa->lex.lasttok = OR;
1446
1447         case '\n':
1448           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1449               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1450             goto normal_char;
1451           dfa->lex.laststart = true;
1452           return dfa->lex.lasttok = OR;
1453
1454         case '(':
1455           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1456             goto normal_char;
1457           dfa->lex.parens++;
1458           dfa->lex.laststart = true;
1459           return dfa->lex.lasttok = LPAREN;
1460
1461         case ')':
1462           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1463             goto normal_char;
1464           if (dfa->lex.parens == 0
1465               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1466             goto normal_char;
1467           dfa->lex.parens--;
1468           dfa->lex.laststart = false;
1469           return dfa->lex.lasttok = RPAREN;
1470
1471         case '.':
1472           if (backslash)
1473             goto normal_char;
1474           if (dfa->canychar < 0)
1475             {
1476               charclass ccl;
1477               fillset (&ccl);
1478               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1479                 clrbit ('\n', &ccl);
1480               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1481                 clrbit ('\0', &ccl);
1482               if (dfa->localeinfo.multibyte)
1483                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1484                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1485                     clrbit (c2, &ccl);
1486               dfa->canychar = charclass_index (dfa, &ccl);
1487             }
1488           dfa->lex.laststart = false;
1489           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1490                                      ? ANYCHAR
1491                                      : CSET + dfa->canychar);
1492
1493         case 's':
1494         case 'S':
1495           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1496             goto normal_char;
1497           if (!dfa->localeinfo.multibyte)
1498             {
1499               charclass ccl;
1500               zeroset (&ccl);
1501               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1502                 if (isspace (c2))
1503                   setbit (c2, &ccl);
1504               if (c == 'S')
1505                 notset (&ccl);
1506               dfa->lex.laststart = false;
1507               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1508             }
1509
1510           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1511              add_utf8_anychar, makes sense.  */
1512
1513           /* \s and \S are documented to be equivalent to [[:space:]] and
1514              [^[:space:]] respectively, so tell the lexer to process those
1515              strings, each minus its "already processed" '['.  */
1516           {
1517             struct lexptr ls;
1518             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1519             dfa->lex.lasttok = parse_bracket_exp (dfa);
1520             pop_lex_state (dfa, &ls);
1521           }
1522
1523           dfa->lex.laststart = false;
1524           return dfa->lex.lasttok;
1525
1526         case 'w':
1527         case 'W':
1528           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1529             goto normal_char;
1530
1531           if (!dfa->localeinfo.multibyte)
1532             {
1533               charclass ccl;
1534               zeroset (&ccl);
1535               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1536                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1537                   setbit (c2, &ccl);
1538               if (c == 'W')
1539                 notset (&ccl);
1540               dfa->lex.laststart = false;
1541               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1542             }
1543
1544           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1545              add_utf8_anychar, makes sense.  */
1546
1547           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1548              [^_[:alnum:]] respectively, so tell the lexer to process those
1549              strings, each minus its "already processed" '['.  */
1550           {
1551             struct lexptr ls;
1552             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1553             dfa->lex.lasttok = parse_bracket_exp (dfa);
1554             pop_lex_state (dfa, &ls);
1555           }
1556
1557           dfa->lex.laststart = false;
1558           return dfa->lex.lasttok;
1559
1560         case '[':
1561           if (backslash)
1562             goto normal_char;
1563           dfa->lex.laststart = false;
1564           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1565
1566         default:
1567         normal_char:
1568           dfa->lex.laststart = false;
1569           /* For multibyte character sets, folding is done in atom.  Always
1570              return WCHAR.  */
1571           if (dfa->localeinfo.multibyte)
1572             return dfa->lex.lasttok = WCHAR;
1573
1574           if (dfa->syntax.case_fold && isalpha (c))
1575             {
1576               charclass ccl;
1577               zeroset (&ccl);
1578               setbit_case_fold_c (c, &ccl);
1579               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1580             }
1581
1582           return dfa->lex.lasttok = c;
1583         }
1584     }
1585
1586   /* The above loop should consume at most a backslash
1587      and some other character.  */
1588   abort ();
1589   return END;                   /* keeps pedantic compilers happy.  */
1590 }
1591
1592 static void
1593 addtok_mb (struct dfa *dfa, token t, char mbprop)
1594 {
1595   if (dfa->talloc == dfa->tindex)
1596     {
1597       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1598                              sizeof *dfa->tokens);
1599       if (dfa->localeinfo.multibyte)
1600         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1601                                          sizeof *dfa->multibyte_prop);
1602     }
1603   if (dfa->localeinfo.multibyte)
1604     dfa->multibyte_prop[dfa->tindex] = mbprop;
1605   dfa->tokens[dfa->tindex++] = t;
1606
1607   switch (t)
1608     {
1609     case QMARK:
1610     case STAR:
1611     case PLUS:
1612       break;
1613
1614     case CAT:
1615     case OR:
1616       dfa->parse.depth--;
1617       break;
1618
1619     case EMPTY:
1620       dfa->epsilon = true;
1621       goto increment_depth;
1622
1623     case BACKREF:
1624       dfa->fast = false;
1625       goto increment_nleaves;
1626
1627     case BEGLINE:
1628     case ENDLINE:
1629     case BEGWORD:
1630     case ENDWORD:
1631     case LIMWORD:
1632     case NOTLIMWORD:
1633       dfa->epsilon = true;
1634       FALLTHROUGH;
1635     default:
1636     increment_nleaves:
1637       dfa->nleaves++;
1638     increment_depth:
1639       dfa->parse.depth++;
1640       if (dfa->depth < dfa->parse.depth)
1641         dfa->depth = dfa->parse.depth;
1642       break;
1643     }
1644 }
1645
1646 static void addtok_wc (struct dfa *dfa, wint_t wc);
1647
1648 /* Add the given token to the parse tree, maintaining the depth count and
1649    updating the maximum depth if necessary.  */
1650 static void
1651 addtok (struct dfa *dfa, token t)
1652 {
1653   if (dfa->localeinfo.multibyte && t == MBCSET)
1654     {
1655       bool need_or = false;
1656
1657       /* Extract wide characters into alternations for better performance.
1658          This does not require UTF-8.  */
1659       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1660         {
1661           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1662           if (need_or)
1663             addtok (dfa, OR);
1664           need_or = true;
1665         }
1666       dfa->lex.brack.nchars = 0;
1667
1668       /* Wide characters have been handled above, so it is possible
1669          that the set is empty now.  Do nothing in that case.  */
1670       if (dfa->lex.brack.cset != -1)
1671         {
1672           addtok (dfa, CSET + dfa->lex.brack.cset);
1673           if (need_or)
1674             addtok (dfa, OR);
1675         }
1676     }
1677   else
1678     {
1679       addtok_mb (dfa, t, 3);
1680     }
1681 }
1682
1683 /* We treat a multibyte character as a single atom, so that DFA
1684    can treat a multibyte character as a single expression.
1685
1686    e.g., we construct the following tree from "<mb1><mb2>".
1687    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1688    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1689 static void
1690 addtok_wc (struct dfa *dfa, wint_t wc)
1691 {
1692   unsigned char buf[MB_LEN_MAX];
1693   mbstate_t s = { 0 };
1694   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1695   int buflen;
1696
1697   if (stored_bytes != (size_t) -1)
1698     buflen = stored_bytes;
1699   else
1700     {
1701       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1702          the addtok_mb call altogether can corrupt the heap.  */
1703       buflen = 1;
1704       buf[0] = 0;
1705     }
1706
1707   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1708   for (int i = 1; i < buflen; i++)
1709     {
1710       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1711       addtok (dfa, CAT);
1712     }
1713 }
1714
1715 static void
1716 add_utf8_anychar (struct dfa *dfa)
1717 {
1718   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1719      UTF-8 byte sequence has been defined as follows:
1720
1721      ([\x00-\x7f]
1722      |[\xc2-\xdf][\x80-\xbf]
1723      |[\xe0][\xa0-\xbf][\x80-\xbf]
1724      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1725      |[\xed][\x80-\x9f][\x80-\xbf]
1726      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1727      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1728      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1729
1730      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1731      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1732      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1733      H = [\x80-\x9f], I = [\xf0],
1734      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1735
1736      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1737
1738   /* Mnemonics for classes containing two or more bytes.  */
1739   enum { A, B, C, E, F, H, J, K, M };
1740
1741   /* Mnemonics for single-byte tokens.  */
1742   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1743
1744   static charclass const utf8_classes[] = {
1745     /* A. 00-7f: 1-byte sequence.  */
1746     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1747
1748     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1749     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1750
1751     /* C. 80-bf: non-leading bytes.  */
1752     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1753
1754     /* D. e0 (just a token).  */
1755
1756     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1757     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1758
1759     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1760     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1761
1762     /* G. ed (just a token).  */
1763
1764     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1765     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1766
1767     /* I. f0 (just a token).  */
1768
1769     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1770     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1771
1772     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1773     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1774
1775     /* L. f4 (just a token).  */
1776
1777     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1778     CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1779   };
1780
1781   /* Define the character classes that are needed below.  */
1782   if (dfa->utf8_anychar_classes[0] == 0)
1783     {
1784       charclass c = utf8_classes[0];
1785       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1786         clrbit ('\n', &c);
1787       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1788         clrbit ('\0', &c);
1789       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1790
1791       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1792         dfa->utf8_anychar_classes[i]
1793           = CSET + charclass_index (dfa, &utf8_classes[i]);
1794     }
1795
1796   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1797      The token buffer is in reverse Polish order, so we get
1798      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1799       C CAT OR C CAT OR C CAT OR".  */
1800   addtok (dfa, dfa->utf8_anychar_classes[A]);
1801   addtok (dfa, dfa->utf8_anychar_classes[B]);
1802   addtok (dfa, D_token);
1803   addtok (dfa, dfa->utf8_anychar_classes[E]);
1804   addtok (dfa, CAT);
1805   addtok (dfa, OR);
1806   addtok (dfa, G_token);
1807   addtok (dfa, dfa->utf8_anychar_classes[H]);
1808   addtok (dfa, CAT);
1809   addtok (dfa, OR);
1810   addtok (dfa, dfa->utf8_anychar_classes[F]);
1811   addtok (dfa, I_token);
1812   addtok (dfa, dfa->utf8_anychar_classes[J]);
1813   addtok (dfa, CAT);
1814   addtok (dfa, OR);
1815   addtok (dfa, L_token);
1816   addtok (dfa, dfa->utf8_anychar_classes[M]);
1817   addtok (dfa, CAT);
1818   addtok (dfa, OR);
1819   addtok (dfa, dfa->utf8_anychar_classes[K]);
1820   for (int i = 0; i < 3; i++)
1821     {
1822       addtok (dfa, dfa->utf8_anychar_classes[C]);
1823       addtok (dfa, CAT);
1824       addtok (dfa, OR);
1825     }
1826 }
1827
1828 /* The grammar understood by the parser is as follows.
1829
1830    regexp:
1831      regexp OR branch
1832      branch
1833
1834    branch:
1835      branch closure
1836      closure
1837
1838    closure:
1839      closure QMARK
1840      closure STAR
1841      closure PLUS
1842      closure REPMN
1843      atom
1844
1845    atom:
1846      <normal character>
1847      <multibyte character>
1848      ANYCHAR
1849      MBCSET
1850      CSET
1851      BACKREF
1852      BEGLINE
1853      ENDLINE
1854      BEGWORD
1855      ENDWORD
1856      LIMWORD
1857      NOTLIMWORD
1858      LPAREN regexp RPAREN
1859      <empty>
1860
1861    The parser builds a parse tree in postfix form in an array of tokens.  */
1862
1863 static void
1864 atom (struct dfa *dfa)
1865 {
1866   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1867       || dfa->parse.tok >= CSET
1868       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1869       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1870       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1871       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1872       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1873     {
1874       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1875         {
1876           /* For UTF-8 expand the period to a series of CSETs that define a
1877              valid UTF-8 character.  This avoids using the slow multibyte
1878              path.  I'm pretty sure it would be both profitable and correct to
1879              do it for any encoding; however, the optimization must be done
1880              manually as it is done above in add_utf8_anychar.  So, let's
1881              start with UTF-8: it is the most used, and the structure of the
1882              encoding makes the correctness more obvious.  */
1883           add_utf8_anychar (dfa);
1884         }
1885       else
1886         addtok (dfa, dfa->parse.tok);
1887       dfa->parse.tok = lex (dfa);
1888     }
1889   else if (dfa->parse.tok == WCHAR)
1890     {
1891       if (dfa->lex.wctok == WEOF)
1892         addtok (dfa, BACKREF);
1893       else
1894         {
1895           addtok_wc (dfa, dfa->lex.wctok);
1896
1897           if (dfa->syntax.case_fold)
1898             {
1899               wchar_t folded[CASE_FOLDED_BUFSIZE];
1900               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1901               for (int i = 0; i < n; i++)
1902                 {
1903                   addtok_wc (dfa, folded[i]);
1904                   addtok (dfa, OR);
1905                 }
1906             }
1907         }
1908
1909       dfa->parse.tok = lex (dfa);
1910     }
1911   else if (dfa->parse.tok == LPAREN)
1912     {
1913       dfa->parse.tok = lex (dfa);
1914       regexp (dfa);
1915       if (dfa->parse.tok != RPAREN)
1916         dfaerror (_("unbalanced ("));
1917       dfa->parse.tok = lex (dfa);
1918     }
1919   else
1920     addtok (dfa, EMPTY);
1921 }
1922
1923 /* Return the number of tokens in the given subexpression.  */
1924 static idx_t _GL_ATTRIBUTE_PURE
1925 nsubtoks (struct dfa const *dfa, idx_t tindex)
1926 {
1927   switch (dfa->tokens[tindex - 1])
1928     {
1929     default:
1930       return 1;
1931     case QMARK:
1932     case STAR:
1933     case PLUS:
1934       return 1 + nsubtoks (dfa, tindex - 1);
1935     case CAT:
1936     case OR:
1937       {
1938         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1939         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1940       }
1941     }
1942 }
1943
1944 /* Copy the given subexpression to the top of the tree.  */
1945 static void
1946 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1947 {
1948   if (dfa->localeinfo.multibyte)
1949     for (idx_t i = 0; i < ntokens; i++)
1950       addtok_mb (dfa, dfa->tokens[tindex + i],
1951                  dfa->multibyte_prop[tindex + i]);
1952   else
1953     for (idx_t i = 0; i < ntokens; i++)
1954       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1955 }
1956
1957 static void
1958 closure (struct dfa *dfa)
1959 {
1960   atom (dfa);
1961   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1962          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1963     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1964       {
1965         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1966         idx_t tindex = dfa->tindex - ntokens;
1967         if (dfa->lex.maxrep < 0)
1968           addtok (dfa, PLUS);
1969         if (dfa->lex.minrep == 0)
1970           addtok (dfa, QMARK);
1971         int i;
1972         for (i = 1; i < dfa->lex.minrep; i++)
1973           {
1974             copytoks (dfa, tindex, ntokens);
1975             addtok (dfa, CAT);
1976           }
1977         for (; i < dfa->lex.maxrep; i++)
1978           {
1979             copytoks (dfa, tindex, ntokens);
1980             addtok (dfa, QMARK);
1981             addtok (dfa, CAT);
1982           }
1983         dfa->parse.tok = lex (dfa);
1984       }
1985     else if (dfa->parse.tok == REPMN)
1986       {
1987         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1988         dfa->parse.tok = lex (dfa);
1989         closure (dfa);
1990       }
1991     else
1992       {
1993         addtok (dfa, dfa->parse.tok);
1994         dfa->parse.tok = lex (dfa);
1995       }
1996 }
1997
1998 static void
1999 branch (struct dfa* dfa)
2000 {
2001   closure (dfa);
2002   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2003          && dfa->parse.tok >= 0)
2004     {
2005       closure (dfa);
2006       addtok (dfa, CAT);
2007     }
2008 }
2009
2010 static void
2011 regexp (struct dfa *dfa)
2012 {
2013   branch (dfa);
2014   while (dfa->parse.tok == OR)
2015     {
2016       dfa->parse.tok = lex (dfa);
2017       branch (dfa);
2018       addtok (dfa, OR);
2019     }
2020 }
2021
2022 /* Parse a string S of length LEN into D.  S can include NUL characters.
2023    This is the main entry point for the parser.  */
2024 void
2025 dfaparse (char const *s, idx_t len, struct dfa *d)
2026 {
2027   d->lex.ptr = s;
2028   d->lex.left = len;
2029   d->lex.lasttok = END;
2030   d->lex.laststart = true;
2031
2032   if (!d->syntax.syntax_bits_set)
2033     dfaerror (_("no syntax specified"));
2034
2035   if (!d->nregexps)
2036     addtok (d, BEG);
2037
2038   d->parse.tok = lex (d);
2039   d->parse.depth = d->depth;
2040
2041   regexp (d);
2042
2043   if (d->parse.tok != END)
2044     dfaerror (_("unbalanced )"));
2045
2046   addtok (d, END - d->nregexps);
2047   addtok (d, CAT);
2048
2049   if (d->nregexps)
2050     addtok (d, OR);
2051
2052   ++d->nregexps;
2053 }
2054
2055 /* Some primitives for operating on sets of positions.  */
2056
2057 /* Copy one set to another.  */
2058 static void
2059 copy (position_set const *src, position_set *dst)
2060 {
2061   if (dst->alloc < src->nelem)
2062     {
2063       free (dst->elems);
2064       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2065                             sizeof *dst->elems);
2066     }
2067   dst->nelem = src->nelem;
2068   if (src->nelem != 0)
2069     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2070 }
2071
2072 static void
2073 alloc_position_set (position_set *s, idx_t size)
2074 {
2075   s->elems = xnmalloc (size, sizeof *s->elems);
2076   s->alloc = size;
2077   s->nelem = 0;
2078 }
2079
2080 /* Insert position P in set S.  S is maintained in sorted order on
2081    decreasing index.  If there is already an entry in S with P.index
2082    then merge (logically-OR) P's constraints into the one in S.
2083    S->elems must point to an array large enough to hold the resulting set.  */
2084 static void
2085 insert (position p, position_set *s)
2086 {
2087   idx_t count = s->nelem;
2088   idx_t lo = 0, hi = count;
2089   while (lo < hi)
2090     {
2091       idx_t mid = (lo + hi) >> 1;
2092       if (s->elems[mid].index < p.index)
2093         lo = mid + 1;
2094       else if (s->elems[mid].index == p.index)
2095         {
2096           s->elems[mid].constraint |= p.constraint;
2097           return;
2098         }
2099       else
2100         hi = mid;
2101     }
2102
2103   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2104   for (idx_t i = count; i > lo; i--)
2105     s->elems[i] = s->elems[i - 1];
2106   s->elems[lo] = p;
2107   ++s->nelem;
2108 }
2109
2110 static void
2111 append (position p, position_set *s)
2112 {
2113   idx_t count = s->nelem;
2114   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2115   s->elems[s->nelem++] = p;
2116 }
2117
2118 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2119    result is as if the positions of S1, and of S2 with the additional
2120    constraint C2, were inserted into an initially empty set.  */
2121 static void
2122 merge_constrained (position_set const *s1, position_set const *s2,
2123                    unsigned int c2, position_set *m)
2124 {
2125   idx_t i = 0, j = 0;
2126
2127   if (m->alloc - s1->nelem < s2->nelem)
2128     {
2129       free (m->elems);
2130       m->alloc = s1->nelem;
2131       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2132     }
2133   m->nelem = 0;
2134   while (i < s1->nelem || j < s2->nelem)
2135     if (! (j < s2->nelem)
2136         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2137       {
2138         unsigned int c = ((i < s1->nelem && j < s2->nelem
2139                            && s1->elems[i].index == s2->elems[j].index)
2140                           ? s2->elems[j++].constraint & c2
2141                           : 0);
2142         m->elems[m->nelem].index = s1->elems[i].index;
2143         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2144       }
2145     else
2146       {
2147         if (s2->elems[j].constraint & c2)
2148           {
2149             m->elems[m->nelem].index = s2->elems[j].index;
2150             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2151           }
2152         j++;
2153       }
2154 }
2155
2156 /* Merge two sets of positions into a third.  The result is exactly as if
2157    the positions of both sets were inserted into an initially empty set.  */
2158 static void
2159 merge (position_set const *s1, position_set const *s2, position_set *m)
2160 {
2161   merge_constrained (s1, s2, -1, m);
2162 }
2163
2164 /* Merge into DST all the elements of SRC, possibly destroying
2165    the contents of the temporary M.  */
2166 static void
2167 merge2 (position_set *dst, position_set const *src, position_set *m)
2168 {
2169   if (src->nelem < 4)
2170     {
2171       for (idx_t i = 0; i < src->nelem; i++)
2172         insert (src->elems[i], dst);
2173     }
2174    else
2175     {
2176       merge (src, dst, m);
2177       copy (m, dst);
2178     }
2179 }
2180
2181 /* Delete a position from a set.  Return the nonzero constraint of the
2182    deleted position, or zero if there was no such position.  */
2183 static unsigned int
2184 delete (idx_t del, position_set *s)
2185 {
2186   idx_t count = s->nelem;
2187   idx_t lo = 0, hi = count;
2188   while (lo < hi)
2189     {
2190       idx_t mid = (lo + hi) >> 1;
2191       if (s->elems[mid].index < del)
2192         lo = mid + 1;
2193       else if (s->elems[mid].index == del)
2194         {
2195           unsigned int c = s->elems[mid].constraint;
2196           idx_t i;
2197           for (i = mid; i + 1 < count; i++)
2198             s->elems[i] = s->elems[i + 1];
2199           s->nelem = i;
2200           return c;
2201         }
2202       else
2203         hi = mid;
2204     }
2205   return 0;
2206 }
2207
2208 /* Replace a position with the followed set.  */
2209 static void
2210 replace (position_set *dst, idx_t del, position_set *add,
2211          unsigned int constraint, position_set *tmp)
2212 {
2213   unsigned int c = delete (del, dst) & constraint;
2214
2215   if (c)
2216     {
2217       copy (dst, tmp);
2218       merge_constrained (tmp, add, c, dst);
2219     }
2220 }
2221
2222 /* Find the index of the state corresponding to the given position set with
2223    the given preceding context, or create a new state if there is no such
2224    state.  Context tells whether we got here on a newline or letter.  */
2225 static state_num
2226 state_index (struct dfa *d, position_set const *s, int context)
2227 {
2228   size_t hash = 0;
2229   int constraint = 0;
2230   state_num i;
2231
2232   for (i = 0; i < s->nelem; ++i)
2233     {
2234       size_t ind = s->elems[i].index;
2235       hash ^= ind + s->elems[i].constraint;
2236     }
2237
2238   /* Try to find a state that exactly matches the proposed one.  */
2239   for (i = 0; i < d->sindex; ++i)
2240     {
2241       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2242           || context != d->states[i].context)
2243         continue;
2244       state_num j;
2245       for (j = 0; j < s->nelem; ++j)
2246         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2247             || s->elems[j].index != d->states[i].elems.elems[j].index)
2248           break;
2249       if (j == s->nelem)
2250         return i;
2251     }
2252
2253 #ifdef DEBUG
2254   fprintf (stderr, "new state %td\n nextpos:", i);
2255   for (state_num j = 0; j < s->nelem; j++)
2256     {
2257       fprintf (stderr, " %td:", s->elems[j].index);
2258       prtok (d->tokens[s->elems[j].index]);
2259     }
2260   fprintf (stderr, "\n context:");
2261   if (context ^ CTX_ANY)
2262     {
2263       if (context & CTX_NONE)
2264         fprintf (stderr, " CTX_NONE");
2265       if (context & CTX_LETTER)
2266         fprintf (stderr, " CTX_LETTER");
2267       if (context & CTX_NEWLINE)
2268         fprintf (stderr, " CTX_NEWLINE");
2269     }
2270   else
2271     fprintf (stderr, " CTX_ANY");
2272   fprintf (stderr, "\n");
2273 #endif
2274
2275   for (state_num j = 0; j < s->nelem; j++)
2276     {
2277       int c = d->constraints[s->elems[j].index];
2278
2279       if (c != 0)
2280         {
2281           if (succeeds_in_context (c, context, CTX_ANY))
2282             constraint |= c;
2283         }
2284       else if (d->tokens[s->elems[j].index] == BACKREF)
2285         constraint = NO_CONSTRAINT;
2286     }
2287
2288
2289   /* Create a new state.  */
2290   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2291                              sizeof *d->states);
2292   d->states[i].hash = hash;
2293   alloc_position_set (&d->states[i].elems, s->nelem);
2294   copy (s, &d->states[i].elems);
2295   d->states[i].context = context;
2296   d->states[i].constraint = constraint;
2297   d->states[i].mbps.nelem = 0;
2298   d->states[i].mbps.elems = NULL;
2299   d->states[i].mb_trindex = -1;
2300
2301   ++d->sindex;
2302
2303   return i;
2304 }
2305
2306 /* Find the epsilon closure of D's set of positions.  If any position of the set
2307    contains a symbol that matches the empty string in some context, replace
2308    that position with the elements of its follow labeled with an appropriate
2309    constraint.  Repeat exhaustively until no funny positions are left.
2310    S->elems must be large enough to hold the result.  BACKWARD is D's
2311    backward set; use and update it too.  */
2312 static void
2313 epsclosure (struct dfa const *d, position_set *backward)
2314 {
2315   position_set tmp;
2316   alloc_position_set (&tmp, d->nleaves);
2317   for (idx_t i = 0; i < d->tindex; i++)
2318     if (0 < d->follows[i].nelem)
2319       {
2320         unsigned int constraint;
2321         switch (d->tokens[i])
2322           {
2323           default:
2324             continue;
2325
2326           case BEGLINE:
2327             constraint = BEGLINE_CONSTRAINT;
2328             break;
2329           case ENDLINE:
2330             constraint = ENDLINE_CONSTRAINT;
2331             break;
2332           case BEGWORD:
2333             constraint = BEGWORD_CONSTRAINT;
2334             break;
2335           case ENDWORD:
2336             constraint = ENDWORD_CONSTRAINT;
2337             break;
2338           case LIMWORD:
2339             constraint = LIMWORD_CONSTRAINT;
2340             break;
2341           case NOTLIMWORD:
2342             constraint = NOTLIMWORD_CONSTRAINT;
2343             break;
2344           case EMPTY:
2345             constraint = NO_CONSTRAINT;
2346             break;
2347           }
2348
2349         delete (i, &d->follows[i]);
2350
2351         for (idx_t j = 0; j < backward[i].nelem; j++)
2352           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2353                    constraint, &tmp);
2354         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2355           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2356                    NO_CONSTRAINT, &tmp);
2357       }
2358   free (tmp.elems);
2359 }
2360
2361 /* Returns the set of contexts for which there is at least one
2362    character included in C.  */
2363
2364 static int
2365 charclass_context (struct dfa const *dfa, charclass const *c)
2366 {
2367   int context = 0;
2368
2369   for (int j = 0; j < CHARCLASS_WORDS; j++)
2370     {
2371       if (c->w[j] & dfa->syntax.newline.w[j])
2372         context |= CTX_NEWLINE;
2373       if (c->w[j] & dfa->syntax.letters.w[j])
2374         context |= CTX_LETTER;
2375       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2376         context |= CTX_NONE;
2377     }
2378
2379   return context;
2380 }
2381
2382 /* Returns the contexts on which the position set S depends.  Each context
2383    in the set of returned contexts (let's call it SC) may have a different
2384    follow set than other contexts in SC, and also different from the
2385    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2386    in the complement set will have the same follow set.  */
2387
2388 static int _GL_ATTRIBUTE_PURE
2389 state_separate_contexts (struct dfa *d, position_set const *s)
2390 {
2391   int separate_contexts = 0;
2392
2393   for (idx_t j = 0; j < s->nelem; j++)
2394     separate_contexts |= d->separates[s->elems[j].index];
2395
2396   return separate_contexts;
2397 }
2398
2399 enum
2400 {
2401   /* Single token is repeated.  It is distinguished from non-repeated.  */
2402   OPT_REPEAT = (1 << 0),
2403
2404   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2405      node is not merged.  */
2406   OPT_LPAREN = (1 << 1),
2407
2408   /* Multiple branches are joined.  The node is not merged.  */
2409   OPT_RPAREN = (1 << 2),
2410
2411   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2412      flag is turned on. */
2413   OPT_WALKED = (1 << 3),
2414
2415   /* The node is queued.  The node is not queued again.  */
2416   OPT_QUEUED = (1 << 4)
2417 };
2418
2419 static void
2420 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2421                  position_set *merged)
2422 {
2423   position_set *follows = d->follows;
2424   idx_t nelem = 0;
2425
2426   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2427     {
2428       idx_t sindex = follows[tindex].elems[i].index;
2429
2430       /* Skip the node as pruned in future.  */
2431       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2432       if (iconstraint == 0)
2433         continue;
2434
2435       if (d->tokens[follows[tindex].elems[i].index] <= END)
2436         {
2437           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2438           continue;
2439         }
2440
2441       if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2442         {
2443           idx_t j;
2444
2445           for (j = 0; j < nelem; j++)
2446             {
2447               idx_t dindex = follows[tindex].elems[j].index;
2448
2449               if (follows[tindex].elems[j].constraint != iconstraint)
2450                 continue;
2451
2452               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2453                 continue;
2454
2455               if (d->tokens[sindex] != d->tokens[dindex])
2456                 continue;
2457
2458               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2459                 continue;
2460
2461               if (flags[sindex] & OPT_REPEAT)
2462                 delete (sindex, &follows[sindex]);
2463
2464               merge2 (&follows[dindex], &follows[sindex], merged);
2465
2466               break;
2467             }
2468
2469           if (j < nelem)
2470             continue;
2471         }
2472
2473       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2474       flags[sindex] |= OPT_QUEUED;
2475     }
2476
2477   follows[tindex].nelem = nelem;
2478 }
2479
2480 static int
2481 compare (const void *a, const void *b)
2482 {
2483   position const *p = a, *q = b;
2484   return (p->index > q->index) - (p->index < q->index);
2485 }
2486
2487 static void
2488 reorder_tokens (struct dfa *d)
2489 {
2490   idx_t nleaves = 0;
2491   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2492   map[0] = nleaves++;
2493   for (idx_t i = 1; i < d->tindex; i++)
2494     map[i] = -1;
2495
2496   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2497   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2498   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2499   char *multibyte_prop = (d->localeinfo.multibyte
2500                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2501                           : NULL);
2502
2503   for (idx_t i = 0; i < d->tindex; i++)
2504     {
2505       if (map[i] < 0)
2506         {
2507           free (d->follows[i].elems);
2508           d->follows[i].elems = NULL;
2509           d->follows[i].nelem = 0;
2510           continue;
2511         }
2512
2513       tokens[map[i]] = d->tokens[i];
2514       follows[map[i]] = d->follows[i];
2515       constraints[map[i]] = d->constraints[i];
2516
2517       if (multibyte_prop != NULL)
2518         multibyte_prop[map[i]] = d->multibyte_prop[i];
2519
2520       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2521         {
2522           if (map[d->follows[i].elems[j].index] == -1)
2523             map[d->follows[i].elems[j].index] = nleaves++;
2524
2525           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2526         }
2527
2528       qsort (d->follows[i].elems, d->follows[i].nelem,
2529              sizeof *d->follows[i].elems, compare);
2530     }
2531
2532   for (idx_t i = 0; i < nleaves; i++)
2533     {
2534       d->tokens[i] = tokens[i];
2535       d->follows[i] = follows[i];
2536       d->constraints[i] = constraints[i];
2537
2538       if (multibyte_prop != NULL)
2539         d->multibyte_prop[i] = multibyte_prop[i];
2540     }
2541
2542   d->tindex = d->nleaves = nleaves;
2543
2544   free (tokens);
2545   free (follows);
2546   free (constraints);
2547   free (multibyte_prop);
2548   free (map);
2549 }
2550
2551 static void
2552 dfaoptimize (struct dfa *d)
2553 {
2554   char *flags = xzalloc (d->tindex);
2555
2556   for (idx_t i = 0; i < d->tindex; i++)
2557     {
2558       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2559         {
2560           if (d->follows[i].elems[j].index == i)
2561             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2562           else if (d->follows[i].elems[j].index < i)
2563             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2564           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2565             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2566           else
2567             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2568         }
2569     }
2570
2571   flags[0] |= OPT_QUEUED;
2572
2573   position_set merged0;
2574   position_set *merged = &merged0;
2575   alloc_position_set (merged, d->nleaves);
2576
2577   d->constraints = xcalloc (d->tindex, sizeof *d->constraints);
2578
2579   for (idx_t i = 0; i < d->tindex; i++)
2580     if (flags[i] & OPT_QUEUED)
2581       merge_nfa_state (d, i, flags, merged);
2582
2583   reorder_tokens (d);
2584
2585   free (merged->elems);
2586   free (flags);
2587 }
2588
2589 /* Perform bottom-up analysis on the parse tree, computing various functions.
2590    Note that at this point, we're pretending constructs like \< are real
2591    characters rather than constraints on what can follow them.
2592
2593    Nullable:  A node is nullable if it is at the root of a regexp that can
2594    match the empty string.
2595    *  EMPTY leaves are nullable.
2596    * No other leaf is nullable.
2597    * A QMARK or STAR node is nullable.
2598    * A PLUS node is nullable if its argument is nullable.
2599    * A CAT node is nullable if both its arguments are nullable.
2600    * An OR node is nullable if either argument is nullable.
2601
2602    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2603    that could correspond to the first character of a string matching the
2604    regexp rooted at the given node.
2605    * EMPTY leaves have empty firstpos.
2606    * The firstpos of a nonempty leaf is that leaf itself.
2607    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2608      argument.
2609    * The firstpos of a CAT node is the firstpos of the left argument, union
2610      the firstpos of the right if the left argument is nullable.
2611    * The firstpos of an OR node is the union of firstpos of each argument.
2612
2613    Lastpos:  The lastpos of a node is the set of positions that could
2614    correspond to the last character of a string matching the regexp at
2615    the given node.
2616    * EMPTY leaves have empty lastpos.
2617    * The lastpos of a nonempty leaf is that leaf itself.
2618    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2619      argument.
2620    * The lastpos of a CAT node is the lastpos of its right argument, union
2621      the lastpos of the left if the right argument is nullable.
2622    * The lastpos of an OR node is the union of the lastpos of each argument.
2623
2624    Follow:  The follow of a position is the set of positions that could
2625    correspond to the character following a character matching the node in
2626    a string matching the regexp.  At this point we consider special symbols
2627    that match the empty string in some context to be just normal characters.
2628    Later, if we find that a special symbol is in a follow set, we will
2629    replace it with the elements of its follow, labeled with an appropriate
2630    constraint.
2631    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2632      the follow of every node in the lastpos.
2633    * Every node in the firstpos of the second argument of a CAT node is in
2634      the follow of every node in the lastpos of the first argument.
2635
2636    Because of the postfix representation of the parse tree, the depth-first
2637    analysis is conveniently done by a linear scan with the aid of a stack.
2638    Sets are stored as arrays of the elements, obeying a stack-like allocation
2639    scheme; the number of elements in each set deeper in the stack can be
2640    used to determine the address of a particular set's array.  */
2641 static void
2642 dfaanalyze (struct dfa *d, bool searchflag)
2643 {
2644   /* Array allocated to hold position sets.  */
2645   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2646   /* Firstpos and lastpos elements.  */
2647   position *firstpos = posalloc;
2648   position *lastpos = firstpos + d->nleaves;
2649   position pos;
2650   position_set tmp;
2651
2652   /* Stack for element counts and nullable flags.  */
2653   struct
2654   {
2655     /* Whether the entry is nullable.  */
2656     bool nullable;
2657
2658     /* Counts of firstpos and lastpos sets.  */
2659     idx_t nfirstpos;
2660     idx_t nlastpos;
2661   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2662
2663   position_set merged;          /* Result of merging sets.  */
2664
2665   addtok (d, CAT);
2666   idx_t tindex = d->tindex;
2667
2668 #ifdef DEBUG
2669   fprintf (stderr, "dfaanalyze:\n");
2670   for (idx_t i = 0; i < tindex; i++)
2671     {
2672       fprintf (stderr, " %td:", i);
2673       prtok (d->tokens[i]);
2674     }
2675   putc ('\n', stderr);
2676 #endif
2677
2678   d->searchflag = searchflag;
2679   alloc_position_set (&merged, d->nleaves);
2680   d->follows = xcalloc (tindex, sizeof *d->follows);
2681   position_set *backward
2682     = d->epsilon ? xcalloc (tindex, sizeof *backward) : NULL;
2683
2684   for (idx_t i = 0; i < tindex; i++)
2685     {
2686       switch (d->tokens[i])
2687         {
2688         case EMPTY:
2689           /* The empty set is nullable.  */
2690           stk->nullable = true;
2691
2692           /* The firstpos and lastpos of the empty leaf are both empty.  */
2693           stk->nfirstpos = stk->nlastpos = 0;
2694           stk++;
2695           break;
2696
2697         case STAR:
2698         case PLUS:
2699           /* Every element in the lastpos of the argument is in the backward
2700              set of every element in the firstpos.  */
2701           if (d->epsilon)
2702             {
2703               tmp.elems = lastpos - stk[-1].nlastpos;
2704               tmp.nelem = stk[-1].nlastpos;
2705               for (position *p = firstpos - stk[-1].nfirstpos;
2706                    p < firstpos; p++)
2707                 merge2 (&backward[p->index], &tmp, &merged);
2708             }
2709
2710           /* Every element in the firstpos of the argument is in the follow
2711              of every element in the lastpos.  */
2712           {
2713             tmp.elems = firstpos - stk[-1].nfirstpos;
2714             tmp.nelem = stk[-1].nfirstpos;
2715             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2716               merge2 (&d->follows[p->index], &tmp, &merged);
2717           }
2718           FALLTHROUGH;
2719         case QMARK:
2720           /* A QMARK or STAR node is automatically nullable.  */
2721           if (d->tokens[i] != PLUS)
2722             stk[-1].nullable = true;
2723           break;
2724
2725         case CAT:
2726           /* Every element in the lastpos of the first argument is in
2727              the backward set of every element in the firstpos of the
2728              second argument.  */
2729           if (backward)
2730             {
2731               tmp.nelem = stk[-2].nlastpos;
2732               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2733               for (position *p = firstpos - stk[-1].nfirstpos;
2734                    p < firstpos; p++)
2735                 merge2 (&backward[p->index], &tmp, &merged);
2736             }
2737
2738           /* Every element in the firstpos of the second argument is in the
2739              follow of every element in the lastpos of the first argument.  */
2740           {
2741             tmp.nelem = stk[-1].nfirstpos;
2742             tmp.elems = firstpos - stk[-1].nfirstpos;
2743             for (position *plim = lastpos - stk[-1].nlastpos,
2744                    *p = plim - stk[-2].nlastpos;
2745                  p < plim; p++)
2746               merge2 (&d->follows[p->index], &tmp, &merged);
2747           }
2748
2749           /* The firstpos of a CAT node is the firstpos of the first argument,
2750              union that of the second argument if the first is nullable.  */
2751           if (stk[-2].nullable)
2752             stk[-2].nfirstpos += stk[-1].nfirstpos;
2753           else
2754             firstpos -= stk[-1].nfirstpos;
2755
2756           /* The lastpos of a CAT node is the lastpos of the second argument,
2757              union that of the first argument if the second is nullable.  */
2758           if (stk[-1].nullable)
2759             stk[-2].nlastpos += stk[-1].nlastpos;
2760           else
2761             {
2762               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2763               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2764                 p[j] = p[j + stk[-2].nlastpos];
2765               lastpos -= stk[-2].nlastpos;
2766               stk[-2].nlastpos = stk[-1].nlastpos;
2767             }
2768
2769           /* A CAT node is nullable if both arguments are nullable.  */
2770           stk[-2].nullable &= stk[-1].nullable;
2771           stk--;
2772           break;
2773
2774         case OR:
2775           /* The firstpos is the union of the firstpos of each argument.  */
2776           stk[-2].nfirstpos += stk[-1].nfirstpos;
2777
2778           /* The lastpos is the union of the lastpos of each argument.  */
2779           stk[-2].nlastpos += stk[-1].nlastpos;
2780
2781           /* An OR node is nullable if either argument is nullable.  */
2782           stk[-2].nullable |= stk[-1].nullable;
2783           stk--;
2784           break;
2785
2786         default:
2787           /* Anything else is a nonempty position.  (Note that special
2788              constructs like \< are treated as nonempty strings here;
2789              an "epsilon closure" effectively makes them nullable later.
2790              Backreferences have to get a real position so we can detect
2791              transitions on them later.  But they are nullable.  */
2792           stk->nullable = d->tokens[i] == BACKREF;
2793
2794           /* This position is in its own firstpos and lastpos.  */
2795           stk->nfirstpos = stk->nlastpos = 1;
2796           stk++;
2797
2798           firstpos->index = lastpos->index = i;
2799           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2800           firstpos++, lastpos++;
2801
2802           break;
2803         }
2804 #ifdef DEBUG
2805       /* ... balance the above nonsyntactic #ifdef goo...  */
2806       fprintf (stderr, "node %td:", i);
2807       prtok (d->tokens[i]);
2808       putc ('\n', stderr);
2809       fprintf (stderr,
2810                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2811       fprintf (stderr, " firstpos:");
2812       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2813         {
2814           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2815           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2816         }
2817       fprintf (stderr, "\n lastpos:");
2818       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2819         {
2820           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2821           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2822         }
2823       putc ('\n', stderr);
2824 #endif
2825     }
2826
2827   if (backward)
2828     {
2829       /* For each follow set that is the follow set of a real position,
2830          replace it with its epsilon closure.  */
2831       epsclosure (d, backward);
2832
2833       for (idx_t i = 0; i < tindex; i++)
2834         free (backward[i].elems);
2835       free (backward);
2836     }
2837
2838   dfaoptimize (d);
2839
2840 #ifdef DEBUG
2841   for (idx_t i = 0; i < tindex; i++)
2842     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2843         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2844         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2845       {
2846         fprintf (stderr, "follows(%td:", i);
2847         prtok (d->tokens[i]);
2848         fprintf (stderr, "):");
2849         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2850           {
2851             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2852             prtok (d->tokens[d->follows[i].elems[j].index]);
2853           }
2854         putc ('\n', stderr);
2855       }
2856 #endif
2857
2858   pos.index = 0;
2859   pos.constraint = NO_CONSTRAINT;
2860
2861   alloc_position_set (&tmp, 1);
2862
2863   append (pos, &tmp);
2864
2865   d->separates = xcalloc (tindex, sizeof *d->separates);
2866
2867   for (idx_t i = 0; i < tindex; i++)
2868     {
2869       if (prev_newline_dependent (d->constraints[i]))
2870         d->separates[i] |= CTX_NEWLINE;
2871       if (prev_letter_dependent (d->constraints[i]))
2872         d->separates[i] |= CTX_LETTER;
2873
2874       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2875         {
2876           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2877             d->separates[i] |= CTX_NEWLINE;
2878           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2879             d->separates[i] |= CTX_LETTER;
2880         }
2881     }
2882
2883   /* Context wanted by some position.  */
2884   int separate_contexts = state_separate_contexts (d, &tmp);
2885
2886   /* Build the initial state.  */
2887   if (separate_contexts & CTX_NEWLINE)
2888     state_index (d, &tmp, CTX_NEWLINE);
2889   d->initstate_notbol = d->min_trcount
2890     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2891   if (separate_contexts & CTX_LETTER)
2892     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2893   d->min_trcount++;
2894   d->trcount = 0;
2895
2896   free (posalloc);
2897   free (stkalloc);
2898   free (merged.elems);
2899   free (tmp.elems);
2900 }
2901
2902 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2903 static void
2904 realloc_trans_if_necessary (struct dfa *d)
2905 {
2906   state_num oldalloc = d->tralloc;
2907   if (oldalloc < d->sindex)
2908     {
2909       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2910       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2911       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2912                            -1, sizeof *realtrans);
2913       realtrans[0] = realtrans[1] = NULL;
2914       d->trans = realtrans + 2;
2915       idx_t newalloc = d->tralloc = newalloc1 - 2;
2916       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2917       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2918       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2919       if (d->localeinfo.multibyte)
2920         {
2921           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2922           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2923           if (oldalloc == 0)
2924             realtrans[0] = realtrans[1] = NULL;
2925           d->mb_trans = realtrans + 2;
2926         }
2927       for (; oldalloc < newalloc; oldalloc++)
2928         {
2929           d->trans[oldalloc] = NULL;
2930           d->fails[oldalloc] = NULL;
2931           if (d->localeinfo.multibyte)
2932             d->mb_trans[oldalloc] = NULL;
2933         }
2934     }
2935 }
2936
2937 /*
2938    Calculate the transition table for a new state derived from state s
2939    for a compiled dfa d after input character uc, and return the new
2940    state number.
2941
2942    Do not worry about all possible input characters; calculate just the group
2943    of positions that match uc.  Label it with the set of characters that
2944    every position in the group matches (taking into account, if necessary,
2945    preceding context information of s).  Then find the union
2946    of these positions' follows, i.e., the set of positions of the
2947    new state.  For each character in the group's label, set the transition
2948    on this character to be to a state corresponding to the set's positions,
2949    and its associated backward context information, if necessary.
2950
2951    When building a searching matcher, include the positions of state
2952    0 in every state.
2953
2954    The group is constructed by building an equivalence-class
2955    partition of the positions of s.
2956
2957    For each position, find the set of characters C that it matches.  Eliminate
2958    any characters from C that fail on grounds of backward context.
2959
2960    Check whether the group's label L has nonempty
2961    intersection with C.  If L - C is nonempty, create a new group labeled
2962    L - C and having the same positions as the current group, and set L to
2963    the intersection of L and C.  Insert the position in the group, set
2964    C = C - L, and resume scanning.
2965
2966    If after comparing with every group there are characters remaining in C,
2967    create a new group labeled with the characters of C and insert this
2968    position in that group.  */
2969
2970 static state_num
2971 build_state (state_num s, struct dfa *d, unsigned char uc)
2972 {
2973   position_set follows;         /* Union of the follows for each
2974                                    position of the current state.  */
2975   position_set group;           /* Positions that match the input char.  */
2976   position_set tmp;             /* Temporary space for merging sets.  */
2977   state_num state;              /* New state.  */
2978   state_num state_newline;      /* New state on a newline transition.  */
2979   state_num state_letter;       /* New state on a letter transition.  */
2980
2981 #ifdef DEBUG
2982   fprintf (stderr, "build state %td\n", s);
2983 #endif
2984
2985   /* A pointer to the new transition table, and the table itself.  */
2986   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2987   state_num *trans = *ptrans;
2988
2989   if (!trans)
2990     {
2991       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2992          transition tables that can exist at once, other than for
2993          initial states.  Often-used transition tables are quickly
2994          rebuilt, whereas rarely-used ones are cleared away.  */
2995       if (MAX_TRCOUNT <= d->trcount)
2996         {
2997           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2998             {
2999               free (d->trans[i]);
3000               free (d->fails[i]);
3001               d->trans[i] = d->fails[i] = NULL;
3002             }
3003           d->trcount = 0;
3004         }
3005
3006       d->trcount++;
3007       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3008
3009       /* Fill transition table with a default value which means that the
3010          transited state has not been calculated yet.  */
3011       for (int i = 0; i < NOTCHAR; i++)
3012         trans[i] = -2;
3013     }
3014
3015   /* Set up the success bits for this state.  */
3016   d->success[s] = 0;
3017   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3018     d->success[s] |= CTX_NEWLINE;
3019   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3020     d->success[s] |= CTX_LETTER;
3021   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3022     d->success[s] |= CTX_NONE;
3023
3024   alloc_position_set (&follows, d->nleaves);
3025
3026   /* Find the union of the follows of the positions of the group.
3027      This is a hideously inefficient loop.  Fix it someday.  */
3028   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3029     for (idx_t k = 0;
3030          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3031       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3032               &follows);
3033
3034   /* Positions that match the input char.  */
3035   alloc_position_set (&group, d->nleaves);
3036
3037   /* The group's label.  */
3038   charclass label;
3039   fillset (&label);
3040
3041   for (idx_t i = 0; i < follows.nelem; i++)
3042     {
3043       charclass matches;            /* Set of matching characters.  */
3044       position pos = follows.elems[i];
3045       bool matched = false;
3046       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3047         {
3048           zeroset (&matches);
3049           setbit (d->tokens[pos.index], &matches);
3050           if (d->tokens[pos.index] == uc)
3051             matched = true;
3052         }
3053       else if (d->tokens[pos.index] >= CSET)
3054         {
3055           matches = d->charclasses[d->tokens[pos.index] - CSET];
3056           if (tstbit (uc, &matches))
3057             matched = true;
3058         }
3059       else if (d->tokens[pos.index] == ANYCHAR)
3060         {
3061           matches = d->charclasses[d->canychar];
3062           if (tstbit (uc, &matches))
3063             matched = true;
3064
3065           /* ANYCHAR must match with a single character, so we must put
3066              it to D->states[s].mbps which contains the positions which
3067              can match with a single character not a byte.  If all
3068              positions which has ANYCHAR does not depend on context of
3069              next character, we put the follows instead of it to
3070              D->states[s].mbps to optimize.  */
3071           if (succeeds_in_context (pos.constraint, d->states[s].context,
3072                                    CTX_NONE))
3073             {
3074               if (d->states[s].mbps.nelem == 0)
3075                 alloc_position_set (&d->states[s].mbps, 1);
3076               insert (pos, &d->states[s].mbps);
3077             }
3078         }
3079       else
3080         continue;
3081
3082       /* Some characters may need to be eliminated from matches because
3083          they fail in the current context.  */
3084       if (pos.constraint != NO_CONSTRAINT)
3085         {
3086           if (!succeeds_in_context (pos.constraint,
3087                                     d->states[s].context, CTX_NEWLINE))
3088             for (int j = 0; j < CHARCLASS_WORDS; j++)
3089               matches.w[j] &= ~d->syntax.newline.w[j];
3090           if (!succeeds_in_context (pos.constraint,
3091                                     d->states[s].context, CTX_LETTER))
3092             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3093               matches.w[j] &= ~d->syntax.letters.w[j];
3094           if (!succeeds_in_context (pos.constraint,
3095                                     d->states[s].context, CTX_NONE))
3096             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3097               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3098
3099           /* If there are no characters left, there's no point in going on.  */
3100           if (emptyset (&matches))
3101             continue;
3102
3103           /* If we have reset the bit that made us declare "matched", reset
3104              that indicator, too.  This is required to avoid an infinite loop
3105              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3106           if (!tstbit (uc, &matches))
3107             matched = false;
3108         }
3109
3110 #ifdef DEBUG
3111       fprintf (stderr, " nextpos %td:", pos.index);
3112       prtok (d->tokens[pos.index]);
3113       fprintf (stderr, " of");
3114       for (unsigned j = 0; j < NOTCHAR; j++)
3115         if (tstbit (j, &matches))
3116           fprintf (stderr, " 0x%02x", j);
3117       fprintf (stderr, "\n");
3118 #endif
3119
3120       if (matched)
3121         {
3122           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3123             label.w[k] &= matches.w[k];
3124           append (pos, &group);
3125         }
3126       else
3127         {
3128           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3129             label.w[k] &= ~matches.w[k];
3130         }
3131     }
3132
3133   alloc_position_set (&tmp, d->nleaves);
3134
3135   if (group.nelem > 0)
3136     {
3137       /* If we are building a searching matcher, throw in the positions
3138          of state 0 as well, if possible.  */
3139       if (d->searchflag)
3140         {
3141           /* If a token in follows.elems is not 1st byte of a multibyte
3142              character, or the states of follows must accept the bytes
3143              which are not 1st byte of the multibyte character.
3144              Then, if a state of follows encounters a byte, it must not be
3145              a 1st byte of a multibyte character nor a single byte character.
3146              In this case, do not add state[0].follows to next state, because
3147              state[0] must accept 1st-byte.
3148
3149              For example, suppose <sb a> is a certain single byte character,
3150              <mb A> is a certain multibyte character, and the codepoint of
3151              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3152              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3153              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3154              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3155              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3156              not add state[0].  */
3157
3158           bool mergeit = !d->localeinfo.multibyte;
3159           if (!mergeit)
3160             {
3161               mergeit = true;
3162               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3163                 mergeit &= d->multibyte_prop[group.elems[j].index];
3164             }
3165           if (mergeit)
3166             merge2 (&group, &d->states[0].elems, &tmp);
3167         }
3168
3169       /* Find out if the new state will want any context information,
3170          by calculating possible contexts that the group can match,
3171          and separate contexts that the new state wants to know.  */
3172       int possible_contexts = charclass_context (d, &label);
3173       int separate_contexts = state_separate_contexts (d, &group);
3174
3175       /* Find the state(s) corresponding to the union of the follows.  */
3176       if (possible_contexts & ~separate_contexts)
3177         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3178       else
3179         state = -1;
3180       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3181         state_newline = state_index (d, &group, CTX_NEWLINE);
3182       else
3183         state_newline = state;
3184       if (separate_contexts & possible_contexts & CTX_LETTER)
3185         state_letter = state_index (d, &group, CTX_LETTER);
3186       else
3187         state_letter = state;
3188
3189       /* Reallocate now, to reallocate any newline transition properly.  */
3190       realloc_trans_if_necessary (d);
3191     }
3192
3193   /* If we are a searching matcher, the default transition is to a state
3194      containing the positions of state 0, otherwise the default transition
3195      is to fail miserably.  */
3196   else if (d->searchflag)
3197     {
3198       state_newline = 0;
3199       state_letter = d->min_trcount - 1;
3200       state = d->initstate_notbol;
3201     }
3202   else
3203     {
3204       state_newline = -1;
3205       state_letter = -1;
3206       state = -1;
3207     }
3208
3209   /* Set the transitions for each character in the label.  */
3210   for (int i = 0; i < NOTCHAR; i++)
3211     if (tstbit (i, &label))
3212       switch (d->syntax.sbit[i])
3213         {
3214         case CTX_NEWLINE:
3215           trans[i] = state_newline;
3216           break;
3217         case CTX_LETTER:
3218           trans[i] = state_letter;
3219           break;
3220         default:
3221           trans[i] = state;
3222           break;
3223         }
3224
3225 #ifdef DEBUG
3226   fprintf (stderr, "trans table %td", s);
3227   for (int i = 0; i < NOTCHAR; ++i)
3228     {
3229       if (!(i & 0xf))
3230         fprintf (stderr, "\n");
3231       fprintf (stderr, " %2td", trans[i]);
3232     }
3233   fprintf (stderr, "\n");
3234 #endif
3235
3236   free (group.elems);
3237   free (follows.elems);
3238   free (tmp.elems);
3239
3240   /* Keep the newline transition in a special place so we can use it as
3241      a sentinel.  */
3242   if (tstbit (d->syntax.eolbyte, &label))
3243     {
3244       d->newlines[s] = trans[d->syntax.eolbyte];
3245       trans[d->syntax.eolbyte] = -1;
3246     }
3247
3248   return trans[uc];
3249 }
3250
3251 /* Multibyte character handling sub-routines for dfaexec.  */
3252
3253 /* Consume a single byte and transit state from 's' to '*next_state'.
3254    This function is almost same as the state transition routin in dfaexec.
3255    But state transition is done just once, otherwise matching succeed or
3256    reach the end of the buffer.  */
3257 static state_num
3258 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3259 {
3260   state_num *t;
3261
3262   if (d->trans[s])
3263     t = d->trans[s];
3264   else if (d->fails[s])
3265     t = d->fails[s];
3266   else
3267     {
3268       build_state (s, d, **pp);
3269       if (d->trans[s])
3270         t = d->trans[s];
3271       else
3272         {
3273           t = d->fails[s];
3274           assert (t);
3275         }
3276     }
3277
3278   if (t[**pp] == -2)
3279     build_state (s, d, **pp);
3280
3281   return t[*(*pp)++];
3282 }
3283
3284 /* Transit state from s, then return new state and update the pointer of
3285    the buffer.  This function is for a period operator which can match a
3286    multi-byte character.  */
3287 static state_num
3288 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3289                unsigned char const *end)
3290 {
3291   wint_t wc;
3292
3293   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3294
3295   /* This state has some operators which can match a multibyte character.  */
3296   d->mb_follows.nelem = 0;
3297
3298   /* Calculate the state which can be reached from the state 's' by
3299      consuming 'mbclen' single bytes from the buffer.  */
3300   state_num s1 = s;
3301   int mbci;
3302   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3303     s = transit_state_singlebyte (d, s, pp);
3304   *pp += mbclen - mbci;
3305
3306   if (wc == WEOF)
3307     {
3308       /* It is an invalid character, so ANYCHAR is not accepted.  */
3309       return s;
3310     }
3311
3312   /* If all positions which have ANYCHAR do not depend on the context
3313      of the next character, calculate the next state with
3314      pre-calculated follows and cache the result.  */
3315   if (d->states[s1].mb_trindex < 0)
3316     {
3317       if (MAX_TRCOUNT <= d->mb_trcount)
3318         {
3319           state_num s3;
3320           for (s3 = -1; s3 < d->tralloc; s3++)
3321             {
3322               free (d->mb_trans[s3]);
3323               d->mb_trans[s3] = NULL;
3324             }
3325
3326           for (state_num i = 0; i < d->sindex; i++)
3327             d->states[i].mb_trindex = -1;
3328           d->mb_trcount = 0;
3329         }
3330       d->states[s1].mb_trindex = d->mb_trcount++;
3331     }
3332
3333   if (! d->mb_trans[s])
3334     {
3335       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3336       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3337       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3338       for (int i = 0; i < MAX_TRCOUNT; i++)
3339         d->mb_trans[s][i] = -1;
3340     }
3341   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3342     return d->mb_trans[s][d->states[s1].mb_trindex];
3343
3344   if (s == -1)
3345     copy (&d->states[s1].mbps, &d->mb_follows);
3346   else
3347     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3348
3349   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3350   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3351   realloc_trans_if_necessary (d);
3352
3353   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3354
3355   return s2;
3356 }
3357
3358 /* The initial state may encounter a byte which is not a single byte character
3359    nor the first byte of a multibyte character.  But it is incorrect for the
3360    initial state to accept such a byte.  For example, in Shift JIS the regular
3361    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3362    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3363    that are not a single byte character nor the first byte of a multibyte
3364    character.
3365
3366    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3367    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3368    the result is greater than P, set *WCP to the final wide character
3369    processed, or to WEOF if no wide character is processed.  Otherwise,
3370    if WCP is non-NULL, *WCP may or may not be updated.
3371
3372    Both P and MBP must be no larger than END.  */
3373 static unsigned char const *
3374 skip_remains_mb (struct dfa *d, unsigned char const *p,
3375                  unsigned char const *mbp, char const *end)
3376 {
3377   if (d->syntax.never_trail[*p])
3378     return p;
3379   while (mbp < p)
3380     {
3381       wint_t wc;
3382       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3383                            end - (char const *) mbp, d);
3384     }
3385   return mbp;
3386 }
3387
3388 /* Search through a buffer looking for a match to the struct dfa *D.
3389    Find the first occurrence of a string matching the regexp in the
3390    buffer, and the shortest possible version thereof.  Return a pointer to
3391    the first character after the match, or NULL if none is found.  BEGIN
3392    points to the beginning of the buffer, and END points to the first byte
3393    after its end.  Note however that we store a sentinel byte (usually
3394    newline) in *END, so the actual buffer must be one byte longer.
3395    When ALLOW_NL, newlines may appear in the matching string.
3396    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3397    If MULTIBYTE, the input consists of multibyte characters and/or
3398    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3399    Here is the list of features that make this DFA matcher punt:
3400     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3401     - [^...] in non-simple locale
3402     - [[=foo=]] or [[.foo.]]
3403     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3404     - back-reference: (.)\1
3405     - word-delimiter in multibyte locale: \<, \>, \b, \B
3406    See struct localeinfo.simple for the definition of "simple locale".  */
3407
3408 static inline char *
3409 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3410               ptrdiff_t *count, bool multibyte)
3411 {
3412   if (MAX_TRCOUNT <= d->sindex)
3413     {
3414       for (state_num s = d->min_trcount; s < d->sindex; s++)
3415         {
3416           free (d->states[s].elems.elems);
3417           free (d->states[s].mbps.elems);
3418         }
3419       d->sindex = d->min_trcount;
3420
3421       if (d->trans)
3422         {
3423           for (state_num s = 0; s < d->tralloc; s++)
3424             {
3425               free (d->trans[s]);
3426               free (d->fails[s]);
3427               d->trans[s] = d->fails[s] = NULL;
3428             }
3429           d->trcount = 0;
3430         }
3431
3432       if (d->localeinfo.multibyte && d->mb_trans)
3433         {
3434           for (state_num s = -1; s < d->tralloc; s++)
3435             {
3436               free (d->mb_trans[s]);
3437               d->mb_trans[s] = NULL;
3438             }
3439           for (state_num s = 0; s < d->min_trcount; s++)
3440             d->states[s].mb_trindex = -1;
3441           d->mb_trcount = 0;
3442         }
3443     }
3444
3445   if (!d->tralloc)
3446     realloc_trans_if_necessary (d);
3447
3448   /* Current state.  */
3449   state_num s = 0, s1 = 0;
3450
3451   /* Current input character.  */
3452   unsigned char const *p = (unsigned char const *) begin;
3453   unsigned char const *mbp = p;
3454
3455   /* Copy of d->trans so it can be optimized into a register.  */
3456   state_num **trans = d->trans;
3457   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3458   unsigned char saved_end = *(unsigned char *) end;
3459   *end = eol;
3460
3461   if (multibyte)
3462     {
3463       memset (&d->mbs, 0, sizeof d->mbs);
3464       if (d->mb_follows.alloc == 0)
3465         alloc_position_set (&d->mb_follows, d->nleaves);
3466     }
3467
3468   idx_t nlcount = 0;
3469   for (;;)
3470     {
3471       state_num *t;
3472       while ((t = trans[s]) != NULL)
3473         {
3474           if (s < d->min_trcount)
3475             {
3476               if (!multibyte || d->states[s].mbps.nelem == 0)
3477                 {
3478                   while (t[*p] == s)
3479                     p++;
3480                 }
3481               if (multibyte)
3482                 p = mbp = skip_remains_mb (d, p, mbp, end);
3483             }
3484
3485           if (multibyte)
3486             {
3487               s1 = s;
3488
3489               if (d->states[s].mbps.nelem == 0
3490                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3491                 {
3492                   /* If an input character does not match ANYCHAR, do it
3493                      like a single-byte character.  */
3494                   s = t[*p++];
3495                 }
3496               else
3497                 {
3498                   s = transit_state (d, s, &p, (unsigned char *) end);
3499                   mbp = p;
3500                   trans = d->trans;
3501                 }
3502             }
3503           else
3504             {
3505               s1 = t[*p++];
3506               t = trans[s1];
3507               if (! t)
3508                 {
3509                   state_num tmp = s;
3510                   s = s1;
3511                   s1 = tmp;     /* swap */
3512                   break;
3513                 }
3514               if (s < d->min_trcount)
3515                 {
3516                   while (t[*p] == s1)
3517                     p++;
3518                 }
3519               s = t[*p++];
3520             }
3521         }
3522
3523       if (s < 0)
3524         {
3525           if (s == -2)
3526             {
3527               s = build_state (s1, d, p[-1]);
3528               trans = d->trans;
3529             }
3530           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3531             {
3532               /* The previous character was a newline.  Count it, and skip
3533                  checking of multibyte character boundary until here.  */
3534               nlcount++;
3535               mbp = p;
3536
3537               s = (allow_nl ? d->newlines[s1]
3538                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3539                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3540                    : d->initstate_notbol);
3541             }
3542           else
3543             {
3544               p = NULL;
3545               goto done;
3546             }
3547         }
3548       else if (d->fails[s])
3549         {
3550           if ((d->success[s] & d->syntax.sbit[*p])
3551               || ((char *) p == end
3552                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3553                                          d)))
3554             goto done;
3555
3556           if (multibyte && s < d->min_trcount)
3557             p = mbp = skip_remains_mb (d, p, mbp, end);
3558
3559           s1 = s;
3560           if (!multibyte || d->states[s].mbps.nelem == 0
3561               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3562             {
3563               /* If a input character does not match ANYCHAR, do it
3564                  like a single-byte character.  */
3565               s = d->fails[s][*p++];
3566             }
3567           else
3568             {
3569               s = transit_state (d, s, &p, (unsigned char *) end);
3570               mbp = p;
3571               trans = d->trans;
3572             }
3573         }
3574       else
3575         {
3576           build_state (s, d, p[0]);
3577           trans = d->trans;
3578         }
3579     }
3580
3581  done:
3582   if (count)
3583     *count += nlcount;
3584   *end = saved_end;
3585   return (char *) p;
3586 }
3587
3588 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3589    This is for performance, as dfaexec_main is an inline function.  */
3590
3591 static char *
3592 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3593             bool allow_nl, ptrdiff_t *count, bool *backref)
3594 {
3595   return dfaexec_main (d, begin, end, allow_nl, count, true);
3596 }
3597
3598 static char *
3599 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3600             bool allow_nl, ptrdiff_t *count, bool *backref)
3601 {
3602   return dfaexec_main (d, begin, end, allow_nl, count, false);
3603 }
3604
3605 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3606    any regexp that uses a construct not supported by this code.  */
3607 static char *
3608 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3609               bool allow_nl, ptrdiff_t *count, bool *backref)
3610 {
3611   *backref = true;
3612   return (char *) begin;
3613 }
3614
3615 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3616    but faster and set *BACKREF if the DFA code does not support this
3617    regexp usage.  */
3618
3619 char *
3620 dfaexec (struct dfa *d, char const *begin, char *end,
3621          bool allow_nl, ptrdiff_t *count, bool *backref)
3622 {
3623   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3624 }
3625
3626 struct dfa *
3627 dfasuperset (struct dfa const *d)
3628 {
3629   return d->superset;
3630 }
3631
3632 bool
3633 dfaisfast (struct dfa const *d)
3634 {
3635   return d->fast;
3636 }
3637
3638 static void
3639 free_mbdata (struct dfa *d)
3640 {
3641   free (d->multibyte_prop);
3642   free (d->lex.brack.chars);
3643   free (d->mb_follows.elems);
3644
3645   if (d->mb_trans)
3646     {
3647       state_num s;
3648       for (s = -1; s < d->tralloc; s++)
3649         free (d->mb_trans[s]);
3650       free (d->mb_trans - 2);
3651     }
3652 }
3653
3654 /* Return true if every construct in D is supported by this DFA matcher.  */
3655 bool
3656 dfasupported (struct dfa const *d)
3657 {
3658   for (idx_t i = 0; i < d->tindex; i++)
3659     {
3660       switch (d->tokens[i])
3661         {
3662         case BEGWORD:
3663         case ENDWORD:
3664         case LIMWORD:
3665         case NOTLIMWORD:
3666           if (!d->localeinfo.multibyte)
3667             continue;
3668           FALLTHROUGH;
3669         case BACKREF:
3670         case MBCSET:
3671           return false;
3672         }
3673     }
3674   return true;
3675 }
3676
3677 /* Disable use of the superset DFA if it is not likely to help
3678    performance.  */
3679 static void
3680 maybe_disable_superset_dfa (struct dfa *d)
3681 {
3682   if (!d->localeinfo.using_utf8)
3683     return;
3684
3685   bool have_backref = false;
3686   for (idx_t i = 0; i < d->tindex; i++)
3687     {
3688       switch (d->tokens[i])
3689         {
3690         case ANYCHAR:
3691           /* Lowered.  */
3692           abort ();
3693         case BACKREF:
3694           have_backref = true;
3695           break;
3696         case MBCSET:
3697           /* Requires multi-byte algorithm.  */
3698           return;
3699         default:
3700           break;
3701         }
3702     }
3703
3704   if (!have_backref && d->superset)
3705     {
3706       /* The superset DFA is not likely to be much faster, so remove it.  */
3707       dfafree (d->superset);
3708       free (d->superset);
3709       d->superset = NULL;
3710     }
3711
3712   free_mbdata (d);
3713   d->localeinfo.multibyte = false;
3714   d->dfaexec = dfaexec_sb;
3715   d->fast = true;
3716 }
3717
3718 static void
3719 dfassbuild (struct dfa *d)
3720 {
3721   struct dfa *sup = dfaalloc ();
3722
3723   *sup = *d;
3724   sup->localeinfo.multibyte = false;
3725   sup->dfaexec = dfaexec_sb;
3726   sup->multibyte_prop = NULL;
3727   sup->superset = NULL;
3728   sup->states = NULL;
3729   sup->sindex = 0;
3730   sup->constraints = NULL;
3731   sup->separates = NULL;
3732   sup->follows = NULL;
3733   sup->tralloc = 0;
3734   sup->trans = NULL;
3735   sup->fails = NULL;
3736   sup->success = NULL;
3737   sup->newlines = NULL;
3738
3739   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3740   if (d->cindex)
3741     {
3742       memcpy (sup->charclasses, d->charclasses,
3743               d->cindex * sizeof *sup->charclasses);
3744     }
3745
3746   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3747   sup->talloc = d->tindex * 2;
3748
3749   bool have_achar = false;
3750   bool have_nchar = false;
3751   idx_t j;
3752   for (idx_t i = j = 0; i < d->tindex; i++)
3753     {
3754       switch (d->tokens[i])
3755         {
3756         case ANYCHAR:
3757         case MBCSET:
3758         case BACKREF:
3759           {
3760             charclass ccl;
3761             fillset (&ccl);
3762             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3763             sup->tokens[j++] = STAR;
3764             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3765                 || d->tokens[i + 1] == PLUS)
3766               i++;
3767             have_achar = true;
3768           }
3769           break;
3770         case BEGWORD:
3771         case ENDWORD:
3772         case LIMWORD:
3773         case NOTLIMWORD:
3774           if (d->localeinfo.multibyte)
3775             {
3776               /* These constraints aren't supported in a multibyte locale.
3777                  Ignore them in the superset DFA.  */
3778               sup->tokens[j++] = EMPTY;
3779               break;
3780             }
3781           FALLTHROUGH;
3782         default:
3783           sup->tokens[j++] = d->tokens[i];
3784           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3785               || d->tokens[i] >= CSET)
3786             have_nchar = true;
3787           break;
3788         }
3789     }
3790   sup->tindex = j;
3791
3792   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3793     d->superset = sup;
3794   else
3795     {
3796       dfafree (sup);
3797       free (sup);
3798     }
3799 }
3800
3801 /* Parse a string S of length LEN into D (but skip this step if S is null).
3802    Then analyze D and build a matcher for it.
3803    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3804 void
3805 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3806 {
3807   if (s != NULL)
3808     dfaparse (s, len, d);
3809
3810   dfassbuild (d);
3811
3812   if (dfasupported (d))
3813     {
3814       maybe_disable_superset_dfa (d);
3815       dfaanalyze (d, searchflag);
3816     }
3817   else
3818     {
3819       d->dfaexec = dfaexec_noop;
3820     }
3821
3822   if (d->superset)
3823     {
3824       d->fast = true;
3825       dfaanalyze (d->superset, searchflag);
3826     }
3827 }
3828
3829 /* Free the storage held by the components of a dfa.  */
3830 void
3831 dfafree (struct dfa *d)
3832 {
3833   free (d->charclasses);
3834   free (d->tokens);
3835
3836   if (d->localeinfo.multibyte)
3837     free_mbdata (d);
3838
3839   free (d->constraints);
3840   free (d->separates);
3841
3842   for (idx_t i = 0; i < d->sindex; i++)
3843     {
3844       free (d->states[i].elems.elems);
3845       free (d->states[i].mbps.elems);
3846     }
3847   free (d->states);
3848
3849   if (d->follows)
3850     {
3851       for (idx_t i = 0; i < d->tindex; i++)
3852         free (d->follows[i].elems);
3853       free (d->follows);
3854     }
3855
3856   if (d->trans)
3857     {
3858       for (idx_t i = 0; i < d->tralloc; i++)
3859         {
3860           free (d->trans[i]);
3861           free (d->fails[i]);
3862         }
3863
3864       free (d->trans - 2);
3865       free (d->fails);
3866       free (d->newlines);
3867       free (d->success);
3868     }
3869
3870   if (d->superset)
3871     {
3872       dfafree (d->superset);
3873       free (d->superset);
3874     }
3875 }
3876
3877 /* Having found the postfix representation of the regular expression,
3878    try to find a long sequence of characters that must appear in any line
3879    containing the r.e.
3880    Finding a "longest" sequence is beyond the scope here;
3881    we take an easy way out and hope for the best.
3882    (Take "(ab|a)b"--please.)
3883
3884    We do a bottom-up calculation of sequences of characters that must appear
3885    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3886    representation:
3887         sequences that must appear at the left of the match ("left")
3888         sequences that must appear at the right of the match ("right")
3889         lists of sequences that must appear somewhere in the match ("in")
3890         sequences that must constitute the match ("is")
3891
3892    When we get to the root of the tree, we use one of the longest of its
3893    calculated "in" sequences as our answer.
3894
3895    The sequences calculated for the various types of node (in pseudo ANSI c)
3896    are shown below.  "p" is the operand of unary operators (and the left-hand
3897    operand of binary operators); "q" is the right-hand operand of binary
3898    operators.
3899
3900    "ZERO" means "a zero-length sequence" below.
3901
3902         Type    left            right           is              in
3903         ----    ----            -----           --              --
3904         char c  # c             # c             # c             # c
3905
3906         ANYCHAR ZERO            ZERO            ZERO            ZERO
3907
3908         MBCSET  ZERO            ZERO            ZERO            ZERO
3909
3910         CSET    ZERO            ZERO            ZERO            ZERO
3911
3912         STAR    ZERO            ZERO            ZERO            ZERO
3913
3914         QMARK   ZERO            ZERO            ZERO            ZERO
3915
3916         PLUS    p->left         p->right        ZERO            p->in
3917
3918         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3919                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3920                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3921                                                 ZERO
3922
3923         OR      longest common  longest common  (do p->is and substrings common
3924                 leading         trailing        to q->is have same p->in and
3925                 (sub)sequence   (sub)sequence   q->in length and content) ?
3926                 of p->left      of p->right
3927                 and q->left     and q->right    p->is : NULL
3928
3929    If there's anything else we recognize in the tree, all four sequences get set
3930    to zero-length sequences.  If there's something we don't recognize in the
3931    tree, we just return a zero-length sequence.
3932
3933    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3934    'aaa')?
3935
3936    And ... is it here or someplace that we might ponder "optimizations" such as
3937         egrep 'psi|epsilon'     ->      egrep 'psi'
3938         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3939                                         (Yes, we now find "epsi" as a "string
3940                                         that must occur", but we might also
3941                                         simplify the *entire* r.e. being sought)
3942         grep '[c]'              ->      grep 'c'
3943         grep '(ab|a)b'          ->      grep 'ab'
3944         grep 'ab*'              ->      grep 'a'
3945         grep 'a*b'              ->      grep 'b'
3946
3947    There are several issues:
3948
3949    Is optimization easy (enough)?
3950
3951    Does optimization actually accomplish anything,
3952    or is the automaton you get from "psi|epsilon" (for example)
3953    the same as the one you get from "psi" (for example)?
3954
3955    Are optimizable r.e.'s likely to be used in real-life situations
3956    (something like 'ab*' is probably unlikely; something like is
3957    'psi|epsilon' is likelier)?  */
3958
3959 static char *
3960 icatalloc (char *old, char const *new)
3961 {
3962   idx_t newsize = strlen (new);
3963   if (newsize == 0)
3964     return old;
3965   idx_t oldsize = strlen (old);
3966   char *result = xrealloc (old, oldsize + newsize + 1);
3967   memcpy (result + oldsize, new, newsize + 1);
3968   return result;
3969 }
3970
3971 static void
3972 freelist (char **cpp)
3973 {
3974   while (*cpp)
3975     free (*cpp++);
3976 }
3977
3978 static char **
3979 enlist (char **cpp, char *new, idx_t len)
3980 {
3981   new = memcpy (xmalloc (len + 1), new, len);
3982   new[len] = '\0';
3983   /* Is there already something in the list that's new (or longer)?  */
3984   idx_t i;
3985   for (i = 0; cpp[i] != NULL; i++)
3986     if (strstr (cpp[i], new) != NULL)
3987       {
3988         free (new);
3989         return cpp;
3990       }
3991   /* Eliminate any obsoleted strings.  */
3992   for (idx_t j = 0; cpp[j] != NULL; )
3993     if (strstr (new, cpp[j]) == NULL)
3994       ++j;
3995     else
3996       {
3997         free (cpp[j]);
3998         if (--i == j)
3999           break;
4000         cpp[j] = cpp[i];
4001         cpp[i] = NULL;
4002       }
4003   /* Add the new string.  */
4004   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
4005   cpp[i] = new;
4006   cpp[i + 1] = NULL;
4007   return cpp;
4008 }
4009
4010 /* Given pointers to two strings, return a pointer to an allocated
4011    list of their distinct common substrings.  */
4012 static char **
4013 comsubs (char *left, char const *right)
4014 {
4015   char **cpp = xzalloc (sizeof *cpp);
4016
4017   for (char *lcp = left; *lcp != '\0'; lcp++)
4018     {
4019       idx_t len = 0;
4020       char *rcp = strchr (right, *lcp);
4021       while (rcp != NULL)
4022         {
4023           idx_t i;
4024           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4025             continue;
4026           if (i > len)
4027             len = i;
4028           rcp = strchr (rcp + 1, *lcp);
4029         }
4030       if (len != 0)
4031         cpp = enlist (cpp, lcp, len);
4032     }
4033   return cpp;
4034 }
4035
4036 static char **
4037 addlists (char **old, char **new)
4038 {
4039   for (; *new; new++)
4040     old = enlist (old, *new, strlen (*new));
4041   return old;
4042 }
4043
4044 /* Given two lists of substrings, return a new list giving substrings
4045    common to both.  */
4046 static char **
4047 inboth (char **left, char **right)
4048 {
4049   char **both = xzalloc (sizeof *both);
4050
4051   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4052     {
4053       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4054         {
4055           char **temp = comsubs (left[lnum], right[rnum]);
4056           both = addlists (both, temp);
4057           freelist (temp);
4058           free (temp);
4059         }
4060     }
4061   return both;
4062 }
4063
4064 typedef struct must must;
4065
4066 struct must
4067 {
4068   char **in;
4069   char *left;
4070   char *right;
4071   char *is;
4072   bool begline;
4073   bool endline;
4074   must *prev;
4075 };
4076
4077 static must *
4078 allocmust (must *mp, idx_t size)
4079 {
4080   must *new_mp = xmalloc (sizeof *new_mp);
4081   new_mp->in = xzalloc (sizeof *new_mp->in);
4082   new_mp->left = xzalloc (size);
4083   new_mp->right = xzalloc (size);
4084   new_mp->is = xzalloc (size);
4085   new_mp->begline = false;
4086   new_mp->endline = false;
4087   new_mp->prev = mp;
4088   return new_mp;
4089 }
4090
4091 static void
4092 resetmust (must *mp)
4093 {
4094   freelist (mp->in);
4095   mp->in[0] = NULL;
4096   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4097   mp->begline = false;
4098   mp->endline = false;
4099 }
4100
4101 static void
4102 freemust (must *mp)
4103 {
4104   freelist (mp->in);
4105   free (mp->in);
4106   free (mp->left);
4107   free (mp->right);
4108   free (mp->is);
4109   free (mp);
4110 }
4111
4112 struct dfamust *
4113 dfamust (struct dfa const *d)
4114 {
4115   must *mp = NULL;
4116   char const *result = "";
4117   bool exact = false;
4118   bool begline = false;
4119   bool endline = false;
4120   bool need_begline = false;
4121   bool need_endline = false;
4122   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4123
4124   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4125     {
4126       token t = d->tokens[ri];
4127       switch (t)
4128         {
4129         case BEGLINE:
4130           mp = allocmust (mp, 2);
4131           mp->begline = true;
4132           need_begline = true;
4133           break;
4134         case ENDLINE:
4135           mp = allocmust (mp, 2);
4136           mp->endline = true;
4137           need_endline = true;
4138           break;
4139         case LPAREN:
4140         case RPAREN:
4141           assert (!"neither LPAREN nor RPAREN may appear here");
4142
4143         case EMPTY:
4144         case BEGWORD:
4145         case ENDWORD:
4146         case LIMWORD:
4147         case NOTLIMWORD:
4148         case BACKREF:
4149         case ANYCHAR:
4150         case MBCSET:
4151           mp = allocmust (mp, 2);
4152           break;
4153
4154         case STAR:
4155         case QMARK:
4156           resetmust (mp);
4157           break;
4158
4159         case OR:
4160           {
4161             char **new;
4162             must *rmp = mp;
4163             must *lmp = mp = mp->prev;
4164             idx_t j, ln, rn, n;
4165
4166             /* Guaranteed to be.  Unlikely, but ...  */
4167             if (streq (lmp->is, rmp->is))
4168               {
4169                 lmp->begline &= rmp->begline;
4170                 lmp->endline &= rmp->endline;
4171               }
4172             else
4173               {
4174                 lmp->is[0] = '\0';
4175                 lmp->begline = false;
4176                 lmp->endline = false;
4177               }
4178             /* Left side--easy */
4179             idx_t i = 0;
4180             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4181               ++i;
4182             lmp->left[i] = '\0';
4183             /* Right side */
4184             ln = strlen (lmp->right);
4185             rn = strlen (rmp->right);
4186             n = ln;
4187             if (n > rn)
4188               n = rn;
4189             for (i = 0; i < n; ++i)
4190               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4191                 break;
4192             for (j = 0; j < i; ++j)
4193               lmp->right[j] = lmp->right[(ln - i) + j];
4194             lmp->right[j] = '\0';
4195             new = inboth (lmp->in, rmp->in);
4196             freelist (lmp->in);
4197             free (lmp->in);
4198             lmp->in = new;
4199             freemust (rmp);
4200           }
4201           break;
4202
4203         case PLUS:
4204           mp->is[0] = '\0';
4205           break;
4206
4207         case END:
4208           assert (!mp->prev);
4209           for (idx_t i = 0; mp->in[i] != NULL; i++)
4210             if (strlen (mp->in[i]) > strlen (result))
4211               result = mp->in[i];
4212           if (streq (result, mp->is))
4213             {
4214               if ((!need_begline || mp->begline) && (!need_endline
4215                                                      || mp->endline))
4216                 exact = true;
4217               begline = mp->begline;
4218               endline = mp->endline;
4219             }
4220           goto done;
4221
4222         case CAT:
4223           {
4224             must *rmp = mp;
4225             must *lmp = mp = mp->prev;
4226
4227             /* In.  Everything in left, plus everything in
4228                right, plus concatenation of
4229                left's right and right's left.  */
4230             lmp->in = addlists (lmp->in, rmp->in);
4231             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4232               {
4233                 idx_t lrlen = strlen (lmp->right);
4234                 idx_t rllen = strlen (rmp->left);
4235                 char *tp = xmalloc (lrlen + rllen);
4236                 memcpy (tp, lmp->right, lrlen);
4237                 memcpy (tp + lrlen, rmp->left, rllen);
4238                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4239                 free (tp);
4240               }
4241             /* Left-hand */
4242             if (lmp->is[0] != '\0')
4243               lmp->left = icatalloc (lmp->left, rmp->left);
4244             /* Right-hand */
4245             if (rmp->is[0] == '\0')
4246               lmp->right[0] = '\0';
4247             lmp->right = icatalloc (lmp->right, rmp->right);
4248             /* Guaranteed to be */
4249             if ((lmp->is[0] != '\0' || lmp->begline)
4250                 && (rmp->is[0] != '\0' || rmp->endline))
4251               {
4252                 lmp->is = icatalloc (lmp->is, rmp->is);
4253                 lmp->endline = rmp->endline;
4254               }
4255             else
4256               {
4257                 lmp->is[0] = '\0';
4258                 lmp->begline = false;
4259                 lmp->endline = false;
4260               }
4261             freemust (rmp);
4262           }
4263           break;
4264
4265         case '\0':
4266           /* Not on *my* shift.  */
4267           goto done;
4268
4269         default:
4270           if (CSET <= t)
4271             {
4272               /* If T is a singleton, or if case-folding in a unibyte
4273                  locale and T's members all case-fold to the same char,
4274                  convert T to one of its members.  Otherwise, do
4275                  nothing further with T.  */
4276               charclass *ccl = &d->charclasses[t - CSET];
4277               int j;
4278               for (j = 0; j < NOTCHAR; j++)
4279                 if (tstbit (j, ccl))
4280                   break;
4281               if (! (j < NOTCHAR))
4282                 {
4283                   mp = allocmust (mp, 2);
4284                   break;
4285                 }
4286               t = j;
4287               while (++j < NOTCHAR)
4288                 if (tstbit (j, ccl)
4289                     && ! (case_fold_unibyte
4290                           && toupper (j) == toupper (t)))
4291                   break;
4292               if (j < NOTCHAR)
4293                 {
4294                   mp = allocmust (mp, 2);
4295                   break;
4296                 }
4297             }
4298
4299           idx_t rj = ri + 2;
4300           if (d->tokens[ri + 1] == CAT)
4301             {
4302               for (; rj < d->tindex - 1; rj += 2)
4303                 {
4304                   if ((rj != ri && (d->tokens[rj] <= 0
4305                                     || NOTCHAR <= d->tokens[rj]))
4306                       || d->tokens[rj + 1] != CAT)
4307                     break;
4308                 }
4309             }
4310           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4311           mp->is[0] = mp->left[0] = mp->right[0]
4312             = case_fold_unibyte ? toupper (t) : t;
4313
4314           idx_t i;
4315           for (i = 1; ri + 2 < rj; i++)
4316             {
4317               ri += 2;
4318               t = d->tokens[ri];
4319               mp->is[i] = mp->left[i] = mp->right[i]
4320                 = case_fold_unibyte ? toupper (t) : t;
4321             }
4322           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4323           mp->in = enlist (mp->in, mp->is, i);
4324           break;
4325         }
4326     }
4327  done:;
4328
4329   struct dfamust *dm = NULL;
4330   if (*result)
4331     {
4332       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4333       dm->exact = exact;
4334       dm->begline = begline;
4335       dm->endline = endline;
4336       strcpy (dm->must, result);
4337     }
4338
4339   while (mp)
4340     {
4341       must *prev = mp->prev;
4342       freemust (mp);
4343       mp = prev;
4344     }
4345
4346   return dm;
4347 }
4348
4349 void
4350 dfamustfree (struct dfamust *dm)
4351 {
4352   free (dm);
4353 }
4354
4355 struct dfa *
4356 dfaalloc (void)
4357 {
4358   return xmalloc (sizeof (struct dfa));
4359 }
4360
4361 /* Initialize DFA.  */
4362 void
4363 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4364            reg_syntax_t bits, int dfaopts)
4365 {
4366   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4367   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4368   dfa->localeinfo = *linfo;
4369
4370   dfa->fast = !dfa->localeinfo.multibyte;
4371
4372   dfa->canychar = -1;
4373   dfa->syntax.syntax_bits_set = true;
4374   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4375   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4376   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4377   dfa->syntax.syntax_bits = bits;
4378
4379   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4380     {
4381       unsigned char uc = i;
4382
4383       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4384       switch (dfa->syntax.sbit[uc])
4385         {
4386         case CTX_LETTER:
4387           setbit (uc, &dfa->syntax.letters);
4388           break;
4389         case CTX_NEWLINE:
4390           setbit (uc, &dfa->syntax.newline);
4391           break;
4392         }
4393
4394       /* POSIX requires that the five bytes in "\n\r./" (including the
4395          terminating NUL) cannot occur inside a multibyte character.  */
4396       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4397                                      ? (uc & 0xc0) != 0x80
4398                                      : strchr ("\n\r./", uc) != NULL);
4399     }
4400 }
4401
4402 /* Initialize TO by copying FROM's syntax settings.  */
4403 void
4404 dfacopysyntax (struct dfa *to, struct dfa const *from)
4405 {
4406   memset (to, 0, offsetof (struct dfa, syntax));
4407   to->canychar = -1;
4408   to->fast = from->fast;
4409   to->syntax = from->syntax;
4410   to->dfaexec = from->dfaexec;
4411   to->localeinfo = from->localeinfo;
4412 }
4413
4414 /* vim:set shiftwidth=2: */