lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include "flexmember.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36
  37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
  38    indexes into objects.  It is signed to help catch integer overflow.
  39    It has its own name because it is for nonnegative values only.  */
  40 typedef ptrdiff_t idx_t;
  41 static idx_t const IDX_MAX = PTRDIFF_MAX;
  42
  43 static bool
  44 streq (char const *a, char const *b)
  45 {
  46   return strcmp (a, b) == 0;
  47 }
  48
  49 static bool
  50 isasciidigit (char c)
  51 {
  52   return '0' <= c && c <= '9';
  53 }
  54
  55 #include "gettext.h"
  56 #define _(str) gettext (str)
  57
  58 #include <wchar.h>
  59
  60 #include "intprops.h"
  61 #include "xalloc.h"
  62 #include "localeinfo.h"
  63
  64 #ifndef FALLTHROUGH
  65 # if __GNUC__ < 7
  66 #  define FALLTHROUGH ((void) 0)
  67 # else
  68 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  69 # endif
  70 #endif
  71
  72 #ifndef MIN
  73 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  74 #endif
  75
  76 /* HPUX defines these as macros in sys/param.h.  */
  77 #ifdef setbit
  78 # undef setbit
  79 #endif
  80 #ifdef clrbit
  81 # undef clrbit
  82 #endif
  83
  84 /* First integer value that is greater than any character code.  */
  85 enum { NOTCHAR = 1 << CHAR_BIT };
  86
  87 /* Number of bits used in a charclass word.  */
  88 enum { CHARCLASS_WORD_BITS = 64 };
  89
  90 /* This represents part of a character class.  It must be unsigned and
  91    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
  92 typedef uint_least64_t charclass_word;
  93
  94 /* An initializer for a charclass whose 64-bit words are A through D.  */
  95 #define CHARCLASS_INIT(a, b, c, d) {{a, b, c, d}}
  96
  97 /* The maximum useful value of a charclass_word; all used bits are 1.  */
  98 static charclass_word const CHARCLASS_WORD_MASK
  99   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 100
 101 /* Number of words required to hold a bit for every character.  */
 102 enum
 103 {
 104   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 105 };
 106
 107 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 108 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 109
 110 /* Convert a possibly-signed character to an unsigned character.  This is
 111    a bit safer than casting to unsigned char, since it catches some type
 112    errors that the cast doesn't.  */
 113 static unsigned char
 114 to_uchar (char ch)
 115 {
 116   return ch;
 117 }
 118
 119 /* Contexts tell us whether a character is a newline or a word constituent.
 120    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 121    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 122    a particular character class.
 123
 124    A state also stores a context value, which is a bitmask of CTX_* values.
 125    A state's context represents a set of characters that the state's
 126    predecessors must match.  For example, a state whose context does not
 127    include CTX_LETTER will never have transitions where the previous
 128    character is a word constituent.  A state whose context is CTX_ANY
 129    might have transitions from any character.  */
 130
 131 enum
 132   {
 133     CTX_NONE = 1,
 134     CTX_LETTER = 2,
 135     CTX_NEWLINE = 4,
 136     CTX_ANY = 7
 137   };
 138
 139 /* Sometimes characters can only be matched depending on the surrounding
 140    context.  Such context decisions depend on what the previous character
 141    was, and the value of the current (lookahead) character.  Context
 142    dependent constraints are encoded as 9-bit integers.  Each bit that
 143    is set indicates that the constraint succeeds in the corresponding
 144    context.
 145
 146    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 147    bit 3-5  - valid contexts when next character is CTX_LETTER
 148    bit 0-2  - valid contexts when next character is CTX_NONE
 149
 150    succeeds_in_context determines whether a given constraint
 151    succeeds in a particular context.  Prev is a bitmask of possible
 152    context values for the previous character, curr is the (single-bit)
 153    context value for the lookahead character.  */
 154 static int
 155 newline_constraint (int constraint)
 156 {
 157   return (constraint >> 6) & 7;
 158 }
 159 static int
 160 letter_constraint (int constraint)
 161 {
 162   return (constraint >> 3) & 7;
 163 }
 164 static int
 165 other_constraint (int constraint)
 166 {
 167   return constraint & 7;
 168 }
 169
 170 static bool
 171 succeeds_in_context (int constraint, int prev, int curr)
 172 {
 173   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 174               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 175               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 176              & prev);
 177 }
 178
 179 /* The following describe what a constraint depends on.  */
 180 static bool
 181 prev_newline_dependent (int constraint)
 182 {
 183   return ((constraint ^ constraint >> 2) & 0111) != 0;
 184 }
 185 static bool
 186 prev_letter_dependent (int constraint)
 187 {
 188   return ((constraint ^ constraint >> 1) & 0111) != 0;
 189 }
 190
 191 /* Tokens that match the empty string subject to some constraint actually
 192    work by applying that constraint to determine what may follow them,
 193    taking into account what has gone before.  The following values are
 194    the constraints corresponding to the special tokens previously defined.  */
 195 enum
 196   {
 197     NO_CONSTRAINT = 0777,
 198     BEGLINE_CONSTRAINT = 0444,
 199     ENDLINE_CONSTRAINT = 0700,
 200     BEGWORD_CONSTRAINT = 0050,
 201     ENDWORD_CONSTRAINT = 0202,
 202     LIMWORD_CONSTRAINT = 0252,
 203     NOTLIMWORD_CONSTRAINT = 0525
 204   };
 205
 206 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 207    are operators and others are terminal symbols.  Most (but not all) of these
 208    codes are returned by the lexical analyzer.  */
 209
 210 typedef ptrdiff_t token;
 211 static token const TOKEN_MAX = PTRDIFF_MAX;
 212
 213 /* States are indexed by state_num values.  These are normally
 214    nonnegative but -1 is used as a special value.  */
 215 typedef ptrdiff_t state_num;
 216
 217 /* Predefined token values.  */
 218 enum
 219 {
 220   END = -1,                     /* END is a terminal symbol that matches the
 221                                    end of input; any value of END or less in
 222                                    the parse tree is such a symbol.  Accepting
 223                                    states of the DFA are those that would have
 224                                    a transition on END.  This is -1, not some
 225                                    more-negative value, to tweak the speed of
 226                                    comparisons to END.  */
 227
 228   /* Ordinary character values are terminal symbols that match themselves.  */
 229
 230   /* CSET must come last in the following list of special tokens.  Otherwise,
 231      the list order matters only for performance.  Related special tokens
 232      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 233      || CSET <= t) can be done with a single machine-level comparison.  */
 234
 235   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 236                                    the empty string.  */
 237
 238   QMARK,                        /* QMARK is an operator of one argument that
 239                                    matches zero or one occurrences of its
 240                                    argument.  */
 241
 242   STAR,                         /* STAR is an operator of one argument that
 243                                    matches the Kleene closure (zero or more
 244                                    occurrences) of its argument.  */
 245
 246   PLUS,                         /* PLUS is an operator of one argument that
 247                                    matches the positive closure (one or more
 248                                    occurrences) of its argument.  */
 249
 250   REPMN,                        /* REPMN is a lexical token corresponding
 251                                    to the {m,n} construct.  REPMN never
 252                                    appears in the compiled token vector.  */
 253
 254   CAT,                          /* CAT is an operator of two arguments that
 255                                    matches the concatenation of its
 256                                    arguments.  CAT is never returned by the
 257                                    lexical analyzer.  */
 258
 259   OR,                           /* OR is an operator of two arguments that
 260                                    matches either of its arguments.  */
 261
 262   LPAREN,                       /* LPAREN never appears in the parse tree,
 263                                    it is only a lexeme.  */
 264
 265   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 266
 267   WCHAR,                        /* Only returned by lex.  wctok contains
 268                                    the wide character representation.  */
 269
 270   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 271                                    a valid multibyte (or single byte) character.
 272                                    It is used only if MB_CUR_MAX > 1.  */
 273
 274   BEG,                          /* BEG is an initial symbol that matches the
 275                                    beginning of input.  */
 276
 277   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 278                                    the empty string at the beginning of a
 279                                    line.  */
 280
 281   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 282                                    the empty string at the end of a line.  */
 283
 284   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 285                                    the empty string at the beginning of a
 286                                    word.  */
 287
 288   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 289                                    the empty string at the end of a word.  */
 290
 291   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 292                                    the empty string at the beginning or the
 293                                    end of a word.  */
 294
 295   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 296                                    matches the empty string not at
 297                                    the beginning or end of a word.  */
 298
 299   BACKREF,                      /* BACKREF is generated by \<digit>
 300                                    or by any other construct that
 301                                    is not completely handled.  If the scanner
 302                                    detects a transition on backref, it returns
 303                                    a kind of "semi-success" indicating that
 304                                    the match will have to be verified with
 305                                    a backtracking matcher.  */
 306
 307   MBCSET,                       /* MBCSET is similar to CSET, but for
 308                                    multibyte characters.  */
 309
 310   CSET                          /* CSET and (and any value greater) is a
 311                                    terminal symbol that matches any of a
 312                                    class of characters.  */
 313 };
 314
 315
 316 /* States of the recognizer correspond to sets of positions in the parse
 317    tree, together with the constraints under which they may be matched.
 318    So a position is encoded as an index into the parse tree together with
 319    a constraint.  */
 320 typedef struct
 321 {
 322   idx_t index;                  /* Index into the parse array.  */
 323   unsigned int constraint;      /* Constraint for matching this position.  */
 324 } position;
 325
 326 /* Sets of positions are stored as arrays.  */
 327 typedef struct
 328 {
 329   position *elems;              /* Elements of this position set.  */
 330   idx_t nelem;                  /* Number of elements in this set.  */
 331   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 332 } position_set;
 333
 334 /* A state of the dfa consists of a set of positions, some flags,
 335    and the token value of the lowest-numbered position of the state that
 336    contains an END token.  */
 337 typedef struct
 338 {
 339   size_t hash;                  /* Hash of the positions of this state.  */
 340   position_set elems;           /* Positions this state could match.  */
 341   unsigned char context;        /* Context from previous state.  */
 342   unsigned short constraint;    /* Constraint for this state to accept.  */
 343   token first_end;              /* Token value of the first END in elems.  */
 344   position_set mbps;            /* Positions which can match multibyte
 345                                    characters or the follows, e.g., period.
 346                                    Used only if MB_CUR_MAX > 1.  */
 347   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 348                                    negative if the state does not have
 349                                    ANYCHAR.  */
 350 } dfa_state;
 351
 352 /* Maximum for any transition table count.  This should be at least 3,
 353    for the initial state setup.  */
 354 enum { MAX_TRCOUNT = 1024 };
 355
 356 /* A bracket operator.
 357    e.g., [a-c], [[:alpha:]], etc.  */
 358 struct mb_char_classes
 359 {
 360   ptrdiff_t cset;
 361   bool invert;
 362   wchar_t *chars;               /* Normal characters.  */
 363   idx_t nchars;
 364   idx_t nchars_alloc;
 365 };
 366
 367 struct regex_syntax
 368 {
 369   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 370   reg_syntax_t syntax_bits;
 371   bool syntax_bits_set;
 372
 373   /* Flag for case-folding letters into sets.  */
 374   bool case_fold;
 375
 376   /* True if ^ and $ match only the start and end of data, and do not match
 377      end-of-line within data.  */
 378   bool anchor;
 379
 380   /* End-of-line byte in data.  */
 381   unsigned char eolbyte;
 382
 383   /* Cache of char-context values.  */
 384   char sbit[NOTCHAR];
 385
 386   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 387      multibyte character.  */
 388   bool never_trail[NOTCHAR];
 389
 390   /* Set of characters considered letters.  */
 391   charclass letters;
 392
 393   /* Set of characters that are newline.  */
 394   charclass newline;
 395 };
 396
 397 /* Lexical analyzer.  All the dross that deals with the obnoxious
 398    GNU Regex syntax bits is located here.  The poor, suffering
 399    reader is referred to the GNU Regex documentation for the
 400    meaning of the @#%!@#%^!@ syntax bits.  */
 401 struct lexer_state
 402 {
 403   char const *ptr;      /* Pointer to next input character.  */
 404   idx_t left;           /* Number of characters remaining.  */
 405   token lasttok;        /* Previous token returned; initially END.  */
 406   idx_t parens;         /* Count of outstanding left parens.  */
 407   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 408
 409   /* Wide character representation of the current multibyte character,
 410      or WEOF if there was an encoding error.  Used only if
 411      MB_CUR_MAX > 1.  */
 412   wint_t wctok;
 413
 414   /* The most recently analyzed multibyte bracket expression.  */
 415   struct mb_char_classes brack;
 416
 417   /* We're separated from beginning or (, | only by zero-width characters.  */
 418   bool laststart;
 419 };
 420
 421 /* Recursive descent parser for regular expressions.  */
 422
 423 struct parser_state
 424 {
 425   token tok;               /* Lookahead token.  */
 426   idx_t depth;             /* Current depth of a hypothetical stack
 427                               holding deferred productions.  This is
 428                               used to determine the depth that will be
 429                               required of the real stack later on in
 430                               dfaanalyze.  */
 431 };
 432
 433 /* A compiled regular expression.  */
 434 struct dfa
 435 {
 436   /* Fields filled by the scanner.  */
 437   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 438   idx_t cindex;                 /* Index for adding new charclasses.  */
 439   idx_t calloc;                 /* Number of charclasses allocated.  */
 440   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 441
 442   /* Scanner state */
 443   struct lexer_state lex;
 444
 445   /* Parser state */
 446   struct parser_state parse;
 447
 448   /* Fields filled by the parser.  */
 449   token *tokens;                /* Postfix parse array.  */
 450   idx_t tindex;                 /* Index for adding new tokens.  */
 451   idx_t talloc;                 /* Number of tokens currently allocated.  */
 452   idx_t depth;                  /* Depth required of an evaluation stack
 453                                    used for depth-first traversal of the
 454                                    parse tree.  */
 455   idx_t nleaves;                /* Number of leaves on the parse tree.  */
 456   idx_t nregexps;               /* Count of parallel regexps being built
 457                                    with dfaparse.  */
 458   bool fast;                    /* The DFA is fast.  */
 459   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 460   mbstate_t mbs;                /* Multibyte conversion state.  */
 461
 462   /* The following are valid only if MB_CUR_MAX > 1.  */
 463
 464   /* The value of multibyte_prop[i] is defined by following rule.
 465      if tokens[i] < NOTCHAR
 466      bit 0 : tokens[i] is the first byte of a character, including
 467      single-byte characters.
 468      bit 1 : tokens[i] is the last byte of a character, including
 469      single-byte characters.
 470
 471      e.g.
 472      tokens
 473      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 474      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 475      multibyte_prop
 476      = 3     , 1               ,  0              ,  2              , 3
 477    */
 478   char *multibyte_prop;
 479
 480   /* Fields filled by the superset.  */
 481   struct dfa *superset;             /* Hint of the dfa.  */
 482
 483   /* Fields filled by the state builder.  */
 484   dfa_state *states;            /* States of the dfa.  */
 485   state_num sindex;             /* Index for adding new states.  */
 486   idx_t salloc;                 /* Number of states currently allocated.  */
 487
 488   /* Fields filled by the parse tree->NFA conversion.  */
 489   position_set *follows;        /* Array of follow sets, indexed by position
 490                                    index.  The follow of a position is the set
 491                                    of positions containing characters that
 492                                    could conceivably follow a character
 493                                    matching the given position in a string
 494                                    matching the regexp.  Allocated to the
 495                                    maximum possible position index.  */
 496   bool searchflag;              /* We are supposed to build a searching
 497                                    as opposed to an exact matcher.  A searching
 498                                    matcher finds the first and shortest string
 499                                    matching a regexp anywhere in the buffer,
 500                                    whereas an exact matcher finds the longest
 501                                    string matching, but anchored to the
 502                                    beginning of the buffer.  */
 503
 504   /* Fields filled by dfaanalyze.  */
 505   int *constraints;             /* Array of union of accepting constraints
 506                                    in the follow of a position.  */
 507   int *separates;               /* Array of contexts on follow of a
 508                                    position.  */
 509
 510   /* Fields filled by dfaexec.  */
 511   state_num tralloc;            /* Number of transition tables that have
 512                                    slots so far, not counting trans[-1] and
 513                                    trans[-2].  */
 514   int trcount;                  /* Number of transition tables that have
 515                                    been built, other than for initial
 516                                    states.  */
 517   int min_trcount;              /* Number of initial states.  Equivalently,
 518                                    the minimum state number for which trcount
 519                                    counts transitions.  */
 520   state_num **trans;            /* Transition tables for states that can
 521                                    never accept.  If the transitions for a
 522                                    state have not yet been computed, or the
 523                                    state could possibly accept, its entry in
 524                                    this table is NULL.  This points to two
 525                                    past the start of the allocated array,
 526                                    and trans[-1] and trans[-2] are always
 527                                    NULL.  */
 528   state_num **fails;            /* Transition tables after failing to accept
 529                                    on a state that potentially could do so.
 530                                    If trans[i] is non-null, fails[i] must
 531                                    be null.  */
 532   char *success;                /* Table of acceptance conditions used in
 533                                    dfaexec and computed in build_state.  */
 534   state_num *newlines;          /* Transitions on newlines.  The entry for a
 535                                    newline in any transition table is always
 536                                    -1 so we can count lines without wasting
 537                                    too many cycles.  The transition for a
 538                                    newline is stored separately and handled
 539                                    as a special case.  Newline is also used
 540                                    as a sentinel at the end of the buffer.  */
 541   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 542                                    context in multibyte locales, in which we
 543                                    do not distinguish between their contexts,
 544                                    as not supported word.  */
 545   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 546   state_num **mb_trans;         /* Transition tables for states with
 547                                    ANYCHAR.  */
 548   state_num mb_trcount;         /* Number of transition tables for states with
 549                                    ANYCHAR that have actually been built.  */
 550
 551   /* Syntax configuration.  This is near the end so that dfacopysyntax
 552      can memset up to here.  */
 553   struct regex_syntax syntax;
 554
 555   /* Information derived from the locale.  This is at the end so that
 556      a quick memset need not clear it specially.  */
 557
 558   /* dfaexec implementation.  */
 559   char *(*dfaexec) (struct dfa *, char const *, char *,
 560                     bool, ptrdiff_t *, bool *);
 561
 562   /* Other cached information derived from the locale.  */
 563   struct localeinfo localeinfo;
 564 };
 565
 566 /* User access to dfa internals.  */
 567
 568 /* S could possibly be an accepting state of R.  */
 569 static bool
 570 accepting (state_num s, struct dfa const *r)
 571 {
 572   return r->states[s].constraint != 0;
 573 }
 574
 575 /* STATE accepts in the specified context.  */
 576 static bool
 577 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 578 {
 579   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 580 }
 581
 582 static void regexp (struct dfa *dfa);
 583
 584 /* Store into *PWC the result of converting the leading bytes of the
 585    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 586    and updating the conversion state in *D.  On conversion error,
 587    convert just a single byte, to WEOF.  Return the number of bytes
 588    converted.
 589
 590    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 591
 592    * PWC points to wint_t, not to wchar_t.
 593    * The last arg is a dfa *D instead of merely a multibyte conversion
 594      state D->mbs.
 595    * N must be at least 1.
 596    * S[N - 1] must be a sentinel byte.
 597    * Shift encodings are not supported.
 598    * The return value is always in the range 1..N.
 599    * D->mbs is always valid afterwards.
 600    * *PWC is always set to something.  */
 601 static int
 602 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 603 {
 604   unsigned char uc = s[0];
 605   wint_t wc = d->localeinfo.sbctowc[uc];
 606
 607   if (wc == WEOF)
 608     {
 609       wchar_t wch;
 610       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 611       if (0 < nbytes && nbytes < (size_t) -2)
 612         {
 613           *pwc = wch;
 614           return nbytes;
 615         }
 616       memset (&d->mbs, 0, sizeof d->mbs);
 617     }
 618
 619   *pwc = wc;
 620   return 1;
 621 }
 622
 623 #ifdef DEBUG
 624
 625 static void
 626 prtok (token t)
 627 {
 628   if (t <= END)
 629     fprintf (stderr, "END");
 630   else if (0 <= t && t < NOTCHAR)
 631     {
 632       unsigned int ch = t;
 633       fprintf (stderr, "0x%02x", ch);
 634     }
 635   else
 636     {
 637       char const *s;
 638       switch (t)
 639         {
 640         case BEG:
 641           s = "BEG";
 642           break;
 643         case EMPTY:
 644           s = "EMPTY";
 645           break;
 646         case BACKREF:
 647           s = "BACKREF";
 648           break;
 649         case BEGLINE:
 650           s = "BEGLINE";
 651           break;
 652         case ENDLINE:
 653           s = "ENDLINE";
 654           break;
 655         case BEGWORD:
 656           s = "BEGWORD";
 657           break;
 658         case ENDWORD:
 659           s = "ENDWORD";
 660           break;
 661         case LIMWORD:
 662           s = "LIMWORD";
 663           break;
 664         case NOTLIMWORD:
 665           s = "NOTLIMWORD";
 666           break;
 667         case QMARK:
 668           s = "QMARK";
 669           break;
 670         case STAR:
 671           s = "STAR";
 672           break;
 673         case PLUS:
 674           s = "PLUS";
 675           break;
 676         case CAT:
 677           s = "CAT";
 678           break;
 679         case OR:
 680           s = "OR";
 681           break;
 682         case LPAREN:
 683           s = "LPAREN";
 684           break;
 685         case RPAREN:
 686           s = "RPAREN";
 687           break;
 688         case ANYCHAR:
 689           s = "ANYCHAR";
 690           break;
 691         case MBCSET:
 692           s = "MBCSET";
 693           break;
 694         default:
 695           s = "CSET";
 696           break;
 697         }
 698       fprintf (stderr, "%s", s);
 699     }
 700 }
 701 #endif /* DEBUG */
 702
 703 /* Stuff pertaining to charclasses.  */
 704
 705 static bool
 706 tstbit (unsigned int b, charclass const *c)
 707 {
 708   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 709 }
 710
 711 static void
 712 setbit (unsigned int b, charclass *c)
 713 {
 714   charclass_word one = 1;
 715   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 716 }
 717
 718 static void
 719 clrbit (unsigned int b, charclass *c)
 720 {
 721   charclass_word one = 1;
 722   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 723 }
 724
 725 static void
 726 zeroset (charclass *s)
 727 {
 728   memset (s, 0, sizeof *s);
 729 }
 730
 731 static void
 732 fillset (charclass *s)
 733 {
 734   for (int i = 0; i < CHARCLASS_WORDS; i++)
 735     s->w[i] = CHARCLASS_WORD_MASK;
 736 }
 737
 738 static void
 739 notset (charclass *s)
 740 {
 741   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 742     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 743 }
 744
 745 static bool
 746 equal (charclass const *s1, charclass const *s2)
 747 {
 748   charclass_word w = 0;
 749   for (int i = 0; i < CHARCLASS_WORDS; i++)
 750     w |= s1->w[i] ^ s2->w[i];
 751   return w == 0;
 752 }
 753
 754 static bool
 755 emptyset (charclass const *s)
 756 {
 757   charclass_word w = 0;
 758   for (int i = 0; i < CHARCLASS_WORDS; i++)
 759     w |= s->w[i];
 760   return w == 0;
 761 }
 762
 763 /* Grow PA, which points to an array of *NITEMS items, and return the
 764    location of the reallocated array, updating *NITEMS to reflect its
 765    new size.  The new array will contain at least NITEMS_INCR_MIN more
 766    items, but will not contain more than NITEMS_MAX items total.
 767    ITEM_SIZE is the size of each item, in bytes.
 768
 769    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 770    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 771    infinity.
 772
 773    If PA is null, then allocate a new array instead of reallocating
 774    the old one.
 775
 776    Thus, to grow an array A without saving its old contents, do
 777    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 778
 779 static void *
 780 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
 781          ptrdiff_t nitems_max, idx_t item_size)
 782 {
 783   idx_t n0 = *nitems;
 784
 785   /* The approximate size to use for initial small allocation
 786      requests.  This is the largest "small" request for the GNU C
 787      library malloc.  */
 788   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 789
 790   /* If the array is tiny, grow it to about (but no greater than)
 791      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 792      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 793      NITEMS_MAX, and what the C language can represent safely.  */
 794
 795   idx_t n, nbytes;
 796   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 797     n = IDX_MAX;
 798   if (0 <= nitems_max && nitems_max < n)
 799     n = nitems_max;
 800
 801   idx_t adjusted_nbytes
 802     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 803        ? MIN (IDX_MAX, SIZE_MAX)
 804        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 805   if (adjusted_nbytes)
 806     {
 807       n = adjusted_nbytes / item_size;
 808       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 809     }
 810
 811   if (! pa)
 812     *nitems = 0;
 813   if (n - n0 < nitems_incr_min
 814       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 815           || (0 <= nitems_max && nitems_max < n)
 816           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 817     xalloc_die ();
 818   pa = xrealloc (pa, nbytes);
 819   *nitems = n;
 820   return pa;
 821 }
 822
 823 /* Ensure that the array addressed by PA holds at least I + 1 items.
 824    Either return PA, or reallocate the array and return its new address.
 825    Although PA may be null, the returned value is never null.
 826
 827    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 828    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 829    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 830    ITEM_SIZE is the size of one item; it must be positive.
 831    Avoid O(N**2) behavior on arrays growing linearly.  */
 832 static void *
 833 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 834                ptrdiff_t nitems_max, idx_t item_size)
 835 {
 836   if (i < *nitems)
 837     return pa;
 838   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 839 }
 840
 841 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 842 static idx_t
 843 charclass_index (struct dfa *d, charclass const *s)
 844 {
 845   idx_t i;
 846
 847   for (i = 0; i < d->cindex; ++i)
 848     if (equal (s, &d->charclasses[i]))
 849       return i;
 850   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 851                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 852   ++d->cindex;
 853   d->charclasses[i] = *s;
 854   return i;
 855 }
 856
 857 static bool
 858 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 859 {
 860   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 861 }
 862
 863 static int
 864 char_context (struct dfa const *dfa, unsigned char c)
 865 {
 866   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 867     return CTX_NEWLINE;
 868   if (unibyte_word_constituent (dfa, c))
 869     return CTX_LETTER;
 870   return CTX_NONE;
 871 }
 872
 873 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 874    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 875    this may happen when folding case in weird Turkish locales where
 876    dotless i/dotted I are not included in the chosen character set.
 877    Return whether a bit was set in the charclass.  */
 878 static bool
 879 setbit_wc (wint_t wc, charclass *c)
 880 {
 881   int b = wctob (wc);
 882   if (b < 0)
 883     return false;
 884
 885   setbit (b, c);
 886   return true;
 887 }
 888
 889 /* Set a bit for B and its case variants in the charclass C.
 890    MB_CUR_MAX must be 1.  */
 891 static void
 892 setbit_case_fold_c (int b, charclass *c)
 893 {
 894   int ub = toupper (b);
 895   for (int i = 0; i < NOTCHAR; i++)
 896     if (toupper (i) == ub)
 897       setbit (i, c);
 898 }
 899
 900 /* Fetch the next lexical input character from the pattern.  There
 901    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 902    value of the character or to WEOF depending on whether the input is
 903    a valid multibyte character (possibly of length 1).  Then return
 904    the next input byte value, except return EOF if the input is a
 905    multibyte character of length greater than 1.  */
 906 static int
 907 fetch_wc (struct dfa *dfa)
 908 {
 909   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 910                              dfa);
 911   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 912   dfa->lex.ptr += nbytes;
 913   dfa->lex.left -= nbytes;
 914   return c;
 915 }
 916
 917 /* If there is no more input, report an error about unbalanced brackets.
 918    Otherwise, behave as with fetch_wc (DFA).  */
 919 static int
 920 bracket_fetch_wc (struct dfa *dfa)
 921 {
 922   if (! dfa->lex.left)
 923     dfaerror (_("unbalanced ["));
 924   return fetch_wc (dfa);
 925 }
 926
 927 typedef int predicate (int);
 928
 929 /* The following list maps the names of the Posix named character classes
 930    to predicate functions that determine whether a given character is in
 931    the class.  The leading [ has already been eaten by the lexical
 932    analyzer.  */
 933 struct dfa_ctype
 934 {
 935   const char *name;
 936   predicate *func;
 937   bool single_byte_only;
 938 };
 939
 940 static const struct dfa_ctype prednames[] = {
 941   {"alpha", isalpha, false},
 942   {"upper", isupper, false},
 943   {"lower", islower, false},
 944   {"digit", isdigit, true},
 945   {"xdigit", isxdigit, false},
 946   {"space", isspace, false},
 947   {"punct", ispunct, false},
 948   {"alnum", isalnum, false},
 949   {"print", isprint, false},
 950   {"graph", isgraph, false},
 951   {"cntrl", iscntrl, false},
 952   {"blank", isblank, false},
 953   {NULL, NULL, false}
 954 };
 955
 956 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 957 find_pred (const char *str)
 958 {
 959   for (int i = 0; prednames[i].name; i++)
 960     if (streq (str, prednames[i].name))
 961       return &prednames[i];
 962   return NULL;
 963 }
 964
 965 /* Parse a bracket expression, which possibly includes multibyte
 966    characters.  */
 967 static token
 968 parse_bracket_exp (struct dfa *dfa)
 969 {
 970   /* This is a bracket expression that dfaexec is known to
 971      process correctly.  */
 972   bool known_bracket_exp = true;
 973
 974   /* Used to warn about [:space:].
 975      Bit 0 = first character is a colon.
 976      Bit 1 = last character is a colon.
 977      Bit 2 = includes any other character but a colon.
 978      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 979   int colon_warning_state;
 980
 981   dfa->lex.brack.nchars = 0;
 982   charclass ccl;
 983   zeroset (&ccl);
 984   int c = bracket_fetch_wc (dfa);
 985   bool invert = c == '^';
 986   if (invert)
 987     {
 988       c = bracket_fetch_wc (dfa);
 989       known_bracket_exp = dfa->localeinfo.simple;
 990     }
 991   wint_t wc = dfa->lex.wctok;
 992   int c1;
 993   wint_t wc1;
 994   colon_warning_state = (c == ':');
 995   do
 996     {
 997       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 998       colon_warning_state &= ~2;
 999
1000       /* Note that if we're looking at some other [:...:] construct,
1001          we just treat it as a bunch of ordinary characters.  We can do
1002          this because we assume regex has checked for syntax errors before
1003          dfa is ever called.  */
1004       if (c == '[')
1005         {
1006           c1 = bracket_fetch_wc (dfa);
1007           wc1 = dfa->lex.wctok;
1008
1009           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1010               || c1 == '.' || c1 == '=')
1011             {
1012               enum { MAX_BRACKET_STRING_LEN = 32 };
1013               char str[MAX_BRACKET_STRING_LEN + 1];
1014               int len = 0;
1015               for (;;)
1016                 {
1017                   c = bracket_fetch_wc (dfa);
1018                   if (dfa->lex.left == 0
1019                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1020                     break;
1021                   if (len < MAX_BRACKET_STRING_LEN)
1022                     str[len++] = c;
1023                   else
1024                     /* This is in any case an invalid class name.  */
1025                     str[0] = '\0';
1026                 }
1027               str[len] = '\0';
1028
1029               /* Fetch bracket.  */
1030               c = bracket_fetch_wc (dfa);
1031               wc = dfa->lex.wctok;
1032               if (c1 == ':')
1033                 /* Build character class.  POSIX allows character
1034                    classes to match multicharacter collating elements,
1035                    but the regex code does not support that, so do not
1036                    worry about that possibility.  */
1037                 {
1038                   char const *class
1039                     = (dfa->syntax.case_fold && (streq (str, "upper")
1040                                                  || streq (str, "lower"))
1041                        ? "alpha" : str);
1042                   const struct dfa_ctype *pred = find_pred (class);
1043                   if (!pred)
1044                     dfaerror (_("invalid character class"));
1045
1046                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1047                     known_bracket_exp = false;
1048                   else
1049                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1050                       if (pred->func (c2))
1051                         setbit (c2, &ccl);
1052                 }
1053               else
1054                 known_bracket_exp = false;
1055
1056               colon_warning_state |= 8;
1057
1058               /* Fetch new lookahead character.  */
1059               c1 = bracket_fetch_wc (dfa);
1060               wc1 = dfa->lex.wctok;
1061               continue;
1062             }
1063
1064           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1065              are already set up.  */
1066         }
1067
1068       if (c == '\\'
1069           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1070         {
1071           c = bracket_fetch_wc (dfa);
1072           wc = dfa->lex.wctok;
1073         }
1074
1075       if (c1 == NOTCHAR)
1076         {
1077           c1 = bracket_fetch_wc (dfa);
1078           wc1 = dfa->lex.wctok;
1079         }
1080
1081       if (c1 == '-')
1082         /* build range characters.  */
1083         {
1084           int c2 = bracket_fetch_wc (dfa);
1085           wint_t wc2 = dfa->lex.wctok;
1086
1087           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1088              Treat it like [-a[.aa.]] while parsing it, and
1089              remember that the set is unknown.  */
1090           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1091             {
1092               known_bracket_exp = false;
1093               c2 = ']';
1094             }
1095
1096           if (c2 == ']')
1097             {
1098               /* In the case [x-], the - is an ordinary hyphen,
1099                  which is left in c1, the lookahead character.  */
1100               dfa->lex.ptr--;
1101               dfa->lex.left++;
1102             }
1103           else
1104             {
1105               if (c2 == '\\' && (dfa->syntax.syntax_bits
1106                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1107                 {
1108                   c2 = bracket_fetch_wc (dfa);
1109                   wc2 = dfa->lex.wctok;
1110                 }
1111
1112               colon_warning_state |= 8;
1113               c1 = bracket_fetch_wc (dfa);
1114               wc1 = dfa->lex.wctok;
1115
1116               /* Treat [x-y] as a range if x != y.  */
1117               if (wc != wc2 || wc == WEOF)
1118                 {
1119                   if (dfa->localeinfo.simple
1120                       || (isasciidigit (c) & isasciidigit (c2)))
1121                     {
1122                       for (int ci = c; ci <= c2; ci++)
1123                         if (dfa->syntax.case_fold && isalpha (ci))
1124                           setbit_case_fold_c (ci, &ccl);
1125                         else
1126                           setbit (ci, &ccl);
1127                     }
1128                   else
1129                     known_bracket_exp = false;
1130
1131                   continue;
1132                 }
1133             }
1134         }
1135
1136       colon_warning_state |= (c == ':') ? 2 : 4;
1137
1138       if (!dfa->localeinfo.multibyte)
1139         {
1140           if (dfa->syntax.case_fold && isalpha (c))
1141             setbit_case_fold_c (c, &ccl);
1142           else
1143             setbit (c, &ccl);
1144           continue;
1145         }
1146
1147       if (wc == WEOF)
1148         known_bracket_exp = false;
1149       else
1150         {
1151           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1152           int n = (dfa->syntax.case_fold
1153                    ? case_folded_counterparts (wc, folded + 1) + 1
1154                    : 1);
1155           folded[0] = wc;
1156           for (int i = 0; i < n; i++)
1157             if (!setbit_wc (folded[i], &ccl))
1158               {
1159                 dfa->lex.brack.chars
1160                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1161                                    &dfa->lex.brack.nchars_alloc, -1,
1162                                    sizeof *dfa->lex.brack.chars);
1163                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1164               }
1165         }
1166     }
1167   while ((wc = wc1, (c = c1) != ']'));
1168
1169   if (colon_warning_state == 7)
1170     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1171
1172   if (! known_bracket_exp)
1173     return BACKREF;
1174
1175   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1176     {
1177       dfa->lex.brack.invert = invert;
1178       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1179       return MBCSET;
1180     }
1181
1182   if (invert)
1183     {
1184       notset (&ccl);
1185       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1186         clrbit ('\n', &ccl);
1187     }
1188
1189   return CSET + charclass_index (dfa, &ccl);
1190 }
1191
1192 struct lexptr
1193 {
1194   char const *ptr;
1195   idx_t left;
1196 };
1197
1198 static void
1199 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1200 {
1201   ls->ptr = dfa->lex.ptr;
1202   ls->left = dfa->lex.left;
1203   dfa->lex.ptr = s;
1204   dfa->lex.left = strlen (s);
1205 }
1206
1207 static void
1208 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1209 {
1210   dfa->lex.ptr = ls->ptr;
1211   dfa->lex.left = ls->left;
1212 }
1213
1214 static token
1215 lex (struct dfa *dfa)
1216 {
1217   bool backslash = false;
1218
1219   /* Basic plan: We fetch a character.  If it's a backslash,
1220      we set the backslash flag and go through the loop again.
1221      On the plus side, this avoids having a duplicate of the
1222      main switch inside the backslash case.  On the minus side,
1223      it means that just about every case begins with
1224      "if (backslash) ...".  */
1225   for (int i = 0; i < 2; ++i)
1226     {
1227       if (! dfa->lex.left)
1228         return dfa->lex.lasttok = END;
1229       int c = fetch_wc (dfa);
1230
1231       switch (c)
1232         {
1233         case '\\':
1234           if (backslash)
1235             goto normal_char;
1236           if (dfa->lex.left == 0)
1237             dfaerror (_("unfinished \\ escape"));
1238           backslash = true;
1239           break;
1240
1241         case '^':
1242           if (backslash)
1243             goto normal_char;
1244           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1245               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1246               || dfa->lex.lasttok == OR)
1247             return dfa->lex.lasttok = BEGLINE;
1248           goto normal_char;
1249
1250         case '$':
1251           if (backslash)
1252             goto normal_char;
1253           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1254               || dfa->lex.left == 0
1255               || ((dfa->lex.left
1256                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1257                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1258                                    & (dfa->lex.ptr[0] == '\\')]
1259                       == ')'))
1260               || ((dfa->lex.left
1261                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1262                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1263                                    & (dfa->lex.ptr[0] == '\\')]
1264                       == '|'))
1265               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267             return dfa->lex.lasttok = ENDLINE;
1268           goto normal_char;
1269
1270         case '1':
1271         case '2':
1272         case '3':
1273         case '4':
1274         case '5':
1275         case '6':
1276         case '7':
1277         case '8':
1278         case '9':
1279           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1280             {
1281               dfa->lex.laststart = false;
1282               return dfa->lex.lasttok = BACKREF;
1283             }
1284           goto normal_char;
1285
1286         case '`':
1287           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1288             {
1289               /* FIXME: should be beginning of string */
1290               return dfa->lex.lasttok = BEGLINE;
1291             }
1292           goto normal_char;
1293
1294         case '\'':
1295           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1296             {
1297               /* FIXME: should be end of string */
1298               return dfa->lex.lasttok = ENDLINE;
1299             }
1300           goto normal_char;
1301
1302         case '<':
1303           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1304             return dfa->lex.lasttok = BEGWORD;
1305           goto normal_char;
1306
1307         case '>':
1308           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1309             return dfa->lex.lasttok = ENDWORD;
1310           goto normal_char;
1311
1312         case 'b':
1313           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1314             return dfa->lex.lasttok = LIMWORD;
1315           goto normal_char;
1316
1317         case 'B':
1318           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319             return dfa->lex.lasttok = NOTLIMWORD;
1320           goto normal_char;
1321
1322         case '?':
1323           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1324             goto normal_char;
1325           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1326             goto normal_char;
1327           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1328               && dfa->lex.laststart)
1329             goto normal_char;
1330           return dfa->lex.lasttok = QMARK;
1331
1332         case '*':
1333           if (backslash)
1334             goto normal_char;
1335           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1336               && dfa->lex.laststart)
1337             goto normal_char;
1338           return dfa->lex.lasttok = STAR;
1339
1340         case '+':
1341           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1342             goto normal_char;
1343           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1344             goto normal_char;
1345           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1346               && dfa->lex.laststart)
1347             goto normal_char;
1348           return dfa->lex.lasttok = PLUS;
1349
1350         case '{':
1351           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1352             goto normal_char;
1353           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1354             goto normal_char;
1355           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1356               && dfa->lex.laststart)
1357             goto normal_char;
1358
1359           /* Cases:
1360              {M} - exact count
1361              {M,} - minimum count, maximum is infinity
1362              {,N} - 0 through N
1363              {,} - 0 to infinity (same as '*')
1364              {M,N} - M through N */
1365           {
1366             char const *p = dfa->lex.ptr;
1367             char const *lim = p + dfa->lex.left;
1368             dfa->lex.minrep = dfa->lex.maxrep = -1;
1369             for (; p != lim && isasciidigit (*p); p++)
1370               dfa->lex.minrep = (dfa->lex.minrep < 0
1371                                  ? *p - '0'
1372                                  : MIN (RE_DUP_MAX + 1,
1373                                         dfa->lex.minrep * 10 + *p - '0'));
1374             if (p != lim)
1375               {
1376                 if (*p != ',')
1377                   dfa->lex.maxrep = dfa->lex.minrep;
1378                 else
1379                   {
1380                     if (dfa->lex.minrep < 0)
1381                       dfa->lex.minrep = 0;
1382                     while (++p != lim && isasciidigit (*p))
1383                       dfa->lex.maxrep
1384                         = (dfa->lex.maxrep < 0
1385                            ? *p - '0'
1386                            : MIN (RE_DUP_MAX + 1,
1387                                   dfa->lex.maxrep * 10 + *p - '0'));
1388                   }
1389               }
1390             if (! ((! backslash || (p != lim && *p++ == '\\'))
1391                    && p != lim && *p++ == '}'
1392                    && 0 <= dfa->lex.minrep
1393                    && (dfa->lex.maxrep < 0
1394                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1395               {
1396                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1397                   goto normal_char;
1398                 dfaerror (_("invalid content of \\{\\}"));
1399               }
1400             if (RE_DUP_MAX < dfa->lex.maxrep)
1401               dfaerror (_("regular expression too big"));
1402             dfa->lex.ptr = p;
1403             dfa->lex.left = lim - p;
1404           }
1405           dfa->lex.laststart = false;
1406           return dfa->lex.lasttok = REPMN;
1407
1408         case '|':
1409           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1410             goto normal_char;
1411           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1412             goto normal_char;
1413           dfa->lex.laststart = true;
1414           return dfa->lex.lasttok = OR;
1415
1416         case '\n':
1417           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1418               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1419             goto normal_char;
1420           dfa->lex.laststart = true;
1421           return dfa->lex.lasttok = OR;
1422
1423         case '(':
1424           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1425             goto normal_char;
1426           dfa->lex.parens++;
1427           dfa->lex.laststart = true;
1428           return dfa->lex.lasttok = LPAREN;
1429
1430         case ')':
1431           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1432             goto normal_char;
1433           if (dfa->lex.parens == 0
1434               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1435             goto normal_char;
1436           dfa->lex.parens--;
1437           dfa->lex.laststart = false;
1438           return dfa->lex.lasttok = RPAREN;
1439
1440         case '.':
1441           if (backslash)
1442             goto normal_char;
1443           if (dfa->canychar < 0)
1444             {
1445               charclass ccl;
1446               fillset (&ccl);
1447               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1448                 clrbit ('\n', &ccl);
1449               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1450                 clrbit ('\0', &ccl);
1451               if (dfa->localeinfo.multibyte)
1452                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1453                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1454                     clrbit (c2, &ccl);
1455               dfa->canychar = charclass_index (dfa, &ccl);
1456             }
1457           dfa->lex.laststart = false;
1458           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1459                                      ? ANYCHAR
1460                                      : CSET + dfa->canychar);
1461
1462         case 's':
1463         case 'S':
1464           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1465             goto normal_char;
1466           if (!dfa->localeinfo.multibyte)
1467             {
1468               charclass ccl;
1469               zeroset (&ccl);
1470               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1471                 if (isspace (c2))
1472                   setbit (c2, &ccl);
1473               if (c == 'S')
1474                 notset (&ccl);
1475               dfa->lex.laststart = false;
1476               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1477             }
1478
1479           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1480              add_utf8_anychar, makes sense.  */
1481
1482           /* \s and \S are documented to be equivalent to [[:space:]] and
1483              [^[:space:]] respectively, so tell the lexer to process those
1484              strings, each minus its "already processed" '['.  */
1485           {
1486             struct lexptr ls;
1487             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1488             dfa->lex.lasttok = parse_bracket_exp (dfa);
1489             pop_lex_state (dfa, &ls);
1490           }
1491
1492           dfa->lex.laststart = false;
1493           return dfa->lex.lasttok;
1494
1495         case 'w':
1496         case 'W':
1497           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1498             goto normal_char;
1499
1500           if (!dfa->localeinfo.multibyte)
1501             {
1502               charclass ccl;
1503               zeroset (&ccl);
1504               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1505                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1506                   setbit (c2, &ccl);
1507               if (c == 'W')
1508                 notset (&ccl);
1509               dfa->lex.laststart = false;
1510               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1511             }
1512
1513           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1514              add_utf8_anychar, makes sense.  */
1515
1516           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1517              [^_[:alnum:]] respectively, so tell the lexer to process those
1518              strings, each minus its "already processed" '['.  */
1519           {
1520             struct lexptr ls;
1521             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1522             dfa->lex.lasttok = parse_bracket_exp (dfa);
1523             pop_lex_state (dfa, &ls);
1524           }
1525
1526           dfa->lex.laststart = false;
1527           return dfa->lex.lasttok;
1528
1529         case '[':
1530           if (backslash)
1531             goto normal_char;
1532           dfa->lex.laststart = false;
1533           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1534
1535         default:
1536         normal_char:
1537           dfa->lex.laststart = false;
1538           /* For multibyte character sets, folding is done in atom.  Always
1539              return WCHAR.  */
1540           if (dfa->localeinfo.multibyte)
1541             return dfa->lex.lasttok = WCHAR;
1542
1543           if (dfa->syntax.case_fold && isalpha (c))
1544             {
1545               charclass ccl;
1546               zeroset (&ccl);
1547               setbit_case_fold_c (c, &ccl);
1548               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1549             }
1550
1551           return dfa->lex.lasttok = c;
1552         }
1553     }
1554
1555   /* The above loop should consume at most a backslash
1556      and some other character.  */
1557   abort ();
1558   return END;                   /* keeps pedantic compilers happy.  */
1559 }
1560
1561 static void
1562 addtok_mb (struct dfa *dfa, token t, char mbprop)
1563 {
1564   if (dfa->talloc == dfa->tindex)
1565     {
1566       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1567                              sizeof *dfa->tokens);
1568       if (dfa->localeinfo.multibyte)
1569         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1570                                          sizeof *dfa->multibyte_prop);
1571     }
1572   if (dfa->localeinfo.multibyte)
1573     dfa->multibyte_prop[dfa->tindex] = mbprop;
1574   dfa->tokens[dfa->tindex++] = t;
1575
1576   switch (t)
1577     {
1578     case QMARK:
1579     case STAR:
1580     case PLUS:
1581       break;
1582
1583     case CAT:
1584     case OR:
1585       dfa->parse.depth--;
1586       break;
1587
1588     case BACKREF:
1589       dfa->fast = false;
1590       FALLTHROUGH;
1591     default:
1592       dfa->nleaves++;
1593       FALLTHROUGH;
1594     case EMPTY:
1595       dfa->parse.depth++;
1596       break;
1597     }
1598   if (dfa->parse.depth > dfa->depth)
1599     dfa->depth = dfa->parse.depth;
1600 }
1601
1602 static void addtok_wc (struct dfa *dfa, wint_t wc);
1603
1604 /* Add the given token to the parse tree, maintaining the depth count and
1605    updating the maximum depth if necessary.  */
1606 static void
1607 addtok (struct dfa *dfa, token t)
1608 {
1609   if (dfa->localeinfo.multibyte && t == MBCSET)
1610     {
1611       bool need_or = false;
1612
1613       /* Extract wide characters into alternations for better performance.
1614          This does not require UTF-8.  */
1615       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1616         {
1617           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1618           if (need_or)
1619             addtok (dfa, OR);
1620           need_or = true;
1621         }
1622       dfa->lex.brack.nchars = 0;
1623
1624       /* Wide characters have been handled above, so it is possible
1625          that the set is empty now.  Do nothing in that case.  */
1626       if (dfa->lex.brack.cset != -1)
1627         {
1628           addtok (dfa, CSET + dfa->lex.brack.cset);
1629           if (need_or)
1630             addtok (dfa, OR);
1631         }
1632     }
1633   else
1634     {
1635       addtok_mb (dfa, t, 3);
1636     }
1637 }
1638
1639 /* We treat a multibyte character as a single atom, so that DFA
1640    can treat a multibyte character as a single expression.
1641
1642    e.g., we construct the following tree from "<mb1><mb2>".
1643    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1644    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1645 static void
1646 addtok_wc (struct dfa *dfa, wint_t wc)
1647 {
1648   unsigned char buf[MB_LEN_MAX];
1649   mbstate_t s = { 0 };
1650   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1651   int buflen;
1652
1653   if (stored_bytes != (size_t) -1)
1654     buflen = stored_bytes;
1655   else
1656     {
1657       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1658          the addtok_mb call altogether can corrupt the heap.  */
1659       buflen = 1;
1660       buf[0] = 0;
1661     }
1662
1663   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1664   for (int i = 1; i < buflen; i++)
1665     {
1666       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1667       addtok (dfa, CAT);
1668     }
1669 }
1670
1671 static void
1672 add_utf8_anychar (struct dfa *dfa)
1673 {
1674   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1675      UTF-8 byte sequence has been defined as follows:
1676
1677      ([\x00-\x7f]
1678      |[\xc2-\xdf][\x80-\xbf]
1679      |[\xe0][\xa0-\xbf][\x80-\xbf]
1680      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1681      |[\xed][\x80-\x9f][\x80-\xbf]
1682      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1683      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1684      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1685
1686      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1687      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1688      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1689      H = [\x80-\x9f], I = [\xf0],
1690      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1691
1692      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1693
1694   /* Mnemonics for classes containing two or more bytes.  */
1695   enum { A, B, C, E, F, H, J, K, M };
1696
1697   /* Mnemonics for single-byte tokens.  */
1698   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1699
1700   static charclass const utf8_classes[] = {
1701     /* A. 00-7f: 1-byte sequence.  */
1702     CHARCLASS_INIT (0xffffffffffffffff, 0xffffffffffffffff, 0, 0),
1703
1704     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1705     CHARCLASS_INIT (0, 0, 0, 0x00000000fffffffc),
1706
1707     /* C. 80-bf: non-leading bytes.  */
1708     CHARCLASS_INIT (0, 0, 0xffffffffffffffff, 0),
1709
1710     /* D. e0 (just a token).  */
1711
1712     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1713     CHARCLASS_INIT (0, 0, 0xffffffff00000000, 0),
1714
1715     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1716     CHARCLASS_INIT (0, 0, 0, 0x0000dffe00000000),
1717
1718     /* G. ed (just a token).  */
1719
1720     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1721     CHARCLASS_INIT (0, 0, 0x000000000000ffff, 0),
1722
1723     /* I. f0 (just a token).  */
1724
1725     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1726     CHARCLASS_INIT (0, 0, 0xffffffffffff0000, 0),
1727
1728     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1729     CHARCLASS_INIT (0, 0, 0, 0x000e000000000000),
1730
1731     /* L. f4 (just a token).  */
1732
1733     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1734     CHARCLASS_INIT (0, 0, 0x00000000000000ff, 0),
1735   };
1736
1737   /* Define the character classes that are needed below.  */
1738   if (dfa->utf8_anychar_classes[0] == 0)
1739     {
1740       charclass c = utf8_classes[0];
1741       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1742         clrbit ('\n', &c);
1743       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1744         clrbit ('\0', &c);
1745       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1746
1747       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1748         dfa->utf8_anychar_classes[i]
1749           = CSET + charclass_index (dfa, &utf8_classes[i]);
1750     }
1751
1752   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1753      The token buffer is in reverse Polish order, so we get
1754      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1755       C CAT OR C CAT OR C CAT OR".  */
1756   addtok (dfa, dfa->utf8_anychar_classes[A]);
1757   addtok (dfa, dfa->utf8_anychar_classes[B]);
1758   addtok (dfa, D_token);
1759   addtok (dfa, dfa->utf8_anychar_classes[E]);
1760   addtok (dfa, CAT);
1761   addtok (dfa, OR);
1762   addtok (dfa, G_token);
1763   addtok (dfa, dfa->utf8_anychar_classes[H]);
1764   addtok (dfa, CAT);
1765   addtok (dfa, OR);
1766   addtok (dfa, dfa->utf8_anychar_classes[F]);
1767   addtok (dfa, I_token);
1768   addtok (dfa, dfa->utf8_anychar_classes[J]);
1769   addtok (dfa, CAT);
1770   addtok (dfa, OR);
1771   addtok (dfa, L_token);
1772   addtok (dfa, dfa->utf8_anychar_classes[M]);
1773   addtok (dfa, CAT);
1774   addtok (dfa, OR);
1775   addtok (dfa, dfa->utf8_anychar_classes[K]);
1776   for (int i = 0; i < 3; i++)
1777     {
1778       addtok (dfa, dfa->utf8_anychar_classes[C]);
1779       addtok (dfa, CAT);
1780       addtok (dfa, OR);
1781     }
1782 }
1783
1784 /* The grammar understood by the parser is as follows.
1785
1786    regexp:
1787      regexp OR branch
1788      branch
1789
1790    branch:
1791      branch closure
1792      closure
1793
1794    closure:
1795      closure QMARK
1796      closure STAR
1797      closure PLUS
1798      closure REPMN
1799      atom
1800
1801    atom:
1802      <normal character>
1803      <multibyte character>
1804      ANYCHAR
1805      MBCSET
1806      CSET
1807      BACKREF
1808      BEGLINE
1809      ENDLINE
1810      BEGWORD
1811      ENDWORD
1812      LIMWORD
1813      NOTLIMWORD
1814      LPAREN regexp RPAREN
1815      <empty>
1816
1817    The parser builds a parse tree in postfix form in an array of tokens.  */
1818
1819 static void
1820 atom (struct dfa *dfa)
1821 {
1822   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1823       || dfa->parse.tok >= CSET
1824       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1825       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1826       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1827       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1828       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1829     {
1830       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1831         {
1832           /* For UTF-8 expand the period to a series of CSETs that define a
1833              valid UTF-8 character.  This avoids using the slow multibyte
1834              path.  I'm pretty sure it would be both profitable and correct to
1835              do it for any encoding; however, the optimization must be done
1836              manually as it is done above in add_utf8_anychar.  So, let's
1837              start with UTF-8: it is the most used, and the structure of the
1838              encoding makes the correctness more obvious.  */
1839           add_utf8_anychar (dfa);
1840         }
1841       else
1842         addtok (dfa, dfa->parse.tok);
1843       dfa->parse.tok = lex (dfa);
1844     }
1845   else if (dfa->parse.tok == WCHAR)
1846     {
1847       if (dfa->lex.wctok == WEOF)
1848         addtok (dfa, BACKREF);
1849       else
1850         {
1851           addtok_wc (dfa, dfa->lex.wctok);
1852
1853           if (dfa->syntax.case_fold)
1854             {
1855               wchar_t folded[CASE_FOLDED_BUFSIZE];
1856               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1857               for (int i = 0; i < n; i++)
1858                 {
1859                   addtok_wc (dfa, folded[i]);
1860                   addtok (dfa, OR);
1861                 }
1862             }
1863         }
1864
1865       dfa->parse.tok = lex (dfa);
1866     }
1867   else if (dfa->parse.tok == LPAREN)
1868     {
1869       dfa->parse.tok = lex (dfa);
1870       regexp (dfa);
1871       if (dfa->parse.tok != RPAREN)
1872         dfaerror (_("unbalanced ("));
1873       dfa->parse.tok = lex (dfa);
1874     }
1875   else
1876     addtok (dfa, EMPTY);
1877 }
1878
1879 /* Return the number of tokens in the given subexpression.  */
1880 static idx_t _GL_ATTRIBUTE_PURE
1881 nsubtoks (struct dfa const *dfa, idx_t tindex)
1882 {
1883   switch (dfa->tokens[tindex - 1])
1884     {
1885     default:
1886       return 1;
1887     case QMARK:
1888     case STAR:
1889     case PLUS:
1890       return 1 + nsubtoks (dfa, tindex - 1);
1891     case CAT:
1892     case OR:
1893       {
1894         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1895         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1896       }
1897     }
1898 }
1899
1900 /* Copy the given subexpression to the top of the tree.  */
1901 static void
1902 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1903 {
1904   if (dfa->localeinfo.multibyte)
1905     for (idx_t i = 0; i < ntokens; i++)
1906       addtok_mb (dfa, dfa->tokens[tindex + i],
1907                  dfa->multibyte_prop[tindex + i]);
1908   else
1909     for (idx_t i = 0; i < ntokens; i++)
1910       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1911 }
1912
1913 static void
1914 closure (struct dfa *dfa)
1915 {
1916   atom (dfa);
1917   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1918          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1919     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1920       {
1921         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1922         idx_t tindex = dfa->tindex - ntokens;
1923         if (dfa->lex.maxrep < 0)
1924           addtok (dfa, PLUS);
1925         if (dfa->lex.minrep == 0)
1926           addtok (dfa, QMARK);
1927         int i;
1928         for (i = 1; i < dfa->lex.minrep; i++)
1929           {
1930             copytoks (dfa, tindex, ntokens);
1931             addtok (dfa, CAT);
1932           }
1933         for (; i < dfa->lex.maxrep; i++)
1934           {
1935             copytoks (dfa, tindex, ntokens);
1936             addtok (dfa, QMARK);
1937             addtok (dfa, CAT);
1938           }
1939         dfa->parse.tok = lex (dfa);
1940       }
1941     else if (dfa->parse.tok == REPMN)
1942       {
1943         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1944         dfa->parse.tok = lex (dfa);
1945         closure (dfa);
1946       }
1947     else
1948       {
1949         addtok (dfa, dfa->parse.tok);
1950         dfa->parse.tok = lex (dfa);
1951       }
1952 }
1953
1954 static void
1955 branch (struct dfa* dfa)
1956 {
1957   closure (dfa);
1958   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1959          && dfa->parse.tok >= 0)
1960     {
1961       closure (dfa);
1962       addtok (dfa, CAT);
1963     }
1964 }
1965
1966 static void
1967 regexp (struct dfa *dfa)
1968 {
1969   branch (dfa);
1970   while (dfa->parse.tok == OR)
1971     {
1972       dfa->parse.tok = lex (dfa);
1973       branch (dfa);
1974       addtok (dfa, OR);
1975     }
1976 }
1977
1978 /* Parse a string S of length LEN into D.  S can include NUL characters.
1979    This is the main entry point for the parser.  */
1980 void
1981 dfaparse (char const *s, idx_t len, struct dfa *d)
1982 {
1983   d->lex.ptr = s;
1984   d->lex.left = len;
1985   d->lex.lasttok = END;
1986   d->lex.laststart = true;
1987
1988   if (!d->syntax.syntax_bits_set)
1989     dfaerror (_("no syntax specified"));
1990
1991   if (!d->nregexps)
1992     addtok (d, BEG);
1993
1994   d->parse.tok = lex (d);
1995   d->parse.depth = d->depth;
1996
1997   regexp (d);
1998
1999   if (d->parse.tok != END)
2000     dfaerror (_("unbalanced )"));
2001
2002   addtok (d, END - d->nregexps);
2003   addtok (d, CAT);
2004
2005   if (d->nregexps)
2006     addtok (d, OR);
2007
2008   ++d->nregexps;
2009 }
2010
2011 /* Some primitives for operating on sets of positions.  */
2012
2013 /* Copy one set to another.  */
2014 static void
2015 copy (position_set const *src, position_set *dst)
2016 {
2017   if (dst->alloc < src->nelem)
2018     {
2019       free (dst->elems);
2020       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2021                             sizeof *dst->elems);
2022     }
2023   dst->nelem = src->nelem;
2024   if (src->nelem != 0)
2025     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2026 }
2027
2028 static void
2029 alloc_position_set (position_set *s, idx_t size)
2030 {
2031   s->elems = xnmalloc (size, sizeof *s->elems);
2032   s->alloc = size;
2033   s->nelem = 0;
2034 }
2035
2036 /* Insert position P in set S.  S is maintained in sorted order on
2037    decreasing index.  If there is already an entry in S with P.index
2038    then merge (logically-OR) P's constraints into the one in S.
2039    S->elems must point to an array large enough to hold the resulting set.  */
2040 static void
2041 insert (position p, position_set *s)
2042 {
2043   idx_t count = s->nelem;
2044   idx_t lo = 0, hi = count;
2045   while (lo < hi)
2046     {
2047       idx_t mid = (lo + hi) >> 1;
2048       if (s->elems[mid].index < p.index)
2049         lo = mid + 1;
2050       else if (s->elems[mid].index == p.index)
2051         {
2052           s->elems[mid].constraint |= p.constraint;
2053           return;
2054         }
2055       else
2056         hi = mid;
2057     }
2058
2059   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2060   for (idx_t i = count; i > lo; i--)
2061     s->elems[i] = s->elems[i - 1];
2062   s->elems[lo] = p;
2063   ++s->nelem;
2064 }
2065
2066 static void
2067 append (position p, position_set *s)
2068 {
2069   idx_t count = s->nelem;
2070   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2071   s->elems[s->nelem++] = p;
2072 }
2073
2074 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2075    result is as if the positions of S1, and of S2 with the additional
2076    constraint C2, were inserted into an initially empty set.  */
2077 static void
2078 merge_constrained (position_set const *s1, position_set const *s2,
2079                    unsigned int c2, position_set *m)
2080 {
2081   idx_t i = 0, j = 0;
2082
2083   if (m->alloc - s1->nelem < s2->nelem)
2084     {
2085       free (m->elems);
2086       m->alloc = s1->nelem;
2087       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2088     }
2089   m->nelem = 0;
2090   while (i < s1->nelem || j < s2->nelem)
2091     if (! (j < s2->nelem)
2092         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2093       {
2094         unsigned int c = ((i < s1->nelem && j < s2->nelem
2095                            && s1->elems[i].index == s2->elems[j].index)
2096                           ? s2->elems[j++].constraint & c2
2097                           : 0);
2098         m->elems[m->nelem].index = s1->elems[i].index;
2099         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2100       }
2101     else
2102       {
2103         if (s2->elems[j].constraint & c2)
2104           {
2105             m->elems[m->nelem].index = s2->elems[j].index;
2106             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2107           }
2108         j++;
2109       }
2110 }
2111
2112 /* Merge two sets of positions into a third.  The result is exactly as if
2113    the positions of both sets were inserted into an initially empty set.  */
2114 static void
2115 merge (position_set const *s1, position_set const *s2, position_set *m)
2116 {
2117   merge_constrained (s1, s2, -1, m);
2118 }
2119
2120 static void
2121 merge2 (position_set *dst, position_set const *src, position_set *m)
2122 {
2123   if (src->nelem < 4)
2124     {
2125       for (idx_t i = 0; i < src->nelem; i++)
2126         insert (src->elems[i], dst);
2127     }
2128    else
2129     {
2130       merge (src, dst, m);
2131       copy (m, dst);
2132     }
2133 }
2134
2135 /* Delete a position from a set.  Return the nonzero constraint of the
2136    deleted position, or zero if there was no such position.  */
2137 static unsigned int
2138 delete (idx_t del, position_set *s)
2139 {
2140   idx_t count = s->nelem;
2141   idx_t lo = 0, hi = count;
2142   while (lo < hi)
2143     {
2144       idx_t mid = (lo + hi) >> 1;
2145       if (s->elems[mid].index < del)
2146         lo = mid + 1;
2147       else if (s->elems[mid].index == del)
2148         {
2149           unsigned int c = s->elems[mid].constraint;
2150           idx_t i;
2151           for (i = mid; i + 1 < count; i++)
2152             s->elems[i] = s->elems[i + 1];
2153           s->nelem = i;
2154           return c;
2155         }
2156       else
2157         hi = mid;
2158     }
2159   return 0;
2160 }
2161
2162 /* Replace a position with the followed set.  */
2163 static void
2164 replace (position_set *dst, idx_t del, position_set *add,
2165          unsigned int constraint, position_set *tmp)
2166 {
2167   unsigned int c = delete (del, dst) & constraint;
2168
2169   if (c)
2170     {
2171       copy (dst, tmp);
2172       merge_constrained (tmp, add, c, dst);
2173     }
2174 }
2175
2176 /* Find the index of the state corresponding to the given position set with
2177    the given preceding context, or create a new state if there is no such
2178    state.  Context tells whether we got here on a newline or letter.  */
2179 static state_num
2180 state_index (struct dfa *d, position_set const *s, int context)
2181 {
2182   size_t hash = 0;
2183   int constraint = 0;
2184   state_num i;
2185   token first_end = 0;
2186
2187   for (i = 0; i < s->nelem; ++i)
2188     {
2189       size_t ind = s->elems[i].index;
2190       hash ^= ind + s->elems[i].constraint;
2191     }
2192
2193   /* Try to find a state that exactly matches the proposed one.  */
2194   for (i = 0; i < d->sindex; ++i)
2195     {
2196       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2197           || context != d->states[i].context)
2198         continue;
2199       state_num j;
2200       for (j = 0; j < s->nelem; ++j)
2201         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2202             || s->elems[j].index != d->states[i].elems.elems[j].index)
2203           break;
2204       if (j == s->nelem)
2205         return i;
2206     }
2207
2208 #ifdef DEBUG
2209   fprintf (stderr, "new state %td\n nextpos:", i);
2210   for (state_num j = 0; j < s->nelem; j++)
2211     {
2212       fprintf (stderr, " %td:", s->elems[j].index);
2213       prtok (d->tokens[s->elems[j].index]);
2214     }
2215   fprintf (stderr, "\n context:");
2216   if (context ^ CTX_ANY)
2217     {
2218       if (context & CTX_NONE)
2219         fprintf (stderr, " CTX_NONE");
2220       if (context & CTX_LETTER)
2221         fprintf (stderr, " CTX_LETTER");
2222       if (context & CTX_NEWLINE)
2223         fprintf (stderr, " CTX_NEWLINE");
2224     }
2225   else
2226     fprintf (stderr, " CTX_ANY");
2227   fprintf (stderr, "\n");
2228 #endif
2229
2230   for (state_num j = 0; j < s->nelem; j++)
2231     {
2232       int c = d->constraints[s->elems[j].index];
2233
2234       if (c != 0)
2235         {
2236           if (succeeds_in_context (c, context, CTX_ANY))
2237             constraint |= c;
2238           if (!first_end)
2239             first_end = d->tokens[s->elems[j].index];
2240         }
2241       else if (d->tokens[s->elems[j].index] == BACKREF)
2242         constraint = NO_CONSTRAINT;
2243     }
2244
2245
2246   /* Create a new state.  */
2247   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2248                              sizeof *d->states);
2249   d->states[i].hash = hash;
2250   alloc_position_set (&d->states[i].elems, s->nelem);
2251   copy (s, &d->states[i].elems);
2252   d->states[i].context = context;
2253   d->states[i].constraint = constraint;
2254   d->states[i].first_end = first_end;
2255   d->states[i].mbps.nelem = 0;
2256   d->states[i].mbps.elems = NULL;
2257   d->states[i].mb_trindex = -1;
2258
2259   ++d->sindex;
2260
2261   return i;
2262 }
2263
2264 /* Find the epsilon closure of a set of positions.  If any position of the set
2265    contains a symbol that matches the empty string in some context, replace
2266    that position with the elements of its follow labeled with an appropriate
2267    constraint.  Repeat exhaustively until no funny positions are left.
2268    S->elems must be large enough to hold the result.  */
2269 static void
2270 epsclosure (struct dfa const *d)
2271 {
2272   position_set tmp;
2273   alloc_position_set (&tmp, d->nleaves);
2274   for (idx_t i = 0; i < d->tindex; i++)
2275     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2276         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2277         && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2278       {
2279         unsigned int constraint;
2280         switch (d->tokens[i])
2281           {
2282           case BEGLINE:
2283             constraint = BEGLINE_CONSTRAINT;
2284             break;
2285           case ENDLINE:
2286             constraint = ENDLINE_CONSTRAINT;
2287             break;
2288           case BEGWORD:
2289             constraint = BEGWORD_CONSTRAINT;
2290             break;
2291           case ENDWORD:
2292             constraint = ENDWORD_CONSTRAINT;
2293             break;
2294           case LIMWORD:
2295             constraint = LIMWORD_CONSTRAINT;
2296             break;
2297           case NOTLIMWORD:
2298             constraint = NOTLIMWORD_CONSTRAINT;
2299             break;
2300           default:
2301             constraint = NO_CONSTRAINT;
2302             break;
2303           }
2304
2305         delete (i, &d->follows[i]);
2306
2307         for (idx_t j = 0; j < d->tindex; j++)
2308           if (i != j && d->follows[j].nelem > 0)
2309             replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2310       }
2311   free (tmp.elems);
2312 }
2313
2314 /* Returns the set of contexts for which there is at least one
2315    character included in C.  */
2316
2317 static int
2318 charclass_context (struct dfa const *dfa, charclass const *c)
2319 {
2320   int context = 0;
2321
2322   for (int j = 0; j < CHARCLASS_WORDS; j++)
2323     {
2324       if (c->w[j] & dfa->syntax.newline.w[j])
2325         context |= CTX_NEWLINE;
2326       if (c->w[j] & dfa->syntax.letters.w[j])
2327         context |= CTX_LETTER;
2328       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2329         context |= CTX_NONE;
2330     }
2331
2332   return context;
2333 }
2334
2335 /* Returns the contexts on which the position set S depends.  Each context
2336    in the set of returned contexts (let's call it SC) may have a different
2337    follow set than other contexts in SC, and also different from the
2338    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2339    in the complement set will have the same follow set.  */
2340
2341 static int _GL_ATTRIBUTE_PURE
2342 state_separate_contexts (struct dfa *d, position_set const *s)
2343 {
2344   int separate_contexts = 0;
2345
2346   for (idx_t j = 0; j < s->nelem; j++)
2347     separate_contexts |= d->separates[s->elems[j].index];
2348
2349   return separate_contexts;
2350 }
2351
2352 enum
2353 {
2354   /* Single token is repeated.  It is distinguished from non-repeated.  */
2355   OPT_REPEAT = (1 << 0),
2356
2357   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2358      node is not merged.  */
2359   OPT_LPAREN = (1 << 1),
2360
2361   /* Multiple branches are joined.  The node is not merged.  */
2362   OPT_RPAREN = (1 << 2),
2363
2364   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2365      flag is turned on. */
2366   OPT_WALKED = (1 << 3),
2367
2368   /* The node is queued.  The node is not queued again.  */
2369   OPT_QUEUED = (1 << 4)
2370 };
2371
2372 static void
2373 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2374                  position_set *merged)
2375 {
2376   position_set *follows = d->follows;
2377   idx_t nelem = 0;
2378
2379   d->constraints[tindex] = 0;
2380
2381   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2382     {
2383       idx_t sindex = follows[tindex].elems[i].index;
2384
2385       /* Skip the node as pruned in future.  */
2386       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2387       if (iconstraint == 0)
2388         continue;
2389
2390       if (d->tokens[follows[tindex].elems[i].index] <= END)
2391         {
2392           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2393           continue;
2394         }
2395
2396       if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2397         {
2398           idx_t j;
2399
2400           for (j = 0; j < nelem; j++)
2401             {
2402               idx_t dindex = follows[tindex].elems[j].index;
2403
2404               if (follows[tindex].elems[j].constraint != iconstraint)
2405                 continue;
2406
2407               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2408                 continue;
2409
2410               if (d->tokens[sindex] != d->tokens[dindex])
2411                 continue;
2412
2413               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2414                 continue;
2415
2416               if (flags[sindex] & OPT_REPEAT)
2417                 delete (sindex, &follows[sindex]);
2418
2419               merge2 (&follows[dindex], &follows[sindex], merged);
2420
2421               break;
2422             }
2423
2424           if (j < nelem)
2425             continue;
2426         }
2427
2428       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2429       flags[sindex] |= OPT_QUEUED;
2430     }
2431
2432   follows[tindex].nelem = nelem;
2433 }
2434
2435 static int
2436 compare (const void *a, const void *b)
2437 {
2438   position const *p = a, *q = b;
2439   return p->index < q->index ? -1 : p->index > q->index;
2440 }
2441
2442 static void
2443 reorder_tokens (struct dfa *d)
2444 {
2445   idx_t nleaves;
2446   ptrdiff_t *map;
2447   token *tokens;
2448   position_set *follows;
2449   int *constraints;
2450   char *multibyte_prop;
2451
2452   nleaves = 0;
2453
2454   map = xnmalloc (d->tindex, sizeof *map);
2455
2456   map[0] = nleaves++;
2457
2458   for (idx_t i = 1; i < d->tindex; i++)
2459     map[i] = -1;
2460
2461   tokens = xnmalloc (d->nleaves, sizeof *tokens);
2462   follows = xnmalloc (d->nleaves, sizeof *follows);
2463   constraints = xnmalloc (d->nleaves, sizeof *constraints);
2464
2465   if (d->localeinfo.multibyte)
2466     multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2467   else
2468     multibyte_prop = NULL;
2469
2470   for (idx_t i = 0; i < d->tindex; i++)
2471     {
2472       if (map[i] == -1)
2473         {
2474           free (d->follows[i].elems);
2475           d->follows[i].elems = NULL;
2476           d->follows[i].nelem = 0;
2477           continue;
2478         }
2479
2480       tokens[map[i]] = d->tokens[i];
2481       follows[map[i]] = d->follows[i];
2482       constraints[map[i]] = d->constraints[i];
2483
2484       if (multibyte_prop != NULL)
2485         multibyte_prop[map[i]] = d->multibyte_prop[i];
2486
2487       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2488         {
2489           if (map[d->follows[i].elems[j].index] == -1)
2490             map[d->follows[i].elems[j].index] = nleaves++;
2491
2492           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2493         }
2494
2495       qsort (d->follows[i].elems, d->follows[i].nelem,
2496              sizeof *d->follows[i].elems, compare);
2497     }
2498
2499   for (idx_t i = 0; i < nleaves; i++)
2500     {
2501       d->tokens[i] = tokens[i];
2502       d->follows[i] = follows[i];
2503       d->constraints[i] = constraints[i];
2504
2505       if (multibyte_prop != NULL)
2506         d->multibyte_prop[i] = multibyte_prop[i];
2507     }
2508
2509   d->tindex = d->nleaves = nleaves;
2510
2511   free (tokens);
2512   free (follows);
2513   free (constraints);
2514   free (multibyte_prop);
2515   free (map);
2516 }
2517
2518 static void
2519 dfaoptimize (struct dfa *d)
2520 {
2521   char *flags = xzalloc (d->tindex);
2522
2523   for (idx_t i = 0; i < d->tindex; i++)
2524     {
2525       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2526         {
2527           if (d->follows[i].elems[j].index == i)
2528             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2529           else if (d->follows[i].elems[j].index < i)
2530             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2531           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2532             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2533           else
2534             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2535         }
2536     }
2537
2538   flags[0] |= OPT_QUEUED;
2539
2540   position_set merged0;
2541   position_set *merged = &merged0;
2542   alloc_position_set (merged, d->nleaves);
2543
2544   d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2545
2546   for (idx_t i = 0; i < d->tindex; i++)
2547     if (flags[i] & OPT_QUEUED)
2548       merge_nfa_state (d, i, flags, merged);
2549
2550   reorder_tokens (d);
2551
2552   free (merged->elems);
2553   free (flags);
2554 }
2555
2556 /* Perform bottom-up analysis on the parse tree, computing various functions.
2557    Note that at this point, we're pretending constructs like \< are real
2558    characters rather than constraints on what can follow them.
2559
2560    Nullable:  A node is nullable if it is at the root of a regexp that can
2561    match the empty string.
2562    *  EMPTY leaves are nullable.
2563    * No other leaf is nullable.
2564    * A QMARK or STAR node is nullable.
2565    * A PLUS node is nullable if its argument is nullable.
2566    * A CAT node is nullable if both its arguments are nullable.
2567    * An OR node is nullable if either argument is nullable.
2568
2569    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2570    that could correspond to the first character of a string matching the
2571    regexp rooted at the given node.
2572    * EMPTY leaves have empty firstpos.
2573    * The firstpos of a nonempty leaf is that leaf itself.
2574    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2575      argument.
2576    * The firstpos of a CAT node is the firstpos of the left argument, union
2577      the firstpos of the right if the left argument is nullable.
2578    * The firstpos of an OR node is the union of firstpos of each argument.
2579
2580    Lastpos:  The lastpos of a node is the set of positions that could
2581    correspond to the last character of a string matching the regexp at
2582    the given node.
2583    * EMPTY leaves have empty lastpos.
2584    * The lastpos of a nonempty leaf is that leaf itself.
2585    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2586      argument.
2587    * The lastpos of a CAT node is the lastpos of its right argument, union
2588      the lastpos of the left if the right argument is nullable.
2589    * The lastpos of an OR node is the union of the lastpos of each argument.
2590
2591    Follow:  The follow of a position is the set of positions that could
2592    correspond to the character following a character matching the node in
2593    a string matching the regexp.  At this point we consider special symbols
2594    that match the empty string in some context to be just normal characters.
2595    Later, if we find that a special symbol is in a follow set, we will
2596    replace it with the elements of its follow, labeled with an appropriate
2597    constraint.
2598    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2599      the follow of every node in the lastpos.
2600    * Every node in the firstpos of the second argument of a CAT node is in
2601      the follow of every node in the lastpos of the first argument.
2602
2603    Because of the postfix representation of the parse tree, the depth-first
2604    analysis is conveniently done by a linear scan with the aid of a stack.
2605    Sets are stored as arrays of the elements, obeying a stack-like allocation
2606    scheme; the number of elements in each set deeper in the stack can be
2607    used to determine the address of a particular set's array.  */
2608 static void
2609 dfaanalyze (struct dfa *d, bool searchflag)
2610 {
2611   /* Array allocated to hold position sets.  */
2612   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2613   /* Firstpos and lastpos elements.  */
2614   position *firstpos = posalloc;
2615   position *lastpos = firstpos + d->nleaves;
2616   position pos;
2617   position_set tmp;
2618
2619   /* Stack for element counts and nullable flags.  */
2620   struct
2621   {
2622     /* Whether the entry is nullable.  */
2623     bool nullable;
2624
2625     /* Counts of firstpos and lastpos sets.  */
2626     idx_t nfirstpos;
2627     idx_t nlastpos;
2628   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2629
2630   position_set merged;          /* Result of merging sets.  */
2631
2632   addtok (d, CAT);
2633
2634 #ifdef DEBUG
2635   fprintf (stderr, "dfaanalyze:\n");
2636   for (idx_t i = 0; i < d->tindex; i++)
2637     {
2638       fprintf (stderr, " %td:", i);
2639       prtok (d->tokens[i]);
2640     }
2641   putc ('\n', stderr);
2642 #endif
2643
2644   d->searchflag = searchflag;
2645   alloc_position_set (&merged, d->nleaves);
2646   d->follows = xcalloc (d->tindex, sizeof *d->follows);
2647
2648   for (idx_t i = 0; i < d->tindex; i++)
2649     {
2650       switch (d->tokens[i])
2651         {
2652         case EMPTY:
2653           /* The empty set is nullable.  */
2654           stk->nullable = true;
2655
2656           /* The firstpos and lastpos of the empty leaf are both empty.  */
2657           stk->nfirstpos = stk->nlastpos = 0;
2658           stk++;
2659           break;
2660
2661         case STAR:
2662         case PLUS:
2663           /* Every element in the firstpos of the argument is in the follow
2664              of every element in the lastpos.  */
2665           {
2666             tmp.elems = firstpos - stk[-1].nfirstpos;
2667             tmp.nelem = stk[-1].nfirstpos;
2668             position *p = lastpos - stk[-1].nlastpos;
2669             for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2670               {
2671                 merge (&tmp, &d->follows[p[j].index], &merged);
2672                 copy (&merged, &d->follows[p[j].index]);
2673               }
2674           }
2675           FALLTHROUGH;
2676         case QMARK:
2677           /* A QMARK or STAR node is automatically nullable.  */
2678           if (d->tokens[i] != PLUS)
2679             stk[-1].nullable = true;
2680           break;
2681
2682         case CAT:
2683           /* Every element in the firstpos of the second argument is in the
2684              follow of every element in the lastpos of the first argument.  */
2685           {
2686             tmp.nelem = stk[-1].nfirstpos;
2687             tmp.elems = firstpos - stk[-1].nfirstpos;
2688             position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2689             for (idx_t j = 0; j < stk[-2].nlastpos; j++)
2690               {
2691                 merge (&tmp, &d->follows[p[j].index], &merged);
2692                 copy (&merged, &d->follows[p[j].index]);
2693               }
2694           }
2695
2696           /* The firstpos of a CAT node is the firstpos of the first argument,
2697              union that of the second argument if the first is nullable.  */
2698           if (stk[-2].nullable)
2699             stk[-2].nfirstpos += stk[-1].nfirstpos;
2700           else
2701             firstpos -= stk[-1].nfirstpos;
2702
2703           /* The lastpos of a CAT node is the lastpos of the second argument,
2704              union that of the first argument if the second is nullable.  */
2705           if (stk[-1].nullable)
2706             stk[-2].nlastpos += stk[-1].nlastpos;
2707           else
2708             {
2709               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2710               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2711                 p[j] = p[j + stk[-2].nlastpos];
2712               lastpos -= stk[-2].nlastpos;
2713               stk[-2].nlastpos = stk[-1].nlastpos;
2714             }
2715
2716           /* A CAT node is nullable if both arguments are nullable.  */
2717           stk[-2].nullable &= stk[-1].nullable;
2718           stk--;
2719           break;
2720
2721         case OR:
2722           /* The firstpos is the union of the firstpos of each argument.  */
2723           stk[-2].nfirstpos += stk[-1].nfirstpos;
2724
2725           /* The lastpos is the union of the lastpos of each argument.  */
2726           stk[-2].nlastpos += stk[-1].nlastpos;
2727
2728           /* An OR node is nullable if either argument is nullable.  */
2729           stk[-2].nullable |= stk[-1].nullable;
2730           stk--;
2731           break;
2732
2733         default:
2734           /* Anything else is a nonempty position.  (Note that special
2735              constructs like \< are treated as nonempty strings here;
2736              an "epsilon closure" effectively makes them nullable later.
2737              Backreferences have to get a real position so we can detect
2738              transitions on them later.  But they are nullable.  */
2739           stk->nullable = d->tokens[i] == BACKREF;
2740
2741           /* This position is in its own firstpos and lastpos.  */
2742           stk->nfirstpos = stk->nlastpos = 1;
2743           stk++;
2744
2745           firstpos->index = lastpos->index = i;
2746           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2747           firstpos++, lastpos++;
2748
2749           break;
2750         }
2751 #ifdef DEBUG
2752       /* ... balance the above nonsyntactic #ifdef goo...  */
2753       fprintf (stderr, "node %td:", i);
2754       prtok (d->tokens[i]);
2755       putc ('\n', stderr);
2756       fprintf (stderr,
2757                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2758       fprintf (stderr, " firstpos:");
2759       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2760         {
2761           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2762           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2763         }
2764       fprintf (stderr, "\n lastpos:");
2765       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2766         {
2767           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2768           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2769         }
2770       putc ('\n', stderr);
2771 #endif
2772     }
2773
2774   /* For each follow set that is the follow set of a real position, replace
2775      it with its epsilon closure.  */
2776   epsclosure (d);
2777
2778   dfaoptimize (d);
2779
2780 #ifdef DEBUG
2781   for (idx_t i = 0; i < d->tindex; i++)
2782     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2783         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2784         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2785       {
2786         fprintf (stderr, "follows(%td:", i);
2787         prtok (d->tokens[i]);
2788         fprintf (stderr, "):");
2789         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2790           {
2791             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2792             prtok (d->tokens[d->follows[i].elems[j].index]);
2793           }
2794         putc ('\n', stderr);
2795       }
2796 #endif
2797
2798   pos.index = 0;
2799   pos.constraint = NO_CONSTRAINT;
2800
2801   alloc_position_set (&tmp, 1);
2802
2803   append (pos, &tmp);
2804
2805   d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2806
2807   for (idx_t i = 0; i < d->tindex; i++)
2808     {
2809       d->separates[i] = 0;
2810
2811       if (prev_newline_dependent (d->constraints[i]))
2812         d->separates[i] |= CTX_NEWLINE;
2813       if (prev_letter_dependent (d->constraints[i]))
2814         d->separates[i] |= CTX_LETTER;
2815
2816       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2817         {
2818           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2819             d->separates[i] |= CTX_NEWLINE;
2820           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2821             d->separates[i] |= CTX_LETTER;
2822         }
2823     }
2824
2825   /* Context wanted by some position.  */
2826   int separate_contexts = state_separate_contexts (d, &tmp);
2827
2828   /* Build the initial state.  */
2829   if (separate_contexts & CTX_NEWLINE)
2830     state_index (d, &tmp, CTX_NEWLINE);
2831   d->initstate_notbol = d->min_trcount
2832     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2833   if (separate_contexts & CTX_LETTER)
2834     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2835   d->min_trcount++;
2836   d->trcount = 0;
2837
2838   free (posalloc);
2839   free (stkalloc);
2840   free (merged.elems);
2841   free (tmp.elems);
2842 }
2843
2844 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2845 static void
2846 realloc_trans_if_necessary (struct dfa *d)
2847 {
2848   state_num oldalloc = d->tralloc;
2849   if (oldalloc < d->sindex)
2850     {
2851       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2852       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2853       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2854                            -1, sizeof *realtrans);
2855       realtrans[0] = realtrans[1] = NULL;
2856       d->trans = realtrans + 2;
2857       idx_t newalloc = d->tralloc = newalloc1 - 2;
2858       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2859       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2860       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2861       if (d->localeinfo.multibyte)
2862         {
2863           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2864           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2865           if (oldalloc == 0)
2866             realtrans[0] = realtrans[1] = NULL;
2867           d->mb_trans = realtrans + 2;
2868         }
2869       for (; oldalloc < newalloc; oldalloc++)
2870         {
2871           d->trans[oldalloc] = NULL;
2872           d->fails[oldalloc] = NULL;
2873           if (d->localeinfo.multibyte)
2874             d->mb_trans[oldalloc] = NULL;
2875         }
2876     }
2877 }
2878
2879 /*
2880    Calculate the transition table for a new state derived from state s
2881    for a compiled dfa d after input character uc, and return the new
2882    state number.
2883
2884    Do not worry about all possible input characters; calculate just the group
2885    of positions that match uc.  Label it with the set of characters that
2886    every position in the group matches (taking into account, if necessary,
2887    preceding context information of s).  Then find the union
2888    of these positions' follows, i.e., the set of positions of the
2889    new state.  For each character in the group's label, set the transition
2890    on this character to be to a state corresponding to the set's positions,
2891    and its associated backward context information, if necessary.
2892
2893    When building a searching matcher, include the positions of state
2894    0 in every state.
2895
2896    The group is constructed by building an equivalence-class
2897    partition of the positions of s.
2898
2899    For each position, find the set of characters C that it matches.  Eliminate
2900    any characters from C that fail on grounds of backward context.
2901
2902    Check whether the group's label L has nonempty
2903    intersection with C.  If L - C is nonempty, create a new group labeled
2904    L - C and having the same positions as the current group, and set L to
2905    the intersection of L and C.  Insert the position in the group, set
2906    C = C - L, and resume scanning.
2907
2908    If after comparing with every group there are characters remaining in C,
2909    create a new group labeled with the characters of C and insert this
2910    position in that group.  */
2911
2912 static state_num
2913 build_state (state_num s, struct dfa *d, unsigned char uc)
2914 {
2915   position_set follows;         /* Union of the follows for each
2916                                    position of the current state.  */
2917   position_set group;           /* Positions that match the input char.  */
2918   position_set tmp;             /* Temporary space for merging sets.  */
2919   state_num state;              /* New state.  */
2920   state_num state_newline;      /* New state on a newline transition.  */
2921   state_num state_letter;       /* New state on a letter transition.  */
2922
2923 #ifdef DEBUG
2924   fprintf (stderr, "build state %td\n", s);
2925 #endif
2926
2927   /* A pointer to the new transition table, and the table itself.  */
2928   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2929   state_num *trans = *ptrans;
2930
2931   if (!trans)
2932     {
2933       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2934          transition tables that can exist at once, other than for
2935          initial states.  Often-used transition tables are quickly
2936          rebuilt, whereas rarely-used ones are cleared away.  */
2937       if (MAX_TRCOUNT <= d->trcount)
2938         {
2939           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2940             {
2941               free (d->trans[i]);
2942               free (d->fails[i]);
2943               d->trans[i] = d->fails[i] = NULL;
2944             }
2945           d->trcount = 0;
2946         }
2947
2948       d->trcount++;
2949       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2950
2951       /* Fill transition table with a default value which means that the
2952          transited state has not been calculated yet.  */
2953       for (int i = 0; i < NOTCHAR; i++)
2954         trans[i] = -2;
2955     }
2956
2957   /* Set up the success bits for this state.  */
2958   d->success[s] = 0;
2959   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2960     d->success[s] |= CTX_NEWLINE;
2961   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2962     d->success[s] |= CTX_LETTER;
2963   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2964     d->success[s] |= CTX_NONE;
2965
2966   alloc_position_set (&follows, d->nleaves);
2967
2968   /* Find the union of the follows of the positions of the group.
2969      This is a hideously inefficient loop.  Fix it someday.  */
2970   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2971     for (idx_t k = 0;
2972          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2973       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2974               &follows);
2975
2976   /* Positions that match the input char.  */
2977   alloc_position_set (&group, d->nleaves);
2978
2979   /* The group's label.  */
2980   charclass label;
2981   fillset (&label);
2982
2983   for (idx_t i = 0; i < follows.nelem; i++)
2984     {
2985       charclass matches;            /* Set of matching characters.  */
2986       position pos = follows.elems[i];
2987       bool matched = false;
2988       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2989         {
2990           zeroset (&matches);
2991           setbit (d->tokens[pos.index], &matches);
2992           if (d->tokens[pos.index] == uc)
2993             matched = true;
2994         }
2995       else if (d->tokens[pos.index] >= CSET)
2996         {
2997           matches = d->charclasses[d->tokens[pos.index] - CSET];
2998           if (tstbit (uc, &matches))
2999             matched = true;
3000         }
3001       else if (d->tokens[pos.index] == ANYCHAR)
3002         {
3003           matches = d->charclasses[d->canychar];
3004           if (tstbit (uc, &matches))
3005             matched = true;
3006
3007           /* ANYCHAR must match with a single character, so we must put
3008              it to D->states[s].mbps which contains the positions which
3009              can match with a single character not a byte.  If all
3010              positions which has ANYCHAR does not depend on context of
3011              next character, we put the follows instead of it to
3012              D->states[s].mbps to optimize.  */
3013           if (succeeds_in_context (pos.constraint, d->states[s].context,
3014                                    CTX_NONE))
3015             {
3016               if (d->states[s].mbps.nelem == 0)
3017                 alloc_position_set (&d->states[s].mbps, 1);
3018               insert (pos, &d->states[s].mbps);
3019             }
3020         }
3021       else
3022         continue;
3023
3024       /* Some characters may need to be eliminated from matches because
3025          they fail in the current context.  */
3026       if (pos.constraint != NO_CONSTRAINT)
3027         {
3028           if (!succeeds_in_context (pos.constraint,
3029                                     d->states[s].context, CTX_NEWLINE))
3030             for (int j = 0; j < CHARCLASS_WORDS; j++)
3031               matches.w[j] &= ~d->syntax.newline.w[j];
3032           if (!succeeds_in_context (pos.constraint,
3033                                     d->states[s].context, CTX_LETTER))
3034             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3035               matches.w[j] &= ~d->syntax.letters.w[j];
3036           if (!succeeds_in_context (pos.constraint,
3037                                     d->states[s].context, CTX_NONE))
3038             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3039               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3040
3041           /* If there are no characters left, there's no point in going on.  */
3042           if (emptyset (&matches))
3043             continue;
3044
3045           /* If we have reset the bit that made us declare "matched", reset
3046              that indicator, too.  This is required to avoid an infinite loop
3047              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3048           if (!tstbit (uc, &matches))
3049             matched = false;
3050         }
3051
3052 #ifdef DEBUG
3053       fprintf (stderr, " nextpos %td:", pos.index);
3054       prtok (d->tokens[pos.index]);
3055       fprintf (stderr, " of");
3056       for (unsigned j = 0; j < NOTCHAR; j++)
3057         if (tstbit (j, &matches))
3058           fprintf (stderr, " 0x%02x", j);
3059       fprintf (stderr, "\n");
3060 #endif
3061
3062       if (matched)
3063         {
3064           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3065             label.w[k] &= matches.w[k];
3066           append (pos, &group);
3067         }
3068       else
3069         {
3070           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3071             label.w[k] &= ~matches.w[k];
3072         }
3073     }
3074
3075   alloc_position_set (&tmp, d->nleaves);
3076
3077   if (group.nelem > 0)
3078     {
3079       /* If we are building a searching matcher, throw in the positions
3080          of state 0 as well, if possible.  */
3081       if (d->searchflag)
3082         {
3083           /* If a token in follows.elems is not 1st byte of a multibyte
3084              character, or the states of follows must accept the bytes
3085              which are not 1st byte of the multibyte character.
3086              Then, if a state of follows encounters a byte, it must not be
3087              a 1st byte of a multibyte character nor a single byte character.
3088              In this case, do not add state[0].follows to next state, because
3089              state[0] must accept 1st-byte.
3090
3091              For example, suppose <sb a> is a certain single byte character,
3092              <mb A> is a certain multibyte character, and the codepoint of
3093              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3094              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3095              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3096              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3097              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3098              not add state[0].  */
3099
3100           bool mergeit = !d->localeinfo.multibyte;
3101           if (!mergeit)
3102             {
3103               mergeit = true;
3104               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3105                 mergeit &= d->multibyte_prop[group.elems[j].index];
3106             }
3107           if (mergeit)
3108             {
3109               merge (&d->states[0].elems, &group, &tmp);
3110               copy (&tmp, &group);
3111             }
3112         }
3113
3114       /* Find out if the new state will want any context information,
3115          by calculating possible contexts that the group can match,
3116          and separate contexts that the new state wants to know.  */
3117       int possible_contexts = charclass_context (d, &label);
3118       int separate_contexts = state_separate_contexts (d, &group);
3119
3120       /* Find the state(s) corresponding to the union of the follows.  */
3121       if (possible_contexts & ~separate_contexts)
3122         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123       else
3124         state = -1;
3125       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126         state_newline = state_index (d, &group, CTX_NEWLINE);
3127       else
3128         state_newline = state;
3129       if (separate_contexts & possible_contexts & CTX_LETTER)
3130         state_letter = state_index (d, &group, CTX_LETTER);
3131       else
3132         state_letter = state;
3133
3134       /* Reallocate now, to reallocate any newline transition properly.  */
3135       realloc_trans_if_necessary (d);
3136     }
3137
3138   /* If we are a searching matcher, the default transition is to a state
3139      containing the positions of state 0, otherwise the default transition
3140      is to fail miserably.  */
3141   else if (d->searchflag)
3142     {
3143       state_newline = 0;
3144       state_letter = d->min_trcount - 1;
3145       state = d->initstate_notbol;
3146     }
3147   else
3148     {
3149       state_newline = -1;
3150       state_letter = -1;
3151       state = -1;
3152     }
3153
3154   /* Set the transitions for each character in the label.  */
3155   for (int i = 0; i < NOTCHAR; i++)
3156     if (tstbit (i, &label))
3157       switch (d->syntax.sbit[i])
3158         {
3159         case CTX_NEWLINE:
3160           trans[i] = state_newline;
3161           break;
3162         case CTX_LETTER:
3163           trans[i] = state_letter;
3164           break;
3165         default:
3166           trans[i] = state;
3167           break;
3168         }
3169
3170 #ifdef DEBUG
3171   fprintf (stderr, "trans table %td", s);
3172   for (int i = 0; i < NOTCHAR; ++i)
3173     {
3174       if (!(i & 0xf))
3175         fprintf (stderr, "\n");
3176       fprintf (stderr, " %2td", trans[i]);
3177     }
3178   fprintf (stderr, "\n");
3179 #endif
3180
3181   free (group.elems);
3182   free (follows.elems);
3183   free (tmp.elems);
3184
3185   /* Keep the newline transition in a special place so we can use it as
3186      a sentinel.  */
3187   if (tstbit (d->syntax.eolbyte, &label))
3188     {
3189       d->newlines[s] = trans[d->syntax.eolbyte];
3190       trans[d->syntax.eolbyte] = -1;
3191     }
3192
3193   return trans[uc];
3194 }
3195
3196 /* Multibyte character handling sub-routines for dfaexec.  */
3197
3198 /* Consume a single byte and transit state from 's' to '*next_state'.
3199    This function is almost same as the state transition routin in dfaexec.
3200    But state transition is done just once, otherwise matching succeed or
3201    reach the end of the buffer.  */
3202 static state_num
3203 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3204 {
3205   state_num *t;
3206
3207   if (d->trans[s])
3208     t = d->trans[s];
3209   else if (d->fails[s])
3210     t = d->fails[s];
3211   else
3212     {
3213       build_state (s, d, **pp);
3214       if (d->trans[s])
3215         t = d->trans[s];
3216       else
3217         {
3218           t = d->fails[s];
3219           assert (t);
3220         }
3221     }
3222
3223   if (t[**pp] == -2)
3224     build_state (s, d, **pp);
3225
3226   return t[*(*pp)++];
3227 }
3228
3229 /* Transit state from s, then return new state and update the pointer of
3230    the buffer.  This function is for a period operator which can match a
3231    multi-byte character.  */
3232 static state_num
3233 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3234                unsigned char const *end)
3235 {
3236   wint_t wc;
3237
3238   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3239
3240   /* This state has some operators which can match a multibyte character.  */
3241   d->mb_follows.nelem = 0;
3242
3243   /* Calculate the state which can be reached from the state 's' by
3244      consuming 'mbclen' single bytes from the buffer.  */
3245   state_num s1 = s;
3246   int mbci;
3247   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3248     s = transit_state_singlebyte (d, s, pp);
3249   *pp += mbclen - mbci;
3250
3251   if (wc == WEOF)
3252     {
3253       /* It is an invalid character, so ANYCHAR is not accepted.  */
3254       return s;
3255     }
3256
3257   /* If all positions which have ANYCHAR do not depend on the context
3258      of the next character, calculate the next state with
3259      pre-calculated follows and cache the result.  */
3260   if (d->states[s1].mb_trindex < 0)
3261     {
3262       if (MAX_TRCOUNT <= d->mb_trcount)
3263         {
3264           state_num s3;
3265           for (s3 = -1; s3 < d->tralloc; s3++)
3266             {
3267               free (d->mb_trans[s3]);
3268               d->mb_trans[s3] = NULL;
3269             }
3270
3271           for (state_num i = 0; i < d->sindex; i++)
3272             d->states[i].mb_trindex = -1;
3273           d->mb_trcount = 0;
3274         }
3275       d->states[s1].mb_trindex = d->mb_trcount++;
3276     }
3277
3278   if (! d->mb_trans[s])
3279     {
3280       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283       for (int i = 0; i < MAX_TRCOUNT; i++)
3284         d->mb_trans[s][i] = -1;
3285     }
3286   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287     return d->mb_trans[s][d->states[s1].mb_trindex];
3288
3289   if (s == -1)
3290     copy (&d->states[s1].mbps, &d->mb_follows);
3291   else
3292     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293
3294   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296   realloc_trans_if_necessary (d);
3297
3298   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299
3300   return s2;
3301 }
3302
3303 /* The initial state may encounter a byte which is not a single byte character
3304    nor the first byte of a multibyte character.  But it is incorrect for the
3305    initial state to accept such a byte.  For example, in Shift JIS the regular
3306    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3307    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3308    that are not a single byte character nor the first byte of a multibyte
3309    character.
3310
3311    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3312    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3313    the result is greater than P, set *WCP to the final wide character
3314    processed, or to WEOF if no wide character is processed.  Otherwise,
3315    if WCP is non-NULL, *WCP may or may not be updated.
3316
3317    Both P and MBP must be no larger than END.  */
3318 static unsigned char const *
3319 skip_remains_mb (struct dfa *d, unsigned char const *p,
3320                  unsigned char const *mbp, char const *end)
3321 {
3322   if (d->syntax.never_trail[*p])
3323     return p;
3324   while (mbp < p)
3325     {
3326       wint_t wc;
3327       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328                            end - (char const *) mbp, d);
3329     }
3330   return mbp;
3331 }
3332
3333 /* Search through a buffer looking for a match to the struct dfa *D.
3334    Find the first occurrence of a string matching the regexp in the
3335    buffer, and the shortest possible version thereof.  Return a pointer to
3336    the first character after the match, or NULL if none is found.  BEGIN
3337    points to the beginning of the buffer, and END points to the first byte
3338    after its end.  Note however that we store a sentinel byte (usually
3339    newline) in *END, so the actual buffer must be one byte longer.
3340    When ALLOW_NL, newlines may appear in the matching string.
3341    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3342    If MULTIBYTE, the input consists of multibyte characters and/or
3343    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3344    Here is the list of features that make this DFA matcher punt:
3345     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3346     - [^...] in non-simple locale
3347     - [[=foo=]] or [[.foo.]]
3348     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3349     - back-reference: (.)\1
3350     - word-delimiter in multibyte locale: \<, \>, \b, \B
3351    See struct localeinfo.simple for the definition of "simple locale".  */
3352
3353 static inline char *
3354 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3355               ptrdiff_t *count, bool multibyte)
3356 {
3357   if (MAX_TRCOUNT <= d->sindex)
3358     {
3359       for (state_num s = d->min_trcount; s < d->sindex; s++)
3360         {
3361           free (d->states[s].elems.elems);
3362           free (d->states[s].mbps.elems);
3363         }
3364       d->sindex = d->min_trcount;
3365
3366       if (d->trans)
3367         {
3368           for (state_num s = 0; s < d->tralloc; s++)
3369             {
3370               free (d->trans[s]);
3371               free (d->fails[s]);
3372               d->trans[s] = d->fails[s] = NULL;
3373             }
3374           d->trcount = 0;
3375         }
3376
3377       if (d->localeinfo.multibyte && d->mb_trans)
3378         {
3379           for (state_num s = -1; s < d->tralloc; s++)
3380             {
3381               free (d->mb_trans[s]);
3382               d->mb_trans[s] = NULL;
3383             }
3384           for (state_num s = 0; s < d->min_trcount; s++)
3385             d->states[s].mb_trindex = -1;
3386           d->mb_trcount = 0;
3387         }
3388     }
3389
3390   if (!d->tralloc)
3391     realloc_trans_if_necessary (d);
3392
3393   /* Current state.  */
3394   state_num s = 0, s1 = 0;
3395
3396   /* Current input character.  */
3397   unsigned char const *p = (unsigned char const *) begin;
3398   unsigned char const *mbp = p;
3399
3400   /* Copy of d->trans so it can be optimized into a register.  */
3401   state_num **trans = d->trans;
3402   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3403   unsigned char saved_end = *(unsigned char *) end;
3404   *end = eol;
3405
3406   if (multibyte)
3407     {
3408       memset (&d->mbs, 0, sizeof d->mbs);
3409       if (d->mb_follows.alloc == 0)
3410         alloc_position_set (&d->mb_follows, d->nleaves);
3411     }
3412
3413   idx_t nlcount = 0;
3414   for (;;)
3415     {
3416       state_num *t;
3417       while ((t = trans[s]) != NULL)
3418         {
3419           if (s < d->min_trcount)
3420             {
3421               if (!multibyte || d->states[s].mbps.nelem == 0)
3422                 {
3423                   while (t[*p] == s)
3424                     p++;
3425                 }
3426               if (multibyte)
3427                 p = mbp = skip_remains_mb (d, p, mbp, end);
3428             }
3429
3430           if (multibyte)
3431             {
3432               s1 = s;
3433
3434               if (d->states[s].mbps.nelem == 0
3435                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3436                 {
3437                   /* If an input character does not match ANYCHAR, do it
3438                      like a single-byte character.  */
3439                   s = t[*p++];
3440                 }
3441               else
3442                 {
3443                   s = transit_state (d, s, &p, (unsigned char *) end);
3444                   mbp = p;
3445                   trans = d->trans;
3446                 }
3447             }
3448           else
3449             {
3450               s1 = t[*p++];
3451               t = trans[s1];
3452               if (! t)
3453                 {
3454                   state_num tmp = s;
3455                   s = s1;
3456                   s1 = tmp;     /* swap */
3457                   break;
3458                 }
3459               if (s < d->min_trcount)
3460                 {
3461                   while (t[*p] == s1)
3462                     p++;
3463                 }
3464               s = t[*p++];
3465             }
3466         }
3467
3468       if (s < 0)
3469         {
3470           if (s == -2)
3471             {
3472               s = build_state (s1, d, p[-1]);
3473               trans = d->trans;
3474             }
3475           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476             {
3477               /* The previous character was a newline.  Count it, and skip
3478                  checking of multibyte character boundary until here.  */
3479               nlcount++;
3480               mbp = p;
3481
3482               s = (allow_nl ? d->newlines[s1]
3483                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485                    : d->initstate_notbol);
3486             }
3487           else
3488             {
3489               p = NULL;
3490               goto done;
3491             }
3492         }
3493       else if (d->fails[s])
3494         {
3495           if ((d->success[s] & d->syntax.sbit[*p])
3496               || ((char *) p == end
3497                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498                                          d)))
3499             goto done;
3500
3501           if (multibyte && s < d->min_trcount)
3502             p = mbp = skip_remains_mb (d, p, mbp, end);
3503
3504           s1 = s;
3505           if (!multibyte || d->states[s].mbps.nelem == 0
3506               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3507             {
3508               /* If a input character does not match ANYCHAR, do it
3509                  like a single-byte character.  */
3510               s = d->fails[s][*p++];
3511             }
3512           else
3513             {
3514               s = transit_state (d, s, &p, (unsigned char *) end);
3515               mbp = p;
3516               trans = d->trans;
3517             }
3518         }
3519       else
3520         {
3521           build_state (s, d, p[0]);
3522           trans = d->trans;
3523         }
3524     }
3525
3526  done:
3527   if (count)
3528     *count += nlcount;
3529   *end = saved_end;
3530   return (char *) p;
3531 }
3532
3533 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3534    This is for performance, as dfaexec_main is an inline function.  */
3535
3536 static char *
3537 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3538             bool allow_nl, ptrdiff_t *count, bool *backref)
3539 {
3540   return dfaexec_main (d, begin, end, allow_nl, count, true);
3541 }
3542
3543 static char *
3544 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3545             bool allow_nl, ptrdiff_t *count, bool *backref)
3546 {
3547   return dfaexec_main (d, begin, end, allow_nl, count, false);
3548 }
3549
3550 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3551    any regexp that uses a construct not supported by this code.  */
3552 static char *
3553 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3554               bool allow_nl, ptrdiff_t *count, bool *backref)
3555 {
3556   *backref = true;
3557   return (char *) begin;
3558 }
3559
3560 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3561    but faster and set *BACKREF if the DFA code does not support this
3562    regexp usage.  */
3563
3564 char *
3565 dfaexec (struct dfa *d, char const *begin, char *end,
3566          bool allow_nl, ptrdiff_t *count, bool *backref)
3567 {
3568   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569 }
3570
3571 struct dfa *
3572 dfasuperset (struct dfa const *d)
3573 {
3574   return d->superset;
3575 }
3576
3577 bool
3578 dfaisfast (struct dfa const *d)
3579 {
3580   return d->fast;
3581 }
3582
3583 static void
3584 free_mbdata (struct dfa *d)
3585 {
3586   free (d->multibyte_prop);
3587   free (d->lex.brack.chars);
3588   free (d->mb_follows.elems);
3589
3590   if (d->mb_trans)
3591     {
3592       state_num s;
3593       for (s = -1; s < d->tralloc; s++)
3594         free (d->mb_trans[s]);
3595       free (d->mb_trans - 2);
3596     }
3597 }
3598
3599 /* Return true if every construct in D is supported by this DFA matcher.  */
3600 static bool _GL_ATTRIBUTE_PURE
3601 dfa_supported (struct dfa const *d)
3602 {
3603   for (idx_t i = 0; i < d->tindex; i++)
3604     {
3605       switch (d->tokens[i])
3606         {
3607         case BEGWORD:
3608         case ENDWORD:
3609         case LIMWORD:
3610         case NOTLIMWORD:
3611           if (!d->localeinfo.multibyte)
3612             continue;
3613           FALLTHROUGH;
3614         case BACKREF:
3615         case MBCSET:
3616           return false;
3617         }
3618     }
3619   return true;
3620 }
3621
3622 /* Disable use of the superset DFA if it is not likely to help
3623    performance.  */
3624 static void
3625 maybe_disable_superset_dfa (struct dfa *d)
3626 {
3627   if (!d->localeinfo.using_utf8)
3628     return;
3629
3630   bool have_backref = false;
3631   for (idx_t i = 0; i < d->tindex; i++)
3632     {
3633       switch (d->tokens[i])
3634         {
3635         case ANYCHAR:
3636           /* Lowered.  */
3637           abort ();
3638         case BACKREF:
3639           have_backref = true;
3640           break;
3641         case MBCSET:
3642           /* Requires multi-byte algorithm.  */
3643           return;
3644         default:
3645           break;
3646         }
3647     }
3648
3649   if (!have_backref && d->superset)
3650     {
3651       /* The superset DFA is not likely to be much faster, so remove it.  */
3652       dfafree (d->superset);
3653       free (d->superset);
3654       d->superset = NULL;
3655     }
3656
3657   free_mbdata (d);
3658   d->localeinfo.multibyte = false;
3659   d->dfaexec = dfaexec_sb;
3660   d->fast = true;
3661 }
3662
3663 static void
3664 dfassbuild (struct dfa *d)
3665 {
3666   struct dfa *sup = dfaalloc ();
3667
3668   *sup = *d;
3669   sup->localeinfo.multibyte = false;
3670   sup->dfaexec = dfaexec_sb;
3671   sup->multibyte_prop = NULL;
3672   sup->superset = NULL;
3673   sup->states = NULL;
3674   sup->sindex = 0;
3675   sup->constraints = NULL;
3676   sup->separates = NULL;
3677   sup->follows = NULL;
3678   sup->tralloc = 0;
3679   sup->trans = NULL;
3680   sup->fails = NULL;
3681   sup->success = NULL;
3682   sup->newlines = NULL;
3683
3684   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685   if (d->cindex)
3686     {
3687       memcpy (sup->charclasses, d->charclasses,
3688               d->cindex * sizeof *sup->charclasses);
3689     }
3690
3691   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692   sup->talloc = d->tindex * 2;
3693
3694   bool have_achar = false;
3695   bool have_nchar = false;
3696   idx_t j;
3697   for (idx_t i = j = 0; i < d->tindex; i++)
3698     {
3699       switch (d->tokens[i])
3700         {
3701         case ANYCHAR:
3702         case MBCSET:
3703         case BACKREF:
3704           {
3705             charclass ccl;
3706             fillset (&ccl);
3707             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708             sup->tokens[j++] = STAR;
3709             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3710                 || d->tokens[i + 1] == PLUS)
3711               i++;
3712             have_achar = true;
3713           }
3714           break;
3715         case BEGWORD:
3716         case ENDWORD:
3717         case LIMWORD:
3718         case NOTLIMWORD:
3719           if (d->localeinfo.multibyte)
3720             {
3721               /* These constraints aren't supported in a multibyte locale.
3722                  Ignore them in the superset DFA.  */
3723               sup->tokens[j++] = EMPTY;
3724               break;
3725             }
3726           FALLTHROUGH;
3727         default:
3728           sup->tokens[j++] = d->tokens[i];
3729           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730               || d->tokens[i] >= CSET)
3731             have_nchar = true;
3732           break;
3733         }
3734     }
3735   sup->tindex = j;
3736
3737   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3738     d->superset = sup;
3739   else
3740     {
3741       dfafree (sup);
3742       free (sup);
3743     }
3744 }
3745
3746 /* Parse a string S of length LEN into D (but skip this step if S is null).
3747    Then analyze D and build a matcher for it.
3748    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3749 void
3750 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3751 {
3752   if (s != NULL)
3753     dfaparse (s, len, d);
3754
3755   dfassbuild (d);
3756
3757   if (dfa_supported (d))
3758     {
3759       maybe_disable_superset_dfa (d);
3760       dfaanalyze (d, searchflag);
3761     }
3762   else
3763     {
3764       d->dfaexec = dfaexec_noop;
3765     }
3766
3767   if (d->superset)
3768     {
3769       d->fast = true;
3770       dfaanalyze (d->superset, searchflag);
3771     }
3772 }
3773
3774 /* Free the storage held by the components of a dfa.  */
3775 void
3776 dfafree (struct dfa *d)
3777 {
3778   free (d->charclasses);
3779   free (d->tokens);
3780
3781   if (d->localeinfo.multibyte)
3782     free_mbdata (d);
3783
3784   free (d->constraints);
3785   free (d->separates);
3786
3787   for (idx_t i = 0; i < d->sindex; i++)
3788     {
3789       free (d->states[i].elems.elems);
3790       free (d->states[i].mbps.elems);
3791     }
3792   free (d->states);
3793
3794   if (d->follows)
3795     {
3796       for (idx_t i = 0; i < d->tindex; i++)
3797         free (d->follows[i].elems);
3798       free (d->follows);
3799     }
3800
3801   if (d->trans)
3802     {
3803       for (idx_t i = 0; i < d->tralloc; i++)
3804         {
3805           free (d->trans[i]);
3806           free (d->fails[i]);
3807         }
3808
3809       free (d->trans - 2);
3810       free (d->fails);
3811       free (d->newlines);
3812       free (d->success);
3813     }
3814
3815   if (d->superset)
3816     {
3817       dfafree (d->superset);
3818       free (d->superset);
3819     }
3820 }
3821
3822 /* Having found the postfix representation of the regular expression,
3823    try to find a long sequence of characters that must appear in any line
3824    containing the r.e.
3825    Finding a "longest" sequence is beyond the scope here;
3826    we take an easy way out and hope for the best.
3827    (Take "(ab|a)b"--please.)
3828
3829    We do a bottom-up calculation of sequences of characters that must appear
3830    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3831    representation:
3832         sequences that must appear at the left of the match ("left")
3833         sequences that must appear at the right of the match ("right")
3834         lists of sequences that must appear somewhere in the match ("in")
3835         sequences that must constitute the match ("is")
3836
3837    When we get to the root of the tree, we use one of the longest of its
3838    calculated "in" sequences as our answer.
3839
3840    The sequences calculated for the various types of node (in pseudo ANSI c)
3841    are shown below.  "p" is the operand of unary operators (and the left-hand
3842    operand of binary operators); "q" is the right-hand operand of binary
3843    operators.
3844
3845    "ZERO" means "a zero-length sequence" below.
3846
3847         Type    left            right           is              in
3848         ----    ----            -----           --              --
3849         char c  # c             # c             # c             # c
3850
3851         ANYCHAR ZERO            ZERO            ZERO            ZERO
3852
3853         MBCSET  ZERO            ZERO            ZERO            ZERO
3854
3855         CSET    ZERO            ZERO            ZERO            ZERO
3856
3857         STAR    ZERO            ZERO            ZERO            ZERO
3858
3859         QMARK   ZERO            ZERO            ZERO            ZERO
3860
3861         PLUS    p->left         p->right        ZERO            p->in
3862
3863         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3864                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3865                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3866                                                 ZERO
3867
3868         OR      longest common  longest common  (do p->is and substrings common
3869                 leading         trailing        to q->is have same p->in and
3870                 (sub)sequence   (sub)sequence   q->in length and content) ?
3871                 of p->left      of p->right
3872                 and q->left     and q->right    p->is : NULL
3873
3874    If there's anything else we recognize in the tree, all four sequences get set
3875    to zero-length sequences.  If there's something we don't recognize in the
3876    tree, we just return a zero-length sequence.
3877
3878    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3879    'aaa')?
3880
3881    And ... is it here or someplace that we might ponder "optimizations" such as
3882         egrep 'psi|epsilon'     ->      egrep 'psi'
3883         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3884                                         (Yes, we now find "epsi" as a "string
3885                                         that must occur", but we might also
3886                                         simplify the *entire* r.e. being sought)
3887         grep '[c]'              ->      grep 'c'
3888         grep '(ab|a)b'          ->      grep 'ab'
3889         grep 'ab*'              ->      grep 'a'
3890         grep 'a*b'              ->      grep 'b'
3891
3892    There are several issues:
3893
3894    Is optimization easy (enough)?
3895
3896    Does optimization actually accomplish anything,
3897    or is the automaton you get from "psi|epsilon" (for example)
3898    the same as the one you get from "psi" (for example)?
3899
3900    Are optimizable r.e.'s likely to be used in real-life situations
3901    (something like 'ab*' is probably unlikely; something like is
3902    'psi|epsilon' is likelier)?  */
3903
3904 static char *
3905 icatalloc (char *old, char const *new)
3906 {
3907   idx_t newsize = strlen (new);
3908   if (newsize == 0)
3909     return old;
3910   idx_t oldsize = strlen (old);
3911   char *result = xrealloc (old, oldsize + newsize + 1);
3912   memcpy (result + oldsize, new, newsize + 1);
3913   return result;
3914 }
3915
3916 static void
3917 freelist (char **cpp)
3918 {
3919   while (*cpp)
3920     free (*cpp++);
3921 }
3922
3923 static char **
3924 enlist (char **cpp, char *new, idx_t len)
3925 {
3926   new = memcpy (xmalloc (len + 1), new, len);
3927   new[len] = '\0';
3928   /* Is there already something in the list that's new (or longer)?  */
3929   idx_t i;
3930   for (i = 0; cpp[i] != NULL; i++)
3931     if (strstr (cpp[i], new) != NULL)
3932       {
3933         free (new);
3934         return cpp;
3935       }
3936   /* Eliminate any obsoleted strings.  */
3937   for (idx_t j = 0; cpp[j] != NULL; )
3938     if (strstr (new, cpp[j]) == NULL)
3939       ++j;
3940     else
3941       {
3942         free (cpp[j]);
3943         if (--i == j)
3944           break;
3945         cpp[j] = cpp[i];
3946         cpp[i] = NULL;
3947       }
3948   /* Add the new string.  */
3949   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3950   cpp[i] = new;
3951   cpp[i + 1] = NULL;
3952   return cpp;
3953 }
3954
3955 /* Given pointers to two strings, return a pointer to an allocated
3956    list of their distinct common substrings.  */
3957 static char **
3958 comsubs (char *left, char const *right)
3959 {
3960   char **cpp = xzalloc (sizeof *cpp);
3961
3962   for (char *lcp = left; *lcp != '\0'; lcp++)
3963     {
3964       idx_t len = 0;
3965       char *rcp = strchr (right, *lcp);
3966       while (rcp != NULL)
3967         {
3968           idx_t i;
3969           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3970             continue;
3971           if (i > len)
3972             len = i;
3973           rcp = strchr (rcp + 1, *lcp);
3974         }
3975       if (len != 0)
3976         cpp = enlist (cpp, lcp, len);
3977     }
3978   return cpp;
3979 }
3980
3981 static char **
3982 addlists (char **old, char **new)
3983 {
3984   for (; *new; new++)
3985     old = enlist (old, *new, strlen (*new));
3986   return old;
3987 }
3988
3989 /* Given two lists of substrings, return a new list giving substrings
3990    common to both.  */
3991 static char **
3992 inboth (char **left, char **right)
3993 {
3994   char **both = xzalloc (sizeof *both);
3995
3996   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
3997     {
3998       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
3999         {
4000           char **temp = comsubs (left[lnum], right[rnum]);
4001           both = addlists (both, temp);
4002           freelist (temp);
4003           free (temp);
4004         }
4005     }
4006   return both;
4007 }
4008
4009 typedef struct must must;
4010
4011 struct must
4012 {
4013   char **in;
4014   char *left;
4015   char *right;
4016   char *is;
4017   bool begline;
4018   bool endline;
4019   must *prev;
4020 };
4021
4022 static must *
4023 allocmust (must *mp, idx_t size)
4024 {
4025   must *new_mp = xmalloc (sizeof *new_mp);
4026   new_mp->in = xzalloc (sizeof *new_mp->in);
4027   new_mp->left = xzalloc (size);
4028   new_mp->right = xzalloc (size);
4029   new_mp->is = xzalloc (size);
4030   new_mp->begline = false;
4031   new_mp->endline = false;
4032   new_mp->prev = mp;
4033   return new_mp;
4034 }
4035
4036 static void
4037 resetmust (must *mp)
4038 {
4039   freelist (mp->in);
4040   mp->in[0] = NULL;
4041   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4042   mp->begline = false;
4043   mp->endline = false;
4044 }
4045
4046 static void
4047 freemust (must *mp)
4048 {
4049   freelist (mp->in);
4050   free (mp->in);
4051   free (mp->left);
4052   free (mp->right);
4053   free (mp->is);
4054   free (mp);
4055 }
4056
4057 struct dfamust *
4058 dfamust (struct dfa const *d)
4059 {
4060   must *mp = NULL;
4061   char const *result = "";
4062   bool exact = false;
4063   bool begline = false;
4064   bool endline = false;
4065   bool need_begline = false;
4066   bool need_endline = false;
4067   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4068
4069   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4070     {
4071       token t = d->tokens[ri];
4072       switch (t)
4073         {
4074         case BEGLINE:
4075           mp = allocmust (mp, 2);
4076           mp->begline = true;
4077           need_begline = true;
4078           break;
4079         case ENDLINE:
4080           mp = allocmust (mp, 2);
4081           mp->endline = true;
4082           need_endline = true;
4083           break;
4084         case LPAREN:
4085         case RPAREN:
4086           assert (!"neither LPAREN nor RPAREN may appear here");
4087
4088         case EMPTY:
4089         case BEGWORD:
4090         case ENDWORD:
4091         case LIMWORD:
4092         case NOTLIMWORD:
4093         case BACKREF:
4094         case ANYCHAR:
4095         case MBCSET:
4096           mp = allocmust (mp, 2);
4097           break;
4098
4099         case STAR:
4100         case QMARK:
4101           resetmust (mp);
4102           break;
4103
4104         case OR:
4105           {
4106             char **new;
4107             must *rmp = mp;
4108             must *lmp = mp = mp->prev;
4109             idx_t j, ln, rn, n;
4110
4111             /* Guaranteed to be.  Unlikely, but ...  */
4112             if (streq (lmp->is, rmp->is))
4113               {
4114                 lmp->begline &= rmp->begline;
4115                 lmp->endline &= rmp->endline;
4116               }
4117             else
4118               {
4119                 lmp->is[0] = '\0';
4120                 lmp->begline = false;
4121                 lmp->endline = false;
4122               }
4123             /* Left side--easy */
4124             idx_t i = 0;
4125             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4126               ++i;
4127             lmp->left[i] = '\0';
4128             /* Right side */
4129             ln = strlen (lmp->right);
4130             rn = strlen (rmp->right);
4131             n = ln;
4132             if (n > rn)
4133               n = rn;
4134             for (i = 0; i < n; ++i)
4135               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4136                 break;
4137             for (j = 0; j < i; ++j)
4138               lmp->right[j] = lmp->right[(ln - i) + j];
4139             lmp->right[j] = '\0';
4140             new = inboth (lmp->in, rmp->in);
4141             freelist (lmp->in);
4142             free (lmp->in);
4143             lmp->in = new;
4144             freemust (rmp);
4145           }
4146           break;
4147
4148         case PLUS:
4149           mp->is[0] = '\0';
4150           break;
4151
4152         case END:
4153           assert (!mp->prev);
4154           for (idx_t i = 0; mp->in[i] != NULL; i++)
4155             if (strlen (mp->in[i]) > strlen (result))
4156               result = mp->in[i];
4157           if (streq (result, mp->is))
4158             {
4159               if ((!need_begline || mp->begline) && (!need_endline
4160                                                      || mp->endline))
4161                 exact = true;
4162               begline = mp->begline;
4163               endline = mp->endline;
4164             }
4165           goto done;
4166
4167         case CAT:
4168           {
4169             must *rmp = mp;
4170             must *lmp = mp = mp->prev;
4171
4172             /* In.  Everything in left, plus everything in
4173                right, plus concatenation of
4174                left's right and right's left.  */
4175             lmp->in = addlists (lmp->in, rmp->in);
4176             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4177               {
4178                 idx_t lrlen = strlen (lmp->right);
4179                 idx_t rllen = strlen (rmp->left);
4180                 char *tp = xmalloc (lrlen + rllen);
4181                 memcpy (tp, lmp->right, lrlen);
4182                 memcpy (tp + lrlen, rmp->left, rllen);
4183                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4184                 free (tp);
4185               }
4186             /* Left-hand */
4187             if (lmp->is[0] != '\0')
4188               lmp->left = icatalloc (lmp->left, rmp->left);
4189             /* Right-hand */
4190             if (rmp->is[0] == '\0')
4191               lmp->right[0] = '\0';
4192             lmp->right = icatalloc (lmp->right, rmp->right);
4193             /* Guaranteed to be */
4194             if ((lmp->is[0] != '\0' || lmp->begline)
4195                 && (rmp->is[0] != '\0' || rmp->endline))
4196               {
4197                 lmp->is = icatalloc (lmp->is, rmp->is);
4198                 lmp->endline = rmp->endline;
4199               }
4200             else
4201               {
4202                 lmp->is[0] = '\0';
4203                 lmp->begline = false;
4204                 lmp->endline = false;
4205               }
4206             freemust (rmp);
4207           }
4208           break;
4209
4210         case '\0':
4211           /* Not on *my* shift.  */
4212           goto done;
4213
4214         default:
4215           if (CSET <= t)
4216             {
4217               /* If T is a singleton, or if case-folding in a unibyte
4218                  locale and T's members all case-fold to the same char,
4219                  convert T to one of its members.  Otherwise, do
4220                  nothing further with T.  */
4221               charclass *ccl = &d->charclasses[t - CSET];
4222               int j;
4223               for (j = 0; j < NOTCHAR; j++)
4224                 if (tstbit (j, ccl))
4225                   break;
4226               if (! (j < NOTCHAR))
4227                 {
4228                   mp = allocmust (mp, 2);
4229                   break;
4230                 }
4231               t = j;
4232               while (++j < NOTCHAR)
4233                 if (tstbit (j, ccl)
4234                     && ! (case_fold_unibyte
4235                           && toupper (j) == toupper (t)))
4236                   break;
4237               if (j < NOTCHAR)
4238                 {
4239                   mp = allocmust (mp, 2);
4240                   break;
4241                 }
4242             }
4243
4244           idx_t rj = ri + 2;
4245           if (d->tokens[ri + 1] == CAT)
4246             {
4247               for (; rj < d->tindex - 1; rj += 2)
4248                 {
4249                   if ((rj != ri && (d->tokens[rj] <= 0
4250                                     || NOTCHAR <= d->tokens[rj]))
4251                       || d->tokens[rj + 1] != CAT)
4252                     break;
4253                 }
4254             }
4255           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4256           mp->is[0] = mp->left[0] = mp->right[0]
4257             = case_fold_unibyte ? toupper (t) : t;
4258
4259           idx_t i;
4260           for (i = 1; ri + 2 < rj; i++)
4261             {
4262               ri += 2;
4263               t = d->tokens[ri];
4264               mp->is[i] = mp->left[i] = mp->right[i]
4265                 = case_fold_unibyte ? toupper (t) : t;
4266             }
4267           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4268           mp->in = enlist (mp->in, mp->is, i);
4269           break;
4270         }
4271     }
4272  done:;
4273
4274   struct dfamust *dm = NULL;
4275   if (*result)
4276     {
4277       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4278       dm->exact = exact;
4279       dm->begline = begline;
4280       dm->endline = endline;
4281       strcpy (dm->must, result);
4282     }
4283
4284   while (mp)
4285     {
4286       must *prev = mp->prev;
4287       freemust (mp);
4288       mp = prev;
4289     }
4290
4291   return dm;
4292 }
4293
4294 void
4295 dfamustfree (struct dfamust *dm)
4296 {
4297   free (dm);
4298 }
4299
4300 struct dfa *
4301 dfaalloc (void)
4302 {
4303   return xmalloc (sizeof (struct dfa));
4304 }
4305
4306 /* Initialize DFA.  */
4307 void
4308 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4309            reg_syntax_t bits, int dfaopts)
4310 {
4311   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4312   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4313   dfa->localeinfo = *linfo;
4314
4315   dfa->fast = !dfa->localeinfo.multibyte;
4316
4317   dfa->canychar = -1;
4318   dfa->syntax.syntax_bits_set = true;
4319   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4320   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4321   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4322   dfa->syntax.syntax_bits = bits;
4323
4324   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4325     {
4326       unsigned char uc = i;
4327
4328       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4329       switch (dfa->syntax.sbit[uc])
4330         {
4331         case CTX_LETTER:
4332           setbit (uc, &dfa->syntax.letters);
4333           break;
4334         case CTX_NEWLINE:
4335           setbit (uc, &dfa->syntax.newline);
4336           break;
4337         }
4338
4339       /* POSIX requires that the five bytes in "\n\r./" (including the
4340          terminating NUL) cannot occur inside a multibyte character.  */
4341       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4342                                      ? (uc & 0xc0) != 0x80
4343                                      : strchr ("\n\r./", uc) != NULL);
4344     }
4345 }
4346
4347 /* Initialize TO by copying FROM's syntax settings.  */
4348 void
4349 dfacopysyntax (struct dfa *to, struct dfa const *from)
4350 {
4351   memset (to, 0, offsetof (struct dfa, syntax));
4352   to->canychar = -1;
4353   to->fast = from->fast;
4354   to->syntax = from->syntax;
4355   to->dfaexec = from->dfaexec;
4356   to->localeinfo = from->localeinfo;
4357 }
4358
4359 /* vim:set shiftwidth=2: */