1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
25 /* Whether -w considers WC to be a word constituent. */
29 return wc
== L
'_' || iswalnum (wc
);
32 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
33 a list of strings, at least one of which is known to occur in
34 any string matching the regexp. */
37 /* DFA compiled regexp. */
38 static struct dfa
*dfa
;
40 /* The Regex compiled patterns. */
41 static struct patterns
43 /* Regex compiled regexp. */
44 struct re_pattern_buffer regexbuf
;
45 struct re_registers regs
; /* This is here on account of a BRAIN-DEAD
46 Q@#%!# library interface in regex.c. */
49 static struct patterns
*patterns
;
52 /* Number of compiled fixed strings known to exactly match the regexp.
53 If kwsexec returns < kwset_exact_matches, then we don't need to
54 call the regexp matcher at all. */
55 static size_t kwset_exact_matches
;
60 dfaerror (char const *mesg
)
62 error (EXIT_TROUBLE
, 0, "%s", mesg
);
65 /* Tell static analyzers that this function does not return. */
69 /* For now, the sole dfawarn-eliciting condition (use of a regexp
70 like '[:lower:]') is unequivocally an error, so treat it as such,
73 dfawarn (char const *mesg
)
75 static enum { DW_NONE
= 0, DW_POSIX
, DW_GNU
} mode
;
77 mode
= (getenv ("POSIXLY_CORRECT") ? DW_POSIX
: DW_GNU
);
82 /* If the DFA turns out to have some set of fixed strings one of
83 which must occur in the match, then we build a kwset matcher
84 to find those strings, and thus quickly filter out impossible
89 struct dfamust
*dm
= dfamust (dfa
);
95 /* Prepare a substring whose presence implies a match.
96 The kwset matcher will return the index of the matching
97 string that it chooses. */
98 ++kwset_exact_matches
;
99 size_t old_len
= strlen (dm
->must
);
100 size_t new_len
= old_len
+ dm
->begline
+ dm
->endline
;
101 char *must
= xmalloc (new_len
);
105 begline
|= dm
->begline
;
106 memcpy (mp
, dm
->must
, old_len
);
108 mp
[old_len
] = eolbyte
;
109 kwsincr (kwset
, must
, new_len
);
114 /* Otherwise, filtering with this substring should help reduce the
115 search space, but we'll still have to use the regexp matcher. */
116 kwsincr (kwset
, dm
->must
, strlen (dm
->must
));
123 GEAcompile (char const *pattern
, size_t size
, reg_syntax_t syntax_bits
)
129 syntax_bits
|= RE_ICASE
;
130 re_set_syntax (syntax_bits
);
131 dfasyntax (syntax_bits
, match_icase
, eolbyte
);
133 /* For GNU regex, pass the patterns separately to detect errors like
134 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
135 this should be a syntax error. The same for backref, where the
136 backref should be local to each pattern. */
137 char const *p
= pattern
;
141 char const *sep
= memchr (p
, '\n', total
);
154 patterns
= xnrealloc (patterns
, pcount
+ 1, sizeof *patterns
);
155 patterns
[pcount
] = patterns0
;
157 char const *err
= re_compile_pattern (p
, len
,
158 &(patterns
[pcount
].regexbuf
));
160 error (EXIT_TROUBLE
, 0, "%s", err
);
166 /* In the match_words and match_lines cases, we use a different pattern
167 for the DFA matcher that will quickly throw out cases that won't work.
168 Then if DFA succeeds we do some hairy stuff using the regex matcher
169 to decide whether the match should really count. */
170 if (match_words
|| match_lines
)
172 static char const line_beg_no_bk
[] = "^(";
173 static char const line_end_no_bk
[] = ")$";
174 static char const word_beg_no_bk
[] = "(^|[^[:alnum:]_])(";
175 static char const word_end_no_bk
[] = ")([^[:alnum:]_]|$)";
176 static char const line_beg_bk
[] = "^\\(";
177 static char const line_end_bk
[] = "\\)$";
178 static char const word_beg_bk
[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
179 static char const word_end_bk
[] = "\\)\\([^[:alnum:]_]\\|$\\)";
180 int bk
= !(syntax_bits
& RE_NO_BK_PARENS
);
181 char *n
= xmalloc (sizeof word_beg_bk
- 1 + size
+ sizeof word_end_bk
);
183 strcpy (n
, match_lines
? (bk
? line_beg_bk
: line_beg_no_bk
)
184 : (bk
? word_beg_bk
: word_beg_no_bk
));
186 memcpy (n
+ total
, pattern
, size
);
188 strcpy (n
+ total
, match_lines
? (bk
? line_end_bk
: line_end_no_bk
)
189 : (bk
? word_end_bk
: word_end_no_bk
));
190 total
+= strlen (n
+ total
);
198 dfacomp (pattern
, size
, dfa
, 1);
205 EGexecute (char const *buf
, size_t size
, size_t *match_size
,
206 char const *start_ptr
)
208 char const *buflim
, *beg
, *end
, *ptr
, *match
, *best_match
, *mb_start
;
211 size_t len
, best_len
;
212 struct kwsmatch kwsm
;
214 struct dfa
*superset
= dfasuperset (dfa
);
215 bool dfafast
= dfaisfast (dfa
);
220 for (beg
= end
= buf
; end
< buflim
; beg
= end
)
226 char const *next_beg
, *dfa_beg
= beg
;
228 bool exact_kwset_match
= false;
231 /* Try matching with KWset, if it's defined. */
234 char const *prev_beg
;
236 /* Find a possible match using the KWset matcher. */
237 size_t offset
= kwsexec (kwset
, beg
- begline
,
238 buflim
- beg
+ begline
, &kwsm
);
239 if (offset
== (size_t) -1)
241 match
= beg
+ offset
;
244 /* Narrow down to the line containing the possible match. */
245 beg
= memrchr (buf
, eol
, match
- buf
);
246 beg
= beg
? beg
+ 1 : buf
;
249 /* Determine the end pointer to give the DFA next. Typically
250 this is after the first newline after MATCH; but if the KWset
251 match is not exact, the DFA is fast, and the offset from
252 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
253 greater of the latter two values; this temporarily prefers
255 exact_kwset_match
= kwsm
.index
< kwset_exact_matches
;
256 end
= ((exact_kwset_match
|| !dfafast
257 || MAX (16, match
- beg
) < (match
- prev_beg
) >> 2)
259 : MAX (16, match
- beg
) < (buflim
- prev_beg
) >> 2
260 ? prev_beg
+ 4 * MAX (16, match
- beg
)
262 end
= memchr (end
, eol
, buflim
- end
);
263 end
= end
? end
+ 1 : buflim
;
265 if (exact_kwset_match
)
267 if (MB_CUR_MAX
== 1 || using_utf8 ())
271 if (mb_goback (&mb_start
, match
, buflim
) == 0)
273 /* The matched line starts in the middle of a multibyte
274 character. Perform the DFA search starting from the
275 beginning of the next character. */
280 /* Try matching with the superset of DFA, if it's defined. */
281 if (superset
&& !exact_kwset_match
)
283 /* Keep using the superset while it reports multiline
284 potential matches; this is more likely to be fast
285 than falling back to KWset would be. */
286 while ((next_beg
= dfaexec (superset
, dfa_beg
, (char *) end
, 1,
291 /* Try to match in just one line. */
293 beg
= memrchr (buf
, eol
, next_beg
- buf
);
297 if (next_beg
== NULL
|| next_beg
== end
)
300 /* Narrow down to the line we've found. */
301 end
= memchr (next_beg
, eol
, buflim
- next_beg
);
302 end
= end
? end
+ 1 : buflim
;
305 /* Try matching with DFA. */
306 next_beg
= dfaexec (dfa
, dfa_beg
, (char *) end
, 0, &count
, &backref
);
308 /* If there's no match, or if we've matched the sentinel,
310 if (next_beg
== NULL
|| next_beg
== end
)
313 /* Narrow down to the line we've found. */
316 beg
= memrchr (buf
, eol
, next_beg
- buf
);
319 end
= memchr (next_beg
, eol
, buflim
- next_beg
);
320 end
= end
? end
+ 1 : buflim
;
322 /* Successful, no backreferences encountered! */
329 /* We are looking for the leftmost (then longest) exact match.
330 We will go through the outer loop only once. */
334 /* If the "line" is longer than the maximum regexp offset,
335 die as if we've run out of memory. */
336 if (TYPE_MAXIMUM (regoff_t
) < end
- beg
- 1)
339 /* Run the possible match through Regex. */
342 for (i
= 0; i
< pcount
; i
++)
344 patterns
[i
].regexbuf
.not_eol
= 0;
345 start
= re_search (&(patterns
[i
].regexbuf
),
347 ptr
- beg
, end
- ptr
- 1,
348 &(patterns
[i
].regs
));
353 len
= patterns
[i
].regs
.end
[0] - start
;
355 if (match
> best_match
)
357 if (start_ptr
&& !match_words
)
358 goto assess_pattern_match
;
359 if ((!match_lines
&& !match_words
)
360 || (match_lines
&& len
== end
- ptr
- 1))
364 goto assess_pattern_match
;
366 /* If -w, check if the match aligns with word boundaries.
367 We do this iteratively because:
368 (a) the line may contain more than one occurrence of the
370 (b) Several alternatives in the pattern might be valid at a
371 given point, and we may need to consider a shorter one to
372 find a word boundary. */
374 while (match
<= best_match
)
376 regoff_t shorter_len
= 0;
377 if (!wordchar (mb_prev_wc (beg
, match
, end
- 1))
378 && !wordchar (mb_next_wc (match
+ len
, end
- 1)))
379 goto assess_pattern_match
;
382 /* Try a shorter length anchored at the same place. */
384 patterns
[i
].regexbuf
.not_eol
= 1;
385 shorter_len
= re_match (&(patterns
[i
].regexbuf
),
386 beg
, match
+ len
- ptr
,
388 &(patterns
[i
].regs
));
389 if (shorter_len
< -1)
396 /* Try looking further on. */
397 if (match
== end
- 1)
400 patterns
[i
].regexbuf
.not_eol
= 0;
401 start
= re_search (&(patterns
[i
].regexbuf
),
403 match
- beg
, end
- match
- 1,
404 &(patterns
[i
].regs
));
411 len
= patterns
[i
].regs
.end
[0] - start
;
414 } /* while (match <= best_match) */
416 assess_pattern_match
:
419 /* Good enough for a non-exact match.
420 No need to look at further patterns, if any. */
423 if (match
< best_match
|| (match
== best_match
&& len
> best_len
))
425 /* Best exact match: leftmost, then longest. */
429 } /* if re_search >= 0 */
430 } /* for Regex patterns. */
431 if (best_match
< end
)
433 /* We have found an exact match. We were just
434 waiting for the best one (leftmost then longest). */
439 } /* for (beg = end ..) */
447 size_t off
= beg
- buf
;