drm/i915: Update to Linux 4.6
[dragonfly.git] / contrib / grep / src / dfasearch.c
blobde513213cdcbb31a13bc62d727eb2b7125f83007
1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* Written August 1992 by Mike Haertel. */
21 #include <config.h>
22 #include "intprops.h"
23 #include "search.h"
25 /* Whether -w considers WC to be a word constituent. */
26 static bool
27 wordchar (wint_t wc)
29 return wc == L'_' || iswalnum (wc);
32 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
33 a list of strings, at least one of which is known to occur in
34 any string matching the regexp. */
35 static kwset_t kwset;
37 /* DFA compiled regexp. */
38 static struct dfa *dfa;
40 /* The Regex compiled patterns. */
41 static struct patterns
43 /* Regex compiled regexp. */
44 struct re_pattern_buffer regexbuf;
45 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
46 Q@#%!# library interface in regex.c. */
47 } patterns0;
49 static struct patterns *patterns;
50 static size_t pcount;
52 /* Number of compiled fixed strings known to exactly match the regexp.
53 If kwsexec returns < kwset_exact_matches, then we don't need to
54 call the regexp matcher at all. */
55 static size_t kwset_exact_matches;
57 static bool begline;
59 void
60 dfaerror (char const *mesg)
62 error (EXIT_TROUBLE, 0, "%s", mesg);
64 /* notreached */
65 /* Tell static analyzers that this function does not return. */
66 abort ();
69 /* For now, the sole dfawarn-eliciting condition (use of a regexp
70 like '[:lower:]') is unequivocally an error, so treat it as such,
71 when possible. */
72 void
73 dfawarn (char const *mesg)
75 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
76 if (mode == DW_NONE)
77 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
78 if (mode == DW_GNU)
79 dfaerror (mesg);
82 /* If the DFA turns out to have some set of fixed strings one of
83 which must occur in the match, then we build a kwset matcher
84 to find those strings, and thus quickly filter out impossible
85 matches. */
86 static void
87 kwsmusts (void)
89 struct dfamust *dm = dfamust (dfa);
90 if (!dm)
91 return;
92 kwsinit (&kwset);
93 if (dm->exact)
95 /* Prepare a substring whose presence implies a match.
96 The kwset matcher will return the index of the matching
97 string that it chooses. */
98 ++kwset_exact_matches;
99 size_t old_len = strlen (dm->must);
100 size_t new_len = old_len + dm->begline + dm->endline;
101 char *must = xmalloc (new_len);
102 char *mp = must;
103 *mp = eolbyte;
104 mp += dm->begline;
105 begline |= dm->begline;
106 memcpy (mp, dm->must, old_len);
107 if (dm->endline)
108 mp[old_len] = eolbyte;
109 kwsincr (kwset, must, new_len);
110 free (must);
112 else
114 /* Otherwise, filtering with this substring should help reduce the
115 search space, but we'll still have to use the regexp matcher. */
116 kwsincr (kwset, dm->must, strlen (dm->must));
118 kwsprep (kwset);
119 dfamustfree (dm);
122 void
123 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
125 size_t total = size;
126 char *motif;
128 if (match_icase)
129 syntax_bits |= RE_ICASE;
130 re_set_syntax (syntax_bits);
131 dfasyntax (syntax_bits, match_icase, eolbyte);
133 /* For GNU regex, pass the patterns separately to detect errors like
134 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
135 this should be a syntax error. The same for backref, where the
136 backref should be local to each pattern. */
137 char const *p = pattern;
140 size_t len;
141 char const *sep = memchr (p, '\n', total);
142 if (sep)
144 len = sep - p;
145 sep++;
146 total -= (len + 1);
148 else
150 len = total;
151 total = 0;
154 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
155 patterns[pcount] = patterns0;
157 char const *err = re_compile_pattern (p, len,
158 &(patterns[pcount].regexbuf));
159 if (err)
160 error (EXIT_TROUBLE, 0, "%s", err);
161 pcount++;
162 p = sep;
164 while (p);
166 /* In the match_words and match_lines cases, we use a different pattern
167 for the DFA matcher that will quickly throw out cases that won't work.
168 Then if DFA succeeds we do some hairy stuff using the regex matcher
169 to decide whether the match should really count. */
170 if (match_words || match_lines)
172 static char const line_beg_no_bk[] = "^(";
173 static char const line_end_no_bk[] = ")$";
174 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
175 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
176 static char const line_beg_bk[] = "^\\(";
177 static char const line_end_bk[] = "\\)$";
178 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
179 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
180 int bk = !(syntax_bits & RE_NO_BK_PARENS);
181 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
183 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
184 : (bk ? word_beg_bk : word_beg_no_bk));
185 total = strlen(n);
186 memcpy (n + total, pattern, size);
187 total += size;
188 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
189 : (bk ? word_end_bk : word_end_no_bk));
190 total += strlen (n + total);
191 pattern = motif = n;
192 size = total;
194 else
195 motif = NULL;
197 dfa = dfaalloc ();
198 dfacomp (pattern, size, dfa, 1);
199 kwsmusts ();
201 free(motif);
204 size_t
205 EGexecute (char const *buf, size_t size, size_t *match_size,
206 char const *start_ptr)
208 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
209 char eol = eolbyte;
210 regoff_t start;
211 size_t len, best_len;
212 struct kwsmatch kwsm;
213 size_t i;
214 struct dfa *superset = dfasuperset (dfa);
215 bool dfafast = dfaisfast (dfa);
217 mb_start = buf;
218 buflim = buf + size;
220 for (beg = end = buf; end < buflim; beg = end)
222 end = buflim;
224 if (!start_ptr)
226 char const *next_beg, *dfa_beg = beg;
227 size_t count = 0;
228 bool exact_kwset_match = false;
229 int backref = 0;
231 /* Try matching with KWset, if it's defined. */
232 if (kwset)
234 char const *prev_beg;
236 /* Find a possible match using the KWset matcher. */
237 size_t offset = kwsexec (kwset, beg - begline,
238 buflim - beg + begline, &kwsm);
239 if (offset == (size_t) -1)
240 goto failure;
241 match = beg + offset;
242 prev_beg = beg;
244 /* Narrow down to the line containing the possible match. */
245 beg = memrchr (buf, eol, match - buf);
246 beg = beg ? beg + 1 : buf;
247 dfa_beg = beg;
249 /* Determine the end pointer to give the DFA next. Typically
250 this is after the first newline after MATCH; but if the KWset
251 match is not exact, the DFA is fast, and the offset from
252 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
253 greater of the latter two values; this temporarily prefers
254 the DFA to KWset. */
255 exact_kwset_match = kwsm.index < kwset_exact_matches;
256 end = ((exact_kwset_match || !dfafast
257 || MAX (16, match - beg) < (match - prev_beg) >> 2)
258 ? match
259 : MAX (16, match - beg) < (buflim - prev_beg) >> 2
260 ? prev_beg + 4 * MAX (16, match - beg)
261 : buflim);
262 end = memchr (end, eol, buflim - end);
263 end = end ? end + 1 : buflim;
265 if (exact_kwset_match)
267 if (MB_CUR_MAX == 1 || using_utf8 ())
268 goto success;
269 if (mb_start < beg)
270 mb_start = beg;
271 if (mb_goback (&mb_start, match, buflim) == 0)
272 goto success;
273 /* The matched line starts in the middle of a multibyte
274 character. Perform the DFA search starting from the
275 beginning of the next character. */
276 dfa_beg = mb_start;
280 /* Try matching with the superset of DFA, if it's defined. */
281 if (superset && !exact_kwset_match)
283 /* Keep using the superset while it reports multiline
284 potential matches; this is more likely to be fast
285 than falling back to KWset would be. */
286 while ((next_beg = dfaexec (superset, dfa_beg, (char *) end, 1,
287 &count, NULL))
288 && next_beg != end
289 && count != 0)
291 /* Try to match in just one line. */
292 count = 0;
293 beg = memrchr (buf, eol, next_beg - buf);
294 beg++;
295 dfa_beg = beg;
297 if (next_beg == NULL || next_beg == end)
298 continue;
300 /* Narrow down to the line we've found. */
301 end = memchr (next_beg, eol, buflim - next_beg);
302 end = end ? end + 1 : buflim;
305 /* Try matching with DFA. */
306 next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref);
308 /* If there's no match, or if we've matched the sentinel,
309 we're done. */
310 if (next_beg == NULL || next_beg == end)
311 continue;
313 /* Narrow down to the line we've found. */
314 if (count != 0)
316 beg = memrchr (buf, eol, next_beg - buf);
317 beg++;
319 end = memchr (next_beg, eol, buflim - next_beg);
320 end = end ? end + 1 : buflim;
322 /* Successful, no backreferences encountered! */
323 if (!backref)
324 goto success;
325 ptr = beg;
327 else
329 /* We are looking for the leftmost (then longest) exact match.
330 We will go through the outer loop only once. */
331 ptr = start_ptr;
334 /* If the "line" is longer than the maximum regexp offset,
335 die as if we've run out of memory. */
336 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
337 xalloc_die ();
339 /* Run the possible match through Regex. */
340 best_match = end;
341 best_len = 0;
342 for (i = 0; i < pcount; i++)
344 patterns[i].regexbuf.not_eol = 0;
345 start = re_search (&(patterns[i].regexbuf),
346 beg, end - beg - 1,
347 ptr - beg, end - ptr - 1,
348 &(patterns[i].regs));
349 if (start < -1)
350 xalloc_die ();
351 else if (0 <= start)
353 len = patterns[i].regs.end[0] - start;
354 match = beg + start;
355 if (match > best_match)
356 continue;
357 if (start_ptr && !match_words)
358 goto assess_pattern_match;
359 if ((!match_lines && !match_words)
360 || (match_lines && len == end - ptr - 1))
362 match = ptr;
363 len = end - ptr;
364 goto assess_pattern_match;
366 /* If -w, check if the match aligns with word boundaries.
367 We do this iteratively because:
368 (a) the line may contain more than one occurrence of the
369 pattern, and
370 (b) Several alternatives in the pattern might be valid at a
371 given point, and we may need to consider a shorter one to
372 find a word boundary. */
373 if (match_words)
374 while (match <= best_match)
376 regoff_t shorter_len = 0;
377 if (!wordchar (mb_prev_wc (beg, match, end - 1))
378 && !wordchar (mb_next_wc (match + len, end - 1)))
379 goto assess_pattern_match;
380 if (len > 0)
382 /* Try a shorter length anchored at the same place. */
383 --len;
384 patterns[i].regexbuf.not_eol = 1;
385 shorter_len = re_match (&(patterns[i].regexbuf),
386 beg, match + len - ptr,
387 match - beg,
388 &(patterns[i].regs));
389 if (shorter_len < -1)
390 xalloc_die ();
392 if (0 < shorter_len)
393 len = shorter_len;
394 else
396 /* Try looking further on. */
397 if (match == end - 1)
398 break;
399 match++;
400 patterns[i].regexbuf.not_eol = 0;
401 start = re_search (&(patterns[i].regexbuf),
402 beg, end - beg - 1,
403 match - beg, end - match - 1,
404 &(patterns[i].regs));
405 if (start < 0)
407 if (start < -1)
408 xalloc_die ();
409 break;
411 len = patterns[i].regs.end[0] - start;
412 match = beg + start;
414 } /* while (match <= best_match) */
415 continue;
416 assess_pattern_match:
417 if (!start_ptr)
419 /* Good enough for a non-exact match.
420 No need to look at further patterns, if any. */
421 goto success;
423 if (match < best_match || (match == best_match && len > best_len))
425 /* Best exact match: leftmost, then longest. */
426 best_match = match;
427 best_len = len;
429 } /* if re_search >= 0 */
430 } /* for Regex patterns. */
431 if (best_match < end)
433 /* We have found an exact match. We were just
434 waiting for the best one (leftmost then longest). */
435 beg = best_match;
436 len = best_len;
437 goto success_in_len;
439 } /* for (beg = end ..) */
441 failure:
442 return -1;
444 success:
445 len = end - beg;
446 success_in_len:;
447 size_t off = beg - buf;
448 *match_size = len;
449 return off;