From FreeBSD:
[dragonfly/vkernel-mp.git] / gnu / usr.bin / grep / search.c
blobb53d209406a0ba26d77fb60e827821d677c40aa2
1 /* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright (C) 1992, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 02111-1307, USA. */
19 /* Written August 1992 by Mike Haertel. */
21 /* $FreeBSD: src/gnu/usr.bin/grep/search.c,v 1.10 2000/01/31 13:28:57 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/search.c,v 1.3 2004/02/03 20:22:57 dillon Exp $ */
24 #ifdef HAVE_CONFIG_H
25 # include <config.h>
26 #endif
27 #include <sys/types.h>
28 #include "system.h"
29 #include "grep.h"
30 #include <gnuregex.h>
31 #include "dfa.h"
32 #include "kwset.h"
34 #define NCHAR (UCHAR_MAX + 1)
36 static void Gcompile PARAMS((char *, size_t));
37 static void Ecompile PARAMS((char *, size_t));
38 static char *EGexecute PARAMS((char *, size_t, char **));
39 static void Fcompile PARAMS((char *, size_t));
40 static char *Fexecute PARAMS((char *, size_t, char **));
41 static void kwsinit PARAMS((void));
43 /* Here is the matchers vector for the main program. */
44 struct matcher matchers[] = {
45 { "default", Gcompile, EGexecute },
46 { "grep", Gcompile, EGexecute },
47 { "egrep", Ecompile, EGexecute },
48 { "awk", Ecompile, EGexecute },
49 { "fgrep", Fcompile, Fexecute },
50 { 0, 0, 0 },
53 /* For -w, we also consider _ to be word constituent. */
54 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
56 /* DFA compiled regexp. */
57 static struct dfa dfa;
59 /* Regex compiled regexp. */
60 static struct re_pattern_buffer regexbuf;
62 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
63 a list of strings, at least one of which is known to occur in
64 any string matching the regexp. */
65 static kwset_t kwset;
67 /* Last compiled fixed string known to exactly match the regexp.
68 If kwsexec() returns < lastexact, then we don't need to
69 call the regexp matcher at all. */
70 static int lastexact;
72 void
73 dfaerror (char const *mesg)
75 fatal(mesg, 0);
78 static void
79 kwsinit (void)
81 static char trans[NCHAR];
82 int i;
84 if (match_icase)
85 for (i = 0; i < NCHAR; ++i)
86 trans[i] = TOLOWER(i);
88 if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
89 fatal("memory exhausted", 0);
92 /* If the DFA turns out to have some set of fixed strings one of
93 which must occur in the match, then we build a kwset matcher
94 to find those strings, and thus quickly filter out impossible
95 matches. */
96 static void
97 kwsmusts (void)
99 struct dfamust *dm;
100 char *err;
102 if (dfa.musts)
104 kwsinit();
105 /* First, we compile in the substrings known to be exact
106 matches. The kwset matcher will return the index
107 of the matching string that it chooses. */
108 for (dm = dfa.musts; dm; dm = dm->next)
110 if (!dm->exact)
111 continue;
112 ++lastexact;
113 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
114 fatal(err, 0);
116 /* Now, we compile the substrings that will require
117 the use of the regexp matcher. */
118 for (dm = dfa.musts; dm; dm = dm->next)
120 if (dm->exact)
121 continue;
122 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
123 fatal(err, 0);
125 if ((err = kwsprep(kwset)) != 0)
126 fatal(err, 0);
130 static void
131 Gcompile (char *pattern, size_t size)
133 const char *err;
135 re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
136 dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
138 if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
139 fatal(err, 0);
141 /* In the match_words and match_lines cases, we use a different pattern
142 for the DFA matcher that will quickly throw out cases that won't work.
143 Then if DFA succeeds we do some hairy stuff using the regex matcher
144 to decide whether the match should really count. */
145 if (match_words || match_lines)
147 /* In the whole-word case, we use the pattern:
148 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
149 In the whole-line case, we use the pattern:
150 ^(userpattern)$.
151 BUG: Using [A-Za-z_] is locale-dependent!
152 So will use [:alnum:] */
154 char *n = malloc(size + 50);
155 int i = 0;
157 strcpy(n, "");
159 if (match_lines)
160 strcpy(n, "^\\(");
161 if (match_words)
162 strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
164 i = strlen(n);
165 memcpy(n + i, pattern, size);
166 i += size;
168 if (match_words)
169 strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
170 if (match_lines)
171 strcpy(n + i, "\\)$");
173 i += strlen(n + i);
174 dfacomp(n, i, &dfa, 1);
176 else
177 dfacomp(pattern, size, &dfa, 1);
179 kwsmusts();
182 static void
183 Ecompile (char *pattern, size_t size)
185 const char *err;
187 if (strcmp(matcher, "awk") == 0)
189 re_set_syntax(RE_SYNTAX_AWK);
190 dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
192 else
194 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
195 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
198 if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
199 fatal(err, 0);
201 /* In the match_words and match_lines cases, we use a different pattern
202 for the DFA matcher that will quickly throw out cases that won't work.
203 Then if DFA succeeds we do some hairy stuff using the regex matcher
204 to decide whether the match should really count. */
205 if (match_words || match_lines)
207 /* In the whole-word case, we use the pattern:
208 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
209 In the whole-line case, we use the pattern:
210 ^(userpattern)$.
211 BUG: Using [A-Za-z_] is locale-dependent!
212 so will use the char class */
214 char *n = malloc(size + 50);
215 int i = 0;
217 strcpy(n, "");
219 if (match_lines)
220 strcpy(n, "^(");
221 if (match_words)
222 strcpy(n, "(^|[^[:alnum:]_])(");
224 i = strlen(n);
225 memcpy(n + i, pattern, size);
226 i += size;
228 if (match_words)
229 strcpy(n + i, ")([^[:alnum:]_]|$)");
230 if (match_lines)
231 strcpy(n + i, ")$");
233 i += strlen(n + i);
234 dfacomp(n, i, &dfa, 1);
236 else
237 dfacomp(pattern, size, &dfa, 1);
239 kwsmusts();
242 static char *
243 EGexecute (char *buf, size_t size, char **endp)
245 register char *buflim, *beg, *end, save;
246 char eol = eolbyte;
247 int backref, start, len;
248 struct kwsmatch kwsm;
249 static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
250 Q@#%!# library interface in regex.c. */
252 buflim = buf + size;
254 for (beg = end = buf; end < buflim; beg = end + 1)
256 if (kwset)
258 /* Find a possible match using the KWset matcher. */
259 beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
260 if (!beg)
261 goto failure;
262 /* Narrow down to the line containing the candidate, and
263 run it through DFA. */
264 end = memchr(beg, eol, buflim - beg);
265 if (!end)
266 end = buflim;
267 while (beg > buf && beg[-1] != eol)
268 --beg;
269 save = *end;
270 if (kwsm.index < lastexact)
271 goto success;
272 if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
274 *end = save;
275 continue;
277 *end = save;
278 /* Successful, no backreferences encountered. */
279 if (!backref)
280 goto success;
282 else
284 /* No good fixed strings; start with DFA. */
285 save = *buflim;
286 beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
287 *buflim = save;
288 if (!beg)
289 goto failure;
290 /* Narrow down to the line we've found. */
291 end = memchr(beg, eol, buflim - beg);
292 if (!end)
293 end = buflim;
294 while (beg > buf && beg[-1] != eol)
295 --beg;
296 /* Successful, no backreferences encountered! */
297 if (!backref)
298 goto success;
300 /* If we've made it to this point, this means DFA has seen
301 a probable match, and we need to run it through Regex. */
302 regexbuf.not_eol = 0;
303 if ((start = re_search(&regexbuf, beg, end - beg, 0, end - beg, &regs)) >= 0)
305 len = regs.end[0] - start;
306 if ((!match_lines && !match_words)
307 || (match_lines && len == end - beg))
308 goto success;
309 /* If -w, check if the match aligns with word boundaries.
310 We do this iteratively because:
311 (a) the line may contain more than one occurence of the pattern, and
312 (b) Several alternatives in the pattern might be valid at a given
313 point, and we may need to consider a shorter one to find a word
314 boundary. */
315 if (match_words)
316 while (start >= 0)
318 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
319 && (len == end - beg
320 || !WCHAR ((unsigned char) beg[start + len])))
321 goto success;
322 if (len > 0)
324 /* Try a shorter length anchored at the same place. */
325 --len;
326 regexbuf.not_eol = 1;
327 len = re_match(&regexbuf, beg, start + len, start, &regs);
329 if (len <= 0)
331 /* Try looking further on. */
332 if (start == end - beg)
333 break;
334 ++start;
335 regexbuf.not_eol = 0;
336 start = re_search(&regexbuf, beg, end - beg,
337 start, end - beg - start, &regs);
338 len = regs.end[0] - start;
344 failure:
345 return 0;
347 success:
348 *endp = end < buflim ? end + 1 : end;
349 return beg;
352 static void
353 Fcompile (char *pattern, size_t size)
355 char *beg, *lim, *err;
357 kwsinit();
358 beg = pattern;
361 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
363 if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
364 fatal(err, 0);
365 if (lim < pattern + size)
366 ++lim;
367 beg = lim;
369 while (beg < pattern + size);
371 if ((err = kwsprep(kwset)) != 0)
372 fatal(err, 0);
375 static char *
376 Fexecute (char *buf, size_t size, char **endp)
378 register char *beg, *try, *end;
379 register size_t len;
380 char eol = eolbyte;
381 struct kwsmatch kwsmatch;
383 for (beg = buf; beg <= buf + size; ++beg)
385 if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
386 return 0;
387 len = kwsmatch.size[0];
388 if (match_lines)
390 if (beg > buf && beg[-1] != eol)
391 continue;
392 if (beg + len < buf + size && beg[len] != eol)
393 continue;
394 goto success;
396 else if (match_words)
397 for (try = beg; len && try;)
399 if (try > buf && WCHAR((unsigned char) try[-1]))
400 break;
401 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
403 try = kwsexec(kwset, beg, --len, &kwsmatch);
404 len = kwsmatch.size[0];
406 else
407 goto success;
409 else
410 goto success;
413 return 0;
415 success:
416 if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
417 ++end;
418 else
419 end = buf + size;
420 *endp = end;
421 while (beg > buf && beg[-1] != '\n')
422 --beg;
423 return beg;