1 /* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright (C) 1992, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
21 /* $FreeBSD: src/gnu/usr.bin/grep/search.c,v 1.10 2000/01/31 13:28:57 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/search.c,v 1.3 2004/02/03 20:22:57 dillon Exp $ */
27 #include <sys/types.h>
34 #define NCHAR (UCHAR_MAX + 1)
36 static void Gcompile
PARAMS((char *, size_t));
37 static void Ecompile
PARAMS((char *, size_t));
38 static char *EGexecute
PARAMS((char *, size_t, char **));
39 static void Fcompile
PARAMS((char *, size_t));
40 static char *Fexecute
PARAMS((char *, size_t, char **));
41 static void kwsinit
PARAMS((void));
43 /* Here is the matchers vector for the main program. */
44 struct matcher matchers
[] = {
45 { "default", Gcompile
, EGexecute
},
46 { "grep", Gcompile
, EGexecute
},
47 { "egrep", Ecompile
, EGexecute
},
48 { "awk", Ecompile
, EGexecute
},
49 { "fgrep", Fcompile
, Fexecute
},
53 /* For -w, we also consider _ to be word constituent. */
54 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
56 /* DFA compiled regexp. */
57 static struct dfa dfa
;
59 /* Regex compiled regexp. */
60 static struct re_pattern_buffer regexbuf
;
62 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
63 a list of strings, at least one of which is known to occur in
64 any string matching the regexp. */
67 /* Last compiled fixed string known to exactly match the regexp.
68 If kwsexec() returns < lastexact, then we don't need to
69 call the regexp matcher at all. */
73 dfaerror (char const *mesg
)
81 static char trans
[NCHAR
];
85 for (i
= 0; i
< NCHAR
; ++i
)
86 trans
[i
] = TOLOWER(i
);
88 if (!(kwset
= kwsalloc(match_icase
? trans
: (char *) 0)))
89 fatal("memory exhausted", 0);
92 /* If the DFA turns out to have some set of fixed strings one of
93 which must occur in the match, then we build a kwset matcher
94 to find those strings, and thus quickly filter out impossible
105 /* First, we compile in the substrings known to be exact
106 matches. The kwset matcher will return the index
107 of the matching string that it chooses. */
108 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
113 if ((err
= kwsincr(kwset
, dm
->must
, strlen(dm
->must
))) != 0)
116 /* Now, we compile the substrings that will require
117 the use of the regexp matcher. */
118 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
122 if ((err
= kwsincr(kwset
, dm
->must
, strlen(dm
->must
))) != 0)
125 if ((err
= kwsprep(kwset
)) != 0)
131 Gcompile (char *pattern
, size_t size
)
135 re_set_syntax(RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
);
136 dfasyntax(RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
, match_icase
, eolbyte
);
138 if ((err
= re_compile_pattern(pattern
, size
, ®exbuf
)) != 0)
141 /* In the match_words and match_lines cases, we use a different pattern
142 for the DFA matcher that will quickly throw out cases that won't work.
143 Then if DFA succeeds we do some hairy stuff using the regex matcher
144 to decide whether the match should really count. */
145 if (match_words
|| match_lines
)
147 /* In the whole-word case, we use the pattern:
148 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
149 In the whole-line case, we use the pattern:
151 BUG: Using [A-Za-z_] is locale-dependent!
152 So will use [:alnum:] */
154 char *n
= malloc(size
+ 50);
162 strcpy(n
, "\\(^\\|[^[:alnum:]_]\\)\\(");
165 memcpy(n
+ i
, pattern
, size
);
169 strcpy(n
+ i
, "\\)\\([^[:alnum:]_]\\|$\\)");
171 strcpy(n
+ i
, "\\)$");
174 dfacomp(n
, i
, &dfa
, 1);
177 dfacomp(pattern
, size
, &dfa
, 1);
183 Ecompile (char *pattern
, size_t size
)
187 if (strcmp(matcher
, "awk") == 0)
189 re_set_syntax(RE_SYNTAX_AWK
);
190 dfasyntax(RE_SYNTAX_AWK
, match_icase
, eolbyte
);
194 re_set_syntax (RE_SYNTAX_POSIX_EGREP
);
195 dfasyntax (RE_SYNTAX_POSIX_EGREP
, match_icase
, eolbyte
);
198 if ((err
= re_compile_pattern(pattern
, size
, ®exbuf
)) != 0)
201 /* In the match_words and match_lines cases, we use a different pattern
202 for the DFA matcher that will quickly throw out cases that won't work.
203 Then if DFA succeeds we do some hairy stuff using the regex matcher
204 to decide whether the match should really count. */
205 if (match_words
|| match_lines
)
207 /* In the whole-word case, we use the pattern:
208 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
209 In the whole-line case, we use the pattern:
211 BUG: Using [A-Za-z_] is locale-dependent!
212 so will use the char class */
214 char *n
= malloc(size
+ 50);
222 strcpy(n
, "(^|[^[:alnum:]_])(");
225 memcpy(n
+ i
, pattern
, size
);
229 strcpy(n
+ i
, ")([^[:alnum:]_]|$)");
234 dfacomp(n
, i
, &dfa
, 1);
237 dfacomp(pattern
, size
, &dfa
, 1);
243 EGexecute (char *buf
, size_t size
, char **endp
)
245 register char *buflim
, *beg
, *end
, save
;
247 int backref
, start
, len
;
248 struct kwsmatch kwsm
;
249 static struct re_registers regs
; /* This is static on account of a BRAIN-DEAD
250 Q@#%!# library interface in regex.c. */
254 for (beg
= end
= buf
; end
< buflim
; beg
= end
+ 1)
258 /* Find a possible match using the KWset matcher. */
259 beg
= kwsexec(kwset
, beg
, buflim
- beg
, &kwsm
);
262 /* Narrow down to the line containing the candidate, and
263 run it through DFA. */
264 end
= memchr(beg
, eol
, buflim
- beg
);
267 while (beg
> buf
&& beg
[-1] != eol
)
270 if (kwsm
.index
< lastexact
)
272 if (!dfaexec(&dfa
, beg
, end
, 0, (int *) 0, &backref
))
278 /* Successful, no backreferences encountered. */
284 /* No good fixed strings; start with DFA. */
286 beg
= dfaexec(&dfa
, beg
, buflim
, 0, (int *) 0, &backref
);
290 /* Narrow down to the line we've found. */
291 end
= memchr(beg
, eol
, buflim
- beg
);
294 while (beg
> buf
&& beg
[-1] != eol
)
296 /* Successful, no backreferences encountered! */
300 /* If we've made it to this point, this means DFA has seen
301 a probable match, and we need to run it through Regex. */
302 regexbuf
.not_eol
= 0;
303 if ((start
= re_search(®exbuf
, beg
, end
- beg
, 0, end
- beg
, ®s
)) >= 0)
305 len
= regs
.end
[0] - start
;
306 if ((!match_lines
&& !match_words
)
307 || (match_lines
&& len
== end
- beg
))
309 /* If -w, check if the match aligns with word boundaries.
310 We do this iteratively because:
311 (a) the line may contain more than one occurence of the pattern, and
312 (b) Several alternatives in the pattern might be valid at a given
313 point, and we may need to consider a shorter one to find a word
318 if ((start
== 0 || !WCHAR ((unsigned char) beg
[start
- 1]))
320 || !WCHAR ((unsigned char) beg
[start
+ len
])))
324 /* Try a shorter length anchored at the same place. */
326 regexbuf
.not_eol
= 1;
327 len
= re_match(®exbuf
, beg
, start
+ len
, start
, ®s
);
331 /* Try looking further on. */
332 if (start
== end
- beg
)
335 regexbuf
.not_eol
= 0;
336 start
= re_search(®exbuf
, beg
, end
- beg
,
337 start
, end
- beg
- start
, ®s
);
338 len
= regs
.end
[0] - start
;
348 *endp
= end
< buflim
? end
+ 1 : end
;
353 Fcompile (char *pattern
, size_t size
)
355 char *beg
, *lim
, *err
;
361 for (lim
= beg
; lim
< pattern
+ size
&& *lim
!= '\n'; ++lim
)
363 if ((err
= kwsincr(kwset
, beg
, lim
- beg
)) != 0)
365 if (lim
< pattern
+ size
)
369 while (beg
< pattern
+ size
);
371 if ((err
= kwsprep(kwset
)) != 0)
376 Fexecute (char *buf
, size_t size
, char **endp
)
378 register char *beg
, *try, *end
;
381 struct kwsmatch kwsmatch
;
383 for (beg
= buf
; beg
<= buf
+ size
; ++beg
)
385 if (!(beg
= kwsexec(kwset
, beg
, buf
+ size
- beg
, &kwsmatch
)))
387 len
= kwsmatch
.size
[0];
390 if (beg
> buf
&& beg
[-1] != eol
)
392 if (beg
+ len
< buf
+ size
&& beg
[len
] != eol
)
396 else if (match_words
)
397 for (try = beg
; len
&& try;)
399 if (try > buf
&& WCHAR((unsigned char) try[-1]))
401 if (try + len
< buf
+ size
&& WCHAR((unsigned char) try[len
]))
403 try = kwsexec(kwset
, beg
, --len
, &kwsmatch
);
404 len
= kwsmatch
.size
[0];
416 if ((end
= memchr(beg
+ len
, eol
, (buf
+ size
) - (beg
+ len
))) != 0)
421 while (beg
> buf
&& beg
[-1] != '\n')