kernel - support dummy reallocblks in devfs
[dragonfly.git] / contrib / grep / src / pcresearch.c
blobb1f831048692e5905115c06065e392b4899bdd28
1 /* pcresearch.c - searching subroutines using PCRE for grep.
2 Copyright 2000, 2007, 2009-2015 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* Written August 1992 by Mike Haertel. */
21 #include <config.h>
22 #include "search.h"
24 #if HAVE_LIBPCRE
25 # include <pcre.h>
27 /* This must be at least 2; everything after that is for performance
28 in pcre_exec. */
29 enum { NSUB = 300 };
31 /* Compiled internal form of a Perl regular expression. */
32 static pcre *cre;
34 /* Additional information about the pattern. */
35 static pcre_extra *extra;
37 # ifndef PCRE_STUDY_JIT_COMPILE
38 # define PCRE_STUDY_JIT_COMPILE 0
39 # endif
41 # if PCRE_STUDY_JIT_COMPILE
42 /* Maximum size of the JIT stack. */
43 static int jit_stack_size;
44 # endif
46 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
47 of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
48 options OPTIONS, and storing resulting matches into SUB. Return
49 the (nonnegative) match location or a (negative) error number. */
50 static int
51 jit_exec (char const *subject, int search_bytes, int search_offset,
52 int options, int *sub)
54 while (true)
56 int e = pcre_exec (cre, extra, subject, search_bytes, search_offset,
57 options, sub, NSUB);
59 # if PCRE_STUDY_JIT_COMPILE
60 if (e == PCRE_ERROR_JIT_STACKLIMIT
61 && 0 < jit_stack_size && jit_stack_size <= INT_MAX / 2)
63 int old_size = jit_stack_size;
64 int new_size = jit_stack_size = old_size * 2;
65 static pcre_jit_stack *jit_stack;
66 if (jit_stack)
67 pcre_jit_stack_free (jit_stack);
68 jit_stack = pcre_jit_stack_alloc (old_size, new_size);
69 if (!jit_stack)
70 error (EXIT_TROUBLE, 0,
71 _("failed to allocate memory for the PCRE JIT stack"));
72 pcre_assign_jit_stack (extra, NULL, jit_stack);
73 continue;
75 # endif
77 return e;
81 #endif
83 #if HAVE_LIBPCRE
84 /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
85 string matches when that flag is used. */
86 static int empty_match[2];
87 #endif
89 void
90 Pcompile (char const *pattern, size_t size)
92 #if !HAVE_LIBPCRE
93 error (EXIT_TROUBLE, 0, "%s",
94 _("support for the -P option is not compiled into "
95 "this --disable-perl-regexp binary"));
96 #else
97 int e;
98 char const *ep;
99 char *re = xnmalloc (4, size + 7);
100 int flags = (PCRE_MULTILINE
101 | (match_icase ? PCRE_CASELESS : 0));
102 char const *patlim = pattern + size;
103 char *n = re;
104 char const *p;
105 char const *pnul;
107 if (using_utf8 ())
108 flags |= PCRE_UTF8;
109 else if (MB_CUR_MAX != 1)
110 error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
112 /* FIXME: Remove these restrictions. */
113 if (memchr (pattern, '\n', size))
114 error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
116 *n = '\0';
117 if (match_lines)
118 strcpy (n, "^(?:");
119 if (match_words)
120 strcpy (n, "(?<!\\w)(?:");
121 n += strlen (n);
123 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
124 replace each NUL byte in the pattern with the four characters
125 "\000", removing a preceding backslash if there are an odd
126 number of backslashes before the NUL.
128 FIXME: This method does not work with some multibyte character
129 encodings, notably Shift-JIS, where a multibyte character can end
130 in a backslash byte. */
131 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
133 memcpy (n, p, pnul - p);
134 n += pnul - p;
135 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
136 continue;
137 n -= (pnul - p) & 1;
138 strcpy (n, "\\000");
139 n += 4;
142 memcpy (n, p, patlim - p);
143 n += patlim - p;
144 *n = '\0';
145 if (match_words)
146 strcpy (n, ")(?!\\w)");
147 if (match_lines)
148 strcpy (n, ")$");
150 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
151 if (!cre)
152 error (EXIT_TROUBLE, 0, "%s", ep);
154 extra = pcre_study (cre, PCRE_STUDY_JIT_COMPILE, &ep);
155 if (ep)
156 error (EXIT_TROUBLE, 0, "%s", ep);
158 # if PCRE_STUDY_JIT_COMPILE
159 if (pcre_fullinfo (cre, extra, PCRE_INFO_JIT, &e))
160 error (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
162 /* The PCRE documentation says that a 32 KiB stack is the default. */
163 if (e)
164 jit_stack_size = 32 << 10;
165 # endif
167 free (re);
169 int sub[NSUB];
170 empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
171 PCRE_NOTBOL, sub, NSUB);
172 empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
173 #endif /* HAVE_LIBPCRE */
176 size_t
177 Pexecute (char const *buf, size_t size, size_t *match_size,
178 char const *start_ptr)
180 #if !HAVE_LIBPCRE
181 /* We can't get here, because Pcompile would have been called earlier. */
182 error (EXIT_TROUBLE, 0, _("internal error"));
183 return -1;
184 #else
185 int sub[NSUB];
186 char const *p = start_ptr ? start_ptr : buf;
187 bool bol = p[-1] == eolbyte;
188 char const *line_start = buf;
189 int e = PCRE_ERROR_NOMATCH;
190 char const *line_end;
192 /* The search address to pass to pcre_exec. This is the start of
193 the buffer, or just past the most-recently discovered encoding
194 error. */
195 char const *subject = buf;
197 /* If the input type is unknown, the caller is still testing the
198 input, which means the current buffer cannot contain encoding
199 errors and a multiline search is typically more efficient.
200 Otherwise, a single-line search is typically faster, so that
201 pcre_exec doesn't waste time validating the entire input
202 buffer. */
203 bool multiline = input_textbin == TEXTBIN_UNKNOWN;
205 for (; p < buf + size; p = line_start = line_end + 1)
207 bool too_big;
209 if (multiline)
211 size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
212 size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
213 line_end = memrchr (p, eolbyte, scan_size);
214 too_big = ! line_end;
216 else
218 line_end = memchr (p, eolbyte, buf + size - p);
219 too_big = INT_MAX < line_end - p;
222 if (too_big)
223 error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
225 for (;;)
227 /* Skip past bytes that are easily determined to be encoding
228 errors, treating them as data that cannot match. This is
229 faster than having pcre_exec check them. */
230 while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
232 p++;
233 bol = false;
236 int search_offset = p - subject;
238 /* Check for an empty match; this is faster than letting
239 pcre_exec do it. */
240 if (p == line_end)
242 sub[0] = sub[1] = search_offset;
243 e = empty_match[bol];
244 break;
247 int options = 0;
248 if (!bol)
249 options |= PCRE_NOTBOL;
250 if (multiline)
251 options |= PCRE_NO_UTF8_CHECK;
253 e = jit_exec (subject, line_end - subject, search_offset,
254 options, sub);
255 if (e != PCRE_ERROR_BADUTF8)
257 if (0 < e && multiline && sub[1] - sub[0] != 0)
259 char const *nl = memchr (subject + sub[0], eolbyte,
260 sub[1] - sub[0]);
261 if (nl)
263 /* This match crosses a line boundary; reject it. */
264 p = subject + sub[0];
265 line_end = nl;
266 continue;
269 break;
271 int valid_bytes = sub[0];
273 /* Try to match the string before the encoding error. */
274 if (valid_bytes < search_offset)
275 e = PCRE_ERROR_NOMATCH;
276 else if (valid_bytes == 0)
278 /* Handle the empty-match case specially, for speed.
279 This optimization is valid if VALID_BYTES is zero,
280 which means SEARCH_OFFSET is also zero. */
281 sub[1] = 0;
282 e = empty_match[bol];
284 else
285 e = jit_exec (subject, valid_bytes, search_offset,
286 options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
288 if (e != PCRE_ERROR_NOMATCH)
289 break;
291 /* Treat the encoding error as data that cannot match. */
292 p = subject += valid_bytes + 1;
293 bol = false;
296 if (e != PCRE_ERROR_NOMATCH)
297 break;
298 bol = true;
301 if (e <= 0)
303 switch (e)
305 case PCRE_ERROR_NOMATCH:
306 break;
308 case PCRE_ERROR_NOMEMORY:
309 error (EXIT_TROUBLE, 0, _("memory exhausted"));
311 # if PCRE_STUDY_JIT_COMPILE
312 case PCRE_ERROR_JIT_STACKLIMIT:
313 error (EXIT_TROUBLE, 0, _("exhausted PCRE JIT stack"));
314 # endif
316 case PCRE_ERROR_MATCHLIMIT:
317 error (EXIT_TROUBLE, 0, _("exceeded PCRE's backtracking limit"));
319 default:
320 /* For now, we lump all remaining PCRE failures into this basket.
321 If anyone cares to provide sample grep usage that can trigger
322 particular PCRE errors, we can add to the list (above) of more
323 detailed diagnostics. */
324 error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
327 return -1;
329 else
331 char const *matchbeg = subject + sub[0];
332 char const *matchend = subject + sub[1];
333 char const *beg;
334 char const *end;
335 if (start_ptr)
337 beg = matchbeg;
338 end = matchend;
340 else if (multiline)
342 char const *prev_nl = memrchr (line_start - 1, eolbyte,
343 matchbeg - (line_start - 1));
344 char const *next_nl = memchr (matchend, eolbyte,
345 line_end + 1 - matchend);
346 beg = prev_nl + 1;
347 end = next_nl + 1;
349 else
351 beg = line_start;
352 end = line_end + 1;
354 *match_size = end - beg;
355 return beg - buf;
357 #endif