1 /* pcresearch.c - searching subroutines using PCRE for grep.
2 Copyright 2000, 2007, 2009-2015 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
27 /* This must be at least 2; everything after that is for performance
31 /* Compiled internal form of a Perl regular expression. */
34 /* Additional information about the pattern. */
35 static pcre_extra
*extra
;
37 # ifndef PCRE_STUDY_JIT_COMPILE
38 # define PCRE_STUDY_JIT_COMPILE 0
41 # if PCRE_STUDY_JIT_COMPILE
42 /* Maximum size of the JIT stack. */
43 static int jit_stack_size
;
46 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
47 of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
48 options OPTIONS, and storing resulting matches into SUB. Return
49 the (nonnegative) match location or a (negative) error number. */
51 jit_exec (char const *subject
, int search_bytes
, int search_offset
,
52 int options
, int *sub
)
56 int e
= pcre_exec (cre
, extra
, subject
, search_bytes
, search_offset
,
59 # if PCRE_STUDY_JIT_COMPILE
60 if (e
== PCRE_ERROR_JIT_STACKLIMIT
61 && 0 < jit_stack_size
&& jit_stack_size
<= INT_MAX
/ 2)
63 int old_size
= jit_stack_size
;
64 int new_size
= jit_stack_size
= old_size
* 2;
65 static pcre_jit_stack
*jit_stack
;
67 pcre_jit_stack_free (jit_stack
);
68 jit_stack
= pcre_jit_stack_alloc (old_size
, new_size
);
70 error (EXIT_TROUBLE
, 0,
71 _("failed to allocate memory for the PCRE JIT stack"));
72 pcre_assign_jit_stack (extra
, NULL
, jit_stack
);
84 /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
85 string matches when that flag is used. */
86 static int empty_match
[2];
90 Pcompile (char const *pattern
, size_t size
)
93 error (EXIT_TROUBLE
, 0, "%s",
94 _("support for the -P option is not compiled into "
95 "this --disable-perl-regexp binary"));
99 char *re
= xnmalloc (4, size
+ 7);
100 int flags
= (PCRE_MULTILINE
101 | (match_icase
? PCRE_CASELESS
: 0));
102 char const *patlim
= pattern
+ size
;
109 else if (MB_CUR_MAX
!= 1)
110 error (EXIT_TROUBLE
, 0, _("-P supports only unibyte and UTF-8 locales"));
112 /* FIXME: Remove these restrictions. */
113 if (memchr (pattern
, '\n', size
))
114 error (EXIT_TROUBLE
, 0, _("the -P option only supports a single pattern"));
120 strcpy (n
, "(?<!\\w)(?:");
123 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
124 replace each NUL byte in the pattern with the four characters
125 "\000", removing a preceding backslash if there are an odd
126 number of backslashes before the NUL.
128 FIXME: This method does not work with some multibyte character
129 encodings, notably Shift-JIS, where a multibyte character can end
130 in a backslash byte. */
131 for (p
= pattern
; (pnul
= memchr (p
, '\0', patlim
- p
)); p
= pnul
+ 1)
133 memcpy (n
, p
, pnul
- p
);
135 for (p
= pnul
; pattern
< p
&& p
[-1] == '\\'; p
--)
142 memcpy (n
, p
, patlim
- p
);
146 strcpy (n
, ")(?!\\w)");
150 cre
= pcre_compile (re
, flags
, &ep
, &e
, pcre_maketables ());
152 error (EXIT_TROUBLE
, 0, "%s", ep
);
154 extra
= pcre_study (cre
, PCRE_STUDY_JIT_COMPILE
, &ep
);
156 error (EXIT_TROUBLE
, 0, "%s", ep
);
158 # if PCRE_STUDY_JIT_COMPILE
159 if (pcre_fullinfo (cre
, extra
, PCRE_INFO_JIT
, &e
))
160 error (EXIT_TROUBLE
, 0, _("internal error (should never happen)"));
162 /* The PCRE documentation says that a 32 KiB stack is the default. */
164 jit_stack_size
= 32 << 10;
170 empty_match
[false] = pcre_exec (cre
, extra
, "", 0, 0,
171 PCRE_NOTBOL
, sub
, NSUB
);
172 empty_match
[true] = pcre_exec (cre
, extra
, "", 0, 0, 0, sub
, NSUB
);
173 #endif /* HAVE_LIBPCRE */
177 Pexecute (char const *buf
, size_t size
, size_t *match_size
,
178 char const *start_ptr
)
181 /* We can't get here, because Pcompile would have been called earlier. */
182 error (EXIT_TROUBLE
, 0, _("internal error"));
186 char const *p
= start_ptr
? start_ptr
: buf
;
187 bool bol
= p
[-1] == eolbyte
;
188 char const *line_start
= buf
;
189 int e
= PCRE_ERROR_NOMATCH
;
190 char const *line_end
;
192 /* The search address to pass to pcre_exec. This is the start of
193 the buffer, or just past the most-recently discovered encoding
195 char const *subject
= buf
;
197 /* If the input type is unknown, the caller is still testing the
198 input, which means the current buffer cannot contain encoding
199 errors and a multiline search is typically more efficient.
200 Otherwise, a single-line search is typically faster, so that
201 pcre_exec doesn't waste time validating the entire input
203 bool multiline
= input_textbin
== TEXTBIN_UNKNOWN
;
205 for (; p
< buf
+ size
; p
= line_start
= line_end
+ 1)
211 size_t pcre_size_max
= MIN (INT_MAX
, SIZE_MAX
- 1);
212 size_t scan_size
= MIN (pcre_size_max
+ 1, buf
+ size
- p
);
213 line_end
= memrchr (p
, eolbyte
, scan_size
);
214 too_big
= ! line_end
;
218 line_end
= memchr (p
, eolbyte
, buf
+ size
- p
);
219 too_big
= INT_MAX
< line_end
- p
;
223 error (EXIT_TROUBLE
, 0, _("exceeded PCRE's line length limit"));
227 /* Skip past bytes that are easily determined to be encoding
228 errors, treating them as data that cannot match. This is
229 faster than having pcre_exec check them. */
230 while (mbclen_cache
[to_uchar (*p
)] == (size_t) -1)
236 int search_offset
= p
- subject
;
238 /* Check for an empty match; this is faster than letting
242 sub
[0] = sub
[1] = search_offset
;
243 e
= empty_match
[bol
];
249 options
|= PCRE_NOTBOL
;
251 options
|= PCRE_NO_UTF8_CHECK
;
253 e
= jit_exec (subject
, line_end
- subject
, search_offset
,
255 if (e
!= PCRE_ERROR_BADUTF8
)
257 if (0 < e
&& multiline
&& sub
[1] - sub
[0] != 0)
259 char const *nl
= memchr (subject
+ sub
[0], eolbyte
,
263 /* This match crosses a line boundary; reject it. */
264 p
= subject
+ sub
[0];
271 int valid_bytes
= sub
[0];
273 /* Try to match the string before the encoding error. */
274 if (valid_bytes
< search_offset
)
275 e
= PCRE_ERROR_NOMATCH
;
276 else if (valid_bytes
== 0)
278 /* Handle the empty-match case specially, for speed.
279 This optimization is valid if VALID_BYTES is zero,
280 which means SEARCH_OFFSET is also zero. */
282 e
= empty_match
[bol
];
285 e
= jit_exec (subject
, valid_bytes
, search_offset
,
286 options
| PCRE_NO_UTF8_CHECK
| PCRE_NOTEOL
, sub
);
288 if (e
!= PCRE_ERROR_NOMATCH
)
291 /* Treat the encoding error as data that cannot match. */
292 p
= subject
+= valid_bytes
+ 1;
296 if (e
!= PCRE_ERROR_NOMATCH
)
305 case PCRE_ERROR_NOMATCH
:
308 case PCRE_ERROR_NOMEMORY
:
309 error (EXIT_TROUBLE
, 0, _("memory exhausted"));
311 # if PCRE_STUDY_JIT_COMPILE
312 case PCRE_ERROR_JIT_STACKLIMIT
:
313 error (EXIT_TROUBLE
, 0, _("exhausted PCRE JIT stack"));
316 case PCRE_ERROR_MATCHLIMIT
:
317 error (EXIT_TROUBLE
, 0, _("exceeded PCRE's backtracking limit"));
320 /* For now, we lump all remaining PCRE failures into this basket.
321 If anyone cares to provide sample grep usage that can trigger
322 particular PCRE errors, we can add to the list (above) of more
323 detailed diagnostics. */
324 error (EXIT_TROUBLE
, 0, _("internal PCRE error: %d"), e
);
331 char const *matchbeg
= subject
+ sub
[0];
332 char const *matchend
= subject
+ sub
[1];
342 char const *prev_nl
= memrchr (line_start
- 1, eolbyte
,
343 matchbeg
- (line_start
- 1));
344 char const *next_nl
= memchr (matchend
, eolbyte
,
345 line_end
+ 1 - matchend
);
354 *match_size
= end
- beg
;