Merge commit 'a058d1cc571af5fbcfe7f1d719df1abbfdb722f3' into merges
[unleashed.git] / usr / src / cmd / expr / compile.c
blobd3a0bf138523cfeccd947da78720fa5dc7522ff8
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31 * porting aid. switches out to libgen compile/step if collation
32 * table not present.
34 * Goal is to work with vi and sed/ed.
35 * Returns expbuf in dhl format (encoding of first two bytes).
36 * Note also that this is profoundly single threaded. You
37 * cannot call compile twice with two separate search strings
38 * because the second call will wipe out the earlier stored string.
39 * This must be fixed, plus a general cleanup should be performed
40 * if this is to be integrated into libc.
44 #include <stdio.h>
45 #include <widec.h>
46 #include <sys/types.h>
47 #include <regex.h>
48 #include <locale.h>
49 #include <stdlib.h>
50 #include <locale.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <regexpr.h>
55 int regcomp_flags; /* interface to specify cflags for regcomp */
57 void regex_comp_free(void *a);
58 static int dhl_step(const char *str, const char *ep);
59 static int dhl_advance(const char *str, const char *ep);
60 static int map_errnos(int); /* Convert regcomp error */
61 static int dhl_doit(const char *, const regex_t *, const int flags);
62 static char *dhl_compile(const char *instr, char *ep, char *endbuf);
65 * # of sub re's: NOTE: For now limit on bra list defined here
66 * but fix is to add maxbra define to to regex.h
67 * One problem is that a bigger number is a performance hit since
68 * regexec() has a slow initialization loop that goes around SEPSIZE times
70 #define SEPSIZE 20
71 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */
74 * Structure to contain dl encoded first two bytes for vi, plus hold two
75 * regex structures, one for advance and one for step.
77 static struct regex_comp {
78 char r_head[2]; /* Header for DL encoding for vi */
79 regex_t r_stp; /* For use by step */
80 regex_t r_adv; /* For use by advance */
81 } reg_comp;
84 * global value for the size of a regex_comp structure:
86 size_t regexc_size = sizeof (reg_comp);
89 char *
90 compile(const char *instr, char *expbuf, char *endbuf)
92 return (dhl_compile(instr, expbuf, endbuf));
95 int
96 step(const char *instr, const char *expbuf)
98 return (dhl_step(instr, expbuf));
102 advance(const char *instr, const char *expbuf)
104 return (dhl_advance(instr, expbuf));
109 * the compile and step routines here simulate the old libgen routines of
110 * compile/step Re: regexpr(3G). in order to do this, we must assume
111 * that expbuf[] consists of the following format:
112 * 1) the first two bytes consist of a special encoding - see below.
113 * 2) the next part is a regex_t used by regexec()/regcomp() for step
114 * 3) the final part is a regex_t used by regexec()/regcomp() for advance
116 * the special encoding of the first two bytes is referenced throughout
117 * vi. apparently expbuf[0] is set to:
118 * = 0 upon initialization
119 * = 1 if the first char of the RE is a ^
120 * = 0 if the first char of the RE isn't a ^
121 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression.
122 * this is apparently 0 if there's no RE.
123 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
124 * if there's at least 1 RE in the string.
125 * I say "apparently" as the code to compile()/step() is poorly written.
127 static char *
128 dhl_compile(instr, expbuf, endbuf)
129 const char *instr; /* the regular expression */
130 char *expbuf; /* where the compiled RE gets placed */
131 char *endbuf; /* ending addr of expbuf */
133 int rv;
134 int alloc = 0;
135 char adv_instr[4096]; /* PLENTY big temp buffer */
136 char *instrp; /* PLENTY big temp buffer */
138 if (*instr == '\0') {
139 regerrno = 41;
140 return (NULL);
144 * Check values of expbuf and endbuf
146 if (expbuf == NULL) {
147 if ((expbuf = malloc(regexc_size)) == NULL) {
148 regerrno = 50;
149 return (NULL);
151 memset(&reg_comp, 0, regexc_size);
152 alloc = 1;
153 endbuf = expbuf + regexc_size;
154 } else { /* Check if enough memory was allocated */
155 if (expbuf + regexc_size > endbuf) {
156 regerrno = 50;
157 return (NULL);
159 memcpy(&reg_comp, expbuf, regexc_size);
163 * Clear global flags
165 nbra = 0;
166 regerrno = 0;
169 * Free any data being held for previous search strings
171 regex_comp_free(&reg_comp);
174 * We call regcomp twice, once to get a regex_t for use by step()
175 * and then again with for use by advance()
177 if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
178 regerrno = map_errnos(rv); /* Convert regcomp error */
179 goto out;
182 * To support advance, which assumes an implicit ^ to match at start
183 * of line we prepend a ^ to the pattern by copying to a temp buffer
186 if (instr[0] == '^')
187 instrp = (char *)instr; /* String already has leading ^ */
188 else {
189 adv_instr[0] = '^';
190 strncpy(&adv_instr[1], instr, 2048);
191 instrp = adv_instr;
194 if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
195 regerrno = map_errnos(rv); /* Convert regcomp error */
196 goto out;
200 * update global variables
202 nbra = (int)reg_comp.r_adv.re_nsub > 0 ?
203 (int)reg_comp.r_adv.re_nsub : 0;
204 regerrno = 0;
207 * Set the header flags for use by vi
209 if (instr[0] == '^') /* if beginning of string, */
210 reg_comp.r_head[0] = 1; /* set special flag */
211 else
212 reg_comp.r_head[0] = 0; /* clear special flag */
214 * note that for a single BRE, nbra will be 0 here.
215 * we're guaranteed that, at this point, a RE has been found.
217 reg_comp.r_head[1] = 1; /* set special flag */
219 * Copy our reg_comp structure to expbuf
221 (void) memcpy(expbuf, (char *)&reg_comp, regexc_size);
223 out:
225 * Return code from libgen regcomp with mods. Note weird return
226 * value - if space is malloc'd return pointer to start of space,
227 * if user provided their own space, return pointer to 1+last byte
228 * of that space.
230 if (regerrno != 0) {
231 if (alloc)
232 free(expbuf);
233 return (NULL);
235 reglength = regexc_size;
237 if (alloc)
238 return (expbuf);
239 else
240 return (expbuf + regexc_size);
245 * dhl_step: step through a string until a RE match is found, or end of str
247 static int
248 dhl_step(str, ep)
249 const char *str; /* characters to be checked for a match */
250 const char *ep; /* compiled RE from dhl_compile() */
253 * Check if we're passed a null ep
255 if (ep == NULL) {
256 regerrno = 41; /* No remembered search string error */
257 return (0);
260 * Call common routine with r_stp (step) structure
262 return (dhl_doit(str, &(((struct regex_comp *)ep)->r_stp),
263 ((locs != NULL) ? REG_NOTBOL : 0)));
267 * dhl_advance: implement advance
269 static int
270 dhl_advance(str, ep)
271 const char *str; /* characters to be checked for a match */
272 const char *ep; /* compiled RE from dhl_compile() */
274 int rv;
276 * Check if we're passed a null ep
278 if (ep == NULL) {
279 regerrno = 41; /* No remembered search string error */
280 return (0);
283 * Call common routine with r_adv (advance) structure
285 rv = dhl_doit(str, &(((struct regex_comp *)ep)->r_adv), 0);
286 loc1 = NULL; /* Clear it per the compile man page */
287 return (rv);
291 * dhl_doit - common code for step and advance
293 static int
294 dhl_doit(str, rep, flags)
295 const char *str; /* characters to be checked for a match */
296 const regex_t *rep;
297 const int flags; /* flags to be passed to regexec directly */
299 int rv;
300 int i;
301 regmatch_t *prm; /* ptr to current regmatch_t */
304 * Check if we're passed a null regex_t
306 if (rep == NULL) {
307 regerrno = 41; /* No remembered search string error */
308 return (0);
311 regerrno = 0;
312 prm = &rm[0];
314 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
315 if (rv == REG_NOMATCH)
316 return (0);
317 regerrno = map_errnos(rv);
318 return (0);
321 loc1 = (char *)str + prm->rm_so;
322 loc2 = (char *)str + prm->rm_eo;
325 * Now we need to fill up the bra lists with all of the sub re's
326 * Note we subtract nsub -1, and preincrement prm.
328 for (i = 0; i <= rep->re_nsub; i++) {
329 prm++; /* XXX inc past first subexp */
330 braslist[i] = (char *)str + prm->rm_so;
331 braelist[i] = (char *)str + prm->rm_eo;
332 if (i >= SEPSIZE) {
333 regerrno = 50; /* regex overflow */
334 return (0);
339 * Inverse logic, a zero from regexec - success, is a 1
340 * from advance/step.
343 return (rv == 0);
348 * regerrno to compile/step error mapping:
349 * This is really a big compromise. Some errors don't map at all
350 * like regcomp error 15 is generated by both compile() error types
351 * 44 & 46. So which one should we map to?
352 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
353 * To do your errors right use xregerr() to get the regcomp error
354 * string and print that.
356 * | regcomp/regexec | Compile/step/advance |
357 * +---------------------------------+--------------------------------------+
358 * 0 REG_OK Pattern matched 1 - Pattern matched
359 * 1 REG_NOMATCH No match 0 - Pattern didn't match
360 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err
361 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \.
362 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter.
363 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \(
364 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range.
365 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance.
366 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance.
367 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \.
368 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large.
369 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow.
370 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter.
371 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence
372 * 14 REG_BADPAT syntax error 50 - Regular expression overflow.
373 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\}
374 * 16 REG_EFATAL internal error 50 - Regular expression overflow.
375 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence
376 * 18 REG_STACK stack overflow 50 - Regular expression overflow.
377 * 19 REG_ENOSYS function not supported 50- Regular expression overflow.
379 * For reference here's the compile/step errno's. We don't generate
380 * 41 here - it's done earlier, nor 44 since we can't tell if from 46.
382 * 11 - Range endpoint too large.
383 * 16 - Bad number.
384 * 25 - ``\digit'' out of range.
385 * 36 - Illegal or missing delimiter.
386 * 41 - No remembered search string.
387 * 42 - \(~\) imbalance.
388 * 43 - Too many \(.
389 * 44 - More than 2 numbers given in "\{~\}"
390 * 45 - } expected after \.
391 * 46 - First number exceeds 2nd in "\{~\}"
392 * 49 - [ ] imbalance.
393 * 50 - Regular expression overflow.
396 static int
397 map_errnos(int Errno)
399 switch (Errno) {
400 case REG_ECOLLATE:
401 regerrno = 67;
402 break;
403 case REG_EESCAPE:
404 regerrno = 45;
405 break;
406 case REG_ENEWLINE:
407 regerrno = 36;
408 break;
409 case REG_ENSUB:
410 regerrno = 43;
411 break;
412 case REG_ESUBREG:
413 regerrno = 25;
414 break;
415 case REG_EBRACK:
416 regerrno = 49;
417 break;
418 case REG_EPAREN:
419 regerrno = 42;
420 break;
421 case REG_EBRACE:
422 regerrno = 45;
423 break;
424 case REG_ERANGE:
425 regerrno = 11;
426 break;
427 case REG_ESPACE:
428 regerrno = 50;
429 break;
430 case REG_BADRPT:
431 regerrno = 36;
432 break;
433 case REG_ECTYPE:
434 regerrno = 67;
435 break;
436 case REG_BADPAT:
437 regerrno = 50;
438 break;
439 case REG_BADBR:
440 regerrno = 46;
441 break;
442 case REG_EFATAL:
443 regerrno = 50;
444 break;
445 case REG_ECHAR:
446 regerrno = 67;
447 break;
448 case REG_STACK:
449 regerrno = 50;
450 break;
451 case REG_ENOSYS:
452 regerrno = 50;
453 break;
454 default:
455 regerrno = 50;
456 break;
458 return (regerrno);
462 * This is a routine to clean up the subtle substructure of the struct
463 * regex_comp type for use by clients of this module. Since the struct
464 * type is private, we use a generic interface, and trust the
465 * application to be damn sure that this operation is valid for the
466 * named memory.
469 void
470 regex_comp_free(void * a)
473 * Free any data being held for previous search strings
476 if (((struct regex_comp *)a) == NULL) {
477 return;
480 regfree(&((struct regex_comp *)a)->r_stp);
481 regfree(&((struct regex_comp *)a)->r_adv);