More updates towards 2.0.
[mpdm.git] / mpdm_r.c
blobba4d4588d4d9480251d5f824811cf13664e42cd7
1 /*
3 MPDM - Minimum Profit Data Manager
4 Copyright (C) 2003/2010 Angel Ortega <angel@triptico.com>
6 mpdm_r.c - Regular expressions
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License
10 as published by the Free Software Foundation; either version 2
11 of the License, or (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 http://www.triptico.com
26 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "mpdm.h"
35 #ifdef CONFOPT_PCRE
36 #include <pcreposix.h>
37 #endif
39 #ifdef CONFOPT_SYSTEM_REGEX
40 #include <regex.h>
41 #endif
43 #ifdef CONFOPT_INCLUDED_REGEX
44 #include "gnu_regex.h"
45 #endif
48 /** data **/
50 /* matching of the last regex */
52 int mpdm_regex_offset = -1;
53 int mpdm_regex_size = 0;
55 /* number of substitutions in last sregex */
57 int mpdm_sregex_count = 0;
60 /** code **/
62 static wchar_t *regex_flags(const mpdm_t r)
64 return wcsrchr((wchar_t *) r->data, *(wchar_t *) r->data);
68 static mpdm_t mpdm_regcomp(mpdm_t r)
70 mpdm_t c = NULL;
71 mpdm_t regex_cache = NULL;
73 /* if cache does not exist, create it */
74 if ((regex_cache = mpdm_hget_s(mpdm_root(), L"__REGEX_CACHE__")) == NULL) {
75 regex_cache = MPDM_H(0);
76 mpdm_hset_s(mpdm_root(), L"__REGEX_CACHE__", regex_cache);
79 /* search the regex in the cache */
80 if ((c = mpdm_hget(regex_cache, r)) == NULL) {
81 mpdm_t rmb;
82 regex_t re;
83 char *regex;
84 char *flags;
85 int f = REG_EXTENDED;
87 /* not found; regex must be compiled */
89 /* convert to mbs */
90 rmb = MPDM_2MBS(r->data);
92 regex = (char *) rmb->data;
93 if ((flags = strrchr(regex, *regex)) == NULL)
94 return NULL;
96 if (strchr(flags, 'i') != NULL)
97 f |= REG_ICASE;
98 if (strchr(flags, 'm') != NULL)
99 f |= REG_NEWLINE;
101 regex++;
102 *flags = '\0';
104 if (!regcomp(&re, regex, f)) {
105 void *ptr;
107 if ((ptr = malloc(sizeof(regex_t))) != NULL) {
108 /* copies */
109 memcpy(ptr, &re, sizeof(regex_t));
111 /* create value */
112 c = mpdm_new(MPDM_FREE, ptr, sizeof(regex_t));
114 /* stores */
115 mpdm_hset(regex_cache, r, c);
120 return c;
124 static mpdm_t regex1(mpdm_t r, const mpdm_t v, int offset)
125 /* test for one regex */
127 mpdm_t w = NULL;
128 mpdm_t cr;
130 /* no matching yet */
131 mpdm_regex_offset = -1;
133 /* compile the regex */
134 if ((cr = mpdm_regcomp(r)) != NULL) {
135 regmatch_t rm;
136 char *ptr;
137 wchar_t *last;
138 int o = 0;
139 int f = 0;
141 /* takes pointer to 'last' flag */
142 if ((last = regex_flags(r)) != NULL)
143 last = wcschr(last, 'l');
145 /* convert to mbs */
146 ptr = mpdm_wcstombs((wchar_t *) v->data + offset, NULL);
148 /* match? */
149 while (regexec((regex_t *) cr->data, ptr + o, 1,
150 &rm, offset > 0 ? REG_NOTBOL : 0) == 0) {
151 f++;
153 /* if 'last' is not set, it's done */
154 if (last == NULL)
155 break;
157 rm.rm_so += o;
158 rm.rm_eo += o;
159 o = rm.rm_eo;
162 if (f) {
163 /* converts to mbs the string from the beginning
164 to the start of the match, just to know
165 the size (and immediately frees it) */
166 free(mpdm_mbstowcs(ptr, &mpdm_regex_offset, rm.rm_so));
168 /* add the offset */
169 mpdm_regex_offset += offset;
171 /* create now the matching string */
172 w = MPDM_NMBS(ptr + rm.rm_so, rm.rm_eo - rm.rm_so);
174 /* and store the size */
175 mpdm_regex_size = mpdm_size(w);
178 free(ptr);
181 return w;
186 * mpdm_regex - Matches a regular expression.
187 * @r: the regular expression
188 * @v: the value to be matched
189 * @offset: offset from the start of v->data
191 * Matches a regular expression against a value. Valid flags are 'i',
192 * for case-insensitive matching, 'm', to treat the string as a
193 * multiline string (i.e., one containing newline characters), so
194 * that ^ and $ match the boundaries of each line instead of the
195 * whole string, 'l', to return the last matching instead of the
196 * first one, or 'g', to match globally; in that last case, an array
197 * containing all matches is returned instead of a string scalar.
199 * If @r is a string, an ordinary regular expression matching is tried
200 * over the @v string. If the matching is possible, the match result
201 * is returned, or NULL otherwise.
203 * If @r is an array (of strings), each element is tried sequentially
204 * as an individual regular expression over the @v string, each one using
205 * the offset returned by the previous match. All regular expressions
206 * must match to be successful. If this is the case, an array (with
207 * the same number of arguments) is returned containing the matched
208 * strings, or NULL otherwise.
210 * If @r is NULL, the result of the previous regex matching
211 * is returned as a two element array. The first element will contain
212 * the character offset of the matching and the second the number of
213 * characters matched. If the previous regex was unsuccessful, NULL
214 * is returned.
215 * [Regular Expressions]
217 mpdm_t mpdm_regex(mpdm_t r, const mpdm_t v, int offset)
219 mpdm_t w = NULL;
221 /* special case: if r is NULL, return previous match */
222 if (r == NULL) {
223 /* if previous regex was successful... */
224 if (mpdm_regex_offset != -1) {
225 w = MPDM_A(2);
227 mpdm_aset(w, MPDM_I(mpdm_regex_offset), 0);
228 mpdm_aset(w, MPDM_I(mpdm_regex_size), 1);
231 return w;
234 /* if the string to be tested is NULL, return NULL */
235 if (v == NULL)
236 return NULL;
238 if (r->flags & MPDM_MULTIPLE) {
239 int n;
240 mpdm_t t;
242 /* multiple value; try sequentially all regexes,
243 moving the offset forward */
245 w = MPDM_A(mpdm_size(r));
247 for (n = 0; n < mpdm_size(r); n++) {
248 t = mpdm_regex(mpdm_aget(r, n), v, offset);
250 /* if not found, invalid all search and exit */
251 if (t == NULL) {
252 w = NULL;
253 break;
256 /* found; store and move forward */
257 mpdm_aset(w, t, n);
258 offset = mpdm_regex_offset + mpdm_regex_size;
261 else {
262 wchar_t *global;
264 /* takes pointer to 'global' flag */
265 if ((global = regex_flags(r)) != NULL)
266 global = wcschr(global, 'g');
268 if (global != NULL) {
269 mpdm_t t;
271 /* match sequentially until done */
272 w = MPDM_A(0);
274 while ((t = regex1(r, v, offset)) != NULL) {
275 mpdm_push(w, t);
277 offset = mpdm_regex_offset + mpdm_regex_size;
280 /* no matches? convert to NULL */
281 if (mpdm_size(w) == 0)
282 w = NULL;
284 else
285 w = regex1(r, v, offset);
288 return w;
292 static mpdm_t expand_ampersands(const mpdm_t s, const mpdm_t t)
293 /* substitutes all unescaped ampersands in s with t */
295 const wchar_t *sptr = mpdm_string(s);
296 wchar_t *wptr;
297 mpdm_t r = NULL;
299 if (s == NULL)
300 return s;
302 while ((wptr = wcschr(sptr, L'\\')) != NULL ||
303 (wptr = wcschr(sptr, L'&')) != NULL) {
304 int n = wptr - sptr;
306 /* add the leading part */
307 r = mpdm_strcat_sn(r, sptr, n);
309 if (*wptr == L'\\') {
310 if (*(wptr + 1) == L'&' || *(wptr + 1) == L'\\')
311 wptr++;
313 r = mpdm_strcat_sn(r, wptr, 1);
315 else
316 if (*wptr == '&')
317 r = mpdm_strcat(r, t);
319 sptr = wptr + 1;
322 if (r == NULL)
323 r = s;
324 else {
325 /* add the rest of the string */
326 r = mpdm_strcat_s(r, sptr);
329 return r;
334 * mpdm_sregex - Matches and substitutes a regular expression.
335 * @r: the regular expression
336 * @v: the value to be matched
337 * @s: the substitution string, hash or code
338 * @offset: offset from the start of v->data
340 * Matches a regular expression against a value, and substitutes the
341 * found substring with @s. Valid flags are 'i', for case-insensitive
342 * matching, and 'g', for global replacements (all ocurrences in @v
343 * will be replaced, instead of just the first found one).
345 * If @s is executable, it's executed with the matched part as
346 * the only argument and its return value is used as the
347 * substitution string.
349 * If @s is a hash, the matched string is used as a key to it and
350 * its value used as the substitution. If this value itself is
351 * executable, it's executed with the matched string as its only
352 * argument and its return value used as the substitution.
354 * If @r is NULL, returns the number of substitutions made in the
355 * previous call to mpdm_sregex() (can be zero if none was done).
357 * The global variables @mpdm_regex_offset and @mpdm_regex_size are
358 * set to the offset of the matched string and the size of the
359 * replaced string, respectively.
361 * Returns the modified string, or the original one if no substitutions
362 * were done.
363 * [Regular Expressions]
365 mpdm_t mpdm_sregex(mpdm_t r, const mpdm_t v, const mpdm_t s, int offset)
367 mpdm_t cr;
368 mpdm_t o = v;
370 if (r == NULL) {
371 /* return last count */
372 return MPDM_I(mpdm_sregex_count);
375 if (v == NULL)
376 return NULL;
378 /* compile the regex */
379 if ((cr = mpdm_regcomp(r)) != NULL) {
380 char *ptr;
381 int f, i = 0;
382 wchar_t *global;
383 mpdm_t t;
385 /* takes pointer to global flag */
386 if ((global = regex_flags(r)) !=NULL)
387 global = wcschr(global, 'g');
389 /* store the first part */
390 o = MPDM_NS(v->data, offset);
392 /* convert to mbs */
393 if ((ptr = mpdm_wcstombs((wchar_t *) v->data + offset, NULL)) == NULL)
394 return NULL;
396 /* reset count */
397 mpdm_sregex_count = 0;
398 mpdm_regex_offset = -1;
400 do {
401 regmatch_t rm;
403 /* try match */
404 f = !regexec((regex_t *) cr->data, ptr + i,
405 1, &rm, offset > 0 ? REG_NOTBOL : 0);
407 if (f) {
408 /* creates a string from the beginning
409 to the start of the match */
410 t = MPDM_NMBS(ptr + i, rm.rm_so);
411 o = mpdm_strcat(o, t);
413 /* store offset of substitution */
414 mpdm_regex_offset = mpdm_size(t) + offset;
416 /* get the matched part */
417 t = MPDM_NMBS(ptr + i + rm.rm_so, rm.rm_eo - rm.rm_so);
419 /* is s an executable value? */
420 if (MPDM_IS_EXEC(s)) {
421 /* protect o from sweeping */
422 mpdm_ref(o);
424 /* execute s, with t as argument */
425 t = mpdm_exec_1(s, t);
427 mpdm_unref(o);
429 else
430 /* is s a hash? use match as key */
431 if (MPDM_IS_HASH(s)) {
432 mpdm_t v = mpdm_hget(s, t);
434 /* is the value executable? */
435 if (MPDM_IS_EXEC(v))
436 v = mpdm_exec_1(v, t);
438 t = v;
440 else
441 t = expand_ampersands(s, t);
443 /* appends the substitution string */
444 o = mpdm_strcat(o, t);
446 /* store size of substitution */
447 mpdm_regex_size = mpdm_size(t);
449 i += rm.rm_eo;
451 /* one more substitution */
452 mpdm_sregex_count++;
455 } while (f && global);
457 /* no (more) matches; convert and append the rest */
458 t = MPDM_MBS(ptr + i);
459 o = mpdm_strcat(o, t);
461 free(ptr);
464 return o;