Updated TODO.
[mpdm.git] / mpdm_r.c
blobefad3ca5b351a7a2fe4c8d3d4b001124c25705a0
1 /*
3 MPDM - Minimum Profit Data Manager
4 Copyright (C) 2003/2010 Angel Ortega <angel@triptico.com>
6 mpdm_r.c - Regular expressions
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License
10 as published by the Free Software Foundation; either version 2
11 of the License, or (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 http://www.triptico.com
26 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "mpdm.h"
35 #ifdef CONFOPT_PCRE
36 #include <pcreposix.h>
37 #endif
39 #ifdef CONFOPT_SYSTEM_REGEX
40 #include <regex.h>
41 #endif
43 #ifdef CONFOPT_INCLUDED_REGEX
44 #include "gnu_regex.h"
45 #endif
48 /** data **/
50 /* matching of the last regex */
52 int mpdm_regex_offset = -1;
53 int mpdm_regex_size = 0;
55 /* number of substitutions in last sregex */
57 int mpdm_sregex_count = 0;
60 /** code **/
62 static wchar_t *regex_flags(const mpdm_t r)
64 wchar_t *ptr = mpdm_string(r);
65 return wcsrchr(ptr, *ptr);
69 static mpdm_t mpdm_regcomp(mpdm_t r)
71 mpdm_t c = NULL;
72 mpdm_t regex_cache = NULL;
74 mpdm_ref(r);
76 /* if cache does not exist, create it */
77 if ((regex_cache =
78 mpdm_hget_s(mpdm_root(), L"__REGEX_CACHE__")) == NULL) {
79 regex_cache = MPDM_H(0);
80 mpdm_hset_s(mpdm_root(), L"__REGEX_CACHE__", regex_cache);
83 /* search the regex in the cache */
84 if ((c = mpdm_hget(regex_cache, r)) == NULL) {
85 mpdm_t rmb;
86 regex_t re;
87 char *regex;
88 char *flags;
89 int f = REG_EXTENDED;
91 /* not found; regex must be compiled */
93 /* convert to mbs */
94 rmb = mpdm_ref(MPDM_2MBS(r->data));
95 regex = (char *) rmb->data;
97 if ((flags = strrchr(regex, *regex)) != NULL) {
99 if (strchr(flags, 'i') != NULL)
100 f |= REG_ICASE;
101 if (strchr(flags, 'm') != NULL)
102 f |= REG_NEWLINE;
104 regex++;
105 *flags = '\0';
107 if (!regcomp(&re, regex, f)) {
108 c = MPDM_C(MPDM_REGEX, &re, sizeof(regex_t));
109 mpdm_hset(regex_cache, r, c);
113 mpdm_unref(rmb);
116 mpdm_unref(r);
118 return c;
122 static mpdm_t regex1(mpdm_t r, const mpdm_t v, int offset)
123 /* test for one regex */
125 mpdm_t w = NULL;
126 mpdm_t cr;
128 mpdm_ref(r);
129 mpdm_ref(v);
131 /* no matching yet */
132 mpdm_regex_offset = -1;
134 /* compile the regex */
135 if ((cr = mpdm_regcomp(r)) != NULL) {
136 regmatch_t rm;
137 char *ptr;
138 wchar_t *last;
139 int o = 0;
140 int f = 0;
142 /* takes pointer to 'last' flag */
143 if ((last = regex_flags(r)) != NULL)
144 last = wcschr(last, 'l');
146 /* convert to mbs */
147 ptr = mpdm_wcstombs((wchar_t *) mpdm_string(v) + offset, NULL);
149 /* match? */
150 while (regexec((regex_t *) cr->data, ptr + o, 1,
151 &rm, offset > 0 ? REG_NOTBOL : 0) == 0) {
152 f++;
154 /* if 'last' is not set, it's done */
155 if (last == NULL)
156 break;
158 rm.rm_so += o;
159 rm.rm_eo += o;
160 o = rm.rm_eo;
163 if (f) {
164 /* converts to mbs the string from the beginning
165 to the start of the match, just to know
166 the size (and immediately frees it) */
167 free(mpdm_mbstowcs(ptr, &mpdm_regex_offset, rm.rm_so));
169 /* add the offset */
170 mpdm_regex_offset += offset;
172 /* create now the matching string */
173 w = MPDM_NMBS(ptr + rm.rm_so, rm.rm_eo - rm.rm_so);
175 /* and store the size */
176 mpdm_regex_size = mpdm_size(w);
179 free(ptr);
182 mpdm_unref(v);
183 mpdm_unref(r);
185 return w;
190 * mpdm_regex - Matches a regular expression.
191 * @v: the value to be matched
192 * @r: the regular expression
193 * @offset: offset from the start of v->data
195 * Matches a regular expression against a value. Valid flags are 'i',
196 * for case-insensitive matching, 'm', to treat the string as a
197 * multiline string (i.e., one containing newline characters), so
198 * that ^ and $ match the boundaries of each line instead of the
199 * whole string, 'l', to return the last matching instead of the
200 * first one, or 'g', to match globally; in that last case, an array
201 * containing all matches is returned instead of a string scalar.
203 * If @r is a string, an ordinary regular expression matching is tried
204 * over the @v string. If the matching is possible, the match result
205 * is returned, or NULL otherwise.
207 * If @r is an array (of strings), each element is tried sequentially
208 * as an individual regular expression over the @v string, each one using
209 * the offset returned by the previous match. All regular expressions
210 * must match to be successful. If this is the case, an array (with
211 * the same number of arguments) is returned containing the matched
212 * strings, or NULL otherwise.
214 * If @r is NULL, the result of the previous regex matching
215 * is returned as a two element array. The first element will contain
216 * the character offset of the matching and the second the number of
217 * characters matched. If the previous regex was unsuccessful, NULL
218 * is returned.
219 * [Regular Expressions]
221 mpdm_t mpdm_regex(const mpdm_t v, const mpdm_t r, int offset)
223 mpdm_t w = NULL;
225 mpdm_ref(r);
226 mpdm_ref(v);
228 /* special case: if r is NULL, return previous match */
229 if (r == NULL) {
230 /* if previous regex was successful... */
231 if (mpdm_regex_offset != -1) {
232 w = MPDM_A(2);
234 mpdm_ref(w);
235 mpdm_aset(w, MPDM_I(mpdm_regex_offset), 0);
236 mpdm_aset(w, MPDM_I(mpdm_regex_size), 1);
237 mpdm_unrefnd(w);
240 else
241 if (v != NULL) {
242 if (r->flags & MPDM_MULTIPLE) {
243 int n;
244 mpdm_t t;
246 /* multiple value; try sequentially all regexes,
247 moving the offset forward */
249 w = MPDM_A(0);
250 mpdm_ref(w);
252 for (n = 0; n < mpdm_size(r); n++) {
253 t = mpdm_regex(v, mpdm_aget(r, n), offset);
255 if (t == NULL)
256 break;
258 /* found; store and move forward */
259 mpdm_push(w, t);
260 offset = mpdm_regex_offset + mpdm_regex_size;
263 mpdm_unrefnd(w);
265 else {
266 wchar_t *global;
268 /* takes pointer to 'global' flag */
269 if ((global = regex_flags(r)) !=NULL)
270 global = wcschr(global, 'g');
272 if (global !=NULL) {
273 mpdm_t t;
275 /* match sequentially until done */
276 w = MPDM_A(0);
277 mpdm_ref(w);
279 while ((t = regex1(r, v, offset)) != NULL) {
280 mpdm_push(w, t);
282 offset = mpdm_regex_offset + mpdm_regex_size;
285 mpdm_unrefnd(w);
287 else
288 w = regex1(r, v, offset);
292 mpdm_unref(v);
293 mpdm_unref(r);
295 return w;
299 static mpdm_t expand_ampersands(const mpdm_t s, const mpdm_t t)
300 /* substitutes all unescaped ampersands in s with t */
302 const wchar_t *sptr = mpdm_string(s);
303 wchar_t *wptr;
304 wchar_t *optr = NULL;
305 int osize = 0;
306 mpdm_t r = NULL;
308 mpdm_ref(s);
309 mpdm_ref(t);
311 if (s != NULL) {
312 while ((wptr = wcschr(sptr, L'\\')) != NULL ||
313 (wptr = wcschr(sptr, L'&')) != NULL) {
314 int n = wptr - sptr;
316 /* add the leading part */
317 optr = mpdm_pokewsn(optr, &osize, sptr, n);
319 if (*wptr == L'\\') {
320 if (*(wptr + 1) == L'&' || *(wptr + 1) == L'\\')
321 wptr++;
323 optr = mpdm_pokewsn(optr, &osize, wptr, 1);
325 else
326 if (*wptr == '&')
327 optr = mpdm_pokev(optr, &osize, t);
329 sptr = wptr + 1;
332 /* add the rest of the string */
333 optr = mpdm_pokews(optr, &osize, sptr);
334 optr = mpdm_pokewsn(optr, &osize, L"", 1);
335 r = MPDM_ENS(optr, osize - 1);
338 mpdm_unref(t);
339 mpdm_unref(s);
341 return r;
346 * mpdm_sregex - Matches and substitutes a regular expression.
347 * @v: the value to be matched
348 * @r: the regular expression
349 * @s: the substitution string, hash or code
350 * @offset: offset from the start of v->data
352 * Matches a regular expression against a value, and substitutes the
353 * found substring with @s. Valid flags are 'i', for case-insensitive
354 * matching, and 'g', for global replacements (all ocurrences in @v
355 * will be replaced, instead of just the first found one).
357 * If @s is executable, it's executed with the matched part as
358 * the only argument and its return value is used as the
359 * substitution string.
361 * If @s is a hash, the matched string is used as a key to it and
362 * its value used as the substitution. If this value itself is
363 * executable, it's executed with the matched string as its only
364 * argument and its return value used as the substitution.
366 * If @r is NULL, returns the number of substitutions made in the
367 * previous call to mpdm_sregex() (can be zero if none was done).
369 * The global variables @mpdm_regex_offset and @mpdm_regex_size are
370 * set to the offset of the matched string and the size of the
371 * replaced string, respectively.
373 * Always returns a new string (either modified or an exact copy).
374 * [Regular Expressions]
376 mpdm_t mpdm_sregex(mpdm_t v, const mpdm_t r, const mpdm_t s, int offset)
378 mpdm_t cr;
379 wchar_t *optr = NULL;
380 int osize = 0;
381 mpdm_t o = NULL;
383 mpdm_ref(r);
384 mpdm_ref(v);
385 mpdm_ref(s);
387 if (r == NULL) {
388 /* return last count */
389 o = MPDM_I(mpdm_sregex_count);
391 else
392 if (v != NULL) {
393 /* compile the regex */
394 if ((cr = mpdm_regcomp(r)) != NULL) {
395 char *ptr;
396 int f, i = 0;
397 wchar_t *global;
398 mpdm_t t;
400 /* takes pointer to global flag */
401 if ((global = regex_flags(r)) !=NULL)
402 global = wcschr(global, 'g');
404 /* store the first part */
405 optr = mpdm_pokewsn(optr, &osize, v->data, offset);
407 /* convert to mbs */
408 ptr = mpdm_wcstombs((wchar_t *) v->data + offset, NULL);
410 /* reset count */
411 mpdm_sregex_count = 0;
412 mpdm_regex_offset = -1;
414 do {
415 regmatch_t rm;
417 /* try match */
418 f = !regexec((regex_t *) cr->data, ptr + i,
419 1, &rm, offset > 0 ? REG_NOTBOL : 0);
421 if (f) {
422 /* creates a string from the beginning
423 to the start of the match */
424 t = mpdm_ref(MPDM_NMBS(ptr + i, rm.rm_so));
425 optr = mpdm_pokev(optr, &osize, t);
427 /* store offset of substitution */
428 mpdm_regex_offset = mpdm_size(t) + offset;
430 mpdm_unref(t);
432 /* get the matched part */
433 t = MPDM_NMBS(ptr + i + rm.rm_so, rm.rm_eo - rm.rm_so);
435 /* is s an executable value? */
436 if (MPDM_IS_EXEC(s)) {
437 /* execute s, with t as argument */
438 t = mpdm_exec_1(s, t, NULL);
440 else
441 /* is s a hash? use match as key */
442 if (MPDM_IS_HASH(s)) {
443 mpdm_t v;
445 mpdm_ref(t);
446 v = mpdm_hget(s, t);
448 /* is the value executable? */
449 if (MPDM_IS_EXEC(v)) {
450 mpdm_t w = mpdm_ref(v);
452 v = mpdm_exec_1(w, t, NULL);
454 mpdm_unref(w);
457 mpdm_unref(t);
459 t = v;
461 else
462 t = expand_ampersands(s, t);
464 /* appends the substitution string */
465 mpdm_ref(t);
467 optr = mpdm_pokev(optr, &osize, t);
469 /* store size of substitution */
470 mpdm_regex_size = mpdm_size(t);
472 mpdm_unref(t);
474 i += rm.rm_eo;
476 /* one more substitution */
477 mpdm_sregex_count++;
479 } while (f && global);
481 /* no (more) matches; convert and append the rest */
482 t = MPDM_MBS(ptr + i);
483 optr = mpdm_pokev(optr, &osize, t);
485 free(ptr);
488 /* NULL-terminate */
489 optr = mpdm_pokewsn(optr, &osize, L"", 1);
491 o = MPDM_ENS(optr, osize - 1);
494 mpdm_unref(s);
495 mpdm_unref(v);
496 mpdm_unref(r);
498 return o;