Convert xml-protected spaces to real spaces
[odt2txt.git] / regex.c
blobc26eb563d12f399e53804f8a28033e377099d07b
1 /*
2 * regex.c: String and regex operations for odt2txt
4 * Copyright (c) 2006-2009 Dennis Stosberg <dennis@stosberg.net>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License,
8 * version 2 as published by the Free Software Foundation
9 */
11 #include "mem.h"
12 #include "regex.h"
14 #define BUF_SZ 4096
16 static char *headline(char line, const char *buf, regmatch_t matches[],
17 size_t nmatch, size_t off);
18 static size_t charlen_utf8(const char *s);
20 static void print_regexp_err(int reg_errno, const regex_t *rx)
22 char *buf = ymalloc(BUF_SZ);
24 regerror(reg_errno, rx, buf, BUF_SZ);
25 fprintf(stderr, "%s\n", buf);
27 yfree(buf);
30 int regex_subst(STRBUF *buf,
31 const char *regex, int regopt,
32 const void *subst)
34 int r;
35 const char *bufp;
36 size_t off = 0;
37 const int i = 0;
38 int match_count = 0;
40 regex_t rx;
41 const size_t nmatches = 10;
42 regmatch_t matches[10];
44 r = regcomp(&rx, regex, REG_EXTENDED);
45 if (r) {
46 print_regexp_err(r, &rx);
47 exit(EXIT_FAILURE);
50 do {
51 if (off > strbuf_len(buf))
52 break;
54 bufp = strbuf_get(buf) + off;
56 #ifdef REG_STARTEND
57 matches[0].rm_so = 0;
58 matches[0].rm_eo = strbuf_len(buf) - off;
60 if (0 != regexec(&rx, bufp, nmatches, matches, REG_STARTEND))
61 #else
62 if (0 != regexec(&rx, bufp, nmatches, matches, 0))
63 #endif
64 break;
66 if (matches[i].rm_so != -1) {
67 char *s;
68 int subst_len;
70 if (regopt & _REG_EXEC) {
71 s = (*(char *(*)
72 (const char *buf, regmatch_t matches[],
73 size_t nmatch, size_t off))subst)
74 (strbuf_get(buf), matches, nmatches, off);
75 } else
76 s = (char*)subst;
78 subst_len = strbuf_subst(buf,
79 matches[i].rm_so + off,
80 matches[i].rm_eo + off,
81 s);
82 match_count++;
84 if (regopt & _REG_EXEC)
85 yfree(s);
87 off += matches[i].rm_so;
88 if (subst_len >= 0)
89 off += subst_len + 1;
91 } while (regopt & _REG_GLOBAL);
93 regfree(&rx);
94 return match_count;
97 int regex_rm(STRBUF *buf,
98 const char *regex, int regopt)
100 return regex_subst(buf, regex, regopt, "");
103 char *underline(char linechar, const char *str)
105 size_t i;
106 char *tmp;
107 STRBUF *line;
108 size_t charlen = charlen_utf8(str);
110 if (str[0] == '\0') {
111 tmp = ymalloc(1);
112 tmp[0] = '\0';
113 return tmp;
116 line = strbuf_new();
117 strbuf_append(line, str);
118 strbuf_append(line, "\n");
120 tmp = ymalloc(charlen);
121 for (i = 0; i < charlen; i++) {
122 tmp[i] = linechar;
124 strbuf_append_n(line, tmp, charlen);
125 yfree(tmp);
127 strbuf_append(line, "\n\n");
128 return strbuf_spit(line);
131 static char *headline(char line, const char *buf, regmatch_t matches[],
132 size_t nmatch, size_t off)
134 const int i = 1;
135 char *result;
136 size_t len;
137 char *match;
139 len = matches[i].rm_eo - matches[i].rm_so;
140 match = ymalloc(len + 1);
142 memcpy(match, buf + matches[i].rm_so + off, len);
143 match[len] = '\0' ;
145 result = underline(line, match);
147 yfree(match);
148 return result;
151 char *h1(const char *buf, regmatch_t matches[], size_t nmatch, size_t off)
153 return headline('=', buf, matches, nmatch, off);
156 char *h2(const char *buf, regmatch_t matches[], size_t nmatch, size_t off)
158 return headline('-', buf, matches, nmatch, off);
161 char *image(const char *buf, regmatch_t matches[], size_t nmatch, size_t off)
163 const int i = 1;
164 const char *prefix = "[-- Image: ";
165 const char *postfix = " --]";
166 size_t pr_len, po_len, len;
167 char *match;
169 pr_len = strlen(prefix);
170 len = matches[i].rm_eo - matches[i].rm_so;
171 po_len = strlen(prefix);
173 match = ymalloc(pr_len + len + po_len + 1);
174 memcpy(match, prefix, pr_len);
175 memcpy(match + pr_len, buf + matches[i].rm_so + off, len);
176 memcpy(match + pr_len + len, postfix, po_len);
177 match[pr_len + len + po_len] = '\0' ;
179 return match;
182 static size_t charlen_utf8(const char *s)
184 size_t count = 0;
185 unsigned char *t = (unsigned char*) s;
186 while (*t != '\0') {
187 if (*t > 0x80)
188 t += utf8_length[*t - 0x80];
189 count++;
190 t++;
192 return count;
195 STRBUF *wrap(STRBUF *buf, int width)
197 const char *lf = "\n";
198 const size_t lflen = strlen(lf);
199 const char *bufp;
200 const char *last;
201 const char *lastspace = 0;
202 size_t linelen = 0;
203 STRBUF *out = strbuf_new();
205 bufp = strbuf_get(buf);
206 last = bufp;
208 if (width == -1) {
209 strbuf_append_n(out, strbuf_get(buf), strbuf_len(buf));
210 return out;
213 strbuf_append_n(out, lf, lflen);
214 while(bufp - strbuf_get(buf) < (ptrdiff_t)strbuf_len(buf)) {
215 if (*bufp == ' ')
216 lastspace = bufp;
217 else if (*bufp == '\n') {
218 strbuf_append_n(out, last, (size_t)(bufp - last));
219 do {
220 strbuf_append_n(out, lf, lflen);
221 } while (*++bufp == '\n');
222 lastspace = NULL;
224 while(*bufp == ' ') {
225 bufp++;
227 last = bufp;
228 linelen = 0;
231 if (NULL != lastspace && (int)linelen > width) {
232 strbuf_append_n(out, last, (size_t)(lastspace - last));
233 strbuf_append_n(out, lf, lflen);
234 last = lastspace;
235 lastspace = NULL;
236 linelen = (size_t)(bufp - last);
238 while(*last == ' ') {
239 last++;
241 if(last > bufp)
242 bufp = last;
245 bufp++;
246 linelen++;
247 if ((unsigned char)*bufp > 0x80)
248 bufp += utf8_length[(unsigned char)*bufp - 0x80];
250 strbuf_append_n(out, "\n", 1);
251 return out;