Fix false resource release <-> double free (Bob Tennent)..
[s-mailx.git] / filter.c
blob12080ffc7bf635d9120093b57484493438610a03
1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
2 *@ Filter objects.
4 * Copyright (c) 2013 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 #undef n_FILE
19 #define n_FILE filter
21 #ifndef HAVE_AMALGAMATION
22 # include "nail.h"
23 #endif
26 * Quotation filter
30 * TODO quotation filter: anticipate in future data: don't break if only WS
31 * TODO or a LF escaping \ follows on the line (simply reuse the latter).
34 #ifdef HAVE_QUOTE_FOLD
35 CTA(QUOTE_MAX > 3);
37 enum qf_state {
38 _QF_CLEAN,
39 _QF_PREFIX,
40 _QF_DATA
43 struct qf_vc {
44 struct quoteflt *self;
45 char const *buf;
46 size_t len;
49 /* Print out prefix and current quote */
50 static ssize_t _qf_dump_prefix(struct quoteflt *self);
52 /* Add one data character */
53 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
55 /* State machine handlers */
56 static ssize_t _qf_state_prefix(struct qf_vc *vc);
57 static ssize_t _qf_state_data(struct qf_vc *vc);
59 static ssize_t
60 _qf_dump_prefix(struct quoteflt *self)
62 ssize_t rv;
63 size_t i;
64 NYD_ENTER;
66 if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
67 self->qf_os))
68 goto jerr;
69 rv = i;
71 if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
72 self->qf_os))
73 goto jerr;
74 rv += i;
75 jleave:
76 NYD_LEAVE;
77 return rv;
78 jerr:
79 rv = -1;
80 goto jleave;
83 static ssize_t
84 _qf_add_data(struct quoteflt *self, wchar_t wc)
86 char *save_b;
87 ui32_t save_l, save_w;
88 ssize_t rv = 0;
89 int w, l;
90 NYD_ENTER;
92 save_l = save_w = 0; /* silence cc */
93 save_b = NULL;
94 /* <newline> ends state */
95 if (wc == L'\n')
96 goto jflush;
97 if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
98 goto jleave;
100 /* Unroll <tab> to spaces */
101 if (wc == L'\t') {
102 save_l = self->qf_datw;
103 save_w = (save_l + QUOTE_TAB_SPACES) & ~(QUOTE_TAB_SPACES - 1);
104 save_w -= save_l;
105 while (save_w-- > 0) {
106 ssize_t j = _qf_add_data(self, L' ');
107 if (j < 0) {
108 rv = j;
109 break;
111 rv += j;
113 goto jleave;
116 w = wcwidth(wc);
117 if (w == -1) {
118 jbad:
119 ++self->qf_datw;
120 self->qf_dat.s[self->qf_dat.l++] = '?';
121 } else {
122 l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
123 if (l < 0)
124 goto jbad;
125 self->qf_datw += (ui32_t)w;
126 self->qf_dat.l += (size_t)l;
129 /* TODO The last visual may excess (adjusted!) *qfold-max* if it's a wide;
130 * TODO place it on the next line, break before */
131 if (self->qf_datw >= self->qf_qfold_max) {
132 /* If we have seen a nice breakpoint during traversal, shuffle data
133 * around a bit so as to restore the trailing part after flushing */
134 if (self->qf_brkl > 0) {
135 save_w = self->qf_datw - self->qf_brkw;
136 save_l = self->qf_dat.l - self->qf_brkl;
137 save_b = self->qf_dat.s + self->qf_brkl + 2;
138 memmove(save_b, save_b - 2, save_l);
139 self->qf_dat.l = self->qf_brkl;
142 self->qf_dat.s[self->qf_dat.l++] = '\\';
143 jflush:
144 self->qf_dat.s[self->qf_dat.l++] = '\n';
145 rv = quoteflt_flush(self);
147 /* Restore takeovers, if any */
148 if (save_b != NULL) {
149 self->qf_brk_isws = FAL0;
150 self->qf_datw += save_w;
151 self->qf_dat.l = save_l;
152 memmove(self->qf_dat.s, save_b, save_l);
154 } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
155 bool_t isws = iswspace(wc);
157 if ((isws && !self->qf_brk_isws) || self->qf_brkl == 0) {
158 self->qf_brkl = self->qf_dat.l;
159 self->qf_brkw = self->qf_datw;
160 self->qf_brk_isws = isws;
164 /* If state changed to prefix, perform full reset (note this implies that
165 * quoteflt_flush() performs too much work..) */
166 if (wc == '\n') {
167 self->qf_state = _QF_PREFIX;
168 self->qf_wscnt = self->qf_datw = 0;
169 self->qf_currq.l = 0;
171 jleave:
172 NYD_LEAVE;
173 return rv;
176 static ssize_t
177 _qf_state_prefix(struct qf_vc *vc)
179 struct quoteflt *self;
180 ssize_t rv;
181 char const *buf;
182 size_t len, i;
183 wchar_t wc;
184 NYD_ENTER;
186 self = vc->self;
187 rv = 0;
189 for (buf = vc->buf, len = vc->len; len > 0;) {
190 /* xxx NULL BYTE! */
191 i = mbrtowc(&wc, buf, len, self->qf_mbps);
192 if (i == (size_t)-1) {
193 /* On hard error, don't modify mbstate_t and step one byte */
194 self->qf_mbps[0] = self->qf_mbps[1];
195 ++buf;
196 --len;
197 self->qf_wscnt = 0;
198 continue;
200 self->qf_mbps[1] = self->qf_mbps[0];
201 if (i == (size_t)-2) {
202 /* Redundant shift sequence, out of buffer */
203 len = 0;
204 break;
206 buf += i;
207 len -= i;
209 if (wc == L'\n')
210 goto jfin;
211 if (iswspace(wc)) {
212 ++self->qf_wscnt;
213 continue;
215 if (i == 1 && ISQUOTE(wc)) {
216 self->qf_wscnt = 0;
217 if (self->qf_currq.l >= QUOTE_MAX - 3) {
218 self->qf_currq.s[QUOTE_MAX - 3] = '.';
219 self->qf_currq.s[QUOTE_MAX - 2] = '.';
220 self->qf_currq.s[QUOTE_MAX - 1] = '.';
221 self->qf_currq.l = QUOTE_MAX;
222 } else
223 self->qf_currq.s[self->qf_currq.l++] = buf[-1];
224 continue;
227 /* The quote is parsed and compressed; dump it */
228 jfin:
229 self->qf_state = _QF_DATA;
230 /* Overtake WS to the current quote in order to preserve it for eventual
231 * necessary follow lines, too */
232 /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
233 while (self->qf_wscnt-- > 0 && self->qf_currq.l < QUOTE_MAX)
234 self->qf_currq.s[self->qf_currq.l++] = ' ';
235 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
236 self->qf_wscnt = 0;
237 rv = _qf_add_data(self, wc);
238 break;
241 vc->buf = buf;
242 vc->len = len;
243 NYD_LEAVE;
244 return rv;
247 static ssize_t
248 _qf_state_data(struct qf_vc *vc)
250 struct quoteflt *self;
251 ssize_t rv;
252 char const *buf;
253 size_t len, i;
254 wchar_t wc;
255 NYD_ENTER;
257 self = vc->self;
258 rv = 0;
260 for (buf = vc->buf, len = vc->len; len > 0;) {
261 /* xxx NULL BYTE! */
262 i = mbrtowc(&wc, buf, len, self->qf_mbps);
263 if (i == (size_t)-1) {
264 /* On hard error, don't modify mbstate_t and step one byte */
265 self->qf_mbps[0] = self->qf_mbps[1];
266 ++buf;
267 --len;
268 continue;
270 self->qf_mbps[1] = self->qf_mbps[0];
271 if (i == (size_t)-2) {
272 /* Redundant shift sequence, out of buffer */
273 len = 0;
274 break;
276 buf += i;
277 len -= i;
279 { ssize_t j = _qf_add_data(self, wc);
280 if (j < 0) {
281 rv = j;
282 break;
284 rv += j;
287 if (self->qf_state != _QF_DATA)
288 break;
291 vc->buf = buf;
292 vc->len = len;
293 NYD_LEAVE;
294 return rv;
296 #endif /* HAVE_QUOTE_FOLD */
298 FL struct quoteflt *
299 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
301 static struct quoteflt qf_i;
303 return &qf_i;
306 FL void
307 quoteflt_init(struct quoteflt *self, char const *prefix)
309 #ifdef HAVE_QUOTE_FOLD
310 char *xcp, *cp;
311 #endif
312 NYD_ENTER;
314 memset(self, 0, sizeof *self);
316 if ((self->qf_pfix = prefix) != NULL)
317 self->qf_pfix_len = (ui32_t)strlen(prefix);
319 /* Check wether the user wants the more fancy quoting algorithm */
320 /* TODO *quote-fold*: QUOTE_MAX may excess it! */
321 #ifdef HAVE_QUOTE_FOLD
322 if (self->qf_pfix_len > 0 && (cp = ok_vlook(quote_fold)) != NULL) {
323 ui32_t qmin, qmax = (ui32_t)strtol(cp, &xcp, 10);
324 /* These magic values ensure we don't bail :) */
325 if (qmax < self->qf_pfix_len + 6)
326 qmax = self->qf_pfix_len + 6;
327 --qmax; /* The newline escape */
328 if (cp == xcp || *xcp == '\0')
329 qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
330 else {
331 qmin = (ui32_t)strtol(xcp + 1, NULL, 10);
332 if (qmin < qmax >> 1)
333 qmin = qmax >> 1;
334 else if (qmin > qmax - 2)
335 qmin = qmax - 2;
337 self->qf_qfold_min = qmin;
338 self->qf_qfold_max = qmax;
340 /* Add pad for takeover copies, backslash and newline */
341 self->qf_dat.s = salloc((qmax + 3) * mb_cur_max);
342 self->qf_currq.s = salloc((QUOTE_MAX + 1) * mb_cur_max);
344 #endif
345 NYD_LEAVE;
348 FL void
349 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
351 NYD_ENTER;
352 UNUSED(self);
353 NYD_LEAVE;
356 FL void
357 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
359 NYD_ENTER;
360 self->qf_os = f;
361 #ifdef HAVE_QUOTE_FOLD
362 self->qf_state = _QF_CLEAN;
363 self->qf_dat.l =
364 self->qf_currq.l = 0;
365 memset(self->qf_mbps, 0, sizeof self->qf_mbps);
366 #endif
367 NYD_LEAVE;
370 FL ssize_t
371 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
373 /* (xxx Ideally the actual push() [and flush()] would be functions on their
374 * xxx own, via indirect vtbl call ..) */
375 ssize_t rv = 0;
376 NYD_ENTER;
378 if (len == 0)
379 goto jleave;
381 /* Bypass? XXX Finally, this filter simply should not be used, then */
382 if (self->qf_pfix_len == 0) {
383 if (len != fwrite(dat, 1, len, self->qf_os))
384 goto jerr;
385 rv = len;
387 /* Normal: place *indentprefix* at every BOL */
388 else
389 #ifdef HAVE_QUOTE_FOLD
390 if (self->qf_qfold_max == 0)
391 #endif
393 void *vp;
394 size_t ll;
395 bool_t pxok = (self->qf_qfold_min != 0);
397 for (;;) {
398 if (!pxok) {
399 ll = self->qf_pfix_len;
400 if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
401 goto jerr;
402 rv += ll;
403 pxok = TRU1;
406 /* xxx Strictly speaking this is invalid, because only `/' and `.' are
407 * xxx mandated by POSIX.1-2008 as "invariant across all locales
408 * xxx supported"; though there is no charset known which uses this
409 * xxx control char as part of a multibyte character; note that S-nail
410 * XXX (and the Mail codebase as such) do not support EBCDIC */
411 if ((vp = memchr(dat, '\n', len)) == NULL)
412 ll = len;
413 else {
414 pxok = FAL0;
415 ll = PTR2SIZE((char*)vp - dat) + 1;
418 if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
419 goto jerr;
420 rv += ll;
421 if ((len -= ll) == 0)
422 break;
423 dat += ll;
426 self->qf_qfold_min = pxok;
428 /* Overly complicated, though still only line-per-line: *quote-fold*.
429 * - If .qf_currq.l is 0, then we are in a clean state. Reset .qf_mbps;
430 * TODO note this means we assume that lines start with reset escape seq,
431 * TODO but i don't think this is any worse than what we currently do;
432 * TODO in 15.0, with the value carrier, we should carry conversion states
433 * TODO all along, only resetting on error (or at words for header =???=);
434 * TODO this still is weird for error handling, but we need to act more
435 * TODO stream-alike (though in practice i don't think cross-line states
436 * TODO can be found, because of compatibility reasons; however, being
437 * TODO a problem rather than a solution is not a good thing (tm))
438 * - Lookout for a newline */
439 #ifdef HAVE_QUOTE_FOLD
440 else {
441 struct qf_vc vc;
442 ssize_t i;
444 vc.self = self;
445 vc.buf = dat;
446 vc.len = len;
447 while (vc.len > 0) {
448 switch (self->qf_state) {
449 case _QF_CLEAN:
450 case _QF_PREFIX:
451 i = _qf_state_prefix(&vc);
452 break;
453 default: /* silence cc (`i' unused) */
454 case _QF_DATA:
455 i = _qf_state_data(&vc);
456 break;
458 if (i < 0)
459 goto jerr;
460 rv += i;
463 #endif /* HAVE_QUOTE_FOLD */
465 jleave:
466 NYD_LEAVE;
467 return rv;
468 jerr:
469 rv = -1;
470 goto jleave;
473 FL ssize_t
474 quoteflt_flush(struct quoteflt *self)
476 ssize_t rv = 0;
477 NYD_ENTER;
478 UNUSED(self);
480 #ifdef HAVE_QUOTE_FOLD
481 if (self->qf_dat.l > 0) {
482 rv = _qf_dump_prefix(self);
483 if (rv >= 0) {
484 size_t i = self->qf_dat.l;
485 if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
486 rv += i;
487 else
488 rv = -1;
489 self->qf_dat.l = 0;
490 self->qf_brk_isws = FAL0;
491 self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
492 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
495 #endif
496 NYD_LEAVE;
497 return rv;
501 * HTML tagsoup filter
502 * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
503 * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
504 * TODO (nonstandard) and watch out for style="gmail_quote" (or so, VERY
505 * TODO nonstandard) and tracking a stack of such elements (to be popped
506 * TODO once the closing element is seen). Then, after writing a newline,
507 * TODO place sizeof(stack) ">"s first. But aren't these HTML mails rude?
508 * TODO Interlocking and non-well-formed data will break us down
510 #ifdef HAVE_FILTER_HTML_TAGSOUP
512 enum hf_limits {
513 _HF_MINLEN = 10, /* Minimum line length (can't really be smaller) */
514 _HF_BRKSUB = 8 /* Start considering line break MAX - BRKSUB */
517 enum hf_flags {
518 _HF_UTF8 = 1<<0, /* Data is in UTF-8 */
519 _HF_ERROR = 1<<1, /* A hard error occurred, bail as soon as possible */
520 _HF_NOPUT = 1<<2, /* (In a tag,) Don't generate output */
521 _HF_IGN = 1<<3, /* Ignore mode on */
522 _HF_ANY = 1<<4, /* Yet seen just any output */
523 _HF_PRE = 1<<5, /* In <pre>formatted mode */
524 _HF_ENT = 1<<6, /* Currently parsing an entity */
525 _HF_BLANK = 1<<7, /* Whitespace last */
526 _HF_HREF = 1<<8, /* External <a href=> was the last href seen */
528 _HF_NL_1 = 1<<9, /* One \n seen */
529 _HF_NL_2 = 2<<9, /* We have produced an all empty line */
530 _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
533 enum hf_special_actions {
534 _HFSA_NEEDSEP = -1, /* Need an empty line (paragraph separator) */
535 _HFSA_NEEDNL = -2, /* Need a new line start (table row) */
536 _HFSA_IGN = -3, /* Things like <style>..</style>, <script>.. */
537 _HFSA_PRE = -4, /* <pre>.. */
538 _HFSA_PRE_END = -5,
539 _HFSA_IMG = -6, /* <img> */
540 _HFSA_HREF = -7, /* <a>.. */
541 _HFSA_HREF_END = -8
544 enum hf_entity_flags {
545 _HFE_HAVE_UNI = 1<<6, /* Have a Unicode replacement character */
546 _HFE_HAVE_CSTR = 1<<7, /* Have a string replacement */
547 /* We store the length of the entity name in the flags, too */
548 _HFE_LENGTH_MASK = (1<<6) - 1
551 struct htmlflt_href {
552 struct htmlflt_href *hfh_next;
553 ui32_t hfh_no; /* Running sequence */
554 ui32_t hfh_len; /* of .hfh_dat */
555 char hfh_dat[VFIELD_SIZE(0)];
558 struct htmlflt_tag {
559 si32_t hft_act; /* char or hf_special_actions */
560 ui8_t hft_len; /* Useful bytes in (NUL terminated) .hft_tag */
561 char const hft_tag[11]; /* Tag less < and > surroundings (TR, /TR, ..) */
563 CTA(SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE); /* .hf_ign_tag */
565 struct hf_ent {
566 ui8_t hfe_flags; /* enum hf_entity_flags plus length of .hfe_ent */
567 char hfe_c; /* Plain replacement character */
568 ui16_t hfe_uni; /* Unicode codepoint if _HFE_HAVE_UNI */
569 char hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
570 char const hfe_ent[7]; /* Entity less & and ; surroundings */
573 /* Tag list; not binary searched :(, so try to take care a bit */
574 static struct htmlflt_tag const _hf_tags[] = {
575 # undef _X
576 # define _X(S,A) { A, sizeof(S) -1, S }
578 _X("P", _HFSA_NEEDSEP), _X("/P", _HFSA_NEEDNL),
579 _X("DIV", _HFSA_NEEDSEP), _X("/DIV", _HFSA_NEEDNL),
580 _X("TR", _HFSA_NEEDNL),
581 _X("/TH", '\t'),
582 _X("/TD", '\t'),
583 _X("A", _HFSA_HREF), _X("/A", _HFSA_HREF_END),
584 _X("IMG", _HFSA_IMG),
585 _X("IT", _HFSA_NEEDNL),
586 _X("BR", '\n'),
587 _X("PRE", _HFSA_PRE), _X("/PRE", _HFSA_PRE_END),
588 _X("DL", _HFSA_NEEDSEP),
589 _X("DT", _HFSA_NEEDNL),
590 _X("TITLE", _HFSA_NEEDSEP), /*_X("/TITLE", '\n'),*/
591 _X("H1", _HFSA_NEEDSEP), /*_X("/H1", '\n'),*/
592 _X("H2", _HFSA_NEEDSEP), /*_X("/H2", '\n'),*/
593 _X("H3", _HFSA_NEEDSEP), /*_X("/H3", '\n'),*/
594 _X("H4", _HFSA_NEEDSEP), /*_X("/H4", '\n'),*/
595 _X("H5", _HFSA_NEEDSEP), /*_X("/H5", '\n'),*/
596 _X("H6", _HFSA_NEEDSEP), /*_X("/H6", '\n'),*/
598 _X("STYLE", _HFSA_IGN),
599 _X("SCRIPT", _HFSA_IGN),
601 # undef _X
604 /* Entity list; not binary searched.. */
605 static struct hf_ent const _hf_ents[] = {
606 # undef _X
607 # undef _XU
608 # undef _XS
609 # undef _XUS
610 # define _X(E,C) {(sizeof(E) -1), C, 0x0u, "", E}
611 # define _XU(E,C,U) {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E}
612 # define _XS(E,S) {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u, S "\0", E}
613 # define _XSU(E,S,U) \
614 {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E}
616 _X("quot", '"'),
617 _X("amp", '&'),
618 _X("lt", '<'), _X("gt", '>'),
620 _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
621 _XSU("hellip", "...", 0x2026),
622 _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
623 _XSU("laquo", "<<", 0x00AB), _XSU("raquo", ">>", 0x00BB),
624 _XSU("lsaquo", "<", 0x2039), _XSU("rsaquo", ">", 0x203A),
625 _XSU("lsquo", "'", 0x2018), _XSU("rsquo", "'", 0x2019),
626 _XSU("ldquo", "\"", 0x201C), _XSU("rdquo", "\"", 0x201D),
627 _XSU("uarr", "^|", 0x2191), _XSU("darr", "|v", 0x2193),
629 _XSU("cent", "CENT", 0x00A2),
630 _XSU("copy", "(C)", 0x00A9),
631 _XSU("euro", "EUR", 0x20AC),
632 _XSU("infin", "INFY", 0x221E),
633 _XSU("pound", "GBP", 0x00A3),
634 _XSU("reg", "(R)", 0x00AE),
635 _XSU("sect", "S:", 0x00A7),
636 _XSU("yen", "JPY", 0x00A5),
638 /* German umlauts */
639 _XSU("Auml", "Ae", 0x00C4), _XSU("auml", "ae", 0x00E4),
640 _XSU("Ouml", "Oe", 0x00D6), _XSU("ouml", "oe", 0x00F6),
641 _XSU("Uuml", "Ue", 0x00DC), _XSU("uuml", "ue", 0x00FC),
642 _XSU("szlig", "ss", 0x00DF)
644 # undef _X
645 # undef _XU
646 # undef _XS
647 # undef _XSU
650 /* Real output */
651 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
652 static struct htmlflt * _hf_dump(struct htmlflt *self);
653 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
654 # ifdef HAVE_NATCH_CHAR
655 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
656 # endif
658 /* Virtual output */
659 static struct htmlflt * _hf_nl(struct htmlflt *self);
660 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
661 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
662 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
663 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
664 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
665 char const *cp, size_t len);
667 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
668 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
669 char const *param);
671 /* Expand all entities in the given parameter */
672 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
673 struct str const *param);
675 /* Completely parsed over a tag / an entity, interpret that */
676 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
677 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
678 size_t l);
680 /* Input handler */
681 static ssize_t _hf_add_data(struct htmlflt *self,
682 char const *dat, size_t len);
684 static struct htmlflt *
685 _hf_dump_hrefs(struct htmlflt *self)
687 struct htmlflt_href *hhp;
688 NYD2_ENTER;
690 if (!(self->hf_flags & _HF_NL_2) && fputc('\n', self->hf_os) == EOF) {
691 self->hf_flags |= _HF_ERROR;
692 goto jleave;
695 /* Reverse the list */
696 for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
697 struct htmlflt_href *tmp = hhp->hfh_next;
698 hhp->hfh_next = self->hf_hrefs;
699 self->hf_hrefs = hhp;
700 hhp = tmp;
703 /* Then dump it */
704 while ((hhp = self->hf_hrefs) != NULL) {
705 self->hf_hrefs = hhp->hfh_next;
707 if (!(self->hf_flags & _HF_ERROR)) {
708 int w = fprintf(self->hf_os, " [%u] %.*s\n",
709 hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
710 if (w < 0)
711 self->hf_flags |= _HF_ERROR;
713 free(hhp);
716 self->hf_flags |= (fputc('\n', self->hf_os) == EOF)
717 ? _HF_ERROR : _HF_NL_1 | _HF_NL_2;
718 self->hf_href_dist = (ui32_t)realscreenheight >> 1;
719 jleave:
720 NYD2_LEAVE;
721 return self;
724 static struct htmlflt *
725 _hf_dump(struct htmlflt *self)
727 ui32_t f, l;
728 char c, *cp;
729 NYD2_ENTER;
731 f = self->hf_flags & ~_HF_BLANK;
732 l = self->hf_len;
733 cp = self->hf_line;
734 self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
736 for (c = '\0'; l > 0; --l) {
737 c = *cp++;
738 jput:
739 if (fputc(c, self->hf_os) == EOF) {
740 self->hf_flags = (f |= _HF_ERROR);
741 goto jleave;
745 if (c != '\n') {
746 f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
747 l = 1;
748 c = '\n';
749 goto jput;
751 self->hf_flags = f;
753 /* Check wether there are HREFs to dump; there is so much messy tagsoup out
754 * there that it seems best not to simply dump HREFs in each _dump(), but
755 * only with some gap, let's say half the real screen height */
756 if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
757 self = _hf_dump_hrefs(self);
758 jleave:
759 NYD2_LEAVE;
760 return self;
763 static struct htmlflt *
764 _hf_store(struct htmlflt *self, char c)
766 ui32_t f, l, i;
767 NYD2_ENTER;
769 assert(c != '\n');
771 f = self->hf_flags;
772 l = self->hf_len;
773 self->hf_line[l] = (c == '\t' ? ' ' : c);
774 self->hf_len = ++l;
775 if (blankspacechar(c)) {
776 if (c == '\t') {
777 i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
778 if (i > 0) {
780 self = _hf_store(self, ' ');
781 while (--i > 0);
782 goto jleave;
785 self->hf_last_ws = l;
786 } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
787 self->hf_last_ws = l;
789 i = l;
790 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
791 if (mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least.. */
792 wchar_t wc;
793 int x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff);
795 if (x > 0) {
796 self->hf_mboff += x;
797 if ((x = wcwidth(wc)) == -1)
798 x = 1;
799 else if (iswspace(wc))
800 self->hf_last_ws = l;
801 i = (self->hf_mbwidth += x);
802 } else {
803 if (x < 0) {
804 mbtowc(&wc, NULL, mb_cur_max);
805 if (UICMP(32, l - self->hf_mboff, >=, mb_cur_max)) { /* XXX */
806 ++self->hf_mboff;
807 ++self->hf_mbwidth;
810 i = self->hf_mbwidth;
813 # endif
815 /* Do we need to break the line? */
816 if (i >= self->hf_lmax - _HF_BRKSUB) {
817 ui32_t lim = self->hf_lmax >> 1;
819 /* Let's hope we saw a sane place to break this line! */
820 if (self->hf_last_ws >= lim) {
821 jput:
822 i = self->hf_len = self->hf_last_ws;
823 self = _hf_dump(self);
824 if ((self->hf_len = (l -= i)) > 0) {
825 self->hf_flags &= ~_HF_NL_MASK;
826 memmove(self->hf_line, self->hf_line + i, l);
827 # ifdef HAVE_NATCH_CHAR
828 __hf_sync_mbstuff(self);
829 # endif
831 goto jleave;
834 /* Any 7-bit characters? */
835 for (i = l; i-- >= lim;)
836 if (asciichar((c = self->hf_line[i]))) {
837 self->hf_last_ws = ++i;
838 goto jput;
839 } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
840 self->hf_last_ws = i;
841 goto jput;
844 /* Hard break necessary! xxx really badly done */
845 if (l >= self->hf_lmax - 1)
846 self = _hf_dump(self);
848 jleave:
849 NYD2_LEAVE;
850 return self;
853 # ifdef HAVE_NATCH_CHAR
854 static struct htmlflt *
855 __hf_sync_mbstuff(struct htmlflt *self)
857 wchar_t wc;
858 char const *b;
859 ui32_t o, w, l;
860 NYD2_ENTER;
862 b = self->hf_line;
863 o = w = 0;
864 l = self->hf_len;
865 goto jumpin;
867 while (l > 0) {
868 int x = mbtowc(&wc, b, l);
870 if (x == 0)
871 break;
873 if (x > 0) {
874 b += x;
875 l -= x;
876 o += x;
877 if ((x = wcwidth(wc)) == -1)
878 x = 1;
879 w += x;
880 continue;
883 /* Bad, skip over a single character.. XXX very bad indeed */
884 ++b;
885 ++o;
886 ++w;
887 --l;
888 jumpin:
889 mbtowc(&wc, NULL, mb_cur_max);
892 self->hf_mboff = o;
893 self->hf_mbwidth = w;
895 NYD2_LEAVE;
896 return self;
898 # endif /* HAVE_NATCH_CHAR */
900 static struct htmlflt *
901 _hf_nl(struct htmlflt *self)
903 ui32_t f;
904 NYD2_ENTER;
906 if (!((f = self->hf_flags) & _HF_ERROR)) {
907 if (f & _HF_ANY) {
908 if ((f & _HF_NL_MASK) != _HF_NL_MASK)
909 self = _hf_dump(self);
910 } else
911 self->hf_flags = (f |= _HF_NL_MASK);
913 NYD2_LEAVE;
914 return self;
917 static struct htmlflt *
918 _hf_nl_force(struct htmlflt *self)
920 NYD2_ENTER;
921 if (!(self->hf_flags & _HF_ERROR))
922 self = _hf_dump(self);
923 NYD2_LEAVE;
924 return self;
927 static struct htmlflt *
928 _hf_putc(struct htmlflt *self, char c)
930 ui32_t f;
931 NYD2_ENTER;
933 if ((f = self->hf_flags) & _HF_ERROR)
934 goto jleave;
936 if (c == '\n') {
937 self = _hf_nl(self);
938 goto jleave;
939 } else if (c == ' ' || c == '\t') {
940 if ((f & _HF_BLANK) || self->hf_len == 0)
941 goto jleave;
942 f |= _HF_BLANK;
943 } else
944 f &= ~_HF_BLANK;
945 f &= ~_HF_NL_MASK;
946 self->hf_flags = (f |= _HF_ANY);
947 self = _hf_store(self, c);
948 jleave:
949 NYD2_LEAVE;
950 return self;
953 static struct htmlflt *
954 _hf_putc_premode(struct htmlflt *self, char c)
956 ui32_t f;
957 NYD2_ENTER;
959 if ((f = self->hf_flags) & _HF_ERROR) {
961 } else if (c == '\n')
962 self = _hf_nl_force(self);
963 else {
964 f &= ~_HF_NL_MASK;
965 self->hf_flags = (f |= _HF_ANY);
966 self = _hf_store(self, c);
968 NYD2_LEAVE;
969 return self;
972 static struct htmlflt *
973 _hf_puts(struct htmlflt *self, char const *cp)
975 char c;
976 NYD2_ENTER;
978 while ((c = *cp++) != '\0')
979 self = _hf_putc(self, c);
980 NYD2_LEAVE;
981 return self;
984 static struct htmlflt *
985 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
987 NYD2_ENTER;
989 while (len-- > 0)
990 self = _hf_putc(self, *cp++);
991 NYD2_LEAVE;
992 return self;
995 static struct htmlflt *
996 _hf_param(struct htmlflt *self, struct str *store, char const *param)
998 char *cp, c;
999 size_t i;
1000 NYD2_ENTER;
1002 store->s = NULL;
1003 store->l = 0;
1005 if ((cp = UNCONST(asccasestr(self->hf_bdat, param))) == NULL)
1006 goto jleave;
1007 cp += strlen(param);
1009 for (;;) {
1010 if ((c = *cp++) == '\0')
1011 goto jleave;
1012 if (c == '=')
1013 break;
1015 if ((c = *cp) == '\0')
1016 goto jleave;
1018 if (c == '"' || c == '\'') {
1019 char quote = c;
1021 /* TODO oops i have forgotten wether backslash quoting is allowed in
1022 * TODO quoted HTML parameter values? not supporting that for now.. */
1023 if ((c = *++cp) == '\0' || c == quote)
1024 goto jleave;
1025 store->s = cp;
1027 while ((c = *++cp) != '\0' && c != quote)
1029 /* XXX ... and we simply ignore missing trailing " :> */
1030 } else {
1031 if (!whitechar(c))
1032 while ((c = *++cp) != '\0' && !whitechar(c))
1034 store->s = cp;
1036 i = PTR2SIZE(cp - store->s);
1038 /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
1039 * parameter values prefixed and suffixed by newlines! Therefore trim the
1040 * value content TODO join into the parse step above! */
1041 for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
1043 store->s = cp;
1044 for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
1046 if ((store->l = i) == 0)
1047 store->s = NULL;
1048 jleave:
1049 NYD2_LEAVE;
1050 return self;
1053 static struct htmlflt *
1054 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
1056 char const *cp, *maxcp, *ep;
1057 char c;
1058 size_t i;
1059 NYD2_ENTER;
1061 for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
1062 if ((c = *cp++) != '&')
1063 self = _hf_putc(self, c);
1064 else {
1065 for (ep = cp--; ep < maxcp && (c = *ep++) != ';';)
1066 if (c == '\0') {
1067 self = _hf_puts(self, cp);
1068 goto jleave;
1070 if ((i = PTR2SIZE(ep - cp)) > 1)
1071 self = _hf_check_ent(self, cp, i);
1072 cp = ep;
1074 jleave:
1075 NYD2_LEAVE;
1076 return self;
1079 static struct htmlflt *
1080 _hf_check_tag(struct htmlflt *self, char const *s)
1082 char nobuf[32], c;
1083 struct str param;
1084 size_t i;
1085 struct htmlflt_tag const *hftp;
1086 ui32_t f;
1087 NYD2_ENTER;
1089 /* Extra check only */
1090 assert(s != NULL);
1091 if (*s != '<') {
1092 DBG( alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
1093 jput_as_is:
1094 self = _hf_puts(self, self->hf_bdat);
1095 goto jleave;
1098 for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
1099 /* Special massage for things like <br/>: after the slash only whitespace
1100 * may separate us from the closing right angle! */
1101 if (c == '/') {
1102 size_t j = i + 1;
1104 while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
1105 ++j;
1106 if (c == '>')
1107 break;
1110 for (hftp = _hf_tags;;) {
1111 if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
1112 c = s[hftp->hft_len];
1113 if (c == '>' || c == '/' || whitechar(c))
1114 break;
1116 if (PTRCMP(++hftp, >=, _hf_tags + NELEM(_hf_tags)))
1117 goto jnotknown;
1120 f = self->hf_flags;
1121 switch (hftp->hft_act) {
1122 case _HFSA_PRE_END:
1123 f &= ~_HF_PRE;
1124 if (0) {
1125 /* FALLTHRU */
1126 case _HFSA_PRE:
1127 f |= _HF_PRE;
1129 self->hf_flags = f;
1130 /* FALLTHRU */
1132 case _HFSA_NEEDSEP:
1133 if (!(self->hf_flags & _HF_NL_2))
1134 self = _hf_nl(self);
1135 /* FALLTHRU */
1136 case _HFSA_NEEDNL:
1137 if (!(f & _HF_NL_1))
1138 self = _hf_nl(self);
1139 break;
1141 case _HFSA_IGN:
1142 self->hf_ign_tag = hftp;
1143 self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
1144 break;
1146 case _HFSA_IMG:
1147 self = _hf_param(self, &param, "alt");
1148 self = _hf_putc(self, '[');
1149 if (param.s == NULL) {
1150 param.s = UNCONST("IMG");
1151 param.l = 3;
1152 goto jimg_put;
1153 } /* else */ if (memchr(param.s, '&', param.l) != NULL)
1154 self = _hf_expand_all_ents(self, &param);
1155 else
1156 jimg_put:
1157 self = _hf_putbuf(self, param.s, param.l);
1158 self = _hf_putc(self, ']');
1159 break;
1161 case _HFSA_HREF:
1162 self = _hf_param(self, &param, "href");
1163 /* Ignore non-external links */
1164 if (param.s != NULL && *param.s != '#') {
1165 struct htmlflt_href *hhp = smalloc(sizeof(*hhp) -
1166 VFIELD_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
1168 hhp->hfh_next = self->hf_hrefs;
1169 hhp->hfh_no = ++self->hf_href_no;
1170 hhp->hfh_len = (ui32_t)param.l;
1171 memcpy(hhp->hfh_dat, param.s, param.l);
1173 snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
1174 self->hf_flags = (f |= _HF_HREF);
1175 self->hf_hrefs = hhp;
1176 self = _hf_puts(self, nobuf);
1177 } else
1178 self->hf_flags = (f &= ~_HF_HREF);
1179 break;
1180 case _HFSA_HREF_END:
1181 if (f & _HF_HREF) {
1182 snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
1183 self = _hf_puts(self, nobuf);
1185 break;
1187 default:
1188 c = (char)(hftp->hft_act & 0xFF);
1189 self = _hf_putc(self, c);
1190 break;
1191 case '\0':
1192 break;
1194 jleave:
1195 NYD2_LEAVE;
1196 return self;
1198 /* The problem is that even invalid tagsoup is widely used, without real
1199 * searching i have seen e-mail address in <N@H.D> notation, and more.
1200 * To protect us a bit look around and possibly write the content as such */
1201 jnotknown:
1202 switch (*s) {
1203 case '!':
1204 case '?':
1205 /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
1206 goto jleave;
1207 case '>':
1208 /* Print out an empty tag as such */
1209 if (s[1] == '\0') {
1210 --s;
1211 goto jput_as_is;
1213 break;
1214 case '/':
1215 ++s;
1216 break;
1217 default:
1218 break;
1221 /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
1222 while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
1223 if (!asciichar(c) || punctchar(c)) {
1224 self = _hf_puts(self, self->hf_bdat);
1225 break;
1227 goto jleave;
1230 static struct htmlflt *
1231 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
1233 char nobuf[32];
1234 char const *s_save;
1235 size_t l_save;
1236 struct hf_ent const *hfep;
1237 long i;
1238 NYD2_ENTER;
1240 s_save = s;
1241 l_save = l;
1242 assert(*s == '&');
1243 assert(l > 0);
1244 assert(s[l - 1] == ';');
1245 ++s;
1246 l -= 2;
1248 /* Numeric entity, or try named search */
1249 if (*s == '#') {
1250 i = (*++s == 'x' ? 16 : 10);
1252 if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
1253 memcpy(nobuf, s, l);
1254 nobuf[l] = '\0';
1255 i = strtol(nobuf, NULL, i);
1256 if (i <= 0x7F)
1257 self = _hf_putc(self, (char)i);
1258 else if (self->hf_flags & _HF_UTF8) {
1259 jputuni:
1260 l = n_utf32_to_utf8((ui32_t)i, nobuf);
1261 self = _hf_putbuf(self, nobuf, l);
1262 } else
1263 goto jeent;
1264 } else
1265 goto jeent;
1266 } else {
1267 ui32_t f = self->hf_flags, hf;
1269 for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + NELEM(_hf_ents)); ++hfep)
1270 if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
1271 !strncmp(s, hfep->hfe_ent, l)) {
1272 if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
1273 i = hfep->hfe_uni;
1274 goto jputuni;
1275 } else if (hf & _HFE_HAVE_CSTR)
1276 self = _hf_puts(self, hfep->hfe_cstr);
1277 else
1278 self = _hf_putc(self, hfep->hfe_c);
1279 goto jleave;
1281 jeent:
1282 self = _hf_putbuf(self, s_save, l_save);
1284 jleave:
1285 NYD2_LEAVE;
1286 return self;
1289 static ssize_t
1290 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
1292 char c, *cp, *cp_max;
1293 bool_t hot;
1294 ssize_t rv = 0;
1295 NYD_ENTER;
1297 /* Final put request? */
1298 if (dat == NULL) {
1299 if (self->hf_len > 0 || self->hf_hrefs != NULL) {
1300 self = _hf_dump(self);
1301 if (self->hf_hrefs != NULL)
1302 self = _hf_dump_hrefs(self);
1303 rv = 1;
1305 goto jleave;
1308 /* Always ensure some initial buffer */
1309 if ((cp = self->hf_curr) != NULL)
1310 cp_max = self->hf_bmax;
1311 else {
1312 cp = self->hf_curr = self->hf_bdat = smalloc(LINESIZE);
1313 cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
1315 hot = (cp != self->hf_bdat);
1317 for (rv = (ssize_t)len; len > 0; --len) {
1318 ui32_t f = self->hf_flags;
1320 if (f & _HF_ERROR)
1321 break;
1322 c = *dat++;
1324 /* Soup is really weird, and scripts may contain almost anything (and
1325 * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
1326 * test and walk until we see the required end tag */
1327 /* TODO For real safety _HF_IGN soup condome would also need to know
1328 * TODO about quoted strings so that 'var i = "</script>";' couldn't
1329 * TODO fool it! We really want this mode also for _HF_NOPUT to be
1330 * TODO able to *gracefully* detect the tag-closing '>', but then if
1331 * TODO that is a single mechanism we should have made it! */
1332 if (f & _HF_IGN) {
1333 struct htmlflt_tag const *hftp = self->hf_ign_tag;
1334 size_t i;
1336 if (c == '<') {
1337 hot = TRU1;
1338 jcp_reset:
1339 cp = self->hf_bdat;
1340 } else if (c == '>') {
1341 if (hot) {
1342 if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
1343 --i == hftp->hft_len &&
1344 !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
1345 self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
1346 hot = FAL0;
1347 goto jcp_reset;
1349 } else if (hot) {
1350 *cp++ = c;
1351 i = PTR2SIZE(cp - self->hf_bdat);
1352 if ((i == 1 && c != '/') || --i > hftp->hft_len) {
1353 hot = FAL0;
1354 goto jcp_reset;
1357 } else switch (c) {
1358 case '<':
1359 /* People are using & without &amp;ing it, ditto <; be aware */
1360 if (f & (_HF_NOPUT | _HF_ENT)) {
1361 f &= ~_HF_ENT;
1362 /* Special case "<!--" buffer content to deal with really weird
1363 * things that can be done with "<!--[if gte mso 9]>" syntax */
1364 if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
1365 memcmp(self->hf_bdat, "<!--", 4)) {
1366 self->hf_flags = f;
1367 *cp = '\0';
1368 self = _hf_puts(self, self->hf_bdat);
1369 f = self->hf_flags;
1372 cp = self->hf_bdat;
1373 *cp++ = c;
1374 self->hf_flags = (f |= _HF_NOPUT);
1375 break;
1376 case '>':
1377 /* Weird tagsoup around, do we actually parse a tag? */
1378 if (!(f & _HF_NOPUT))
1379 goto jdo_c;
1380 cp[0] = c;
1381 cp[1] = '\0';
1382 f &= ~(_HF_NOPUT | _HF_ENT);
1383 self->hf_flags = f;
1384 self = _hf_check_tag(self, self->hf_bdat);
1385 *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
1386 /* Quick hack to get rid of redundant newline after <pre> XXX */
1387 if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
1388 len > 1 && *dat == '\n')
1389 ++dat, --len;
1390 break;
1392 case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
1393 break;
1394 case '\n':
1395 /* End of line is not considered unless we are in PRE section.
1396 * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
1397 * newlines for separating parameters */
1398 if (f & _HF_NOPUT)
1399 goto jdo_c;
1400 self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
1401 break;
1403 case '\t':
1404 if (!(f & _HF_PRE))
1405 c = ' ';
1406 /* FALLTHRU */
1407 default:
1408 jdo_c:
1409 /* If not currently parsing a tag and bypassing normal output.. */
1410 if (!(f & _HF_NOPUT)) {
1411 if (cntrlchar(c))
1412 break;
1413 if (c == '&') {
1414 cp = self->hf_bdat;
1415 *cp++ = c;
1416 self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
1417 } else if (f & _HF_PRE) {
1418 self = _hf_putc_premode(self, c);
1419 self->hf_flags &= ~_HF_BLANK;
1420 } else
1421 self = _hf_putc(self, c);
1422 } else if ((f & _HF_ENT) && c == ';') {
1423 cp[0] = c;
1424 cp[1] = '\0';
1425 f &= ~(_HF_NOPUT | _HF_ENT);
1426 self->hf_flags = f;
1427 self = _hf_check_ent(self, self->hf_bdat,
1428 PTR2SIZE(cp + 1 - self->hf_bdat));
1429 } else {
1430 /* We may need to grow the buffer */
1431 if (PTRCMP(cp + 42/2, >=, cp_max)) {
1432 size_t i = PTR2SIZE(cp - self->hf_bdat),
1433 m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
1435 cp = self->hf_bdat = srealloc(self->hf_bdat, m);
1436 self->hf_bmax = cp + m -1;
1437 self->hf_curr = (cp += i);
1439 *cp++ = c;
1443 self->hf_curr = cp;
1444 jleave:
1445 NYD_LEAVE;
1446 return (self->hf_flags & _HF_ERROR) ? -1 : rv;
1450 * TODO Because we don't support filter chains yet this filter will be run
1451 * TODO in a dedicated subprocess, driven via a special Popen() mode
1453 static bool_t __hf_hadpipesig;
1454 static void
1455 __hf_onpipe(int signo)
1457 NYD_X; /* Signal handler */
1458 UNUSED(signo);
1459 __hf_hadpipesig = TRU1;
1462 FL int
1463 htmlflt_process_main(void)
1465 char buf[BUFFER_SIZE];
1466 struct htmlflt hf;
1467 size_t i;
1468 int rv;
1469 NYD_ENTER;
1471 __hf_hadpipesig = FAL0;
1472 safe_signal(SIGPIPE, &__hf_onpipe);
1474 htmlflt_init(&hf);
1475 htmlflt_reset(&hf, stdout);
1477 for (;;) {
1478 if ((i = fread(buf, sizeof(buf[0]), NELEM(buf), stdin)) == 0) {
1479 rv = !feof(stdin);
1480 break;
1483 if ((rv = __hf_hadpipesig))
1484 break;
1485 /* Just use this directly.. */
1486 if (htmlflt_push(&hf, buf, i) < 0) {
1487 rv = 1;
1488 break;
1491 if (rv == 0 && htmlflt_flush(&hf) < 0)
1492 rv = 1;
1494 htmlflt_destroy(&hf);
1496 rv |= __hf_hadpipesig;
1497 NYD_LEAVE;
1498 return rv;
1501 FL void
1502 htmlflt_init(struct htmlflt *self)
1504 NYD_ENTER;
1505 /* (Rather redundant though) */
1506 memset(self, 0, sizeof *self);
1507 NYD_LEAVE;
1510 FL void
1511 htmlflt_destroy(struct htmlflt *self)
1513 NYD_ENTER;
1514 htmlflt_reset(self, NULL);
1515 NYD_LEAVE;
1518 FL void
1519 htmlflt_reset(struct htmlflt *self, FILE *f)
1521 struct htmlflt_href *hfhp;
1522 NYD_ENTER;
1524 while ((hfhp = self->hf_hrefs) != NULL) {
1525 self->hf_hrefs = hfhp->hfh_next;
1526 free(hfhp);
1529 if (self->hf_bdat != NULL)
1530 free(self->hf_bdat);
1531 if (self->hf_line != NULL)
1532 free(self->hf_line);
1534 memset(self, 0, sizeof *self);
1536 if (f != NULL) {
1537 ui32_t sw = MAX(_HF_MINLEN, (ui32_t)scrnwidth);
1539 self->hf_line = smalloc((size_t)sw * mb_cur_max +1);
1540 self->hf_lmax = sw;
1542 if (options & OPT_UNICODE) /* TODO not truly generic */
1543 self->hf_flags = _HF_UTF8;
1544 self->hf_os = f;
1546 NYD_LEAVE;
1549 FL ssize_t
1550 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
1552 ssize_t rv;
1553 NYD_ENTER;
1555 rv = _hf_add_data(self, dat, len);
1556 NYD_LEAVE;
1557 return rv;
1560 FL ssize_t
1561 htmlflt_flush(struct htmlflt *self)
1563 ssize_t rv;
1564 NYD_ENTER;
1566 rv = _hf_add_data(self, NULL, 0);
1567 rv |= !fflush(self->hf_os) ? 0 : -1;
1568 NYD_LEAVE;
1569 return rv;
1571 #endif /* HAVE_FILTER_HTML_TAGSOUP */
1573 /* s-it-mode */