Merge branch 'topic/coverity-444-v14.9.11-1'
[s-mailx.git] / filter.c
blobe1581075535a1efcee15b4d4c9a5087ac193e443
1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
2 *@ Filter objects.
4 * Copyright (c) 2013 - 2018 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
5 * SPDX-License-Identifier: ISC
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #undef n_FILE
20 #define n_FILE filter
22 #ifndef HAVE_AMALGAMATION
23 # include "nail.h"
24 #endif
27 * Quotation filter
31 * TODO quotation filter: anticipate in future data: don't break if only WS
32 * TODO or a LF escaping \ follows on the line (simply reuse the latter).
35 #ifdef HAVE_QUOTE_FOLD
36 n_CTAV(n_QUOTE_MAX > 3);
38 enum qf_state {
39 _QF_CLEAN,
40 _QF_PREFIX,
41 _QF_DATA
44 struct qf_vc {
45 struct quoteflt *self;
46 char const *buf;
47 size_t len;
50 /* Print out prefix and current quote */
51 static ssize_t _qf_dump_prefix(struct quoteflt *self);
53 /* Add one data character */
54 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
56 /* State machine handlers */
57 static ssize_t _qf_state_prefix(struct qf_vc *vc);
58 static ssize_t _qf_state_data(struct qf_vc *vc);
60 static ssize_t
61 _qf_dump_prefix(struct quoteflt *self)
63 ssize_t rv;
64 size_t i;
65 NYD_ENTER;
67 if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
68 self->qf_os))
69 goto jerr;
70 rv = i;
72 if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
73 self->qf_os))
74 goto jerr;
75 rv += i;
76 jleave:
77 NYD_LEAVE;
78 return rv;
79 jerr:
80 rv = -1;
81 goto jleave;
84 static ssize_t
85 _qf_add_data(struct quoteflt *self, wchar_t wc)
87 int w, l;
88 char *save_b;
89 ui32_t save_l, save_w;
90 ssize_t rv;
91 NYD_ENTER;
93 rv = 0;
94 save_l = save_w = 0; /* silence cc */
95 save_b = NULL;
97 /* <newline> ends state */
98 if (wc == L'\n') {
99 w = 0;
100 goto jflush;
102 if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
103 goto jleave;
105 /* Unroll <tab> to spaces */
106 if (wc == L'\t') {
107 save_l = self->qf_datw;
108 save_w = (save_l + n_QUOTE_TAB_SPACES) & ~(n_QUOTE_TAB_SPACES - 1);
109 save_w -= save_l;
110 while (save_w-- > 0) {
111 ssize_t j = _qf_add_data(self, L' ');
112 if (j < 0) {
113 rv = j;
114 break;
116 rv += j;
118 goto jleave;
121 /* To avoid that the last visual excesses *qfold-max*, which may happen for
122 * multi-column characters, use w as an indicator for this and move that
123 * thing to the next line */
124 w = wcwidth(wc);
125 if (w == -1) {
126 w = 0;
127 jbad:
128 ++self->qf_datw;
129 self->qf_dat.s[self->qf_dat.l++] = '?';
130 } else if (self->qf_datw > self->qf_qfold_max - w) {
131 w = -1;
132 goto jneednl;
133 } else {
134 l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
135 if (l < 0)
136 goto jbad;
137 self->qf_datw += (ui32_t)w;
138 self->qf_dat.l += (size_t)l;
141 if (self->qf_datw >= self->qf_qfold_max) {
142 /* If we have seen a nice breakpoint during traversal, shuffle data
143 * around a bit so as to restore the trailing part after flushing */
144 jneednl:
145 if (self->qf_brkl > 0) {
146 save_w = self->qf_datw - self->qf_brkw;
147 save_l = self->qf_dat.l - self->qf_brkl;
148 save_b = self->qf_dat.s + self->qf_brkl + 2;
149 memmove(save_b, save_b - 2, save_l);
150 self->qf_dat.l = self->qf_brkl;
153 self->qf_dat.s[self->qf_dat.l++] = '\\';
154 jflush:
155 self->qf_dat.s[self->qf_dat.l++] = '\n';
156 rv = quoteflt_flush(self);
158 /* Restore takeovers, if any */
159 if (save_b != NULL) {
160 self->qf_brk_isws = FAL0;
161 self->qf_datw += save_w;
162 self->qf_dat.l = save_l;
163 memmove(self->qf_dat.s, save_b, save_l);
165 } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
166 bool_t isws = (iswspace(wc) != 0);
168 if (isws || !self->qf_brk_isws || self->qf_brkl == 0) {
169 if((self->qf_brk_isws = isws) ||
170 self->qf_brkl < self->qf_qfold_maxnws){
171 self->qf_brkl = self->qf_dat.l;
172 self->qf_brkw = self->qf_datw;
177 /* Did we hold this back to avoid qf_fold_max excess? Then do it now */
178 if(rv >= 0 && w == -1){
179 ssize_t j = _qf_add_data(self, wc);
180 if(j < 0)
181 rv = j;
182 else
183 rv += j;
185 /* If state changed to prefix, perform full reset (note this implies that
186 * quoteflt_flush() performs too much work..) */
187 else if (wc == '\n') {
188 self->qf_state = _QF_PREFIX;
189 self->qf_wscnt = self->qf_datw = 0;
190 self->qf_currq.l = 0;
192 jleave:
193 NYD_LEAVE;
194 return rv;
197 static ssize_t
198 _qf_state_prefix(struct qf_vc *vc)
200 struct quoteflt *self;
201 ssize_t rv;
202 char const *buf;
203 size_t len, i;
204 wchar_t wc;
205 NYD_ENTER;
207 self = vc->self;
208 rv = 0;
210 for (buf = vc->buf, len = vc->len; len > 0;) {
211 /* xxx NULL BYTE! */
212 i = mbrtowc(&wc, buf, len, self->qf_mbps);
213 if (i == (size_t)-1) {
214 /* On hard error, don't modify mbstate_t and step one byte */
215 self->qf_mbps[0] = self->qf_mbps[1];
216 ++buf;
217 --len;
218 self->qf_wscnt = 0;
219 continue;
221 self->qf_mbps[1] = self->qf_mbps[0];
222 if (i == (size_t)-2) {
223 /* Redundant shift sequence, out of buffer */
224 len = 0;
225 break;
227 buf += i;
228 len -= i;
230 if (wc == L'\n')
231 goto jfin;
232 if (iswspace(wc)) {
233 ++self->qf_wscnt;
234 continue;
236 if (i == 1 && n_uasciichar(wc) &&
237 strchr(self->qf_quote_chars, (char)wc) != NULL){
238 self->qf_wscnt = 0;
239 if (self->qf_currq.l >= n_QUOTE_MAX - 3) {
240 self->qf_currq.s[n_QUOTE_MAX - 3] = '.';
241 self->qf_currq.s[n_QUOTE_MAX - 2] = '.';
242 self->qf_currq.s[n_QUOTE_MAX - 1] = '.';
243 self->qf_currq.l = n_QUOTE_MAX;
244 } else
245 self->qf_currq.s[self->qf_currq.l++] = buf[-1];
246 continue;
249 /* The quote is parsed and compressed; dump it */
250 jfin:
251 self->qf_state = _QF_DATA;
252 /* Overtake WS to the current quote in order to preserve it for eventual
253 * necessary follow lines, too */
254 /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
255 while (self->qf_wscnt-- > 0 && self->qf_currq.l < n_QUOTE_MAX)
256 self->qf_currq.s[self->qf_currq.l++] = ' ';
257 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
258 self->qf_wscnt = 0;
259 rv = _qf_add_data(self, wc);
260 break;
263 vc->buf = buf;
264 vc->len = len;
265 NYD_LEAVE;
266 return rv;
269 static ssize_t
270 _qf_state_data(struct qf_vc *vc)
272 struct quoteflt *self;
273 ssize_t rv;
274 char const *buf;
275 size_t len, i;
276 wchar_t wc;
277 NYD_ENTER;
279 self = vc->self;
280 rv = 0;
282 for (buf = vc->buf, len = vc->len; len > 0;) {
283 /* xxx NULL BYTE! */
284 i = mbrtowc(&wc, buf, len, self->qf_mbps);
285 if (i == (size_t)-1) {
286 /* On hard error, don't modify mbstate_t and step one byte */
287 self->qf_mbps[0] = self->qf_mbps[1];
288 ++buf;
289 --len;
290 continue;
292 self->qf_mbps[1] = self->qf_mbps[0];
293 if (i == (size_t)-2) {
294 /* Redundant shift sequence, out of buffer */
295 len = 0;
296 break;
298 buf += i;
299 len -= i;
301 { ssize_t j = _qf_add_data(self, wc);
302 if (j < 0) {
303 rv = j;
304 break;
306 rv += j;
309 if (self->qf_state != _QF_DATA)
310 break;
313 vc->buf = buf;
314 vc->len = len;
315 NYD_LEAVE;
316 return rv;
318 #endif /* HAVE_QUOTE_FOLD */
320 FL struct quoteflt *
321 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
323 static struct quoteflt qf_i;
325 qf_i.qf_bypass = TRU1;
326 return &qf_i;
329 FL void
330 quoteflt_init(struct quoteflt *self, char const *prefix, bool_t bypass)
332 #ifdef HAVE_QUOTE_FOLD
333 char const *xcp, *cp;
334 #endif
335 NYD_ENTER;
337 memset(self, 0, sizeof *self);
339 if ((self->qf_pfix = prefix) != NULL)
340 self->qf_pfix_len = (ui32_t)strlen(prefix);
341 self->qf_bypass = bypass;
343 /* Check whether the user wants the more fancy quoting algorithm */
344 /* TODO *quote-fold*: n_QUOTE_MAX may excess it! */
345 #ifdef HAVE_QUOTE_FOLD
346 if (!bypass && (cp = ok_vlook(quote_fold)) != NULL) {
347 ui32_t qmax, qmaxnws, qmin;
349 /* These magic values ensure we don't bail */
350 n_idec_ui32_cp(&qmax, cp, 10, &xcp);
351 if (qmax < self->qf_pfix_len + 6)
352 qmax = self->qf_pfix_len + 6;
353 qmaxnws = --qmax; /* The newline escape */
354 if (cp == xcp || *xcp == '\0')
355 qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
356 else {
357 n_idec_ui32_cp(&qmin, &xcp[1], 10, &xcp);
358 if (qmin < qmax >> 1)
359 qmin = qmax >> 1;
360 else if (qmin > qmax - 2)
361 qmin = qmax - 2;
363 if (cp != xcp && *xcp != '\0') {
364 n_idec_ui32_cp(&qmaxnws, &xcp[1], 10, &xcp);
365 if (qmaxnws > qmax || qmaxnws < qmin)
366 qmaxnws = qmax;
369 self->qf_qfold_min = qmin;
370 self->qf_qfold_max = qmax;
371 self->qf_qfold_maxnws = qmaxnws;
372 self->qf_quote_chars = ok_vlook(quote_chars);
374 /* Add pad for takeover copies, reverse solidus and newline */
375 self->qf_dat.s = n_autorec_alloc((qmax + 3) * n_mb_cur_max);
376 self->qf_currq.s = n_autorec_alloc((n_QUOTE_MAX + 1) * n_mb_cur_max);
378 #endif
379 NYD_LEAVE;
382 FL void
383 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
385 NYD_ENTER;
386 n_UNUSED(self);
387 NYD_LEAVE;
390 FL void
391 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
393 NYD_ENTER;
394 self->qf_os = f;
395 #ifdef HAVE_QUOTE_FOLD
396 self->qf_state = _QF_CLEAN;
397 self->qf_dat.l =
398 self->qf_currq.l = 0;
399 memset(self->qf_mbps, 0, sizeof self->qf_mbps);
400 #endif
401 NYD_LEAVE;
404 FL ssize_t
405 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
407 /* (xxx Ideally the actual push() [and flush()] would be functions on their
408 * xxx own, via indirect vtbl call ..) */
409 ssize_t rv = 0;
410 NYD_ENTER;
412 self->qf_nl_last = (len > 0 && dat[len - 1] == '\n'); /* TODO HACK */
414 if (len == 0)
415 goto jleave;
417 /* Bypass? TODO Finally, this filter simply should not be used, then
418 * (TODO It supercedes prefix_write() or something) */
419 if (self->qf_bypass) {
420 if (len != fwrite(dat, 1, len, self->qf_os))
421 goto jerr;
422 rv = len;
424 /* Normal: place *indentprefix* at every BOL */
425 else
426 #ifdef HAVE_QUOTE_FOLD
427 if (self->qf_qfold_max == 0)
428 #endif
430 void *vp;
431 size_t ll;
432 bool_t pxok = (self->qf_qfold_min != 0);
434 for (;;) {
435 if (!pxok && (ll = self->qf_pfix_len) > 0) {
436 if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
437 goto jerr;
438 rv += ll;
439 pxok = TRU1;
442 /* xxx Strictly speaking this is invalid, because only `/' and `.' are
443 * xxx mandated by POSIX.1-2008 as "invariant across all locales
444 * xxx supported"; though there is no charset known which uses this
445 * xxx control char as part of a multibyte character; note that S-nail
446 * XXX (and the Mail codebase as such) do not support EBCDIC */
447 if ((vp = memchr(dat, '\n', len)) == NULL)
448 ll = len;
449 else {
450 pxok = FAL0;
451 ll = PTR2SIZE((char*)vp - dat) + 1;
454 if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
455 goto jerr;
456 rv += ll;
457 if ((len -= ll) == 0)
458 break;
459 dat += ll;
462 self->qf_qfold_min = pxok;
464 /* Overly complicated, though still only line-per-line: *quote-fold*.
465 * - If .qf_currq.l is 0, then we are in a clean state. Reset .qf_mbps;
466 * TODO note this means we assume that lines start with reset escape seq,
467 * TODO but i don't think this is any worse than what we currently do;
468 * TODO in 15.0, with the value carrier, we should carry conversion states
469 * TODO all along, only resetting on error (or at words for header =???=);
470 * TODO this still is weird for error handling, but we need to act more
471 * TODO stream-alike (though in practice i don't think cross-line states
472 * TODO can be found, because of compatibility reasons; however, being
473 * TODO a problem rather than a solution is not a good thing (tm))
474 * - Lookout for a newline */
475 #ifdef HAVE_QUOTE_FOLD
476 else {
477 struct qf_vc vc;
478 ssize_t i;
480 vc.self = self;
481 vc.buf = dat;
482 vc.len = len;
483 while (vc.len > 0) {
484 switch (self->qf_state) {
485 case _QF_CLEAN:
486 case _QF_PREFIX:
487 i = _qf_state_prefix(&vc);
488 break;
489 default: /* silence cc (`i' unused) */
490 case _QF_DATA:
491 i = _qf_state_data(&vc);
492 break;
494 if (i < 0)
495 goto jerr;
496 rv += i;
499 #endif /* HAVE_QUOTE_FOLD */
501 jleave:
502 NYD_LEAVE;
503 return rv;
504 jerr:
505 rv = -1;
506 goto jleave;
509 FL ssize_t
510 quoteflt_flush(struct quoteflt *self)
512 ssize_t rv = 0;
513 NYD_ENTER;
514 n_UNUSED(self);
516 #ifdef HAVE_QUOTE_FOLD
517 if (self->qf_dat.l > 0) {
518 rv = _qf_dump_prefix(self);
519 if (rv >= 0) {
520 size_t i = self->qf_dat.l;
521 if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
522 rv += i;
523 else
524 rv = -1;
525 self->qf_dat.l = 0;
526 self->qf_brk_isws = FAL0;
527 self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
528 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
531 #endif
532 NYD_LEAVE;
533 return rv;
537 * HTML tagsoup filter TODO rewrite wchar_t based (require HAVE_C90AMEND1)
538 * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
539 * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
540 * TODO (nonstandard) and watch out for style="gmail_quote" (or so, VERY
541 * TODO nonstandard) and tracking a stack of such elements (to be popped
542 * TODO once the closing element is seen). Then, after writing a newline,
543 * TODO place sizeof(stack) ">"s first. But aren't these HTML mails rude?
544 * TODO Interlocking and non-well-formed data will break us down
546 #ifdef HAVE_FILTER_HTML_TAGSOUP
548 enum hf_limits {
549 _HF_MINLEN = 10, /* Minimum line length (can't really be smaller) */
550 _HF_BRKSUB = 8 /* Start considering line break MAX - BRKSUB */
553 enum hf_flags {
554 _HF_BQUOTE_MASK = 0xFFFFu,
555 _HF_UTF8 = 1u<<16, /* Data is in UTF-8 */
556 _HF_ERROR = 1u<<17, /* A hard error occurred, bail as soon as possible */
557 _HF_NOPUT = 1u<<18, /* (In a tag,) Don't generate output */
558 _HF_IGN = 1u<<19, /* Ignore mode on */
559 _HF_ANY = 1u<<20, /* Yet seen just any output */
560 _HF_PRE = 1u<<21, /* In <pre>formatted mode */
561 _HF_ENT = 1u<<22, /* Currently parsing an entity */
562 _HF_BLANK = 1u<<23, /* Whitespace last */
563 _HF_HREF = 1u<<24, /* External <a href=> was the last href seen */
565 _HF_NL_1 = 1u<<25, /* One \n seen */
566 _HF_NL_2 = 2u<<25, /* We have produced an all empty line */
567 _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
570 enum hf_special_actions {
571 _HFSA_NEEDSEP = -1, /* Need an empty line (paragraph separator) */
572 _HFSA_NEEDNL = -2, /* Need a new line start (table row) */
573 _HFSA_IGN = -3, /* Things like <style>..</style>, <script>.. */
574 _HFSA_PRE = -4, /* <pre>.. */
575 _HFSA_PRE_END = -5,
576 _HFSA_IMG = -6, /* <img> */
577 _HFSA_HREF = -7, /* <a>.. */
578 _HFSA_HREF_END = -8,
579 _HFSA_BQUOTE = -9, /* <blockquote>, interpreted as citation! */
580 _HFSA_BQUOTE_END = -10
583 enum hf_entity_flags {
584 _HFE_HAVE_UNI = 1<<6, /* Have a Unicode replacement character */
585 _HFE_HAVE_CSTR = 1<<7, /* Have a string replacement */
586 /* We store the length of the entity name in the flags, too */
587 _HFE_LENGTH_MASK = (1<<6) - 1
590 struct htmlflt_href {
591 struct htmlflt_href *hfh_next;
592 ui32_t hfh_no; /* Running sequence */
593 ui32_t hfh_len; /* of .hfh_dat */
594 char hfh_dat[n_VFIELD_SIZE(0)];
597 struct htmlflt_tag {
598 si32_t hft_act; /* char or hf_special_actions */
599 /* Not NUL: character to inject, with high bit set: place a space
600 * afterwards. Note: only recognized with _HFSA_NEEDSEP or _HFSA_NEEDNL */
601 char hft_injc;
602 ui8_t hft_len; /* Useful bytes in (NUL terminated) .hft_tag */
603 char const hft_tag[10]; /* Tag less < and > surroundings (TR, /TR, ..) */
605 n_CTA(n_SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE,
606 "Structure field too large a size"); /* .hf_ign_tag */
608 struct hf_ent {
609 ui8_t hfe_flags; /* enum hf_entity_flags plus length of .hfe_ent */
610 char hfe_c; /* Plain replacement character */
611 ui16_t hfe_uni; /* Unicode codepoint if _HFE_HAVE_UNI */
612 char hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
613 char const hfe_ent[7]; /* Entity less & and ; surroundings */
616 /* Tag list; not binary searched :(, so try to take care a bit */
617 static struct htmlflt_tag const _hf_tags[] = {
618 # undef _X
619 # undef _XC
620 # define _X(S,A) {A, '\0', sizeof(S) -1, S "\0"}
621 # define _XC(S,C,A) {A, C, sizeof(S) -1, S "\0"}
623 # if 0 /* This is treated very special (to avoid wasting space in .hft_tag) */
624 _X("BLOCKQUOTE", _HFSA_BQUOTE), _X("/BLOCKQUOTE", _HFSA_BQUOTE_END),
625 # endif
627 _X("P", _HFSA_NEEDSEP), _X("/P", _HFSA_NEEDNL),
628 _X("DIV", _HFSA_NEEDSEP), _X("/DIV", _HFSA_NEEDNL),
629 _X("TR", _HFSA_NEEDNL),
630 _X("/TH", '\t'),
631 _X("/TD", '\t'),
632 /* Let it stand out; also since we don't support implicit paragraphs after
633 * block elements, plain running text after a list (seen in Unicode
634 * announcement via Firefox) */
635 _X("UL", _HFSA_NEEDSEP), _X("/UL", _HFSA_NEEDSEP),
636 _XC("LI", (char)0x80 | '*', _HFSA_NEEDSEP),
637 _X("DL", _HFSA_NEEDSEP),
638 _X("DT", _HFSA_NEEDNL),
640 _X("A", _HFSA_HREF), _X("/A", _HFSA_HREF_END),
641 _X("IMG", _HFSA_IMG),
642 _X("BR", '\n'),
643 _X("PRE", _HFSA_PRE), _X("/PRE", _HFSA_PRE_END),
644 _X("TITLE", _HFSA_NEEDSEP), /*_X("/TITLE", '\n'),*/
645 _X("H1", _HFSA_NEEDSEP), /*_X("/H1", '\n'),*/
646 _X("H2", _HFSA_NEEDSEP), /*_X("/H2", '\n'),*/
647 _X("H3", _HFSA_NEEDSEP), /*_X("/H3", '\n'),*/
648 _X("H4", _HFSA_NEEDSEP), /*_X("/H4", '\n'),*/
649 _X("H5", _HFSA_NEEDSEP), /*_X("/H5", '\n'),*/
650 _X("H6", _HFSA_NEEDSEP), /*_X("/H6", '\n'),*/
652 _X("STYLE", _HFSA_IGN),
653 _X("SCRIPT", _HFSA_IGN),
655 # undef _X
658 /* Entity list; not binary searched.. */
659 static struct hf_ent const _hf_ents[] = {
660 # undef _X
661 # undef _XU
662 # undef _XS
663 # undef _XUS
664 # define _X(E,C) {(sizeof(E) -1), C, 0x0u, "", E "\0"}
665 # define _XU(E,C,U) {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E "\0"}
666 # define _XS(E,S) {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u,S "\0",E "\0"}
667 # define _XSU(E,S,U) \
668 {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E "\0"}
670 _X("quot", '"'),
671 _X("amp", '&'),
672 _X("lt", '<'), _X("gt", '>'),
674 _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
675 _XU("middot", '.', 0x00B7),
676 _XSU("hellip", "...", 0x2026),
677 _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
678 _XSU("laquo", "<<", 0x00AB), _XSU("raquo", ">>", 0x00BB),
679 _XSU("lsaquo", "<", 0x2039), _XSU("rsaquo", ">", 0x203A),
680 _XSU("lsquo", "'", 0x2018), _XSU("rsquo", "'", 0x2019),
681 _XSU("ldquo", "\"", 0x201C), _XSU("rdquo", "\"", 0x201D),
682 _XSU("uarr", "^|", 0x2191), _XSU("darr", "|v", 0x2193),
684 _XSU("cent", "CENT", 0x00A2),
685 _XSU("copy", "(C)", 0x00A9),
686 _XSU("euro", "EUR", 0x20AC),
687 _XSU("infin", "INFY", 0x221E),
688 _XSU("pound", "GBP", 0x00A3),
689 _XSU("reg", "(R)", 0x00AE),
690 _XSU("sect", "S:", 0x00A7),
691 _XSU("yen", "JPY", 0x00A5),
693 /* German umlauts */
694 _XSU("Auml", "Ae", 0x00C4), _XSU("auml", "ae", 0x00E4),
695 _XSU("Ouml", "Oe", 0x00D6), _XSU("ouml", "oe", 0x00F6),
696 _XSU("Uuml", "Ue", 0x00DC), _XSU("uuml", "ue", 0x00FC),
697 _XSU("szlig", "ss", 0x00DF)
699 # undef _X
700 # undef _XU
701 # undef _XS
702 # undef _XSU
705 /* Real output */
706 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
707 static struct htmlflt * _hf_dump(struct htmlflt *self);
708 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
709 # ifdef HAVE_NATCH_CHAR
710 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
711 # endif
713 /* Virtual output */
714 static struct htmlflt * _hf_nl(struct htmlflt *self);
715 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
716 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
717 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
718 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
719 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
720 char const *cp, size_t len);
722 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
723 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
724 char const *param);
726 /* Expand all entities in the given parameter */
727 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
728 struct str const *param);
730 /* Completely parsed over a tag / an entity, interpret that */
731 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
732 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
733 size_t l);
735 /* Input handler */
736 static ssize_t _hf_add_data(struct htmlflt *self,
737 char const *dat, size_t len);
739 static struct htmlflt *
740 _hf_dump_hrefs(struct htmlflt *self)
742 struct htmlflt_href *hhp;
743 NYD2_ENTER;
745 if (!(self->hf_flags & _HF_NL_2) && putc('\n', self->hf_os) == EOF) {
746 self->hf_flags |= _HF_ERROR;
747 goto jleave;
750 /* Reverse the list */
751 for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
752 struct htmlflt_href *tmp = hhp->hfh_next;
753 hhp->hfh_next = self->hf_hrefs;
754 self->hf_hrefs = hhp;
755 hhp = tmp;
758 /* Then dump it */
759 while ((hhp = self->hf_hrefs) != NULL) {
760 self->hf_hrefs = hhp->hfh_next;
762 if (!(self->hf_flags & _HF_ERROR)) {
763 int w = fprintf(self->hf_os, " [%u] %.*s\n",
764 hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
765 if (w < 0)
766 self->hf_flags |= _HF_ERROR;
768 n_free(hhp);
771 self->hf_flags |= (putc('\n', self->hf_os) == EOF)
772 ? _HF_ERROR : _HF_NL_1 | _HF_NL_2;
773 self->hf_href_dist = (ui32_t)n_realscreenheight >> 1;
774 jleave:
775 NYD2_LEAVE;
776 return self;
779 static struct htmlflt *
780 _hf_dump(struct htmlflt *self)
782 ui32_t f, l;
783 char c, *cp;
784 NYD2_ENTER;
786 f = self->hf_flags & ~_HF_BLANK;
787 l = self->hf_len;
788 cp = self->hf_line;
789 self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
791 for (c = '\0'; l > 0; --l) {
792 c = *cp++;
793 jput:
794 if (putc(c, self->hf_os) == EOF) {
795 self->hf_flags = (f |= _HF_ERROR);
796 goto jleave;
800 if (c != '\n') {
801 f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
802 l = 1;
803 c = '\n';
804 goto jput;
806 self->hf_flags = f;
808 /* Check whether there are HREFs to dump; there is so much messy tagsoup out
809 * there that it seems best not to simply dump HREFs in each _dump(), but
810 * only with some gap, let's say half the real screen height */
811 if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
812 self = _hf_dump_hrefs(self);
813 jleave:
814 NYD2_LEAVE;
815 return self;
818 static struct htmlflt *
819 _hf_store(struct htmlflt *self, char c)
821 ui32_t l, i;
822 NYD2_ENTER;
824 assert(c != '\n');
826 l = self->hf_len;
827 if(n_UNLIKELY(l == 0) && (i = (self->hf_flags & _HF_BQUOTE_MASK)) != 0 &&
828 self->hf_lmax > _HF_MINLEN){
829 ui32_t len, j;
830 char const *ip;
832 ip = ok_vlook(indentprefix);
833 len = strlen(ip);
834 if(len == 0 || len >= _HF_MINLEN){
835 ip = " |"; /* XXX something from *quote-chars* */
836 len = sizeof(" |") -1;
839 self->hf_len = len;
840 for(j = len; j-- != 0;){
841 char x;
843 if((x = ip[j]) == '\t')
844 x = ' ';
845 self->hf_line[j] = x;
848 while(--i > 0 && self->hf_len < self->hf_lmax - _HF_BRKSUB)
849 self = _hf_store(self, '|'); /* XXX something from *quote-chars* */
851 l = self->hf_len;
854 self->hf_line[l] = (c == '\t' ? ' ' : c);
855 self->hf_len = ++l;
856 if (blankspacechar(c)) {
857 if (c == '\t') {
858 i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
859 if (i > 0) {
861 self = _hf_store(self, ' ');
862 while (--i > 0);
863 goto jleave;
866 self->hf_last_ws = l;
867 } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
868 self->hf_last_ws = l;
870 i = l;
871 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
872 if (n_mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least */
873 wchar_t wc;
874 int w, x;
876 if((x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff)
877 ) > 0){
878 if ((w = wcwidth(wc)) == -1 ||
879 /* Actively filter out L-TO-R and R-TO-R marks TODO ctext */
880 (wc == 0x200E || wc == 0x200F ||
881 (wc >= 0x202A && wc <= 0x202E)) ||
882 /* And some zero-width messes */
883 wc == 0x00AD || (wc >= 0x200B && wc <= 0x200D) ||
884 /* Oh about the ISO C wide character interfaces, baby! */
885 (wc == 0xFEFF)){
886 self->hf_len -= x;
887 goto jleave;
888 } else if (iswspace(wc))
889 self->hf_last_ws = l;
890 self->hf_mboff += x;
891 i = (self->hf_mbwidth += w);
892 } else {
893 if (x < 0) {
894 (void)mbtowc(&wc, NULL, n_mb_cur_max);
895 if (UICMP(32, l - self->hf_mboff, >=, n_mb_cur_max)) { /* XXX */
896 ++self->hf_mboff;
897 ++self->hf_mbwidth;
900 i = self->hf_mbwidth;
903 # endif
905 /* Do we need to break the line? */
906 if (i >= self->hf_lmax - _HF_BRKSUB) {
907 ui32_t f, lim;
910 /* Let's hope we saw a sane place to break this line! */
911 if (self->hf_last_ws >= (lim = self->hf_lmax >> 1)) {
912 jput:
913 i = self->hf_len = self->hf_last_ws;
914 self = _hf_dump(self);
915 if ((self->hf_len = (l -= i)) > 0) {
916 self->hf_flags &= ~_HF_NL_MASK;
917 memmove(self->hf_line, self->hf_line + i, l);
918 # ifdef HAVE_NATCH_CHAR
919 __hf_sync_mbstuff(self);
920 # endif
922 goto jleave;
925 /* Any 7-bit characters? */
926 f = self->hf_flags;
927 for (i = l; i-- >= lim;)
928 if (asciichar((c = self->hf_line[i]))) {
929 self->hf_last_ws = ++i;
930 goto jput;
931 } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
932 self->hf_last_ws = i;
933 goto jput;
936 /* Hard break necessary! xxx really badly done */
937 if (l >= self->hf_lmax - 1)
938 self = _hf_dump(self);
940 jleave:
941 NYD2_LEAVE;
942 return self;
945 # ifdef HAVE_NATCH_CHAR
946 static struct htmlflt *
947 __hf_sync_mbstuff(struct htmlflt *self)
949 wchar_t wc;
950 char const *b;
951 ui32_t o, w, l;
952 NYD2_ENTER;
954 b = self->hf_line;
955 o = w = 0;
956 l = self->hf_len;
957 goto jumpin;
959 while (l > 0) {
960 int x = mbtowc(&wc, b, l);
962 if (x == 0)
963 break;
965 if (x > 0) {
966 b += x;
967 l -= x;
968 o += x;
969 if ((x = wcwidth(wc)) == -1)
970 x = 1;
971 w += x;
972 continue;
975 /* Bad, skip over a single character.. XXX very bad indeed */
976 ++b;
977 ++o;
978 ++w;
979 --l;
980 jumpin:
981 (void)mbtowc(&wc, NULL, n_mb_cur_max);
984 self->hf_mboff = o;
985 self->hf_mbwidth = w;
987 NYD2_LEAVE;
988 return self;
990 # endif /* HAVE_NATCH_CHAR */
992 static struct htmlflt *
993 _hf_nl(struct htmlflt *self)
995 ui32_t f;
996 NYD2_ENTER;
998 if (!((f = self->hf_flags) & _HF_ERROR)) {
999 if (f & _HF_ANY) {
1000 if ((f & _HF_NL_MASK) != _HF_NL_MASK)
1001 self = _hf_dump(self);
1002 } else
1003 self->hf_flags = (f |= _HF_NL_MASK);
1005 NYD2_LEAVE;
1006 return self;
1009 static struct htmlflt *
1010 _hf_nl_force(struct htmlflt *self)
1012 NYD2_ENTER;
1013 if (!(self->hf_flags & _HF_ERROR))
1014 self = _hf_dump(self);
1015 NYD2_LEAVE;
1016 return self;
1019 static struct htmlflt *
1020 _hf_putc(struct htmlflt *self, char c)
1022 ui32_t f;
1023 NYD2_ENTER;
1025 if ((f = self->hf_flags) & _HF_ERROR)
1026 goto jleave;
1028 if (c == '\n') {
1029 self = _hf_nl(self);
1030 goto jleave;
1031 } else if (c == ' ' || c == '\t') {
1032 if ((f & _HF_BLANK) || self->hf_len == 0)
1033 goto jleave;
1034 f |= _HF_BLANK;
1035 } else
1036 f &= ~_HF_BLANK;
1037 f &= ~_HF_NL_MASK;
1038 self->hf_flags = (f |= _HF_ANY);
1039 self = _hf_store(self, c);
1040 jleave:
1041 NYD2_LEAVE;
1042 return self;
1045 static struct htmlflt *
1046 _hf_putc_premode(struct htmlflt *self, char c)
1048 ui32_t f;
1049 NYD2_ENTER;
1051 if ((f = self->hf_flags) & _HF_ERROR) {
1053 } else if (c == '\n')
1054 self = _hf_nl_force(self);
1055 else {
1056 f &= ~_HF_NL_MASK;
1057 self->hf_flags = (f |= _HF_ANY);
1058 self = _hf_store(self, c);
1060 NYD2_LEAVE;
1061 return self;
1064 static struct htmlflt *
1065 _hf_puts(struct htmlflt *self, char const *cp)
1067 char c;
1068 NYD2_ENTER;
1070 while ((c = *cp++) != '\0')
1071 self = _hf_putc(self, c);
1072 NYD2_LEAVE;
1073 return self;
1076 static struct htmlflt *
1077 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
1079 NYD2_ENTER;
1081 while (len-- > 0)
1082 self = _hf_putc(self, *cp++);
1083 NYD2_LEAVE;
1084 return self;
1087 static struct htmlflt *
1088 _hf_param(struct htmlflt *self, struct str *store, char const *param)
1090 char const *cp;
1091 char c, x, quote;
1092 size_t i;
1093 bool_t hot;
1094 NYD2_ENTER;
1096 store->s = NULL;
1097 store->l = 0;
1098 cp = self->hf_bdat;
1100 /* Skip over any non-WS first; be aware of soup, if it slipped through */
1101 for(;;){
1102 if((c = *cp++) == '\0' || c == '>')
1103 goto jleave;
1104 if(whitechar(c))
1105 break;
1108 /* Search for the parameter, take care of other quoting along the way */
1109 x = *param++;
1110 x = upperconv(x);
1111 i = strlen(param);
1113 for(hot = TRU1;;){
1114 if((c = *cp++) == '\0' || c == '>')
1115 goto jleave;
1116 if(whitechar(c)){
1117 hot = TRU1;
1118 continue;
1121 /* Could it be a parameter? */
1122 if(hot){
1123 hot = FAL0;
1125 /* Is it the desired one? */
1126 if((c = upperconv(c)) == x && !ascncasecmp(param, cp, i)){
1127 char const *cp2 = cp + i;
1129 if((quote = *cp2++) != '='){
1130 if(quote == '\0' || quote == '>')
1131 goto jleave;
1132 while(whitechar(quote))
1133 quote = *cp2++;
1135 if(quote == '='){
1136 cp = cp2;
1137 break;
1139 continue; /* XXX Optimize: i bytes or even cp2 can't be it! */
1143 /* Not the desired one; but a parameter? */
1144 if(c != '=')
1145 continue;
1146 /* If so, properly skip over the value */
1147 if((c = *cp++) == '"' || c == '\''){
1148 /* TODO i have forgotten whether reverse solidus quoting is allowed
1149 * TODO quoted HTML parameter values? not supporting that for now.. */
1150 for(quote = c; (c = *cp++) != '\0' && c != quote;)
1152 }else
1153 while(c != '\0' && !whitechar(c) && c != '>')
1154 c = *++cp;
1155 if(c == '\0')
1156 goto jleave;
1159 /* Skip further whitespace */
1160 for(;;){
1161 if((c = *cp++) == '\0' || c == '>')
1162 goto jleave;
1163 if(!whitechar(c))
1164 break;
1167 if(c == '"' || c == '\''){
1168 /* TODO i have forgotten whether reverse solisud quoting is allowed in
1169 * TODO quoted HTML parameter values? not supporting that for now.. */
1170 store->s = n_UNCONST(cp);
1171 for(quote = c; (c = *cp) != '\0' && c != quote; ++cp)
1173 /* XXX ... and we simply ignore a missing trailing " :> */
1174 }else{
1175 store->s = n_UNCONST(cp - 1);
1176 if(!whitechar(c))
1177 while((c = *cp) != '\0' && !whitechar(c) && c != '>')
1178 ++cp;
1180 i = PTR2SIZE(cp - store->s);
1182 /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
1183 * parameter values prefixed and suffixed by newlines! Therefore trim the
1184 * value content TODO join into the parse step above! */
1185 for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
1187 store->s = n_UNCONST(cp);
1188 for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
1190 if ((store->l = i) == 0)
1191 store->s = NULL;
1192 jleave:
1193 NYD2_LEAVE;
1194 return self;
1197 static struct htmlflt *
1198 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
1200 char const *cp, *maxcp, *ep;
1201 char c;
1202 size_t i;
1203 NYD2_ENTER;
1205 for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
1206 if ((c = *cp++) != '&')
1207 jputc:
1208 self = _hf_putc(self, c);
1209 else {
1210 for (ep = cp--;;) {
1211 if (ep == maxcp || (c = *ep++) == '\0') {
1212 for (; cp < ep; ++cp)
1213 self = _hf_putc(self, *cp);
1214 goto jleave;
1215 } else if (c == ';') {
1216 if ((i = PTR2SIZE(ep - cp)) > 1) {
1217 self = _hf_check_ent(self, cp, i);
1218 break;
1219 } else {
1220 c = *cp++;
1221 goto jputc;
1225 cp = ep;
1227 jleave:
1228 NYD2_LEAVE;
1229 return self;
1232 static struct htmlflt *
1233 _hf_check_tag(struct htmlflt *self, char const *s)
1235 char nobuf[32], c;
1236 struct str param;
1237 size_t i;
1238 struct htmlflt_tag const *hftp;
1239 ui32_t f;
1240 NYD2_ENTER;
1242 /* Extra check only */
1243 assert(s != NULL);
1244 if (*s != '<') {
1245 DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
1246 jput_as_is:
1247 self = _hf_puts(self, self->hf_bdat);
1248 goto jleave;
1251 for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
1252 /* Special massage for things like <br/>: after the slash only whitespace
1253 * may separate us from the closing right angle! */
1254 if (c == '/') {
1255 size_t j = i + 1;
1257 while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
1258 ++j;
1259 if (c == '>')
1260 break;
1263 for (hftp = _hf_tags;;) {
1264 if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
1265 c = s[hftp->hft_len];
1266 if (c == '>' || c == '/' || whitechar(c))
1267 break;
1269 if (n_UNLIKELY(PTRCMP(++hftp, >=, _hf_tags + n_NELEM(_hf_tags)))){
1270 /* A <blockquote> is very special xxx */
1271 bool_t isct;
1273 if((isct = (i > 1 && *s == '/'))){
1274 ++s;
1275 --i;
1278 if(i != sizeof("blockquote") -1 || ascncasecmp(s, "blockquote", i) ||
1279 ((c = s[sizeof("blockquote") -1]) != '>' && !whitechar(c))){
1280 s -= isct;
1281 i += isct;
1282 goto jnotknown;
1285 if(!isct && !(self->hf_flags & _HF_NL_2))
1286 self = _hf_nl(self);
1287 if(!(self->hf_flags & _HF_NL_1))
1288 self = _hf_nl(self);
1289 f = self->hf_flags;
1290 f &= _HF_BQUOTE_MASK;
1291 if(!isct){
1292 if(f != _HF_BQUOTE_MASK)
1293 ++f;
1294 }else if(f > 0)
1295 --f;
1296 f |= (self->hf_flags & ~_HF_BQUOTE_MASK);
1297 self->hf_flags = f;
1298 goto jleave;
1302 f = self->hf_flags;
1303 switch (hftp->hft_act) {
1304 case _HFSA_PRE_END:
1305 f &= ~_HF_PRE;
1306 if (0) {
1307 /* FALLTHRU */
1308 case _HFSA_PRE:
1309 f |= _HF_PRE;
1311 self->hf_flags = f;
1312 /* FALLTHRU */
1314 case _HFSA_NEEDSEP:
1315 if (!(self->hf_flags & _HF_NL_2))
1316 self = _hf_nl(self);
1317 /* FALLTHRU */
1318 case _HFSA_NEEDNL:
1319 if (!(f & _HF_NL_1))
1320 self = _hf_nl(self);
1321 if (hftp->hft_injc != '\0') {
1322 self = _hf_putc(self, hftp->hft_injc & 0x7F);
1323 if ((uc_i)hftp->hft_injc & 0x80)
1324 self = _hf_putc(self, ' ');
1326 break;
1328 case _HFSA_IGN:
1329 self->hf_ign_tag = hftp;
1330 self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
1331 break;
1333 case _HFSA_IMG:
1334 self = _hf_param(self, &param, "alt");
1335 self = _hf_putc(self, '[');
1336 if (param.s == NULL) {
1337 param.s = n_UNCONST("IMG");
1338 param.l = 3;
1339 goto jimg_put;
1340 } /* else */ if (memchr(param.s, '&', param.l) != NULL)
1341 self = _hf_expand_all_ents(self, &param);
1342 else
1343 jimg_put:
1344 self = _hf_putbuf(self, param.s, param.l);
1345 self = _hf_putc(self, ']');
1346 break;
1348 case _HFSA_HREF:
1349 self = _hf_param(self, &param, "href");
1350 /* Ignore non-external links */
1351 if (param.s != NULL && *param.s != '#') {
1352 struct htmlflt_href *hhp = n_alloc(
1353 n_VSTRUCT_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
1355 hhp->hfh_next = self->hf_hrefs;
1356 hhp->hfh_no = ++self->hf_href_no;
1357 hhp->hfh_len = (ui32_t)param.l;
1358 memcpy(hhp->hfh_dat, param.s, param.l);
1360 snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
1361 self->hf_flags = (f |= _HF_HREF);
1362 self->hf_hrefs = hhp;
1363 self = _hf_puts(self, nobuf);
1364 } else
1365 self->hf_flags = (f &= ~_HF_HREF);
1366 break;
1367 case _HFSA_HREF_END:
1368 if (f & _HF_HREF) {
1369 snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
1370 self = _hf_puts(self, nobuf);
1372 break;
1374 default:
1375 c = (char)(hftp->hft_act & 0xFF);
1376 self = _hf_putc(self, c);
1377 break;
1378 case '\0':
1379 break;
1381 jleave:
1382 NYD2_LEAVE;
1383 return self;
1385 /* The problem is that even invalid tagsoup is widely used, without real
1386 * searching i have seen e-mail address in <N@H.D> notation, and more.
1387 * To protect us a bit look around and possibly write the content as such */
1388 jnotknown:
1389 switch (*s) {
1390 case '!':
1391 case '?':
1392 /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
1393 goto jleave;
1394 case '>':
1395 /* Print out an empty tag as such */
1396 if (s[1] == '\0') {
1397 --s;
1398 goto jput_as_is;
1400 break;
1401 case '/':
1402 ++s;
1403 break;
1404 default:
1405 break;
1408 /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
1409 while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
1410 if (!asciichar(c) || punctchar(c)) {
1411 self = _hf_puts(self, self->hf_bdat);
1412 break;
1414 goto jleave;
1417 static struct htmlflt *
1418 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
1420 char nobuf[32];
1421 char const *s_save;
1422 size_t l_save;
1423 struct hf_ent const *hfep;
1424 size_t i;
1425 NYD2_ENTER;
1427 s_save = s;
1428 l_save = l;
1429 assert(*s == '&');
1430 assert(l > 0);
1431 /* False entities seen in the wild assert(s[l - 1] == ';'); */
1432 ++s;
1433 l -= 2;
1435 /* Numeric entity, or try named search */
1436 if (*s == '#') {
1437 i = (*++s == 'x' ? 16 : 10);
1439 if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
1440 memcpy(nobuf, s, l);
1441 nobuf[l] = '\0';
1442 n_idec_uiz_cp(&i, nobuf, i, NULL);
1443 if (i <= 0x7F)
1444 self = _hf_putc(self, (char)i);
1445 else if (self->hf_flags & _HF_UTF8) {
1446 jputuni:
1447 l = n_utf32_to_utf8((ui32_t)i, nobuf);
1448 self = _hf_putbuf(self, nobuf, l);
1449 } else
1450 goto jeent;
1451 } else
1452 goto jeent;
1453 } else {
1454 ui32_t f = self->hf_flags, hf;
1456 for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + n_NELEM(_hf_ents));
1457 ++hfep)
1458 if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
1459 !strncmp(s, hfep->hfe_ent, l)) {
1460 if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
1461 i = hfep->hfe_uni;
1462 goto jputuni;
1463 } else if (hf & _HFE_HAVE_CSTR)
1464 self = _hf_puts(self, hfep->hfe_cstr);
1465 else
1466 self = _hf_putc(self, hfep->hfe_c);
1467 goto jleave;
1469 jeent:
1470 self = _hf_putbuf(self, s_save, l_save);
1472 jleave:
1473 NYD2_LEAVE;
1474 return self;
1477 static ssize_t
1478 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
1480 char c, *cp, *cp_max;
1481 bool_t hot;
1482 ssize_t rv = 0;
1483 NYD_ENTER;
1485 /* Final put request? */
1486 if (dat == NULL) {
1487 if (self->hf_len > 0 || self->hf_hrefs != NULL) {
1488 self = _hf_dump(self);
1489 if (self->hf_hrefs != NULL)
1490 self = _hf_dump_hrefs(self);
1491 rv = 1;
1493 goto jleave;
1496 /* Always ensure some initial buffer */
1497 if ((cp = self->hf_curr) != NULL)
1498 cp_max = self->hf_bmax;
1499 else {
1500 cp = self->hf_curr = self->hf_bdat = n_alloc(LINESIZE);
1501 cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
1503 hot = (cp != self->hf_bdat);
1505 for (rv = (ssize_t)len; len > 0; --len) {
1506 ui32_t f = self->hf_flags;
1508 if (f & _HF_ERROR)
1509 break;
1510 c = *dat++;
1512 /* Soup is really weird, and scripts may contain almost anything (and
1513 * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
1514 * test and walk until we see the required end tag */
1515 /* TODO For real safety _HF_IGN soup condome would also need to know
1516 * TODO about quoted strings so that 'var i = "</script>";' couldn't
1517 * TODO fool it! We really want this mode also for _HF_NOPUT to be
1518 * TODO able to *gracefully* detect the tag-closing '>', but then if
1519 * TODO that is a single mechanism we should have made it! */
1520 if (f & _HF_IGN) {
1521 struct htmlflt_tag const *hftp = self->hf_ign_tag;
1522 size_t i;
1524 if (c == '<') {
1525 hot = TRU1;
1526 jcp_reset:
1527 cp = self->hf_bdat;
1528 } else if (c == '>') {
1529 if (hot) {
1530 if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
1531 --i == hftp->hft_len &&
1532 !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
1533 self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
1534 hot = FAL0;
1535 goto jcp_reset;
1537 } else if (hot) {
1538 *cp++ = c;
1539 i = PTR2SIZE(cp - self->hf_bdat);
1540 if ((i == 1 && c != '/') || --i > hftp->hft_len) {
1541 hot = FAL0;
1542 goto jcp_reset;
1545 } else switch (c) {
1546 case '<':
1547 /* People are using & without &amp;ing it, ditto <; be aware */
1548 if (f & (_HF_NOPUT | _HF_ENT)) {
1549 f &= ~_HF_ENT;
1550 /* Special case "<!--" buffer content to deal with really weird
1551 * things that can be done with "<!--[if gte mso 9]>" syntax */
1552 if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
1553 memcmp(self->hf_bdat, "<!--", 4)) {
1554 self->hf_flags = f;
1555 *cp = '\0';
1556 self = _hf_puts(self, self->hf_bdat);
1557 f = self->hf_flags;
1560 cp = self->hf_bdat;
1561 *cp++ = c;
1562 self->hf_flags = (f |= _HF_NOPUT);
1563 break;
1564 case '>':
1565 /* Weird tagsoup around, do we actually parse a tag? */
1566 if (!(f & _HF_NOPUT))
1567 goto jdo_c;
1568 cp[0] = c;
1569 cp[1] = '\0';
1570 f &= ~(_HF_NOPUT | _HF_ENT);
1571 self->hf_flags = f;
1572 self = _hf_check_tag(self, self->hf_bdat);
1573 *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
1574 /* Quick hack to get rid of redundant newline after <pre> XXX */
1575 if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
1576 len > 1 && *dat == '\n')
1577 ++dat, --len;
1578 break;
1580 case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
1581 break;
1582 case '\n':
1583 /* End of line is not considered unless we are in PRE section.
1584 * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
1585 * newlines for separating parameters */
1586 if (f & _HF_NOPUT)
1587 goto jdo_c;
1588 self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
1589 break;
1591 case '\t':
1592 if (!(f & _HF_PRE))
1593 c = ' ';
1594 /* FALLTHRU */
1595 default:
1596 jdo_c:
1597 /* If not currently parsing a tag and bypassing normal output.. */
1598 if (!(f & _HF_NOPUT)) {
1599 if (cntrlchar(c))
1600 break;
1601 if (c == '&') {
1602 cp = self->hf_bdat;
1603 *cp++ = c;
1604 self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
1605 } else if (f & _HF_PRE) {
1606 self = _hf_putc_premode(self, c);
1607 self->hf_flags &= ~_HF_BLANK;
1608 } else
1609 self = _hf_putc(self, c);
1610 } else if ((f & _HF_ENT) && c == ';') {
1611 cp[0] = c;
1612 cp[1] = '\0';
1613 f &= ~(_HF_NOPUT | _HF_ENT);
1614 self->hf_flags = f;
1615 self = _hf_check_ent(self, self->hf_bdat,
1616 PTR2SIZE(cp + 1 - self->hf_bdat));
1617 } else {
1618 /* We may need to grow the buffer */
1619 if (PTRCMP(cp + 42/2, >=, cp_max)) {
1620 size_t i = PTR2SIZE(cp - self->hf_bdat),
1621 m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
1623 cp = self->hf_bdat = n_realloc(self->hf_bdat, m);
1624 self->hf_bmax = cp_max = &cp[m -1];
1625 self->hf_curr = (cp += i);
1627 *cp++ = c;
1631 self->hf_curr = cp;
1632 jleave:
1633 NYD_LEAVE;
1634 return (self->hf_flags & _HF_ERROR) ? -1 : rv;
1638 * TODO Because we don't support filter chains yet this filter will be run
1639 * TODO in a dedicated subprocess, driven via a special Popen() mode
1641 static bool_t __hf_hadpipesig;
1642 static void
1643 __hf_onpipe(int signo)
1645 NYD_X; /* Signal handler */
1646 n_UNUSED(signo);
1647 __hf_hadpipesig = TRU1;
1650 FL int
1651 htmlflt_process_main(void)
1653 char buf[BUFFER_SIZE];
1654 struct htmlflt hf;
1655 size_t i;
1656 int rv;
1657 NYD_ENTER;
1659 __hf_hadpipesig = FAL0;
1660 safe_signal(SIGPIPE, &__hf_onpipe);
1662 htmlflt_init(&hf);
1663 htmlflt_reset(&hf, n_stdout);
1665 for (;;) {
1666 if ((i = fread(buf, sizeof(buf[0]), n_NELEM(buf), n_stdin)) == 0) {
1667 rv = !feof(n_stdin);
1668 break;
1671 if ((rv = __hf_hadpipesig))
1672 break;
1673 /* Just use this directly.. */
1674 if (htmlflt_push(&hf, buf, i) < 0) {
1675 rv = 1;
1676 break;
1679 if (rv == 0 && htmlflt_flush(&hf) < 0)
1680 rv = 1;
1682 htmlflt_destroy(&hf);
1684 rv |= __hf_hadpipesig;
1685 NYD_LEAVE;
1686 return rv;
1689 FL void
1690 htmlflt_init(struct htmlflt *self)
1692 NYD_ENTER;
1693 /* (Rather redundant though) */
1694 memset(self, 0, sizeof *self);
1695 NYD_LEAVE;
1698 FL void
1699 htmlflt_destroy(struct htmlflt *self)
1701 NYD_ENTER;
1702 htmlflt_reset(self, NULL);
1703 NYD_LEAVE;
1706 FL void
1707 htmlflt_reset(struct htmlflt *self, FILE *f)
1709 struct htmlflt_href *hfhp;
1710 NYD_ENTER;
1712 while ((hfhp = self->hf_hrefs) != NULL) {
1713 self->hf_hrefs = hfhp->hfh_next;
1714 n_free(hfhp);
1717 if (self->hf_bdat != NULL)
1718 n_free(self->hf_bdat);
1719 if (self->hf_line != NULL)
1720 n_free(self->hf_line);
1722 memset(self, 0, sizeof *self);
1724 if (f != NULL) {
1725 ui32_t sw = n_MAX(_HF_MINLEN, (ui32_t)n_scrnwidth);
1727 self->hf_line = n_alloc((size_t)sw * n_mb_cur_max +1);
1728 self->hf_lmax = sw;
1730 if (n_psonce & n_PSO_UNICODE) /* TODO not truly generic */
1731 self->hf_flags = _HF_UTF8;
1732 self->hf_os = f;
1734 NYD_LEAVE;
1737 FL ssize_t
1738 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
1740 ssize_t rv;
1741 NYD_ENTER;
1743 rv = _hf_add_data(self, dat, len);
1744 NYD_LEAVE;
1745 return rv;
1748 FL ssize_t
1749 htmlflt_flush(struct htmlflt *self)
1751 ssize_t rv;
1752 NYD_ENTER;
1754 rv = _hf_add_data(self, NULL, 0);
1755 rv |= !fflush(self->hf_os) ? 0 : -1;
1756 NYD_LEAVE;
1757 return rv;
1759 #endif /* HAVE_FILTER_HTML_TAGSOUP */
1761 /* s-it-mode */