Bump S-nail v14.9.7.ar ("Marsh tit patiently scraping bark"), 2018-02-16
[s-mailx.git] / filter.c
blobe7fc6b1147812a4ac670668360192bc9a25dc2d7
1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
2 *@ Filter objects.
4 * Copyright (c) 2013 - 2018 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 #undef n_FILE
19 #define n_FILE filter
21 #ifndef HAVE_AMALGAMATION
22 # include "nail.h"
23 #endif
26 * Quotation filter
30 * TODO quotation filter: anticipate in future data: don't break if only WS
31 * TODO or a LF escaping \ follows on the line (simply reuse the latter).
34 #ifdef HAVE_QUOTE_FOLD
35 n_CTAV(n_QUOTE_MAX > 3);
37 enum qf_state {
38 _QF_CLEAN,
39 _QF_PREFIX,
40 _QF_DATA
43 struct qf_vc {
44 struct quoteflt *self;
45 char const *buf;
46 size_t len;
49 /* Print out prefix and current quote */
50 static ssize_t _qf_dump_prefix(struct quoteflt *self);
52 /* Add one data character */
53 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
55 /* State machine handlers */
56 static ssize_t _qf_state_prefix(struct qf_vc *vc);
57 static ssize_t _qf_state_data(struct qf_vc *vc);
59 static ssize_t
60 _qf_dump_prefix(struct quoteflt *self)
62 ssize_t rv;
63 size_t i;
64 NYD_ENTER;
66 if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
67 self->qf_os))
68 goto jerr;
69 rv = i;
71 if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
72 self->qf_os))
73 goto jerr;
74 rv += i;
75 jleave:
76 NYD_LEAVE;
77 return rv;
78 jerr:
79 rv = -1;
80 goto jleave;
83 static ssize_t
84 _qf_add_data(struct quoteflt *self, wchar_t wc)
86 char *save_b;
87 ui32_t save_l, save_w;
88 ssize_t rv = 0;
89 int w, l;
90 NYD_ENTER;
92 save_l = save_w = 0; /* silence cc */
93 save_b = NULL;
94 /* <newline> ends state */
95 if (wc == L'\n')
96 goto jflush;
97 if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
98 goto jleave;
100 /* Unroll <tab> to spaces */
101 if (wc == L'\t') {
102 save_l = self->qf_datw;
103 save_w = (save_l + n_QUOTE_TAB_SPACES) & ~(n_QUOTE_TAB_SPACES - 1);
104 save_w -= save_l;
105 while (save_w-- > 0) {
106 ssize_t j = _qf_add_data(self, L' ');
107 if (j < 0) {
108 rv = j;
109 break;
111 rv += j;
113 goto jleave;
116 w = wcwidth(wc);
117 if (w == -1) {
118 jbad:
119 ++self->qf_datw;
120 self->qf_dat.s[self->qf_dat.l++] = '?';
121 } else {
122 l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
123 if (l < 0)
124 goto jbad;
125 self->qf_datw += (ui32_t)w;
126 self->qf_dat.l += (size_t)l;
129 /* TODO The last visual may excess (adjusted!) *qfold-max* if it's a wide;
130 * TODO place it on the next line, break before */
131 if (self->qf_datw >= self->qf_qfold_max) {
132 /* If we have seen a nice breakpoint during traversal, shuffle data
133 * around a bit so as to restore the trailing part after flushing */
134 if (self->qf_brkl > 0) {
135 save_w = self->qf_datw - self->qf_brkw;
136 save_l = self->qf_dat.l - self->qf_brkl;
137 save_b = self->qf_dat.s + self->qf_brkl + 2;
138 memmove(save_b, save_b - 2, save_l);
139 self->qf_dat.l = self->qf_brkl;
142 self->qf_dat.s[self->qf_dat.l++] = '\\';
143 jflush:
144 self->qf_dat.s[self->qf_dat.l++] = '\n';
145 rv = quoteflt_flush(self);
147 /* Restore takeovers, if any */
148 if (save_b != NULL) {
149 self->qf_brk_isws = FAL0;
150 self->qf_datw += save_w;
151 self->qf_dat.l = save_l;
152 memmove(self->qf_dat.s, save_b, save_l);
154 } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
155 bool_t isws = (iswspace(wc) != 0);
157 if (isws || !self->qf_brk_isws || self->qf_brkl == 0) {
158 self->qf_brkl = self->qf_dat.l;
159 self->qf_brkw = self->qf_datw;
160 self->qf_brk_isws = isws;
164 /* If state changed to prefix, perform full reset (note this implies that
165 * quoteflt_flush() performs too much work..) */
166 if (wc == '\n') {
167 self->qf_state = _QF_PREFIX;
168 self->qf_wscnt = self->qf_datw = 0;
169 self->qf_currq.l = 0;
171 jleave:
172 NYD_LEAVE;
173 return rv;
176 static ssize_t
177 _qf_state_prefix(struct qf_vc *vc)
179 struct quoteflt *self;
180 ssize_t rv;
181 char const *buf;
182 size_t len, i;
183 wchar_t wc;
184 NYD_ENTER;
186 self = vc->self;
187 rv = 0;
189 for (buf = vc->buf, len = vc->len; len > 0;) {
190 /* xxx NULL BYTE! */
191 i = mbrtowc(&wc, buf, len, self->qf_mbps);
192 if (i == (size_t)-1) {
193 /* On hard error, don't modify mbstate_t and step one byte */
194 self->qf_mbps[0] = self->qf_mbps[1];
195 ++buf;
196 --len;
197 self->qf_wscnt = 0;
198 continue;
200 self->qf_mbps[1] = self->qf_mbps[0];
201 if (i == (size_t)-2) {
202 /* Redundant shift sequence, out of buffer */
203 len = 0;
204 break;
206 buf += i;
207 len -= i;
209 if (wc == L'\n')
210 goto jfin;
211 if (iswspace(wc)) {
212 ++self->qf_wscnt;
213 continue;
215 if (i == 1 && n_uasciichar(wc) &&
216 strchr(self->qf_quote_chars, (char)wc) != NULL){
217 self->qf_wscnt = 0;
218 if (self->qf_currq.l >= n_QUOTE_MAX - 3) {
219 self->qf_currq.s[n_QUOTE_MAX - 3] = '.';
220 self->qf_currq.s[n_QUOTE_MAX - 2] = '.';
221 self->qf_currq.s[n_QUOTE_MAX - 1] = '.';
222 self->qf_currq.l = n_QUOTE_MAX;
223 } else
224 self->qf_currq.s[self->qf_currq.l++] = buf[-1];
225 continue;
228 /* The quote is parsed and compressed; dump it */
229 jfin:
230 self->qf_state = _QF_DATA;
231 /* Overtake WS to the current quote in order to preserve it for eventual
232 * necessary follow lines, too */
233 /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
234 while (self->qf_wscnt-- > 0 && self->qf_currq.l < n_QUOTE_MAX)
235 self->qf_currq.s[self->qf_currq.l++] = ' ';
236 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
237 self->qf_wscnt = 0;
238 rv = _qf_add_data(self, wc);
239 break;
242 vc->buf = buf;
243 vc->len = len;
244 NYD_LEAVE;
245 return rv;
248 static ssize_t
249 _qf_state_data(struct qf_vc *vc)
251 struct quoteflt *self;
252 ssize_t rv;
253 char const *buf;
254 size_t len, i;
255 wchar_t wc;
256 NYD_ENTER;
258 self = vc->self;
259 rv = 0;
261 for (buf = vc->buf, len = vc->len; len > 0;) {
262 /* xxx NULL BYTE! */
263 i = mbrtowc(&wc, buf, len, self->qf_mbps);
264 if (i == (size_t)-1) {
265 /* On hard error, don't modify mbstate_t and step one byte */
266 self->qf_mbps[0] = self->qf_mbps[1];
267 ++buf;
268 --len;
269 continue;
271 self->qf_mbps[1] = self->qf_mbps[0];
272 if (i == (size_t)-2) {
273 /* Redundant shift sequence, out of buffer */
274 len = 0;
275 break;
277 buf += i;
278 len -= i;
280 { ssize_t j = _qf_add_data(self, wc);
281 if (j < 0) {
282 rv = j;
283 break;
285 rv += j;
288 if (self->qf_state != _QF_DATA)
289 break;
292 vc->buf = buf;
293 vc->len = len;
294 NYD_LEAVE;
295 return rv;
297 #endif /* HAVE_QUOTE_FOLD */
299 FL struct quoteflt *
300 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
302 static struct quoteflt qf_i;
304 return &qf_i;
307 FL void
308 quoteflt_init(struct quoteflt *self, char const *prefix)
310 #ifdef HAVE_QUOTE_FOLD
311 char const *xcp, *cp;
312 #endif
313 NYD_ENTER;
315 memset(self, 0, sizeof *self);
317 if ((self->qf_pfix = prefix) != NULL)
318 self->qf_pfix_len = (ui32_t)strlen(prefix);
320 /* Check whether the user wants the more fancy quoting algorithm */
321 /* TODO *quote-fold*: n_QUOTE_MAX may excess it! */
322 #ifdef HAVE_QUOTE_FOLD
323 if (self->qf_pfix_len > 0 && (cp = ok_vlook(quote_fold)) != NULL) {
324 ui32_t qmin, qmax;
326 /* These magic values ensure we don't bail */
327 n_idec_ui32_cp(&qmax, cp, 10, &xcp);
328 if (qmax < self->qf_pfix_len + 6)
329 qmax = self->qf_pfix_len + 6;
330 --qmax; /* The newline escape */
331 if (cp == xcp || *xcp == '\0')
332 qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
333 else {
334 n_idec_ui32_cp(&qmin, &xcp[1], 10, NULL);
335 if (qmin < qmax >> 1)
336 qmin = qmax >> 1;
337 else if (qmin > qmax - 2)
338 qmin = qmax - 2;
340 self->qf_qfold_min = qmin;
341 self->qf_qfold_max = qmax;
342 self->qf_quote_chars = ok_vlook(quote_chars);
344 /* Add pad for takeover copies, reverse solidus and newline */
345 self->qf_dat.s = salloc((qmax + 3) * n_mb_cur_max);
346 self->qf_currq.s = salloc((n_QUOTE_MAX + 1) * n_mb_cur_max);
348 #endif
349 NYD_LEAVE;
352 FL void
353 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
355 NYD_ENTER;
356 n_UNUSED(self);
357 NYD_LEAVE;
360 FL void
361 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
363 NYD_ENTER;
364 self->qf_os = f;
365 #ifdef HAVE_QUOTE_FOLD
366 self->qf_state = _QF_CLEAN;
367 self->qf_dat.l =
368 self->qf_currq.l = 0;
369 memset(self->qf_mbps, 0, sizeof self->qf_mbps);
370 #endif
371 NYD_LEAVE;
374 FL ssize_t
375 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
377 /* (xxx Ideally the actual push() [and flush()] would be functions on their
378 * xxx own, via indirect vtbl call ..) */
379 ssize_t rv = 0;
380 NYD_ENTER;
382 self->qf_nl_last = (len > 0 && dat[len - 1] == '\n'); /* TODO HACK */
384 if (len == 0)
385 goto jleave;
387 /* Bypass? TODO Finally, this filter simply should not be used, then
388 * (TODO It supercedes prefix_write() or something) */
389 if (self->qf_pfix_len == 0) {
390 if (len != fwrite(dat, 1, len, self->qf_os))
391 goto jerr;
392 rv = len;
394 /* Normal: place *indentprefix* at every BOL */
395 else
396 #ifdef HAVE_QUOTE_FOLD
397 if (self->qf_qfold_max == 0)
398 #endif
400 void *vp;
401 size_t ll;
402 bool_t pxok = (self->qf_qfold_min != 0);
404 for (;;) {
405 if (!pxok) {
406 ll = self->qf_pfix_len;
407 if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
408 goto jerr;
409 rv += ll;
410 pxok = TRU1;
413 /* xxx Strictly speaking this is invalid, because only `/' and `.' are
414 * xxx mandated by POSIX.1-2008 as "invariant across all locales
415 * xxx supported"; though there is no charset known which uses this
416 * xxx control char as part of a multibyte character; note that S-nail
417 * XXX (and the Mail codebase as such) do not support EBCDIC */
418 if ((vp = memchr(dat, '\n', len)) == NULL)
419 ll = len;
420 else {
421 pxok = FAL0;
422 ll = PTR2SIZE((char*)vp - dat) + 1;
425 if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
426 goto jerr;
427 rv += ll;
428 if ((len -= ll) == 0)
429 break;
430 dat += ll;
433 self->qf_qfold_min = pxok;
435 /* Overly complicated, though still only line-per-line: *quote-fold*.
436 * - If .qf_currq.l is 0, then we are in a clean state. Reset .qf_mbps;
437 * TODO note this means we assume that lines start with reset escape seq,
438 * TODO but i don't think this is any worse than what we currently do;
439 * TODO in 15.0, with the value carrier, we should carry conversion states
440 * TODO all along, only resetting on error (or at words for header =???=);
441 * TODO this still is weird for error handling, but we need to act more
442 * TODO stream-alike (though in practice i don't think cross-line states
443 * TODO can be found, because of compatibility reasons; however, being
444 * TODO a problem rather than a solution is not a good thing (tm))
445 * - Lookout for a newline */
446 #ifdef HAVE_QUOTE_FOLD
447 else {
448 struct qf_vc vc;
449 ssize_t i;
451 vc.self = self;
452 vc.buf = dat;
453 vc.len = len;
454 while (vc.len > 0) {
455 switch (self->qf_state) {
456 case _QF_CLEAN:
457 case _QF_PREFIX:
458 i = _qf_state_prefix(&vc);
459 break;
460 default: /* silence cc (`i' unused) */
461 case _QF_DATA:
462 i = _qf_state_data(&vc);
463 break;
465 if (i < 0)
466 goto jerr;
467 rv += i;
470 #endif /* HAVE_QUOTE_FOLD */
472 jleave:
473 NYD_LEAVE;
474 return rv;
475 jerr:
476 rv = -1;
477 goto jleave;
480 FL ssize_t
481 quoteflt_flush(struct quoteflt *self)
483 ssize_t rv = 0;
484 NYD_ENTER;
485 n_UNUSED(self);
487 #ifdef HAVE_QUOTE_FOLD
488 if (self->qf_dat.l > 0) {
489 rv = _qf_dump_prefix(self);
490 if (rv >= 0) {
491 size_t i = self->qf_dat.l;
492 if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
493 rv += i;
494 else
495 rv = -1;
496 self->qf_dat.l = 0;
497 self->qf_brk_isws = FAL0;
498 self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
499 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
502 #endif
503 NYD_LEAVE;
504 return rv;
508 * HTML tagsoup filter TODO rewrite wchar_t based (require HAVE_C90AMEND1)
509 * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
510 * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
511 * TODO (nonstandard) and watch out for style="gmail_quote" (or so, VERY
512 * TODO nonstandard) and tracking a stack of such elements (to be popped
513 * TODO once the closing element is seen). Then, after writing a newline,
514 * TODO place sizeof(stack) ">"s first. But aren't these HTML mails rude?
515 * TODO Interlocking and non-well-formed data will break us down
517 #ifdef HAVE_FILTER_HTML_TAGSOUP
519 enum hf_limits {
520 _HF_MINLEN = 10, /* Minimum line length (can't really be smaller) */
521 _HF_BRKSUB = 8 /* Start considering line break MAX - BRKSUB */
524 enum hf_flags {
525 _HF_BQUOTE_MASK = 0xFFFFu,
526 _HF_UTF8 = 1u<<16, /* Data is in UTF-8 */
527 _HF_ERROR = 1u<<17, /* A hard error occurred, bail as soon as possible */
528 _HF_NOPUT = 1u<<18, /* (In a tag,) Don't generate output */
529 _HF_IGN = 1u<<19, /* Ignore mode on */
530 _HF_ANY = 1u<<20, /* Yet seen just any output */
531 _HF_PRE = 1u<<21, /* In <pre>formatted mode */
532 _HF_ENT = 1u<<22, /* Currently parsing an entity */
533 _HF_BLANK = 1u<<23, /* Whitespace last */
534 _HF_HREF = 1u<<24, /* External <a href=> was the last href seen */
536 _HF_NL_1 = 1u<<25, /* One \n seen */
537 _HF_NL_2 = 2u<<25, /* We have produced an all empty line */
538 _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
541 enum hf_special_actions {
542 _HFSA_NEEDSEP = -1, /* Need an empty line (paragraph separator) */
543 _HFSA_NEEDNL = -2, /* Need a new line start (table row) */
544 _HFSA_IGN = -3, /* Things like <style>..</style>, <script>.. */
545 _HFSA_PRE = -4, /* <pre>.. */
546 _HFSA_PRE_END = -5,
547 _HFSA_IMG = -6, /* <img> */
548 _HFSA_HREF = -7, /* <a>.. */
549 _HFSA_HREF_END = -8,
550 _HFSA_BQUOTE = -9, /* <blockquote>, interpreted as citation! */
551 _HFSA_BQUOTE_END = -10
554 enum hf_entity_flags {
555 _HFE_HAVE_UNI = 1<<6, /* Have a Unicode replacement character */
556 _HFE_HAVE_CSTR = 1<<7, /* Have a string replacement */
557 /* We store the length of the entity name in the flags, too */
558 _HFE_LENGTH_MASK = (1<<6) - 1
561 struct htmlflt_href {
562 struct htmlflt_href *hfh_next;
563 ui32_t hfh_no; /* Running sequence */
564 ui32_t hfh_len; /* of .hfh_dat */
565 char hfh_dat[n_VFIELD_SIZE(0)];
568 struct htmlflt_tag {
569 si32_t hft_act; /* char or hf_special_actions */
570 /* Not NUL: character to inject, with high bit set: place a space
571 * afterwards. Note: only recognized with _HFSA_NEEDSEP or _HFSA_NEEDNL */
572 char hft_injc;
573 ui8_t hft_len; /* Useful bytes in (NUL terminated) .hft_tag */
574 char const hft_tag[10]; /* Tag less < and > surroundings (TR, /TR, ..) */
576 n_CTA(n_SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE,
577 "Structure field too large a size"); /* .hf_ign_tag */
579 struct hf_ent {
580 ui8_t hfe_flags; /* enum hf_entity_flags plus length of .hfe_ent */
581 char hfe_c; /* Plain replacement character */
582 ui16_t hfe_uni; /* Unicode codepoint if _HFE_HAVE_UNI */
583 char hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
584 char const hfe_ent[7]; /* Entity less & and ; surroundings */
587 /* Tag list; not binary searched :(, so try to take care a bit */
588 static struct htmlflt_tag const _hf_tags[] = {
589 # undef _X
590 # undef _XC
591 # define _X(S,A) {A, '\0', sizeof(S) -1, S "\0"}
592 # define _XC(S,C,A) {A, C, sizeof(S) -1, S "\0"}
594 # if 0 /* This is treated very special (to avoid wasting space in .hft_tag) */
595 _X("BLOCKQUOTE", _HFSA_BQUOTE), _X("/BLOCKQUOTE", _HFSA_BQUOTE_END),
596 # endif
598 _X("P", _HFSA_NEEDSEP), _X("/P", _HFSA_NEEDNL),
599 _X("DIV", _HFSA_NEEDSEP), _X("/DIV", _HFSA_NEEDNL),
600 _X("TR", _HFSA_NEEDNL),
601 _X("/TH", '\t'),
602 _X("/TD", '\t'),
603 /* Let it stand out; also since we don't support implicit paragraphs after
604 * block elements, plain running text after a list (seen in Unicode
605 * announcement via Firefox) */
606 _X("UL", _HFSA_NEEDSEP), _X("/UL", _HFSA_NEEDSEP),
607 _XC("LI", (char)0x80 | '*', _HFSA_NEEDSEP),
608 _X("DL", _HFSA_NEEDSEP),
609 _X("DT", _HFSA_NEEDNL),
611 _X("A", _HFSA_HREF), _X("/A", _HFSA_HREF_END),
612 _X("IMG", _HFSA_IMG),
613 _X("BR", '\n'),
614 _X("PRE", _HFSA_PRE), _X("/PRE", _HFSA_PRE_END),
615 _X("TITLE", _HFSA_NEEDSEP), /*_X("/TITLE", '\n'),*/
616 _X("H1", _HFSA_NEEDSEP), /*_X("/H1", '\n'),*/
617 _X("H2", _HFSA_NEEDSEP), /*_X("/H2", '\n'),*/
618 _X("H3", _HFSA_NEEDSEP), /*_X("/H3", '\n'),*/
619 _X("H4", _HFSA_NEEDSEP), /*_X("/H4", '\n'),*/
620 _X("H5", _HFSA_NEEDSEP), /*_X("/H5", '\n'),*/
621 _X("H6", _HFSA_NEEDSEP), /*_X("/H6", '\n'),*/
623 _X("STYLE", _HFSA_IGN),
624 _X("SCRIPT", _HFSA_IGN),
626 # undef _X
629 /* Entity list; not binary searched.. */
630 static struct hf_ent const _hf_ents[] = {
631 # undef _X
632 # undef _XU
633 # undef _XS
634 # undef _XUS
635 # define _X(E,C) {(sizeof(E) -1), C, 0x0u, "", E "\0"}
636 # define _XU(E,C,U) {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E "\0"}
637 # define _XS(E,S) {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u,S "\0",E "\0"}
638 # define _XSU(E,S,U) \
639 {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E "\0"}
641 _X("quot", '"'),
642 _X("amp", '&'),
643 _X("lt", '<'), _X("gt", '>'),
645 _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
646 _XU("middot", '.', 0x00B7),
647 _XSU("hellip", "...", 0x2026),
648 _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
649 _XSU("laquo", "<<", 0x00AB), _XSU("raquo", ">>", 0x00BB),
650 _XSU("lsaquo", "<", 0x2039), _XSU("rsaquo", ">", 0x203A),
651 _XSU("lsquo", "'", 0x2018), _XSU("rsquo", "'", 0x2019),
652 _XSU("ldquo", "\"", 0x201C), _XSU("rdquo", "\"", 0x201D),
653 _XSU("uarr", "^|", 0x2191), _XSU("darr", "|v", 0x2193),
655 _XSU("cent", "CENT", 0x00A2),
656 _XSU("copy", "(C)", 0x00A9),
657 _XSU("euro", "EUR", 0x20AC),
658 _XSU("infin", "INFY", 0x221E),
659 _XSU("pound", "GBP", 0x00A3),
660 _XSU("reg", "(R)", 0x00AE),
661 _XSU("sect", "S:", 0x00A7),
662 _XSU("yen", "JPY", 0x00A5),
664 /* German umlauts */
665 _XSU("Auml", "Ae", 0x00C4), _XSU("auml", "ae", 0x00E4),
666 _XSU("Ouml", "Oe", 0x00D6), _XSU("ouml", "oe", 0x00F6),
667 _XSU("Uuml", "Ue", 0x00DC), _XSU("uuml", "ue", 0x00FC),
668 _XSU("szlig", "ss", 0x00DF)
670 # undef _X
671 # undef _XU
672 # undef _XS
673 # undef _XSU
676 /* Real output */
677 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
678 static struct htmlflt * _hf_dump(struct htmlflt *self);
679 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
680 # ifdef HAVE_NATCH_CHAR
681 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
682 # endif
684 /* Virtual output */
685 static struct htmlflt * _hf_nl(struct htmlflt *self);
686 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
687 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
688 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
689 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
690 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
691 char const *cp, size_t len);
693 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
694 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
695 char const *param);
697 /* Expand all entities in the given parameter */
698 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
699 struct str const *param);
701 /* Completely parsed over a tag / an entity, interpret that */
702 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
703 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
704 size_t l);
706 /* Input handler */
707 static ssize_t _hf_add_data(struct htmlflt *self,
708 char const *dat, size_t len);
710 static struct htmlflt *
711 _hf_dump_hrefs(struct htmlflt *self)
713 struct htmlflt_href *hhp;
714 NYD2_ENTER;
716 if (!(self->hf_flags & _HF_NL_2) && putc('\n', self->hf_os) == EOF) {
717 self->hf_flags |= _HF_ERROR;
718 goto jleave;
721 /* Reverse the list */
722 for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
723 struct htmlflt_href *tmp = hhp->hfh_next;
724 hhp->hfh_next = self->hf_hrefs;
725 self->hf_hrefs = hhp;
726 hhp = tmp;
729 /* Then dump it */
730 while ((hhp = self->hf_hrefs) != NULL) {
731 self->hf_hrefs = hhp->hfh_next;
733 if (!(self->hf_flags & _HF_ERROR)) {
734 int w = fprintf(self->hf_os, " [%u] %.*s\n",
735 hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
736 if (w < 0)
737 self->hf_flags |= _HF_ERROR;
739 free(hhp);
742 self->hf_flags |= (putc('\n', self->hf_os) == EOF)
743 ? _HF_ERROR : _HF_NL_1 | _HF_NL_2;
744 self->hf_href_dist = (ui32_t)n_realscreenheight >> 1;
745 jleave:
746 NYD2_LEAVE;
747 return self;
750 static struct htmlflt *
751 _hf_dump(struct htmlflt *self)
753 ui32_t f, l;
754 char c, *cp;
755 NYD2_ENTER;
757 f = self->hf_flags & ~_HF_BLANK;
758 l = self->hf_len;
759 cp = self->hf_line;
760 self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
762 for (c = '\0'; l > 0; --l) {
763 c = *cp++;
764 jput:
765 if (putc(c, self->hf_os) == EOF) {
766 self->hf_flags = (f |= _HF_ERROR);
767 goto jleave;
771 if (c != '\n') {
772 f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
773 l = 1;
774 c = '\n';
775 goto jput;
777 self->hf_flags = f;
779 /* Check whether there are HREFs to dump; there is so much messy tagsoup out
780 * there that it seems best not to simply dump HREFs in each _dump(), but
781 * only with some gap, let's say half the real screen height */
782 if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
783 self = _hf_dump_hrefs(self);
784 jleave:
785 NYD2_LEAVE;
786 return self;
789 static struct htmlflt *
790 _hf_store(struct htmlflt *self, char c)
792 ui32_t l, i;
793 NYD2_ENTER;
795 assert(c != '\n');
797 l = self->hf_len;
798 if(n_UNLIKELY(l == 0) && (i = (self->hf_flags & _HF_BQUOTE_MASK)) != 0 &&
799 self->hf_lmax > _HF_MINLEN){
800 ui32_t len, j;
801 char const *ip;
803 ip = ok_vlook(indentprefix);
804 len = strlen(ip);
805 if(len == 0 || len >= _HF_MINLEN){
806 ip = " |"; /* XXX something from *quote-chars* */
807 len = sizeof(" |") -1;
810 self->hf_len = len;
811 for(j = len; j-- != 0;){
812 char x;
814 if((x = ip[j]) == '\t')
815 x = ' ';
816 self->hf_line[j] = x;
819 while(--i > 0 && self->hf_len < self->hf_lmax - _HF_BRKSUB)
820 self = _hf_store(self, '|'); /* XXX something from *quote-chars* */
822 l = self->hf_len;
825 self->hf_line[l] = (c == '\t' ? ' ' : c);
826 self->hf_len = ++l;
827 if (blankspacechar(c)) {
828 if (c == '\t') {
829 i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
830 if (i > 0) {
832 self = _hf_store(self, ' ');
833 while (--i > 0);
834 goto jleave;
837 self->hf_last_ws = l;
838 } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
839 self->hf_last_ws = l;
841 i = l;
842 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
843 if (n_mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least */
844 wchar_t wc;
845 int w, x;
847 if((x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff)
848 ) > 0){
849 if ((w = wcwidth(wc)) == -1 ||
850 /* Actively filter out L-TO-R and R-TO-R marks TODO ctext */
851 (wc == 0x200E || wc == 0x200F ||
852 (wc >= 0x202A && wc <= 0x202E)) ||
853 /* And some zero-width messes */
854 wc == 0x00AD || (wc >= 0x200B && wc <= 0x200D) ||
855 /* Oh about the ISO C wide character interfaces, baby! */
856 (wc == 0xFEFF)){
857 self->hf_len -= x;
858 goto jleave;
859 } else if (iswspace(wc))
860 self->hf_last_ws = l;
861 self->hf_mboff += x;
862 i = (self->hf_mbwidth += w);
863 } else {
864 if (x < 0) {
865 (void)mbtowc(&wc, NULL, n_mb_cur_max);
866 if (UICMP(32, l - self->hf_mboff, >=, n_mb_cur_max)) { /* XXX */
867 ++self->hf_mboff;
868 ++self->hf_mbwidth;
871 i = self->hf_mbwidth;
874 # endif
876 /* Do we need to break the line? */
877 if (i >= self->hf_lmax - _HF_BRKSUB) {
878 ui32_t f, lim;
881 /* Let's hope we saw a sane place to break this line! */
882 if (self->hf_last_ws >= (lim = self->hf_lmax >> 1)) {
883 jput:
884 i = self->hf_len = self->hf_last_ws;
885 self = _hf_dump(self);
886 if ((self->hf_len = (l -= i)) > 0) {
887 self->hf_flags &= ~_HF_NL_MASK;
888 memmove(self->hf_line, self->hf_line + i, l);
889 # ifdef HAVE_NATCH_CHAR
890 __hf_sync_mbstuff(self);
891 # endif
893 goto jleave;
896 /* Any 7-bit characters? */
897 f = self->hf_flags;
898 for (i = l; i-- >= lim;)
899 if (asciichar((c = self->hf_line[i]))) {
900 self->hf_last_ws = ++i;
901 goto jput;
902 } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
903 self->hf_last_ws = i;
904 goto jput;
907 /* Hard break necessary! xxx really badly done */
908 if (l >= self->hf_lmax - 1)
909 self = _hf_dump(self);
911 jleave:
912 NYD2_LEAVE;
913 return self;
916 # ifdef HAVE_NATCH_CHAR
917 static struct htmlflt *
918 __hf_sync_mbstuff(struct htmlflt *self)
920 wchar_t wc;
921 char const *b;
922 ui32_t o, w, l;
923 NYD2_ENTER;
925 b = self->hf_line;
926 o = w = 0;
927 l = self->hf_len;
928 goto jumpin;
930 while (l > 0) {
931 int x = mbtowc(&wc, b, l);
933 if (x == 0)
934 break;
936 if (x > 0) {
937 b += x;
938 l -= x;
939 o += x;
940 if ((x = wcwidth(wc)) == -1)
941 x = 1;
942 w += x;
943 continue;
946 /* Bad, skip over a single character.. XXX very bad indeed */
947 ++b;
948 ++o;
949 ++w;
950 --l;
951 jumpin:
952 (void)mbtowc(&wc, NULL, n_mb_cur_max);
955 self->hf_mboff = o;
956 self->hf_mbwidth = w;
958 NYD2_LEAVE;
959 return self;
961 # endif /* HAVE_NATCH_CHAR */
963 static struct htmlflt *
964 _hf_nl(struct htmlflt *self)
966 ui32_t f;
967 NYD2_ENTER;
969 if (!((f = self->hf_flags) & _HF_ERROR)) {
970 if (f & _HF_ANY) {
971 if ((f & _HF_NL_MASK) != _HF_NL_MASK)
972 self = _hf_dump(self);
973 } else
974 self->hf_flags = (f |= _HF_NL_MASK);
976 NYD2_LEAVE;
977 return self;
980 static struct htmlflt *
981 _hf_nl_force(struct htmlflt *self)
983 NYD2_ENTER;
984 if (!(self->hf_flags & _HF_ERROR))
985 self = _hf_dump(self);
986 NYD2_LEAVE;
987 return self;
990 static struct htmlflt *
991 _hf_putc(struct htmlflt *self, char c)
993 ui32_t f;
994 NYD2_ENTER;
996 if ((f = self->hf_flags) & _HF_ERROR)
997 goto jleave;
999 if (c == '\n') {
1000 self = _hf_nl(self);
1001 goto jleave;
1002 } else if (c == ' ' || c == '\t') {
1003 if ((f & _HF_BLANK) || self->hf_len == 0)
1004 goto jleave;
1005 f |= _HF_BLANK;
1006 } else
1007 f &= ~_HF_BLANK;
1008 f &= ~_HF_NL_MASK;
1009 self->hf_flags = (f |= _HF_ANY);
1010 self = _hf_store(self, c);
1011 jleave:
1012 NYD2_LEAVE;
1013 return self;
1016 static struct htmlflt *
1017 _hf_putc_premode(struct htmlflt *self, char c)
1019 ui32_t f;
1020 NYD2_ENTER;
1022 if ((f = self->hf_flags) & _HF_ERROR) {
1024 } else if (c == '\n')
1025 self = _hf_nl_force(self);
1026 else {
1027 f &= ~_HF_NL_MASK;
1028 self->hf_flags = (f |= _HF_ANY);
1029 self = _hf_store(self, c);
1031 NYD2_LEAVE;
1032 return self;
1035 static struct htmlflt *
1036 _hf_puts(struct htmlflt *self, char const *cp)
1038 char c;
1039 NYD2_ENTER;
1041 while ((c = *cp++) != '\0')
1042 self = _hf_putc(self, c);
1043 NYD2_LEAVE;
1044 return self;
1047 static struct htmlflt *
1048 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
1050 NYD2_ENTER;
1052 while (len-- > 0)
1053 self = _hf_putc(self, *cp++);
1054 NYD2_LEAVE;
1055 return self;
1058 static struct htmlflt *
1059 _hf_param(struct htmlflt *self, struct str *store, char const *param)
1061 char const *cp;
1062 char c, x, quote;
1063 size_t i;
1064 bool_t hot;
1065 NYD2_ENTER;
1067 store->s = NULL;
1068 store->l = 0;
1069 cp = self->hf_bdat;
1071 /* Skip over any non-WS first; be aware of soup, if it slipped through */
1072 for(;;){
1073 if((c = *cp++) == '\0' || c == '>')
1074 goto jleave;
1075 if(whitechar(c))
1076 break;
1079 /* Search for the parameter, take care of other quoting along the way */
1080 x = *param++;
1081 x = upperconv(x);
1082 i = strlen(param);
1084 for(hot = TRU1;;){
1085 if((c = *cp++) == '\0' || c == '>')
1086 goto jleave;
1087 if(whitechar(c)){
1088 hot = TRU1;
1089 continue;
1092 /* Could it be a parameter? */
1093 if(hot){
1094 hot = FAL0;
1096 /* Is it the desired one? */
1097 if((c = upperconv(c)) == x && !ascncasecmp(param, cp, i)){
1098 char const *cp2 = cp + i;
1100 if((quote = *cp2++) != '='){
1101 if(quote == '\0' || quote == '>')
1102 goto jleave;
1103 while(whitechar(quote))
1104 quote = *cp2++;
1106 if(quote == '='){
1107 cp = cp2;
1108 break;
1110 continue; /* XXX Optimize: i bytes or even cp2 can't be it! */
1114 /* Not the desired one; but a parameter? */
1115 if(c != '=')
1116 continue;
1117 /* If so, properly skip over the value */
1118 if((c = *cp++) == '"' || c == '\''){
1119 /* TODO i have forgotten whether reverse solidus quoting is allowed
1120 * TODO quoted HTML parameter values? not supporting that for now.. */
1121 for(quote = c; (c = *cp++) != '\0' && c != quote;)
1123 }else
1124 while(c != '\0' && !whitechar(c) && c != '>')
1125 c = *++cp;
1126 if(c == '\0')
1127 goto jleave;
1130 /* Skip further whitespace */
1131 for(;;){
1132 if((c = *cp++) == '\0' || c == '>')
1133 goto jleave;
1134 if(!whitechar(c))
1135 break;
1138 if(c == '"' || c == '\''){
1139 /* TODO i have forgotten whether reverse solisud quoting is allowed in
1140 * TODO quoted HTML parameter values? not supporting that for now.. */
1141 store->s = n_UNCONST(cp);
1142 for(quote = c; (c = *cp) != '\0' && c != quote; ++cp)
1144 /* XXX ... and we simply ignore a missing trailing " :> */
1145 }else{
1146 store->s = n_UNCONST(cp - 1);
1147 if(!whitechar(c))
1148 while((c = *cp) != '\0' && !whitechar(c) && c != '>')
1149 ++cp;
1151 i = PTR2SIZE(cp - store->s);
1153 /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
1154 * parameter values prefixed and suffixed by newlines! Therefore trim the
1155 * value content TODO join into the parse step above! */
1156 for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
1158 store->s = n_UNCONST(cp);
1159 for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
1161 if ((store->l = i) == 0)
1162 store->s = NULL;
1163 jleave:
1164 NYD2_LEAVE;
1165 return self;
1168 static struct htmlflt *
1169 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
1171 char const *cp, *maxcp, *ep;
1172 char c;
1173 size_t i;
1174 NYD2_ENTER;
1176 for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
1177 if ((c = *cp++) != '&')
1178 jputc:
1179 self = _hf_putc(self, c);
1180 else {
1181 for (ep = cp--;;) {
1182 if (ep == maxcp || (c = *ep++) == '\0') {
1183 for (; cp < ep; ++cp)
1184 self = _hf_putc(self, *cp);
1185 goto jleave;
1186 } else if (c == ';') {
1187 if ((i = PTR2SIZE(ep - cp)) > 1) {
1188 self = _hf_check_ent(self, cp, i);
1189 break;
1190 } else {
1191 c = *cp++;
1192 goto jputc;
1196 cp = ep;
1198 jleave:
1199 NYD2_LEAVE;
1200 return self;
1203 static struct htmlflt *
1204 _hf_check_tag(struct htmlflt *self, char const *s)
1206 char nobuf[32], c;
1207 struct str param;
1208 size_t i;
1209 struct htmlflt_tag const *hftp;
1210 ui32_t f;
1211 NYD2_ENTER;
1213 /* Extra check only */
1214 assert(s != NULL);
1215 if (*s != '<') {
1216 DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
1217 jput_as_is:
1218 self = _hf_puts(self, self->hf_bdat);
1219 goto jleave;
1222 for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
1223 /* Special massage for things like <br/>: after the slash only whitespace
1224 * may separate us from the closing right angle! */
1225 if (c == '/') {
1226 size_t j = i + 1;
1228 while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
1229 ++j;
1230 if (c == '>')
1231 break;
1234 for (hftp = _hf_tags;;) {
1235 if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
1236 c = s[hftp->hft_len];
1237 if (c == '>' || c == '/' || whitechar(c))
1238 break;
1240 if (n_UNLIKELY(PTRCMP(++hftp, >=, _hf_tags + n_NELEM(_hf_tags)))){
1241 /* A <blockquote> is very special xxx */
1242 bool_t isct;
1244 if((isct = (i > 1 && *s == '/'))){
1245 ++s;
1246 --i;
1249 if(i != sizeof("blockquote") -1 || ascncasecmp(s, "blockquote", i) ||
1250 ((c = s[sizeof("blockquote") -1]) != '>' && !whitechar(c))){
1251 s -= isct;
1252 i += isct;
1253 goto jnotknown;
1256 if(!isct && !(self->hf_flags & _HF_NL_2))
1257 self = _hf_nl(self);
1258 if(!(self->hf_flags & _HF_NL_1))
1259 self = _hf_nl(self);
1260 f = self->hf_flags;
1261 f &= _HF_BQUOTE_MASK;
1262 if(!isct){
1263 if(f != _HF_BQUOTE_MASK)
1264 ++f;
1265 }else if(f > 0)
1266 --f;
1267 f |= (self->hf_flags & ~_HF_BQUOTE_MASK);
1268 self->hf_flags = f;
1269 goto jleave;
1273 f = self->hf_flags;
1274 switch (hftp->hft_act) {
1275 case _HFSA_PRE_END:
1276 f &= ~_HF_PRE;
1277 if (0) {
1278 /* FALLTHRU */
1279 case _HFSA_PRE:
1280 f |= _HF_PRE;
1282 self->hf_flags = f;
1283 /* FALLTHRU */
1285 case _HFSA_NEEDSEP:
1286 if (!(self->hf_flags & _HF_NL_2))
1287 self = _hf_nl(self);
1288 /* FALLTHRU */
1289 case _HFSA_NEEDNL:
1290 if (!(f & _HF_NL_1))
1291 self = _hf_nl(self);
1292 if (hftp->hft_injc != '\0') {
1293 self = _hf_putc(self, hftp->hft_injc & 0x7F);
1294 if ((uc_i)hftp->hft_injc & 0x80)
1295 self = _hf_putc(self, ' ');
1297 break;
1299 case _HFSA_IGN:
1300 self->hf_ign_tag = hftp;
1301 self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
1302 break;
1304 case _HFSA_IMG:
1305 self = _hf_param(self, &param, "alt");
1306 self = _hf_putc(self, '[');
1307 if (param.s == NULL) {
1308 param.s = n_UNCONST("IMG");
1309 param.l = 3;
1310 goto jimg_put;
1311 } /* else */ if (memchr(param.s, '&', param.l) != NULL)
1312 self = _hf_expand_all_ents(self, &param);
1313 else
1314 jimg_put:
1315 self = _hf_putbuf(self, param.s, param.l);
1316 self = _hf_putc(self, ']');
1317 break;
1319 case _HFSA_HREF:
1320 self = _hf_param(self, &param, "href");
1321 /* Ignore non-external links */
1322 if (param.s != NULL && *param.s != '#') {
1323 struct htmlflt_href *hhp = smalloc(
1324 n_VSTRUCT_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
1326 hhp->hfh_next = self->hf_hrefs;
1327 hhp->hfh_no = ++self->hf_href_no;
1328 hhp->hfh_len = (ui32_t)param.l;
1329 memcpy(hhp->hfh_dat, param.s, param.l);
1331 snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
1332 self->hf_flags = (f |= _HF_HREF);
1333 self->hf_hrefs = hhp;
1334 self = _hf_puts(self, nobuf);
1335 } else
1336 self->hf_flags = (f &= ~_HF_HREF);
1337 break;
1338 case _HFSA_HREF_END:
1339 if (f & _HF_HREF) {
1340 snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
1341 self = _hf_puts(self, nobuf);
1343 break;
1345 default:
1346 c = (char)(hftp->hft_act & 0xFF);
1347 self = _hf_putc(self, c);
1348 break;
1349 case '\0':
1350 break;
1352 jleave:
1353 NYD2_LEAVE;
1354 return self;
1356 /* The problem is that even invalid tagsoup is widely used, without real
1357 * searching i have seen e-mail address in <N@H.D> notation, and more.
1358 * To protect us a bit look around and possibly write the content as such */
1359 jnotknown:
1360 switch (*s) {
1361 case '!':
1362 case '?':
1363 /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
1364 goto jleave;
1365 case '>':
1366 /* Print out an empty tag as such */
1367 if (s[1] == '\0') {
1368 --s;
1369 goto jput_as_is;
1371 break;
1372 case '/':
1373 ++s;
1374 break;
1375 default:
1376 break;
1379 /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
1380 while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
1381 if (!asciichar(c) || punctchar(c)) {
1382 self = _hf_puts(self, self->hf_bdat);
1383 break;
1385 goto jleave;
1388 static struct htmlflt *
1389 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
1391 char nobuf[32];
1392 char const *s_save;
1393 size_t l_save;
1394 struct hf_ent const *hfep;
1395 size_t i;
1396 NYD2_ENTER;
1398 s_save = s;
1399 l_save = l;
1400 assert(*s == '&');
1401 assert(l > 0);
1402 /* False entities seen in the wild assert(s[l - 1] == ';'); */
1403 ++s;
1404 l -= 2;
1406 /* Numeric entity, or try named search */
1407 if (*s == '#') {
1408 i = (*++s == 'x' ? 16 : 10);
1410 if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
1411 memcpy(nobuf, s, l);
1412 nobuf[l] = '\0';
1413 n_idec_uiz_cp(&i, nobuf, i, NULL);
1414 if (i <= 0x7F)
1415 self = _hf_putc(self, (char)i);
1416 else if (self->hf_flags & _HF_UTF8) {
1417 jputuni:
1418 l = n_utf32_to_utf8((ui32_t)i, nobuf);
1419 self = _hf_putbuf(self, nobuf, l);
1420 } else
1421 goto jeent;
1422 } else
1423 goto jeent;
1424 } else {
1425 ui32_t f = self->hf_flags, hf;
1427 for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + n_NELEM(_hf_ents));
1428 ++hfep)
1429 if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
1430 !strncmp(s, hfep->hfe_ent, l)) {
1431 if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
1432 i = hfep->hfe_uni;
1433 goto jputuni;
1434 } else if (hf & _HFE_HAVE_CSTR)
1435 self = _hf_puts(self, hfep->hfe_cstr);
1436 else
1437 self = _hf_putc(self, hfep->hfe_c);
1438 goto jleave;
1440 jeent:
1441 self = _hf_putbuf(self, s_save, l_save);
1443 jleave:
1444 NYD2_LEAVE;
1445 return self;
1448 static ssize_t
1449 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
1451 char c, *cp, *cp_max;
1452 bool_t hot;
1453 ssize_t rv = 0;
1454 NYD_ENTER;
1456 /* Final put request? */
1457 if (dat == NULL) {
1458 if (self->hf_len > 0 || self->hf_hrefs != NULL) {
1459 self = _hf_dump(self);
1460 if (self->hf_hrefs != NULL)
1461 self = _hf_dump_hrefs(self);
1462 rv = 1;
1464 goto jleave;
1467 /* Always ensure some initial buffer */
1468 if ((cp = self->hf_curr) != NULL)
1469 cp_max = self->hf_bmax;
1470 else {
1471 cp = self->hf_curr = self->hf_bdat = smalloc(LINESIZE);
1472 cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
1474 hot = (cp != self->hf_bdat);
1476 for (rv = (ssize_t)len; len > 0; --len) {
1477 ui32_t f = self->hf_flags;
1479 if (f & _HF_ERROR)
1480 break;
1481 c = *dat++;
1483 /* Soup is really weird, and scripts may contain almost anything (and
1484 * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
1485 * test and walk until we see the required end tag */
1486 /* TODO For real safety _HF_IGN soup condome would also need to know
1487 * TODO about quoted strings so that 'var i = "</script>";' couldn't
1488 * TODO fool it! We really want this mode also for _HF_NOPUT to be
1489 * TODO able to *gracefully* detect the tag-closing '>', but then if
1490 * TODO that is a single mechanism we should have made it! */
1491 if (f & _HF_IGN) {
1492 struct htmlflt_tag const *hftp = self->hf_ign_tag;
1493 size_t i;
1495 if (c == '<') {
1496 hot = TRU1;
1497 jcp_reset:
1498 cp = self->hf_bdat;
1499 } else if (c == '>') {
1500 if (hot) {
1501 if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
1502 --i == hftp->hft_len &&
1503 !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
1504 self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
1505 hot = FAL0;
1506 goto jcp_reset;
1508 } else if (hot) {
1509 *cp++ = c;
1510 i = PTR2SIZE(cp - self->hf_bdat);
1511 if ((i == 1 && c != '/') || --i > hftp->hft_len) {
1512 hot = FAL0;
1513 goto jcp_reset;
1516 } else switch (c) {
1517 case '<':
1518 /* People are using & without &amp;ing it, ditto <; be aware */
1519 if (f & (_HF_NOPUT | _HF_ENT)) {
1520 f &= ~_HF_ENT;
1521 /* Special case "<!--" buffer content to deal with really weird
1522 * things that can be done with "<!--[if gte mso 9]>" syntax */
1523 if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
1524 memcmp(self->hf_bdat, "<!--", 4)) {
1525 self->hf_flags = f;
1526 *cp = '\0';
1527 self = _hf_puts(self, self->hf_bdat);
1528 f = self->hf_flags;
1531 cp = self->hf_bdat;
1532 *cp++ = c;
1533 self->hf_flags = (f |= _HF_NOPUT);
1534 break;
1535 case '>':
1536 /* Weird tagsoup around, do we actually parse a tag? */
1537 if (!(f & _HF_NOPUT))
1538 goto jdo_c;
1539 cp[0] = c;
1540 cp[1] = '\0';
1541 f &= ~(_HF_NOPUT | _HF_ENT);
1542 self->hf_flags = f;
1543 self = _hf_check_tag(self, self->hf_bdat);
1544 *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
1545 /* Quick hack to get rid of redundant newline after <pre> XXX */
1546 if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
1547 len > 1 && *dat == '\n')
1548 ++dat, --len;
1549 break;
1551 case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
1552 break;
1553 case '\n':
1554 /* End of line is not considered unless we are in PRE section.
1555 * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
1556 * newlines for separating parameters */
1557 if (f & _HF_NOPUT)
1558 goto jdo_c;
1559 self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
1560 break;
1562 case '\t':
1563 if (!(f & _HF_PRE))
1564 c = ' ';
1565 /* FALLTHRU */
1566 default:
1567 jdo_c:
1568 /* If not currently parsing a tag and bypassing normal output.. */
1569 if (!(f & _HF_NOPUT)) {
1570 if (cntrlchar(c))
1571 break;
1572 if (c == '&') {
1573 cp = self->hf_bdat;
1574 *cp++ = c;
1575 self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
1576 } else if (f & _HF_PRE) {
1577 self = _hf_putc_premode(self, c);
1578 self->hf_flags &= ~_HF_BLANK;
1579 } else
1580 self = _hf_putc(self, c);
1581 } else if ((f & _HF_ENT) && c == ';') {
1582 cp[0] = c;
1583 cp[1] = '\0';
1584 f &= ~(_HF_NOPUT | _HF_ENT);
1585 self->hf_flags = f;
1586 self = _hf_check_ent(self, self->hf_bdat,
1587 PTR2SIZE(cp + 1 - self->hf_bdat));
1588 } else {
1589 /* We may need to grow the buffer */
1590 if (PTRCMP(cp + 42/2, >=, cp_max)) {
1591 size_t i = PTR2SIZE(cp - self->hf_bdat),
1592 m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
1594 cp = self->hf_bdat = srealloc(self->hf_bdat, m);
1595 self->hf_bmax = cp + m -1;
1596 self->hf_curr = (cp += i);
1598 *cp++ = c;
1602 self->hf_curr = cp;
1603 jleave:
1604 NYD_LEAVE;
1605 return (self->hf_flags & _HF_ERROR) ? -1 : rv;
1609 * TODO Because we don't support filter chains yet this filter will be run
1610 * TODO in a dedicated subprocess, driven via a special Popen() mode
1612 static bool_t __hf_hadpipesig;
1613 static void
1614 __hf_onpipe(int signo)
1616 NYD_X; /* Signal handler */
1617 n_UNUSED(signo);
1618 __hf_hadpipesig = TRU1;
1621 FL int
1622 htmlflt_process_main(void)
1624 char buf[BUFFER_SIZE];
1625 struct htmlflt hf;
1626 size_t i;
1627 int rv;
1628 NYD_ENTER;
1630 __hf_hadpipesig = FAL0;
1631 safe_signal(SIGPIPE, &__hf_onpipe);
1633 htmlflt_init(&hf);
1634 htmlflt_reset(&hf, n_stdout);
1636 for (;;) {
1637 if ((i = fread(buf, sizeof(buf[0]), n_NELEM(buf), n_stdin)) == 0) {
1638 rv = !feof(n_stdin);
1639 break;
1642 if ((rv = __hf_hadpipesig))
1643 break;
1644 /* Just use this directly.. */
1645 if (htmlflt_push(&hf, buf, i) < 0) {
1646 rv = 1;
1647 break;
1650 if (rv == 0 && htmlflt_flush(&hf) < 0)
1651 rv = 1;
1653 htmlflt_destroy(&hf);
1655 rv |= __hf_hadpipesig;
1656 NYD_LEAVE;
1657 return rv;
1660 FL void
1661 htmlflt_init(struct htmlflt *self)
1663 NYD_ENTER;
1664 /* (Rather redundant though) */
1665 memset(self, 0, sizeof *self);
1666 NYD_LEAVE;
1669 FL void
1670 htmlflt_destroy(struct htmlflt *self)
1672 NYD_ENTER;
1673 htmlflt_reset(self, NULL);
1674 NYD_LEAVE;
1677 FL void
1678 htmlflt_reset(struct htmlflt *self, FILE *f)
1680 struct htmlflt_href *hfhp;
1681 NYD_ENTER;
1683 while ((hfhp = self->hf_hrefs) != NULL) {
1684 self->hf_hrefs = hfhp->hfh_next;
1685 free(hfhp);
1688 if (self->hf_bdat != NULL)
1689 free(self->hf_bdat);
1690 if (self->hf_line != NULL)
1691 free(self->hf_line);
1693 memset(self, 0, sizeof *self);
1695 if (f != NULL) {
1696 ui32_t sw = n_MAX(_HF_MINLEN, (ui32_t)n_scrnwidth);
1698 self->hf_line = smalloc((size_t)sw * n_mb_cur_max +1);
1699 self->hf_lmax = sw;
1701 if (n_psonce & n_PSO_UNICODE) /* TODO not truly generic */
1702 self->hf_flags = _HF_UTF8;
1703 self->hf_os = f;
1705 NYD_LEAVE;
1708 FL ssize_t
1709 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
1711 ssize_t rv;
1712 NYD_ENTER;
1714 rv = _hf_add_data(self, dat, len);
1715 NYD_LEAVE;
1716 return rv;
1719 FL ssize_t
1720 htmlflt_flush(struct htmlflt *self)
1722 ssize_t rv;
1723 NYD_ENTER;
1725 rv = _hf_add_data(self, NULL, 0);
1726 rv |= !fflush(self->hf_os) ? 0 : -1;
1727 NYD_LEAVE;
1728 return rv;
1730 #endif /* HAVE_FILTER_HTML_TAGSOUP */
1732 /* s-it-mode */