is_addr_invalid(): do not say doesn't, say does not
[s-mailx.git] / filter.c
blobec9c38f59a838ef4a7f6236cf4b9c8d5a0f4325a
1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
2 *@ Filter objects.
4 * Copyright (c) 2013 - 2018 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 #undef n_FILE
19 #define n_FILE filter
21 #ifndef HAVE_AMALGAMATION
22 # include "nail.h"
23 #endif
26 * Quotation filter
30 * TODO quotation filter: anticipate in future data: don't break if only WS
31 * TODO or a LF escaping \ follows on the line (simply reuse the latter).
34 #ifdef HAVE_QUOTE_FOLD
35 n_CTAV(n_QUOTE_MAX > 3);
37 enum qf_state {
38 _QF_CLEAN,
39 _QF_PREFIX,
40 _QF_DATA
43 struct qf_vc {
44 struct quoteflt *self;
45 char const *buf;
46 size_t len;
49 /* Print out prefix and current quote */
50 static ssize_t _qf_dump_prefix(struct quoteflt *self);
52 /* Add one data character */
53 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
55 /* State machine handlers */
56 static ssize_t _qf_state_prefix(struct qf_vc *vc);
57 static ssize_t _qf_state_data(struct qf_vc *vc);
59 static ssize_t
60 _qf_dump_prefix(struct quoteflt *self)
62 ssize_t rv;
63 size_t i;
64 NYD_ENTER;
66 if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
67 self->qf_os))
68 goto jerr;
69 rv = i;
71 if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
72 self->qf_os))
73 goto jerr;
74 rv += i;
75 jleave:
76 NYD_LEAVE;
77 return rv;
78 jerr:
79 rv = -1;
80 goto jleave;
83 static ssize_t
84 _qf_add_data(struct quoteflt *self, wchar_t wc)
86 int w, l;
87 char *save_b;
88 ui32_t save_l, save_w;
89 ssize_t rv;
90 NYD_ENTER;
92 rv = 0;
93 save_l = save_w = 0; /* silence cc */
94 save_b = NULL;
96 /* <newline> ends state */
97 if (wc == L'\n') {
98 w = 0;
99 goto jflush;
101 if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
102 goto jleave;
104 /* Unroll <tab> to spaces */
105 if (wc == L'\t') {
106 save_l = self->qf_datw;
107 save_w = (save_l + n_QUOTE_TAB_SPACES) & ~(n_QUOTE_TAB_SPACES - 1);
108 save_w -= save_l;
109 while (save_w-- > 0) {
110 ssize_t j = _qf_add_data(self, L' ');
111 if (j < 0) {
112 rv = j;
113 break;
115 rv += j;
117 goto jleave;
120 /* To avoid that the last visual excesses *qfold-max*, which may happen for
121 * multi-column characters, use w as an indicator for this and move that
122 * thing to the next line */
123 w = wcwidth(wc);
124 if (w == -1) {
125 w = 0;
126 jbad:
127 ++self->qf_datw;
128 self->qf_dat.s[self->qf_dat.l++] = '?';
129 } else if (self->qf_datw > self->qf_qfold_max - w) {
130 w = -1;
131 goto jneednl;
132 } else {
133 l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
134 if (l < 0)
135 goto jbad;
136 self->qf_datw += (ui32_t)w;
137 self->qf_dat.l += (size_t)l;
140 if (self->qf_datw >= self->qf_qfold_max) {
141 /* If we have seen a nice breakpoint during traversal, shuffle data
142 * around a bit so as to restore the trailing part after flushing */
143 jneednl:
144 if (self->qf_brkl > 0) {
145 save_w = self->qf_datw - self->qf_brkw;
146 save_l = self->qf_dat.l - self->qf_brkl;
147 save_b = self->qf_dat.s + self->qf_brkl + 2;
148 memmove(save_b, save_b - 2, save_l);
149 self->qf_dat.l = self->qf_brkl;
152 self->qf_dat.s[self->qf_dat.l++] = '\\';
153 jflush:
154 self->qf_dat.s[self->qf_dat.l++] = '\n';
155 rv = quoteflt_flush(self);
157 /* Restore takeovers, if any */
158 if (save_b != NULL) {
159 self->qf_brk_isws = FAL0;
160 self->qf_datw += save_w;
161 self->qf_dat.l = save_l;
162 memmove(self->qf_dat.s, save_b, save_l);
164 } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
165 bool_t isws = (iswspace(wc) != 0);
167 if (isws || !self->qf_brk_isws || self->qf_brkl == 0) {
168 if((self->qf_brk_isws = isws) ||
169 self->qf_brkl < self->qf_qfold_maxnws){
170 self->qf_brkl = self->qf_dat.l;
171 self->qf_brkw = self->qf_datw;
176 /* Did we hold this back to avoid qf_fold_max excess? Then do it now */
177 if(rv >= 0 && w == -1){
178 ssize_t j = _qf_add_data(self, wc);
179 if(j < 0)
180 rv = j;
181 else
182 rv += j;
184 /* If state changed to prefix, perform full reset (note this implies that
185 * quoteflt_flush() performs too much work..) */
186 else if (wc == '\n') {
187 self->qf_state = _QF_PREFIX;
188 self->qf_wscnt = self->qf_datw = 0;
189 self->qf_currq.l = 0;
191 jleave:
192 NYD_LEAVE;
193 return rv;
196 static ssize_t
197 _qf_state_prefix(struct qf_vc *vc)
199 struct quoteflt *self;
200 ssize_t rv;
201 char const *buf;
202 size_t len, i;
203 wchar_t wc;
204 NYD_ENTER;
206 self = vc->self;
207 rv = 0;
209 for (buf = vc->buf, len = vc->len; len > 0;) {
210 /* xxx NULL BYTE! */
211 i = mbrtowc(&wc, buf, len, self->qf_mbps);
212 if (i == (size_t)-1) {
213 /* On hard error, don't modify mbstate_t and step one byte */
214 self->qf_mbps[0] = self->qf_mbps[1];
215 ++buf;
216 --len;
217 self->qf_wscnt = 0;
218 continue;
220 self->qf_mbps[1] = self->qf_mbps[0];
221 if (i == (size_t)-2) {
222 /* Redundant shift sequence, out of buffer */
223 len = 0;
224 break;
226 buf += i;
227 len -= i;
229 if (wc == L'\n')
230 goto jfin;
231 if (iswspace(wc)) {
232 ++self->qf_wscnt;
233 continue;
235 if (i == 1 && n_uasciichar(wc) &&
236 strchr(self->qf_quote_chars, (char)wc) != NULL){
237 self->qf_wscnt = 0;
238 if (self->qf_currq.l >= n_QUOTE_MAX - 3) {
239 self->qf_currq.s[n_QUOTE_MAX - 3] = '.';
240 self->qf_currq.s[n_QUOTE_MAX - 2] = '.';
241 self->qf_currq.s[n_QUOTE_MAX - 1] = '.';
242 self->qf_currq.l = n_QUOTE_MAX;
243 } else
244 self->qf_currq.s[self->qf_currq.l++] = buf[-1];
245 continue;
248 /* The quote is parsed and compressed; dump it */
249 jfin:
250 self->qf_state = _QF_DATA;
251 /* Overtake WS to the current quote in order to preserve it for eventual
252 * necessary follow lines, too */
253 /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
254 while (self->qf_wscnt-- > 0 && self->qf_currq.l < n_QUOTE_MAX)
255 self->qf_currq.s[self->qf_currq.l++] = ' ';
256 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
257 self->qf_wscnt = 0;
258 rv = _qf_add_data(self, wc);
259 break;
262 vc->buf = buf;
263 vc->len = len;
264 NYD_LEAVE;
265 return rv;
268 static ssize_t
269 _qf_state_data(struct qf_vc *vc)
271 struct quoteflt *self;
272 ssize_t rv;
273 char const *buf;
274 size_t len, i;
275 wchar_t wc;
276 NYD_ENTER;
278 self = vc->self;
279 rv = 0;
281 for (buf = vc->buf, len = vc->len; len > 0;) {
282 /* xxx NULL BYTE! */
283 i = mbrtowc(&wc, buf, len, self->qf_mbps);
284 if (i == (size_t)-1) {
285 /* On hard error, don't modify mbstate_t and step one byte */
286 self->qf_mbps[0] = self->qf_mbps[1];
287 ++buf;
288 --len;
289 continue;
291 self->qf_mbps[1] = self->qf_mbps[0];
292 if (i == (size_t)-2) {
293 /* Redundant shift sequence, out of buffer */
294 len = 0;
295 break;
297 buf += i;
298 len -= i;
300 { ssize_t j = _qf_add_data(self, wc);
301 if (j < 0) {
302 rv = j;
303 break;
305 rv += j;
308 if (self->qf_state != _QF_DATA)
309 break;
312 vc->buf = buf;
313 vc->len = len;
314 NYD_LEAVE;
315 return rv;
317 #endif /* HAVE_QUOTE_FOLD */
319 FL struct quoteflt *
320 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
322 static struct quoteflt qf_i;
324 qf_i.qf_bypass = TRU1;
325 return &qf_i;
328 FL void
329 quoteflt_init(struct quoteflt *self, char const *prefix, bool_t bypass)
331 #ifdef HAVE_QUOTE_FOLD
332 char const *xcp, *cp;
333 #endif
334 NYD_ENTER;
336 memset(self, 0, sizeof *self);
338 if ((self->qf_pfix = prefix) != NULL)
339 self->qf_pfix_len = (ui32_t)strlen(prefix);
340 self->qf_bypass = bypass;
342 /* Check whether the user wants the more fancy quoting algorithm */
343 /* TODO *quote-fold*: n_QUOTE_MAX may excess it! */
344 #ifdef HAVE_QUOTE_FOLD
345 if (!bypass && (cp = ok_vlook(quote_fold)) != NULL) {
346 ui32_t qmax, qmaxnws, qmin;
348 /* These magic values ensure we don't bail */
349 n_idec_ui32_cp(&qmax, cp, 10, &xcp);
350 if (qmax < self->qf_pfix_len + 6)
351 qmax = self->qf_pfix_len + 6;
352 qmaxnws = --qmax; /* The newline escape */
353 if (cp == xcp || *xcp == '\0')
354 qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
355 else {
356 n_idec_ui32_cp(&qmin, &xcp[1], 10, &xcp);
357 if (qmin < qmax >> 1)
358 qmin = qmax >> 1;
359 else if (qmin > qmax - 2)
360 qmin = qmax - 2;
362 if (cp != xcp && *xcp != '\0') {
363 n_idec_ui32_cp(&qmaxnws, &xcp[1], 10, &xcp);
364 if (qmaxnws > qmax || qmaxnws < qmin)
365 qmaxnws = qmax;
368 self->qf_qfold_min = qmin;
369 self->qf_qfold_max = qmax;
370 self->qf_qfold_maxnws = qmaxnws;
371 self->qf_quote_chars = ok_vlook(quote_chars);
373 /* Add pad for takeover copies, reverse solidus and newline */
374 self->qf_dat.s = n_autorec_alloc((qmax + 3) * n_mb_cur_max);
375 self->qf_currq.s = n_autorec_alloc((n_QUOTE_MAX + 1) * n_mb_cur_max);
377 #endif
378 NYD_LEAVE;
381 FL void
382 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
384 NYD_ENTER;
385 n_UNUSED(self);
386 NYD_LEAVE;
389 FL void
390 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
392 NYD_ENTER;
393 self->qf_os = f;
394 #ifdef HAVE_QUOTE_FOLD
395 self->qf_state = _QF_CLEAN;
396 self->qf_dat.l =
397 self->qf_currq.l = 0;
398 memset(self->qf_mbps, 0, sizeof self->qf_mbps);
399 #endif
400 NYD_LEAVE;
403 FL ssize_t
404 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
406 /* (xxx Ideally the actual push() [and flush()] would be functions on their
407 * xxx own, via indirect vtbl call ..) */
408 ssize_t rv = 0;
409 NYD_ENTER;
411 self->qf_nl_last = (len > 0 && dat[len - 1] == '\n'); /* TODO HACK */
413 if (len == 0)
414 goto jleave;
416 /* Bypass? TODO Finally, this filter simply should not be used, then
417 * (TODO It supercedes prefix_write() or something) */
418 if (self->qf_bypass) {
419 if (len != fwrite(dat, 1, len, self->qf_os))
420 goto jerr;
421 rv = len;
423 /* Normal: place *indentprefix* at every BOL */
424 else
425 #ifdef HAVE_QUOTE_FOLD
426 if (self->qf_qfold_max == 0)
427 #endif
429 void *vp;
430 size_t ll;
431 bool_t pxok = (self->qf_qfold_min != 0);
433 for (;;) {
434 if (!pxok && (ll = self->qf_pfix_len) > 0) {
435 if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
436 goto jerr;
437 rv += ll;
438 pxok = TRU1;
441 /* xxx Strictly speaking this is invalid, because only `/' and `.' are
442 * xxx mandated by POSIX.1-2008 as "invariant across all locales
443 * xxx supported"; though there is no charset known which uses this
444 * xxx control char as part of a multibyte character; note that S-nail
445 * XXX (and the Mail codebase as such) do not support EBCDIC */
446 if ((vp = memchr(dat, '\n', len)) == NULL)
447 ll = len;
448 else {
449 pxok = FAL0;
450 ll = PTR2SIZE((char*)vp - dat) + 1;
453 if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
454 goto jerr;
455 rv += ll;
456 if ((len -= ll) == 0)
457 break;
458 dat += ll;
461 self->qf_qfold_min = pxok;
463 /* Overly complicated, though still only line-per-line: *quote-fold*.
464 * - If .qf_currq.l is 0, then we are in a clean state. Reset .qf_mbps;
465 * TODO note this means we assume that lines start with reset escape seq,
466 * TODO but i don't think this is any worse than what we currently do;
467 * TODO in 15.0, with the value carrier, we should carry conversion states
468 * TODO all along, only resetting on error (or at words for header =???=);
469 * TODO this still is weird for error handling, but we need to act more
470 * TODO stream-alike (though in practice i don't think cross-line states
471 * TODO can be found, because of compatibility reasons; however, being
472 * TODO a problem rather than a solution is not a good thing (tm))
473 * - Lookout for a newline */
474 #ifdef HAVE_QUOTE_FOLD
475 else {
476 struct qf_vc vc;
477 ssize_t i;
479 vc.self = self;
480 vc.buf = dat;
481 vc.len = len;
482 while (vc.len > 0) {
483 switch (self->qf_state) {
484 case _QF_CLEAN:
485 case _QF_PREFIX:
486 i = _qf_state_prefix(&vc);
487 break;
488 default: /* silence cc (`i' unused) */
489 case _QF_DATA:
490 i = _qf_state_data(&vc);
491 break;
493 if (i < 0)
494 goto jerr;
495 rv += i;
498 #endif /* HAVE_QUOTE_FOLD */
500 jleave:
501 NYD_LEAVE;
502 return rv;
503 jerr:
504 rv = -1;
505 goto jleave;
508 FL ssize_t
509 quoteflt_flush(struct quoteflt *self)
511 ssize_t rv = 0;
512 NYD_ENTER;
513 n_UNUSED(self);
515 #ifdef HAVE_QUOTE_FOLD
516 if (self->qf_dat.l > 0) {
517 rv = _qf_dump_prefix(self);
518 if (rv >= 0) {
519 size_t i = self->qf_dat.l;
520 if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
521 rv += i;
522 else
523 rv = -1;
524 self->qf_dat.l = 0;
525 self->qf_brk_isws = FAL0;
526 self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
527 self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
530 #endif
531 NYD_LEAVE;
532 return rv;
536 * HTML tagsoup filter TODO rewrite wchar_t based (require HAVE_C90AMEND1)
537 * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
538 * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
539 * TODO (nonstandard) and watch out for style="gmail_quote" (or so, VERY
540 * TODO nonstandard) and tracking a stack of such elements (to be popped
541 * TODO once the closing element is seen). Then, after writing a newline,
542 * TODO place sizeof(stack) ">"s first. But aren't these HTML mails rude?
543 * TODO Interlocking and non-well-formed data will break us down
545 #ifdef HAVE_FILTER_HTML_TAGSOUP
547 enum hf_limits {
548 _HF_MINLEN = 10, /* Minimum line length (can't really be smaller) */
549 _HF_BRKSUB = 8 /* Start considering line break MAX - BRKSUB */
552 enum hf_flags {
553 _HF_BQUOTE_MASK = 0xFFFFu,
554 _HF_UTF8 = 1u<<16, /* Data is in UTF-8 */
555 _HF_ERROR = 1u<<17, /* A hard error occurred, bail as soon as possible */
556 _HF_NOPUT = 1u<<18, /* (In a tag,) Don't generate output */
557 _HF_IGN = 1u<<19, /* Ignore mode on */
558 _HF_ANY = 1u<<20, /* Yet seen just any output */
559 _HF_PRE = 1u<<21, /* In <pre>formatted mode */
560 _HF_ENT = 1u<<22, /* Currently parsing an entity */
561 _HF_BLANK = 1u<<23, /* Whitespace last */
562 _HF_HREF = 1u<<24, /* External <a href=> was the last href seen */
564 _HF_NL_1 = 1u<<25, /* One \n seen */
565 _HF_NL_2 = 2u<<25, /* We have produced an all empty line */
566 _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
569 enum hf_special_actions {
570 _HFSA_NEEDSEP = -1, /* Need an empty line (paragraph separator) */
571 _HFSA_NEEDNL = -2, /* Need a new line start (table row) */
572 _HFSA_IGN = -3, /* Things like <style>..</style>, <script>.. */
573 _HFSA_PRE = -4, /* <pre>.. */
574 _HFSA_PRE_END = -5,
575 _HFSA_IMG = -6, /* <img> */
576 _HFSA_HREF = -7, /* <a>.. */
577 _HFSA_HREF_END = -8,
578 _HFSA_BQUOTE = -9, /* <blockquote>, interpreted as citation! */
579 _HFSA_BQUOTE_END = -10
582 enum hf_entity_flags {
583 _HFE_HAVE_UNI = 1<<6, /* Have a Unicode replacement character */
584 _HFE_HAVE_CSTR = 1<<7, /* Have a string replacement */
585 /* We store the length of the entity name in the flags, too */
586 _HFE_LENGTH_MASK = (1<<6) - 1
589 struct htmlflt_href {
590 struct htmlflt_href *hfh_next;
591 ui32_t hfh_no; /* Running sequence */
592 ui32_t hfh_len; /* of .hfh_dat */
593 char hfh_dat[n_VFIELD_SIZE(0)];
596 struct htmlflt_tag {
597 si32_t hft_act; /* char or hf_special_actions */
598 /* Not NUL: character to inject, with high bit set: place a space
599 * afterwards. Note: only recognized with _HFSA_NEEDSEP or _HFSA_NEEDNL */
600 char hft_injc;
601 ui8_t hft_len; /* Useful bytes in (NUL terminated) .hft_tag */
602 char const hft_tag[10]; /* Tag less < and > surroundings (TR, /TR, ..) */
604 n_CTA(n_SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE,
605 "Structure field too large a size"); /* .hf_ign_tag */
607 struct hf_ent {
608 ui8_t hfe_flags; /* enum hf_entity_flags plus length of .hfe_ent */
609 char hfe_c; /* Plain replacement character */
610 ui16_t hfe_uni; /* Unicode codepoint if _HFE_HAVE_UNI */
611 char hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
612 char const hfe_ent[7]; /* Entity less & and ; surroundings */
615 /* Tag list; not binary searched :(, so try to take care a bit */
616 static struct htmlflt_tag const _hf_tags[] = {
617 # undef _X
618 # undef _XC
619 # define _X(S,A) {A, '\0', sizeof(S) -1, S "\0"}
620 # define _XC(S,C,A) {A, C, sizeof(S) -1, S "\0"}
622 # if 0 /* This is treated very special (to avoid wasting space in .hft_tag) */
623 _X("BLOCKQUOTE", _HFSA_BQUOTE), _X("/BLOCKQUOTE", _HFSA_BQUOTE_END),
624 # endif
626 _X("P", _HFSA_NEEDSEP), _X("/P", _HFSA_NEEDNL),
627 _X("DIV", _HFSA_NEEDSEP), _X("/DIV", _HFSA_NEEDNL),
628 _X("TR", _HFSA_NEEDNL),
629 _X("/TH", '\t'),
630 _X("/TD", '\t'),
631 /* Let it stand out; also since we don't support implicit paragraphs after
632 * block elements, plain running text after a list (seen in Unicode
633 * announcement via Firefox) */
634 _X("UL", _HFSA_NEEDSEP), _X("/UL", _HFSA_NEEDSEP),
635 _XC("LI", (char)0x80 | '*', _HFSA_NEEDSEP),
636 _X("DL", _HFSA_NEEDSEP),
637 _X("DT", _HFSA_NEEDNL),
639 _X("A", _HFSA_HREF), _X("/A", _HFSA_HREF_END),
640 _X("IMG", _HFSA_IMG),
641 _X("BR", '\n'),
642 _X("PRE", _HFSA_PRE), _X("/PRE", _HFSA_PRE_END),
643 _X("TITLE", _HFSA_NEEDSEP), /*_X("/TITLE", '\n'),*/
644 _X("H1", _HFSA_NEEDSEP), /*_X("/H1", '\n'),*/
645 _X("H2", _HFSA_NEEDSEP), /*_X("/H2", '\n'),*/
646 _X("H3", _HFSA_NEEDSEP), /*_X("/H3", '\n'),*/
647 _X("H4", _HFSA_NEEDSEP), /*_X("/H4", '\n'),*/
648 _X("H5", _HFSA_NEEDSEP), /*_X("/H5", '\n'),*/
649 _X("H6", _HFSA_NEEDSEP), /*_X("/H6", '\n'),*/
651 _X("STYLE", _HFSA_IGN),
652 _X("SCRIPT", _HFSA_IGN),
654 # undef _X
657 /* Entity list; not binary searched.. */
658 static struct hf_ent const _hf_ents[] = {
659 # undef _X
660 # undef _XU
661 # undef _XS
662 # undef _XUS
663 # define _X(E,C) {(sizeof(E) -1), C, 0x0u, "", E "\0"}
664 # define _XU(E,C,U) {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E "\0"}
665 # define _XS(E,S) {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u,S "\0",E "\0"}
666 # define _XSU(E,S,U) \
667 {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E "\0"}
669 _X("quot", '"'),
670 _X("amp", '&'),
671 _X("lt", '<'), _X("gt", '>'),
673 _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
674 _XU("middot", '.', 0x00B7),
675 _XSU("hellip", "...", 0x2026),
676 _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
677 _XSU("laquo", "<<", 0x00AB), _XSU("raquo", ">>", 0x00BB),
678 _XSU("lsaquo", "<", 0x2039), _XSU("rsaquo", ">", 0x203A),
679 _XSU("lsquo", "'", 0x2018), _XSU("rsquo", "'", 0x2019),
680 _XSU("ldquo", "\"", 0x201C), _XSU("rdquo", "\"", 0x201D),
681 _XSU("uarr", "^|", 0x2191), _XSU("darr", "|v", 0x2193),
683 _XSU("cent", "CENT", 0x00A2),
684 _XSU("copy", "(C)", 0x00A9),
685 _XSU("euro", "EUR", 0x20AC),
686 _XSU("infin", "INFY", 0x221E),
687 _XSU("pound", "GBP", 0x00A3),
688 _XSU("reg", "(R)", 0x00AE),
689 _XSU("sect", "S:", 0x00A7),
690 _XSU("yen", "JPY", 0x00A5),
692 /* German umlauts */
693 _XSU("Auml", "Ae", 0x00C4), _XSU("auml", "ae", 0x00E4),
694 _XSU("Ouml", "Oe", 0x00D6), _XSU("ouml", "oe", 0x00F6),
695 _XSU("Uuml", "Ue", 0x00DC), _XSU("uuml", "ue", 0x00FC),
696 _XSU("szlig", "ss", 0x00DF)
698 # undef _X
699 # undef _XU
700 # undef _XS
701 # undef _XSU
704 /* Real output */
705 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
706 static struct htmlflt * _hf_dump(struct htmlflt *self);
707 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
708 # ifdef HAVE_NATCH_CHAR
709 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
710 # endif
712 /* Virtual output */
713 static struct htmlflt * _hf_nl(struct htmlflt *self);
714 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
715 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
716 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
717 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
718 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
719 char const *cp, size_t len);
721 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
722 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
723 char const *param);
725 /* Expand all entities in the given parameter */
726 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
727 struct str const *param);
729 /* Completely parsed over a tag / an entity, interpret that */
730 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
731 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
732 size_t l);
734 /* Input handler */
735 static ssize_t _hf_add_data(struct htmlflt *self,
736 char const *dat, size_t len);
738 static struct htmlflt *
739 _hf_dump_hrefs(struct htmlflt *self)
741 struct htmlflt_href *hhp;
742 NYD2_ENTER;
744 if (!(self->hf_flags & _HF_NL_2) && putc('\n', self->hf_os) == EOF) {
745 self->hf_flags |= _HF_ERROR;
746 goto jleave;
749 /* Reverse the list */
750 for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
751 struct htmlflt_href *tmp = hhp->hfh_next;
752 hhp->hfh_next = self->hf_hrefs;
753 self->hf_hrefs = hhp;
754 hhp = tmp;
757 /* Then dump it */
758 while ((hhp = self->hf_hrefs) != NULL) {
759 self->hf_hrefs = hhp->hfh_next;
761 if (!(self->hf_flags & _HF_ERROR)) {
762 int w = fprintf(self->hf_os, " [%u] %.*s\n",
763 hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
764 if (w < 0)
765 self->hf_flags |= _HF_ERROR;
767 n_free(hhp);
770 self->hf_flags |= (putc('\n', self->hf_os) == EOF)
771 ? _HF_ERROR : _HF_NL_1 | _HF_NL_2;
772 self->hf_href_dist = (ui32_t)n_realscreenheight >> 1;
773 jleave:
774 NYD2_LEAVE;
775 return self;
778 static struct htmlflt *
779 _hf_dump(struct htmlflt *self)
781 ui32_t f, l;
782 char c, *cp;
783 NYD2_ENTER;
785 f = self->hf_flags & ~_HF_BLANK;
786 l = self->hf_len;
787 cp = self->hf_line;
788 self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
790 for (c = '\0'; l > 0; --l) {
791 c = *cp++;
792 jput:
793 if (putc(c, self->hf_os) == EOF) {
794 self->hf_flags = (f |= _HF_ERROR);
795 goto jleave;
799 if (c != '\n') {
800 f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
801 l = 1;
802 c = '\n';
803 goto jput;
805 self->hf_flags = f;
807 /* Check whether there are HREFs to dump; there is so much messy tagsoup out
808 * there that it seems best not to simply dump HREFs in each _dump(), but
809 * only with some gap, let's say half the real screen height */
810 if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
811 self = _hf_dump_hrefs(self);
812 jleave:
813 NYD2_LEAVE;
814 return self;
817 static struct htmlflt *
818 _hf_store(struct htmlflt *self, char c)
820 ui32_t l, i;
821 NYD2_ENTER;
823 assert(c != '\n');
825 l = self->hf_len;
826 if(n_UNLIKELY(l == 0) && (i = (self->hf_flags & _HF_BQUOTE_MASK)) != 0 &&
827 self->hf_lmax > _HF_MINLEN){
828 ui32_t len, j;
829 char const *ip;
831 ip = ok_vlook(indentprefix);
832 len = strlen(ip);
833 if(len == 0 || len >= _HF_MINLEN){
834 ip = " |"; /* XXX something from *quote-chars* */
835 len = sizeof(" |") -1;
838 self->hf_len = len;
839 for(j = len; j-- != 0;){
840 char x;
842 if((x = ip[j]) == '\t')
843 x = ' ';
844 self->hf_line[j] = x;
847 while(--i > 0 && self->hf_len < self->hf_lmax - _HF_BRKSUB)
848 self = _hf_store(self, '|'); /* XXX something from *quote-chars* */
850 l = self->hf_len;
853 self->hf_line[l] = (c == '\t' ? ' ' : c);
854 self->hf_len = ++l;
855 if (blankspacechar(c)) {
856 if (c == '\t') {
857 i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
858 if (i > 0) {
860 self = _hf_store(self, ' ');
861 while (--i > 0);
862 goto jleave;
865 self->hf_last_ws = l;
866 } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
867 self->hf_last_ws = l;
869 i = l;
870 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
871 if (n_mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least */
872 wchar_t wc;
873 int w, x;
875 if((x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff)
876 ) > 0){
877 if ((w = wcwidth(wc)) == -1 ||
878 /* Actively filter out L-TO-R and R-TO-R marks TODO ctext */
879 (wc == 0x200E || wc == 0x200F ||
880 (wc >= 0x202A && wc <= 0x202E)) ||
881 /* And some zero-width messes */
882 wc == 0x00AD || (wc >= 0x200B && wc <= 0x200D) ||
883 /* Oh about the ISO C wide character interfaces, baby! */
884 (wc == 0xFEFF)){
885 self->hf_len -= x;
886 goto jleave;
887 } else if (iswspace(wc))
888 self->hf_last_ws = l;
889 self->hf_mboff += x;
890 i = (self->hf_mbwidth += w);
891 } else {
892 if (x < 0) {
893 (void)mbtowc(&wc, NULL, n_mb_cur_max);
894 if (UICMP(32, l - self->hf_mboff, >=, n_mb_cur_max)) { /* XXX */
895 ++self->hf_mboff;
896 ++self->hf_mbwidth;
899 i = self->hf_mbwidth;
902 # endif
904 /* Do we need to break the line? */
905 if (i >= self->hf_lmax - _HF_BRKSUB) {
906 ui32_t f, lim;
909 /* Let's hope we saw a sane place to break this line! */
910 if (self->hf_last_ws >= (lim = self->hf_lmax >> 1)) {
911 jput:
912 i = self->hf_len = self->hf_last_ws;
913 self = _hf_dump(self);
914 if ((self->hf_len = (l -= i)) > 0) {
915 self->hf_flags &= ~_HF_NL_MASK;
916 memmove(self->hf_line, self->hf_line + i, l);
917 # ifdef HAVE_NATCH_CHAR
918 __hf_sync_mbstuff(self);
919 # endif
921 goto jleave;
924 /* Any 7-bit characters? */
925 f = self->hf_flags;
926 for (i = l; i-- >= lim;)
927 if (asciichar((c = self->hf_line[i]))) {
928 self->hf_last_ws = ++i;
929 goto jput;
930 } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
931 self->hf_last_ws = i;
932 goto jput;
935 /* Hard break necessary! xxx really badly done */
936 if (l >= self->hf_lmax - 1)
937 self = _hf_dump(self);
939 jleave:
940 NYD2_LEAVE;
941 return self;
944 # ifdef HAVE_NATCH_CHAR
945 static struct htmlflt *
946 __hf_sync_mbstuff(struct htmlflt *self)
948 wchar_t wc;
949 char const *b;
950 ui32_t o, w, l;
951 NYD2_ENTER;
953 b = self->hf_line;
954 o = w = 0;
955 l = self->hf_len;
956 goto jumpin;
958 while (l > 0) {
959 int x = mbtowc(&wc, b, l);
961 if (x == 0)
962 break;
964 if (x > 0) {
965 b += x;
966 l -= x;
967 o += x;
968 if ((x = wcwidth(wc)) == -1)
969 x = 1;
970 w += x;
971 continue;
974 /* Bad, skip over a single character.. XXX very bad indeed */
975 ++b;
976 ++o;
977 ++w;
978 --l;
979 jumpin:
980 (void)mbtowc(&wc, NULL, n_mb_cur_max);
983 self->hf_mboff = o;
984 self->hf_mbwidth = w;
986 NYD2_LEAVE;
987 return self;
989 # endif /* HAVE_NATCH_CHAR */
991 static struct htmlflt *
992 _hf_nl(struct htmlflt *self)
994 ui32_t f;
995 NYD2_ENTER;
997 if (!((f = self->hf_flags) & _HF_ERROR)) {
998 if (f & _HF_ANY) {
999 if ((f & _HF_NL_MASK) != _HF_NL_MASK)
1000 self = _hf_dump(self);
1001 } else
1002 self->hf_flags = (f |= _HF_NL_MASK);
1004 NYD2_LEAVE;
1005 return self;
1008 static struct htmlflt *
1009 _hf_nl_force(struct htmlflt *self)
1011 NYD2_ENTER;
1012 if (!(self->hf_flags & _HF_ERROR))
1013 self = _hf_dump(self);
1014 NYD2_LEAVE;
1015 return self;
1018 static struct htmlflt *
1019 _hf_putc(struct htmlflt *self, char c)
1021 ui32_t f;
1022 NYD2_ENTER;
1024 if ((f = self->hf_flags) & _HF_ERROR)
1025 goto jleave;
1027 if (c == '\n') {
1028 self = _hf_nl(self);
1029 goto jleave;
1030 } else if (c == ' ' || c == '\t') {
1031 if ((f & _HF_BLANK) || self->hf_len == 0)
1032 goto jleave;
1033 f |= _HF_BLANK;
1034 } else
1035 f &= ~_HF_BLANK;
1036 f &= ~_HF_NL_MASK;
1037 self->hf_flags = (f |= _HF_ANY);
1038 self = _hf_store(self, c);
1039 jleave:
1040 NYD2_LEAVE;
1041 return self;
1044 static struct htmlflt *
1045 _hf_putc_premode(struct htmlflt *self, char c)
1047 ui32_t f;
1048 NYD2_ENTER;
1050 if ((f = self->hf_flags) & _HF_ERROR) {
1052 } else if (c == '\n')
1053 self = _hf_nl_force(self);
1054 else {
1055 f &= ~_HF_NL_MASK;
1056 self->hf_flags = (f |= _HF_ANY);
1057 self = _hf_store(self, c);
1059 NYD2_LEAVE;
1060 return self;
1063 static struct htmlflt *
1064 _hf_puts(struct htmlflt *self, char const *cp)
1066 char c;
1067 NYD2_ENTER;
1069 while ((c = *cp++) != '\0')
1070 self = _hf_putc(self, c);
1071 NYD2_LEAVE;
1072 return self;
1075 static struct htmlflt *
1076 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
1078 NYD2_ENTER;
1080 while (len-- > 0)
1081 self = _hf_putc(self, *cp++);
1082 NYD2_LEAVE;
1083 return self;
1086 static struct htmlflt *
1087 _hf_param(struct htmlflt *self, struct str *store, char const *param)
1089 char const *cp;
1090 char c, x, quote;
1091 size_t i;
1092 bool_t hot;
1093 NYD2_ENTER;
1095 store->s = NULL;
1096 store->l = 0;
1097 cp = self->hf_bdat;
1099 /* Skip over any non-WS first; be aware of soup, if it slipped through */
1100 for(;;){
1101 if((c = *cp++) == '\0' || c == '>')
1102 goto jleave;
1103 if(whitechar(c))
1104 break;
1107 /* Search for the parameter, take care of other quoting along the way */
1108 x = *param++;
1109 x = upperconv(x);
1110 i = strlen(param);
1112 for(hot = TRU1;;){
1113 if((c = *cp++) == '\0' || c == '>')
1114 goto jleave;
1115 if(whitechar(c)){
1116 hot = TRU1;
1117 continue;
1120 /* Could it be a parameter? */
1121 if(hot){
1122 hot = FAL0;
1124 /* Is it the desired one? */
1125 if((c = upperconv(c)) == x && !ascncasecmp(param, cp, i)){
1126 char const *cp2 = cp + i;
1128 if((quote = *cp2++) != '='){
1129 if(quote == '\0' || quote == '>')
1130 goto jleave;
1131 while(whitechar(quote))
1132 quote = *cp2++;
1134 if(quote == '='){
1135 cp = cp2;
1136 break;
1138 continue; /* XXX Optimize: i bytes or even cp2 can't be it! */
1142 /* Not the desired one; but a parameter? */
1143 if(c != '=')
1144 continue;
1145 /* If so, properly skip over the value */
1146 if((c = *cp++) == '"' || c == '\''){
1147 /* TODO i have forgotten whether reverse solidus quoting is allowed
1148 * TODO quoted HTML parameter values? not supporting that for now.. */
1149 for(quote = c; (c = *cp++) != '\0' && c != quote;)
1151 }else
1152 while(c != '\0' && !whitechar(c) && c != '>')
1153 c = *++cp;
1154 if(c == '\0')
1155 goto jleave;
1158 /* Skip further whitespace */
1159 for(;;){
1160 if((c = *cp++) == '\0' || c == '>')
1161 goto jleave;
1162 if(!whitechar(c))
1163 break;
1166 if(c == '"' || c == '\''){
1167 /* TODO i have forgotten whether reverse solisud quoting is allowed in
1168 * TODO quoted HTML parameter values? not supporting that for now.. */
1169 store->s = n_UNCONST(cp);
1170 for(quote = c; (c = *cp) != '\0' && c != quote; ++cp)
1172 /* XXX ... and we simply ignore a missing trailing " :> */
1173 }else{
1174 store->s = n_UNCONST(cp - 1);
1175 if(!whitechar(c))
1176 while((c = *cp) != '\0' && !whitechar(c) && c != '>')
1177 ++cp;
1179 i = PTR2SIZE(cp - store->s);
1181 /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
1182 * parameter values prefixed and suffixed by newlines! Therefore trim the
1183 * value content TODO join into the parse step above! */
1184 for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
1186 store->s = n_UNCONST(cp);
1187 for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
1189 if ((store->l = i) == 0)
1190 store->s = NULL;
1191 jleave:
1192 NYD2_LEAVE;
1193 return self;
1196 static struct htmlflt *
1197 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
1199 char const *cp, *maxcp, *ep;
1200 char c;
1201 size_t i;
1202 NYD2_ENTER;
1204 for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
1205 if ((c = *cp++) != '&')
1206 jputc:
1207 self = _hf_putc(self, c);
1208 else {
1209 for (ep = cp--;;) {
1210 if (ep == maxcp || (c = *ep++) == '\0') {
1211 for (; cp < ep; ++cp)
1212 self = _hf_putc(self, *cp);
1213 goto jleave;
1214 } else if (c == ';') {
1215 if ((i = PTR2SIZE(ep - cp)) > 1) {
1216 self = _hf_check_ent(self, cp, i);
1217 break;
1218 } else {
1219 c = *cp++;
1220 goto jputc;
1224 cp = ep;
1226 jleave:
1227 NYD2_LEAVE;
1228 return self;
1231 static struct htmlflt *
1232 _hf_check_tag(struct htmlflt *self, char const *s)
1234 char nobuf[32], c;
1235 struct str param;
1236 size_t i;
1237 struct htmlflt_tag const *hftp;
1238 ui32_t f;
1239 NYD2_ENTER;
1241 /* Extra check only */
1242 assert(s != NULL);
1243 if (*s != '<') {
1244 DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
1245 jput_as_is:
1246 self = _hf_puts(self, self->hf_bdat);
1247 goto jleave;
1250 for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
1251 /* Special massage for things like <br/>: after the slash only whitespace
1252 * may separate us from the closing right angle! */
1253 if (c == '/') {
1254 size_t j = i + 1;
1256 while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
1257 ++j;
1258 if (c == '>')
1259 break;
1262 for (hftp = _hf_tags;;) {
1263 if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
1264 c = s[hftp->hft_len];
1265 if (c == '>' || c == '/' || whitechar(c))
1266 break;
1268 if (n_UNLIKELY(PTRCMP(++hftp, >=, _hf_tags + n_NELEM(_hf_tags)))){
1269 /* A <blockquote> is very special xxx */
1270 bool_t isct;
1272 if((isct = (i > 1 && *s == '/'))){
1273 ++s;
1274 --i;
1277 if(i != sizeof("blockquote") -1 || ascncasecmp(s, "blockquote", i) ||
1278 ((c = s[sizeof("blockquote") -1]) != '>' && !whitechar(c))){
1279 s -= isct;
1280 i += isct;
1281 goto jnotknown;
1284 if(!isct && !(self->hf_flags & _HF_NL_2))
1285 self = _hf_nl(self);
1286 if(!(self->hf_flags & _HF_NL_1))
1287 self = _hf_nl(self);
1288 f = self->hf_flags;
1289 f &= _HF_BQUOTE_MASK;
1290 if(!isct){
1291 if(f != _HF_BQUOTE_MASK)
1292 ++f;
1293 }else if(f > 0)
1294 --f;
1295 f |= (self->hf_flags & ~_HF_BQUOTE_MASK);
1296 self->hf_flags = f;
1297 goto jleave;
1301 f = self->hf_flags;
1302 switch (hftp->hft_act) {
1303 case _HFSA_PRE_END:
1304 f &= ~_HF_PRE;
1305 if (0) {
1306 /* FALLTHRU */
1307 case _HFSA_PRE:
1308 f |= _HF_PRE;
1310 self->hf_flags = f;
1311 /* FALLTHRU */
1313 case _HFSA_NEEDSEP:
1314 if (!(self->hf_flags & _HF_NL_2))
1315 self = _hf_nl(self);
1316 /* FALLTHRU */
1317 case _HFSA_NEEDNL:
1318 if (!(f & _HF_NL_1))
1319 self = _hf_nl(self);
1320 if (hftp->hft_injc != '\0') {
1321 self = _hf_putc(self, hftp->hft_injc & 0x7F);
1322 if ((uc_i)hftp->hft_injc & 0x80)
1323 self = _hf_putc(self, ' ');
1325 break;
1327 case _HFSA_IGN:
1328 self->hf_ign_tag = hftp;
1329 self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
1330 break;
1332 case _HFSA_IMG:
1333 self = _hf_param(self, &param, "alt");
1334 self = _hf_putc(self, '[');
1335 if (param.s == NULL) {
1336 param.s = n_UNCONST("IMG");
1337 param.l = 3;
1338 goto jimg_put;
1339 } /* else */ if (memchr(param.s, '&', param.l) != NULL)
1340 self = _hf_expand_all_ents(self, &param);
1341 else
1342 jimg_put:
1343 self = _hf_putbuf(self, param.s, param.l);
1344 self = _hf_putc(self, ']');
1345 break;
1347 case _HFSA_HREF:
1348 self = _hf_param(self, &param, "href");
1349 /* Ignore non-external links */
1350 if (param.s != NULL && *param.s != '#') {
1351 struct htmlflt_href *hhp = n_alloc(
1352 n_VSTRUCT_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
1354 hhp->hfh_next = self->hf_hrefs;
1355 hhp->hfh_no = ++self->hf_href_no;
1356 hhp->hfh_len = (ui32_t)param.l;
1357 memcpy(hhp->hfh_dat, param.s, param.l);
1359 snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
1360 self->hf_flags = (f |= _HF_HREF);
1361 self->hf_hrefs = hhp;
1362 self = _hf_puts(self, nobuf);
1363 } else
1364 self->hf_flags = (f &= ~_HF_HREF);
1365 break;
1366 case _HFSA_HREF_END:
1367 if (f & _HF_HREF) {
1368 snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
1369 self = _hf_puts(self, nobuf);
1371 break;
1373 default:
1374 c = (char)(hftp->hft_act & 0xFF);
1375 self = _hf_putc(self, c);
1376 break;
1377 case '\0':
1378 break;
1380 jleave:
1381 NYD2_LEAVE;
1382 return self;
1384 /* The problem is that even invalid tagsoup is widely used, without real
1385 * searching i have seen e-mail address in <N@H.D> notation, and more.
1386 * To protect us a bit look around and possibly write the content as such */
1387 jnotknown:
1388 switch (*s) {
1389 case '!':
1390 case '?':
1391 /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
1392 goto jleave;
1393 case '>':
1394 /* Print out an empty tag as such */
1395 if (s[1] == '\0') {
1396 --s;
1397 goto jput_as_is;
1399 break;
1400 case '/':
1401 ++s;
1402 break;
1403 default:
1404 break;
1407 /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
1408 while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
1409 if (!asciichar(c) || punctchar(c)) {
1410 self = _hf_puts(self, self->hf_bdat);
1411 break;
1413 goto jleave;
1416 static struct htmlflt *
1417 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
1419 char nobuf[32];
1420 char const *s_save;
1421 size_t l_save;
1422 struct hf_ent const *hfep;
1423 size_t i;
1424 NYD2_ENTER;
1426 s_save = s;
1427 l_save = l;
1428 assert(*s == '&');
1429 assert(l > 0);
1430 /* False entities seen in the wild assert(s[l - 1] == ';'); */
1431 ++s;
1432 l -= 2;
1434 /* Numeric entity, or try named search */
1435 if (*s == '#') {
1436 i = (*++s == 'x' ? 16 : 10);
1438 if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
1439 memcpy(nobuf, s, l);
1440 nobuf[l] = '\0';
1441 n_idec_uiz_cp(&i, nobuf, i, NULL);
1442 if (i <= 0x7F)
1443 self = _hf_putc(self, (char)i);
1444 else if (self->hf_flags & _HF_UTF8) {
1445 jputuni:
1446 l = n_utf32_to_utf8((ui32_t)i, nobuf);
1447 self = _hf_putbuf(self, nobuf, l);
1448 } else
1449 goto jeent;
1450 } else
1451 goto jeent;
1452 } else {
1453 ui32_t f = self->hf_flags, hf;
1455 for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + n_NELEM(_hf_ents));
1456 ++hfep)
1457 if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
1458 !strncmp(s, hfep->hfe_ent, l)) {
1459 if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
1460 i = hfep->hfe_uni;
1461 goto jputuni;
1462 } else if (hf & _HFE_HAVE_CSTR)
1463 self = _hf_puts(self, hfep->hfe_cstr);
1464 else
1465 self = _hf_putc(self, hfep->hfe_c);
1466 goto jleave;
1468 jeent:
1469 self = _hf_putbuf(self, s_save, l_save);
1471 jleave:
1472 NYD2_LEAVE;
1473 return self;
1476 static ssize_t
1477 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
1479 char c, *cp, *cp_max;
1480 bool_t hot;
1481 ssize_t rv = 0;
1482 NYD_ENTER;
1484 /* Final put request? */
1485 if (dat == NULL) {
1486 if (self->hf_len > 0 || self->hf_hrefs != NULL) {
1487 self = _hf_dump(self);
1488 if (self->hf_hrefs != NULL)
1489 self = _hf_dump_hrefs(self);
1490 rv = 1;
1492 goto jleave;
1495 /* Always ensure some initial buffer */
1496 if ((cp = self->hf_curr) != NULL)
1497 cp_max = self->hf_bmax;
1498 else {
1499 cp = self->hf_curr = self->hf_bdat = n_alloc(LINESIZE);
1500 cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
1502 hot = (cp != self->hf_bdat);
1504 for (rv = (ssize_t)len; len > 0; --len) {
1505 ui32_t f = self->hf_flags;
1507 if (f & _HF_ERROR)
1508 break;
1509 c = *dat++;
1511 /* Soup is really weird, and scripts may contain almost anything (and
1512 * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
1513 * test and walk until we see the required end tag */
1514 /* TODO For real safety _HF_IGN soup condome would also need to know
1515 * TODO about quoted strings so that 'var i = "</script>";' couldn't
1516 * TODO fool it! We really want this mode also for _HF_NOPUT to be
1517 * TODO able to *gracefully* detect the tag-closing '>', but then if
1518 * TODO that is a single mechanism we should have made it! */
1519 if (f & _HF_IGN) {
1520 struct htmlflt_tag const *hftp = self->hf_ign_tag;
1521 size_t i;
1523 if (c == '<') {
1524 hot = TRU1;
1525 jcp_reset:
1526 cp = self->hf_bdat;
1527 } else if (c == '>') {
1528 if (hot) {
1529 if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
1530 --i == hftp->hft_len &&
1531 !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
1532 self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
1533 hot = FAL0;
1534 goto jcp_reset;
1536 } else if (hot) {
1537 *cp++ = c;
1538 i = PTR2SIZE(cp - self->hf_bdat);
1539 if ((i == 1 && c != '/') || --i > hftp->hft_len) {
1540 hot = FAL0;
1541 goto jcp_reset;
1544 } else switch (c) {
1545 case '<':
1546 /* People are using & without &amp;ing it, ditto <; be aware */
1547 if (f & (_HF_NOPUT | _HF_ENT)) {
1548 f &= ~_HF_ENT;
1549 /* Special case "<!--" buffer content to deal with really weird
1550 * things that can be done with "<!--[if gte mso 9]>" syntax */
1551 if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
1552 memcmp(self->hf_bdat, "<!--", 4)) {
1553 self->hf_flags = f;
1554 *cp = '\0';
1555 self = _hf_puts(self, self->hf_bdat);
1556 f = self->hf_flags;
1559 cp = self->hf_bdat;
1560 *cp++ = c;
1561 self->hf_flags = (f |= _HF_NOPUT);
1562 break;
1563 case '>':
1564 /* Weird tagsoup around, do we actually parse a tag? */
1565 if (!(f & _HF_NOPUT))
1566 goto jdo_c;
1567 cp[0] = c;
1568 cp[1] = '\0';
1569 f &= ~(_HF_NOPUT | _HF_ENT);
1570 self->hf_flags = f;
1571 self = _hf_check_tag(self, self->hf_bdat);
1572 *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
1573 /* Quick hack to get rid of redundant newline after <pre> XXX */
1574 if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
1575 len > 1 && *dat == '\n')
1576 ++dat, --len;
1577 break;
1579 case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
1580 break;
1581 case '\n':
1582 /* End of line is not considered unless we are in PRE section.
1583 * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
1584 * newlines for separating parameters */
1585 if (f & _HF_NOPUT)
1586 goto jdo_c;
1587 self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
1588 break;
1590 case '\t':
1591 if (!(f & _HF_PRE))
1592 c = ' ';
1593 /* FALLTHRU */
1594 default:
1595 jdo_c:
1596 /* If not currently parsing a tag and bypassing normal output.. */
1597 if (!(f & _HF_NOPUT)) {
1598 if (cntrlchar(c))
1599 break;
1600 if (c == '&') {
1601 cp = self->hf_bdat;
1602 *cp++ = c;
1603 self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
1604 } else if (f & _HF_PRE) {
1605 self = _hf_putc_premode(self, c);
1606 self->hf_flags &= ~_HF_BLANK;
1607 } else
1608 self = _hf_putc(self, c);
1609 } else if ((f & _HF_ENT) && c == ';') {
1610 cp[0] = c;
1611 cp[1] = '\0';
1612 f &= ~(_HF_NOPUT | _HF_ENT);
1613 self->hf_flags = f;
1614 self = _hf_check_ent(self, self->hf_bdat,
1615 PTR2SIZE(cp + 1 - self->hf_bdat));
1616 } else {
1617 /* We may need to grow the buffer */
1618 if (PTRCMP(cp + 42/2, >=, cp_max)) {
1619 size_t i = PTR2SIZE(cp - self->hf_bdat),
1620 m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
1622 cp = self->hf_bdat = n_realloc(self->hf_bdat, m);
1623 self->hf_bmax = cp_max = &cp[m -1];
1624 self->hf_curr = (cp += i);
1626 *cp++ = c;
1630 self->hf_curr = cp;
1631 jleave:
1632 NYD_LEAVE;
1633 return (self->hf_flags & _HF_ERROR) ? -1 : rv;
1637 * TODO Because we don't support filter chains yet this filter will be run
1638 * TODO in a dedicated subprocess, driven via a special Popen() mode
1640 static bool_t __hf_hadpipesig;
1641 static void
1642 __hf_onpipe(int signo)
1644 NYD_X; /* Signal handler */
1645 n_UNUSED(signo);
1646 __hf_hadpipesig = TRU1;
1649 FL int
1650 htmlflt_process_main(void)
1652 char buf[BUFFER_SIZE];
1653 struct htmlflt hf;
1654 size_t i;
1655 int rv;
1656 NYD_ENTER;
1658 __hf_hadpipesig = FAL0;
1659 safe_signal(SIGPIPE, &__hf_onpipe);
1661 htmlflt_init(&hf);
1662 htmlflt_reset(&hf, n_stdout);
1664 for (;;) {
1665 if ((i = fread(buf, sizeof(buf[0]), n_NELEM(buf), n_stdin)) == 0) {
1666 rv = !feof(n_stdin);
1667 break;
1670 if ((rv = __hf_hadpipesig))
1671 break;
1672 /* Just use this directly.. */
1673 if (htmlflt_push(&hf, buf, i) < 0) {
1674 rv = 1;
1675 break;
1678 if (rv == 0 && htmlflt_flush(&hf) < 0)
1679 rv = 1;
1681 htmlflt_destroy(&hf);
1683 rv |= __hf_hadpipesig;
1684 NYD_LEAVE;
1685 return rv;
1688 FL void
1689 htmlflt_init(struct htmlflt *self)
1691 NYD_ENTER;
1692 /* (Rather redundant though) */
1693 memset(self, 0, sizeof *self);
1694 NYD_LEAVE;
1697 FL void
1698 htmlflt_destroy(struct htmlflt *self)
1700 NYD_ENTER;
1701 htmlflt_reset(self, NULL);
1702 NYD_LEAVE;
1705 FL void
1706 htmlflt_reset(struct htmlflt *self, FILE *f)
1708 struct htmlflt_href *hfhp;
1709 NYD_ENTER;
1711 while ((hfhp = self->hf_hrefs) != NULL) {
1712 self->hf_hrefs = hfhp->hfh_next;
1713 n_free(hfhp);
1716 if (self->hf_bdat != NULL)
1717 n_free(self->hf_bdat);
1718 if (self->hf_line != NULL)
1719 n_free(self->hf_line);
1721 memset(self, 0, sizeof *self);
1723 if (f != NULL) {
1724 ui32_t sw = n_MAX(_HF_MINLEN, (ui32_t)n_scrnwidth);
1726 self->hf_line = n_alloc((size_t)sw * n_mb_cur_max +1);
1727 self->hf_lmax = sw;
1729 if (n_psonce & n_PSO_UNICODE) /* TODO not truly generic */
1730 self->hf_flags = _HF_UTF8;
1731 self->hf_os = f;
1733 NYD_LEAVE;
1736 FL ssize_t
1737 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
1739 ssize_t rv;
1740 NYD_ENTER;
1742 rv = _hf_add_data(self, dat, len);
1743 NYD_LEAVE;
1744 return rv;
1747 FL ssize_t
1748 htmlflt_flush(struct htmlflt *self)
1750 ssize_t rv;
1751 NYD_ENTER;
1753 rv = _hf_add_data(self, NULL, 0);
1754 rv |= !fflush(self->hf_os) ? 0 : -1;
1755 NYD_LEAVE;
1756 return rv;
1758 #endif /* HAVE_FILTER_HTML_TAGSOUP */
1760 /* s-it-mode */