1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2013 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21 #ifndef HAVE_AMALGAMATION
30 * TODO quotation filter: anticipate in future data: don't break if only WS
31 * TODO or a LF escaping \ follows on the line (simply reuse the latter).
34 #ifdef HAVE_QUOTE_FOLD
44 struct quoteflt
*self
;
49 /* Print out prefix and current quote */
50 static ssize_t
_qf_dump_prefix(struct quoteflt
*self
);
52 /* Add one data character */
53 static ssize_t
_qf_add_data(struct quoteflt
*self
, wchar_t wc
);
55 /* State machine handlers */
56 static ssize_t
_qf_state_prefix(struct qf_vc
*vc
);
57 static ssize_t
_qf_state_data(struct qf_vc
*vc
);
60 _qf_dump_prefix(struct quoteflt
*self
)
66 if ((i
= self
->qf_pfix_len
) > 0 && i
!= fwrite(self
->qf_pfix
, 1, i
,
71 if ((i
= self
->qf_currq
.l
) > 0 && i
!= fwrite(self
->qf_currq
.s
, 1, i
,
84 _qf_add_data(struct quoteflt
*self
, wchar_t wc
)
87 ui32_t save_l
, save_w
;
92 save_l
= save_w
= 0; /* silence cc */
94 /* <newline> ends state */
97 if (wc
== L
'\r') /* TODO CR should be stripped in lower level!! */
100 /* Unroll <tab> to spaces */
102 save_l
= self
->qf_datw
;
103 save_w
= (save_l
+ QUOTE_TAB_SPACES
) & ~(QUOTE_TAB_SPACES
- 1);
105 while (save_w
-- > 0) {
106 ssize_t j
= _qf_add_data(self
, L
' ');
120 self
->qf_dat
.s
[self
->qf_dat
.l
++] = '?';
122 l
= wctomb(self
->qf_dat
.s
+ self
->qf_dat
.l
, wc
);
125 self
->qf_datw
+= (ui32_t
)w
;
126 self
->qf_dat
.l
+= (size_t)l
;
129 /* TODO The last visual may excess (adjusted!) *qfold-max* if it's a wide;
130 * TODO place it on the next line, break before */
131 if (self
->qf_datw
>= self
->qf_qfold_max
) {
132 /* If we have seen a nice breakpoint during traversal, shuffle data
133 * around a bit so as to restore the trailing part after flushing */
134 if (self
->qf_brkl
> 0) {
135 save_w
= self
->qf_datw
- self
->qf_brkw
;
136 save_l
= self
->qf_dat
.l
- self
->qf_brkl
;
137 save_b
= self
->qf_dat
.s
+ self
->qf_brkl
+ 2;
138 memmove(save_b
, save_b
- 2, save_l
);
139 self
->qf_dat
.l
= self
->qf_brkl
;
142 self
->qf_dat
.s
[self
->qf_dat
.l
++] = '\\';
144 self
->qf_dat
.s
[self
->qf_dat
.l
++] = '\n';
145 rv
= quoteflt_flush(self
);
147 /* Restore takeovers, if any */
148 if (save_b
!= NULL
) {
149 self
->qf_brk_isws
= FAL0
;
150 self
->qf_datw
+= save_w
;
151 self
->qf_dat
.l
= save_l
;
152 memmove(self
->qf_dat
.s
, save_b
, save_l
);
154 } else if (self
->qf_datw
>= self
->qf_qfold_min
&& !self
->qf_brk_isws
) {
155 bool_t isws
= iswspace(wc
);
157 if ((isws
&& !self
->qf_brk_isws
) || self
->qf_brkl
== 0) {
158 self
->qf_brkl
= self
->qf_dat
.l
;
159 self
->qf_brkw
= self
->qf_datw
;
160 self
->qf_brk_isws
= isws
;
164 /* If state changed to prefix, perform full reset (note this implies that
165 * quoteflt_flush() performs too much work..) */
167 self
->qf_state
= _QF_PREFIX
;
168 self
->qf_wscnt
= self
->qf_datw
= 0;
169 self
->qf_currq
.l
= 0;
177 _qf_state_prefix(struct qf_vc
*vc
)
179 struct quoteflt
*self
;
189 for (buf
= vc
->buf
, len
= vc
->len
; len
> 0;) {
191 i
= mbrtowc(&wc
, buf
, len
, self
->qf_mbps
);
192 if (i
== (size_t)-1) {
193 /* On hard error, don't modify mbstate_t and step one byte */
194 self
->qf_mbps
[0] = self
->qf_mbps
[1];
200 self
->qf_mbps
[1] = self
->qf_mbps
[0];
201 if (i
== (size_t)-2) {
202 /* Redundant shift sequence, out of buffer */
215 if (i
== 1 && ISQUOTE(wc
)) {
217 if (self
->qf_currq
.l
>= QUOTE_MAX
- 3) {
218 self
->qf_currq
.s
[QUOTE_MAX
- 3] = '.';
219 self
->qf_currq
.s
[QUOTE_MAX
- 2] = '.';
220 self
->qf_currq
.s
[QUOTE_MAX
- 1] = '.';
221 self
->qf_currq
.l
= QUOTE_MAX
;
223 self
->qf_currq
.s
[self
->qf_currq
.l
++] = buf
[-1];
227 /* The quote is parsed and compressed; dump it */
229 self
->qf_state
= _QF_DATA
;
230 /* Overtake WS to the current quote in order to preserve it for eventual
231 * necessary follow lines, too */
232 /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
233 while (self
->qf_wscnt
-- > 0 && self
->qf_currq
.l
< QUOTE_MAX
)
234 self
->qf_currq
.s
[self
->qf_currq
.l
++] = ' ';
235 self
->qf_datw
= self
->qf_pfix_len
+ self
->qf_currq
.l
;
237 rv
= _qf_add_data(self
, wc
);
248 _qf_state_data(struct qf_vc
*vc
)
250 struct quoteflt
*self
;
260 for (buf
= vc
->buf
, len
= vc
->len
; len
> 0;) {
262 i
= mbrtowc(&wc
, buf
, len
, self
->qf_mbps
);
263 if (i
== (size_t)-1) {
264 /* On hard error, don't modify mbstate_t and step one byte */
265 self
->qf_mbps
[0] = self
->qf_mbps
[1];
270 self
->qf_mbps
[1] = self
->qf_mbps
[0];
271 if (i
== (size_t)-2) {
272 /* Redundant shift sequence, out of buffer */
279 { ssize_t j
= _qf_add_data(self
, wc
);
287 if (self
->qf_state
!= _QF_DATA
)
296 #endif /* HAVE_QUOTE_FOLD */
299 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
301 static struct quoteflt qf_i
;
307 quoteflt_init(struct quoteflt
*self
, char const *prefix
)
309 #ifdef HAVE_QUOTE_FOLD
314 memset(self
, 0, sizeof *self
);
316 if ((self
->qf_pfix
= prefix
) != NULL
)
317 self
->qf_pfix_len
= (ui32_t
)strlen(prefix
);
319 /* Check wether the user wants the more fancy quoting algorithm */
320 /* TODO *quote-fold*: QUOTE_MAX may excess it! */
321 #ifdef HAVE_QUOTE_FOLD
322 if (self
->qf_pfix_len
> 0 && (cp
= ok_vlook(quote_fold
)) != NULL
) {
323 ui32_t qmin
, qmax
= (ui32_t
)strtol(cp
, &xcp
, 10);
324 /* These magic values ensure we don't bail :) */
325 if (qmax
< self
->qf_pfix_len
+ 6)
326 qmax
= self
->qf_pfix_len
+ 6;
327 --qmax
; /* The newline escape */
328 if (cp
== xcp
|| *xcp
== '\0')
329 qmin
= (qmax
>> 1) + (qmax
>> 2) + (qmax
>> 5);
331 qmin
= (ui32_t
)strtol(xcp
+ 1, NULL
, 10);
332 if (qmin
< qmax
>> 1)
334 else if (qmin
> qmax
- 2)
337 self
->qf_qfold_min
= qmin
;
338 self
->qf_qfold_max
= qmax
;
340 /* Add pad for takeover copies, backslash and newline */
341 self
->qf_dat
.s
= salloc((qmax
+ 3) * mb_cur_max
);
342 self
->qf_currq
.s
= salloc((QUOTE_MAX
+ 1) * mb_cur_max
);
349 quoteflt_destroy(struct quoteflt
*self
) /* xxx inline */
357 quoteflt_reset(struct quoteflt
*self
, FILE *f
) /* xxx inline */
361 #ifdef HAVE_QUOTE_FOLD
362 self
->qf_state
= _QF_CLEAN
;
364 self
->qf_currq
.l
= 0;
365 memset(self
->qf_mbps
, 0, sizeof self
->qf_mbps
);
371 quoteflt_push(struct quoteflt
*self
, char const *dat
, size_t len
)
373 /* (xxx Ideally the actual push() [and flush()] would be functions on their
374 * xxx own, via indirect vtbl call ..) */
381 /* Bypass? XXX Finally, this filter simply should not be used, then */
382 if (self
->qf_pfix_len
== 0) {
383 if (len
!= fwrite(dat
, 1, len
, self
->qf_os
))
387 /* Normal: place *indentprefix* at every BOL */
389 #ifdef HAVE_QUOTE_FOLD
390 if (self
->qf_qfold_max
== 0)
395 bool_t pxok
= (self
->qf_qfold_min
!= 0);
399 ll
= self
->qf_pfix_len
;
400 if (ll
!= fwrite(self
->qf_pfix
, 1, ll
, self
->qf_os
))
406 /* xxx Strictly speaking this is invalid, because only `/' and `.' are
407 * xxx mandated by POSIX.1-2008 as "invariant across all locales
408 * xxx supported"; though there is no charset known which uses this
409 * xxx control char as part of a multibyte character; note that S-nail
410 * XXX (and the Mail codebase as such) do not support EBCDIC */
411 if ((vp
= memchr(dat
, '\n', len
)) == NULL
)
415 ll
= PTR2SIZE((char*)vp
- dat
) + 1;
418 if (ll
!= fwrite(dat
, sizeof *dat
, ll
, self
->qf_os
))
421 if ((len
-= ll
) == 0)
426 self
->qf_qfold_min
= pxok
;
428 /* Overly complicated, though still only line-per-line: *quote-fold*.
429 * - If .qf_currq.l is 0, then we are in a clean state. Reset .qf_mbps;
430 * TODO note this means we assume that lines start with reset escape seq,
431 * TODO but i don't think this is any worse than what we currently do;
432 * TODO in 15.0, with the value carrier, we should carry conversion states
433 * TODO all along, only resetting on error (or at words for header =???=);
434 * TODO this still is weird for error handling, but we need to act more
435 * TODO stream-alike (though in practice i don't think cross-line states
436 * TODO can be found, because of compatibility reasons; however, being
437 * TODO a problem rather than a solution is not a good thing (tm))
438 * - Lookout for a newline */
439 #ifdef HAVE_QUOTE_FOLD
448 switch (self
->qf_state
) {
451 i
= _qf_state_prefix(&vc
);
453 default: /* silence cc (`i' unused) */
455 i
= _qf_state_data(&vc
);
463 #endif /* HAVE_QUOTE_FOLD */
474 quoteflt_flush(struct quoteflt
*self
)
480 #ifdef HAVE_QUOTE_FOLD
481 if (self
->qf_dat
.l
> 0) {
482 rv
= _qf_dump_prefix(self
);
484 size_t i
= self
->qf_dat
.l
;
485 if (i
== fwrite(self
->qf_dat
.s
, 1, i
, self
->qf_os
))
490 self
->qf_brk_isws
= FAL0
;
491 self
->qf_wscnt
= self
->qf_brkl
= self
->qf_brkw
= 0;
492 self
->qf_datw
= self
->qf_pfix_len
+ self
->qf_currq
.l
;
501 * HTML tagsoup filter
502 * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
503 * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
504 * TODO (nonstandard) and watch out for style="gmail_quote" (or so, VERY
505 * TODO nonstandard) and tracking a stack of such elements (to be popped
506 * TODO once the closing element is seen). Then, after writing a newline,
507 * TODO place sizeof(stack) ">"s first. But aren't these HTML mails rude?
508 * TODO Interlocking and non-well-formed data will break us down
510 #ifdef HAVE_FILTER_HTML_TAGSOUP
513 _HF_MINLEN
= 10, /* Minimum line length (can't really be smaller) */
514 _HF_BRKSUB
= 8 /* Start considering line break MAX - BRKSUB */
518 _HF_UTF8
= 1<<0, /* Data is in UTF-8 */
519 _HF_ERROR
= 1<<1, /* A hard error occurred, bail as soon as possible */
520 _HF_NOPUT
= 1<<2, /* (In a tag,) Don't generate output */
521 _HF_IGN
= 1<<3, /* Ignore mode on */
522 _HF_ANY
= 1<<4, /* Yet seen just any output */
523 _HF_PRE
= 1<<5, /* In <pre>formatted mode */
524 _HF_ENT
= 1<<6, /* Currently parsing an entity */
525 _HF_BLANK
= 1<<7, /* Whitespace last */
526 _HF_HREF
= 1<<8, /* External <a href=> was the last href seen */
528 _HF_NL_1
= 1<<9, /* One \n seen */
529 _HF_NL_2
= 2<<9, /* We have produced an all empty line */
530 _HF_NL_MASK
= _HF_NL_1
| _HF_NL_2
533 enum hf_special_actions
{
534 _HFSA_NEEDSEP
= -1, /* Need an empty line (paragraph separator) */
535 _HFSA_NEEDNL
= -2, /* Need a new line start (table row) */
536 _HFSA_IGN
= -3, /* Things like <style>..</style>, <script>.. */
537 _HFSA_PRE
= -4, /* <pre>.. */
539 _HFSA_IMG
= -6, /* <img> */
540 _HFSA_HREF
= -7, /* <a>.. */
544 enum hf_entity_flags
{
545 _HFE_HAVE_UNI
= 1<<6, /* Have a Unicode replacement character */
546 _HFE_HAVE_CSTR
= 1<<7, /* Have a string replacement */
547 /* We store the length of the entity name in the flags, too */
548 _HFE_LENGTH_MASK
= (1<<6) - 1
551 struct htmlflt_href
{
552 struct htmlflt_href
*hfh_next
;
553 ui32_t hfh_no
; /* Running sequence */
554 ui32_t hfh_len
; /* of .hfh_dat */
555 char hfh_dat
[VFIELD_SIZE(0)];
559 si32_t hft_act
; /* char or hf_special_actions */
560 ui8_t hft_len
; /* Useful bytes in (NUL terminated) .hft_tag */
561 char const hft_tag
[11]; /* Tag less < and > surroundings (TR, /TR, ..) */
563 CTA(SIZEOF_FIELD(struct htmlflt_tag
, hft_tag
) < LINESIZE
); /* .hf_ign_tag */
566 ui8_t hfe_flags
; /* enum hf_entity_flags plus length of .hfe_ent */
567 char hfe_c
; /* Plain replacement character */
568 ui16_t hfe_uni
; /* Unicode codepoint if _HFE_HAVE_UNI */
569 char hfe_cstr
[5]; /* _HFE_HAVE_CSTR (e.g., … -> ...) */
570 char const hfe_ent
[7]; /* Entity less & and ; surroundings */
573 /* Tag list; not binary searched :(, so try to take care a bit */
574 static struct htmlflt_tag
const _hf_tags
[] = {
576 # define _X(S,A) { A, sizeof(S) -1, S }
578 _X("P", _HFSA_NEEDSEP
), _X("/P", _HFSA_NEEDNL
),
579 _X("DIV", _HFSA_NEEDSEP
), _X("/DIV", _HFSA_NEEDNL
),
580 _X("TR", _HFSA_NEEDNL
),
583 _X("A", _HFSA_HREF
), _X("/A", _HFSA_HREF_END
),
584 _X("IMG", _HFSA_IMG
),
585 _X("IT", _HFSA_NEEDNL
),
587 _X("PRE", _HFSA_PRE
), _X("/PRE", _HFSA_PRE_END
),
588 _X("DL", _HFSA_NEEDSEP
),
589 _X("DT", _HFSA_NEEDNL
),
590 _X("TITLE", _HFSA_NEEDSEP
), /*_X("/TITLE", '\n'),*/
591 _X("H1", _HFSA_NEEDSEP
), /*_X("/H1", '\n'),*/
592 _X("H2", _HFSA_NEEDSEP
), /*_X("/H2", '\n'),*/
593 _X("H3", _HFSA_NEEDSEP
), /*_X("/H3", '\n'),*/
594 _X("H4", _HFSA_NEEDSEP
), /*_X("/H4", '\n'),*/
595 _X("H5", _HFSA_NEEDSEP
), /*_X("/H5", '\n'),*/
596 _X("H6", _HFSA_NEEDSEP
), /*_X("/H6", '\n'),*/
598 _X("STYLE", _HFSA_IGN
),
599 _X("SCRIPT", _HFSA_IGN
),
604 /* Entity list; not binary searched.. */
605 static struct hf_ent
const _hf_ents
[] = {
610 # define _X(E,C) {(sizeof(E) -1), C, 0x0u, "", E}
611 # define _XU(E,C,U) {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E}
612 # define _XS(E,S) {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u, S "\0", E}
613 # define _XSU(E,S,U) \
614 {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E}
618 _X("lt", '<'), _X("gt", '>'),
620 _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
621 _XSU("hellip", "...", 0x2026),
622 _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
623 _XSU("laquo", "<<", 0x00AB), _XSU("raquo", ">>", 0x00BB),
624 _XSU("lsaquo", "<", 0x2039), _XSU("rsaquo", ">", 0x203A),
625 _XSU("lsquo", "'", 0x2018), _XSU("rsquo", "'", 0x2019),
626 _XSU("ldquo", "\"", 0x201C), _XSU("rdquo", "\"", 0x201D),
627 _XSU("uarr", "^|", 0x2191), _XSU("darr", "|v", 0x2193),
629 _XSU("cent", "CENT", 0x00A2),
630 _XSU("copy", "(C)", 0x00A9),
631 _XSU("euro", "EUR", 0x20AC),
632 _XSU("infin", "INFY", 0x221E),
633 _XSU("pound", "GBP", 0x00A3),
634 _XSU("reg", "(R)", 0x00AE),
635 _XSU("sect", "S:", 0x00A7),
636 _XSU("yen", "JPY", 0x00A5),
639 _XSU("Auml", "Ae", 0x00C4), _XSU("auml", "ae", 0x00E4),
640 _XSU("Ouml", "Oe", 0x00D6), _XSU("ouml", "oe", 0x00F6),
641 _XSU("Uuml", "Ue", 0x00DC), _XSU("uuml", "ue", 0x00FC),
642 _XSU("szlig", "ss", 0x00DF)
651 static struct htmlflt
* _hf_dump_hrefs(struct htmlflt
*self
);
652 static struct htmlflt
* _hf_dump(struct htmlflt
*self
);
653 static struct htmlflt
* _hf_store(struct htmlflt
*self
, char c
);
654 # ifdef HAVE_NATCH_CHAR
655 static struct htmlflt
* __hf_sync_mbstuff(struct htmlflt
*self
);
659 static struct htmlflt
* _hf_nl(struct htmlflt
*self
);
660 static struct htmlflt
* _hf_nl_force(struct htmlflt
*self
);
661 static struct htmlflt
* _hf_putc(struct htmlflt
*self
, char c
);
662 static struct htmlflt
* _hf_putc_premode(struct htmlflt
*self
, char c
);
663 static struct htmlflt
* _hf_puts(struct htmlflt
*self
, char const *cp
);
664 static struct htmlflt
* _hf_putbuf(struct htmlflt
*self
,
665 char const *cp
, size_t len
);
667 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
668 static struct htmlflt
* _hf_param(struct htmlflt
*self
, struct str
*store
,
671 /* Expand all entities in the given parameter */
672 static struct htmlflt
* _hf_expand_all_ents(struct htmlflt
*self
,
673 struct str
const *param
);
675 /* Completely parsed over a tag / an entity, interpret that */
676 static struct htmlflt
* _hf_check_tag(struct htmlflt
*self
, char const *s
);
677 static struct htmlflt
* _hf_check_ent(struct htmlflt
*self
, char const *s
,
681 static ssize_t
_hf_add_data(struct htmlflt
*self
,
682 char const *dat
, size_t len
);
684 static struct htmlflt
*
685 _hf_dump_hrefs(struct htmlflt
*self
)
687 struct htmlflt_href
*hhp
;
690 if (!(self
->hf_flags
& _HF_NL_2
) && putc('\n', self
->hf_os
) == EOF
) {
691 self
->hf_flags
|= _HF_ERROR
;
695 /* Reverse the list */
696 for (hhp
= self
->hf_hrefs
, self
->hf_hrefs
= NULL
; hhp
!= NULL
;) {
697 struct htmlflt_href
*tmp
= hhp
->hfh_next
;
698 hhp
->hfh_next
= self
->hf_hrefs
;
699 self
->hf_hrefs
= hhp
;
704 while ((hhp
= self
->hf_hrefs
) != NULL
) {
705 self
->hf_hrefs
= hhp
->hfh_next
;
707 if (!(self
->hf_flags
& _HF_ERROR
)) {
708 int w
= fprintf(self
->hf_os
, " [%u] %.*s\n",
709 hhp
->hfh_no
, (int)hhp
->hfh_len
, hhp
->hfh_dat
);
711 self
->hf_flags
|= _HF_ERROR
;
716 self
->hf_flags
|= (putc('\n', self
->hf_os
) == EOF
)
717 ? _HF_ERROR
: _HF_NL_1
| _HF_NL_2
;
718 self
->hf_href_dist
= (ui32_t
)realscreenheight
>> 1;
724 static struct htmlflt
*
725 _hf_dump(struct htmlflt
*self
)
731 f
= self
->hf_flags
& ~_HF_BLANK
;
734 self
->hf_mbwidth
= self
->hf_mboff
= self
->hf_last_ws
= self
->hf_len
= 0;
736 for (c
= '\0'; l
> 0; --l
) {
739 if (putc(c
, self
->hf_os
) == EOF
) {
740 self
->hf_flags
= (f
|= _HF_ERROR
);
746 f
|= (f
& _HF_NL_1
) ? _HF_NL_2
: _HF_NL_1
;
753 /* Check wether there are HREFs to dump; there is so much messy tagsoup out
754 * there that it seems best not to simply dump HREFs in each _dump(), but
755 * only with some gap, let's say half the real screen height */
756 if (--self
->hf_href_dist
< 0 && (f
& _HF_NL_2
) && self
->hf_hrefs
!= NULL
)
757 self
= _hf_dump_hrefs(self
);
763 static struct htmlflt
*
764 _hf_store(struct htmlflt
*self
, char c
)
773 self
->hf_line
[l
] = (c
== '\t' ? ' ' : c
);
775 if (blankspacechar(c
)) {
777 i
= 8 - ((l
- 1) & 7); /* xxx magic tab width of 8 */
780 self
= _hf_store(self
, ' ');
785 self
->hf_last_ws
= l
;
786 } else if (/*c == '.' ||*/ c
== ',' || c
== ';' || c
== '-')
787 self
->hf_last_ws
= l
;
790 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
791 if (mb_cur_max
> 1) { /* XXX should mbrtowc() and THEN store, at least.. */
793 int x
= mbtowc(&wc
, self
->hf_line
+ self
->hf_mboff
, l
- self
->hf_mboff
);
797 if ((x
= wcwidth(wc
)) == -1)
799 else if (iswspace(wc
))
800 self
->hf_last_ws
= l
;
801 i
= (self
->hf_mbwidth
+= x
);
804 mbtowc(&wc
, NULL
, mb_cur_max
);
805 if (UICMP(32, l
- self
->hf_mboff
, >=, mb_cur_max
)) { /* XXX */
810 i
= self
->hf_mbwidth
;
815 /* Do we need to break the line? */
816 if (i
>= self
->hf_lmax
- _HF_BRKSUB
) {
817 ui32_t lim
= self
->hf_lmax
>> 1;
819 /* Let's hope we saw a sane place to break this line! */
820 if (self
->hf_last_ws
>= lim
) {
822 i
= self
->hf_len
= self
->hf_last_ws
;
823 self
= _hf_dump(self
);
824 if ((self
->hf_len
= (l
-= i
)) > 0) {
825 self
->hf_flags
&= ~_HF_NL_MASK
;
826 memmove(self
->hf_line
, self
->hf_line
+ i
, l
);
827 # ifdef HAVE_NATCH_CHAR
828 __hf_sync_mbstuff(self
);
834 /* Any 7-bit characters? */
835 for (i
= l
; i
-- >= lim
;)
836 if (asciichar((c
= self
->hf_line
[i
]))) {
837 self
->hf_last_ws
= ++i
;
839 } else if ((f
& _HF_UTF8
) && ((ui8_t
)c
& 0xC0) != 0x80) {
840 self
->hf_last_ws
= i
;
844 /* Hard break necessary! xxx really badly done */
845 if (l
>= self
->hf_lmax
- 1)
846 self
= _hf_dump(self
);
853 # ifdef HAVE_NATCH_CHAR
854 static struct htmlflt
*
855 __hf_sync_mbstuff(struct htmlflt
*self
)
868 int x
= mbtowc(&wc
, b
, l
);
877 if ((x
= wcwidth(wc
)) == -1)
883 /* Bad, skip over a single character.. XXX very bad indeed */
889 mbtowc(&wc
, NULL
, mb_cur_max
);
893 self
->hf_mbwidth
= w
;
898 # endif /* HAVE_NATCH_CHAR */
900 static struct htmlflt
*
901 _hf_nl(struct htmlflt
*self
)
906 if (!((f
= self
->hf_flags
) & _HF_ERROR
)) {
908 if ((f
& _HF_NL_MASK
) != _HF_NL_MASK
)
909 self
= _hf_dump(self
);
911 self
->hf_flags
= (f
|= _HF_NL_MASK
);
917 static struct htmlflt
*
918 _hf_nl_force(struct htmlflt
*self
)
921 if (!(self
->hf_flags
& _HF_ERROR
))
922 self
= _hf_dump(self
);
927 static struct htmlflt
*
928 _hf_putc(struct htmlflt
*self
, char c
)
933 if ((f
= self
->hf_flags
) & _HF_ERROR
)
939 } else if (c
== ' ' || c
== '\t') {
940 if ((f
& _HF_BLANK
) || self
->hf_len
== 0)
946 self
->hf_flags
= (f
|= _HF_ANY
);
947 self
= _hf_store(self
, c
);
953 static struct htmlflt
*
954 _hf_putc_premode(struct htmlflt
*self
, char c
)
959 if ((f
= self
->hf_flags
) & _HF_ERROR
) {
961 } else if (c
== '\n')
962 self
= _hf_nl_force(self
);
965 self
->hf_flags
= (f
|= _HF_ANY
);
966 self
= _hf_store(self
, c
);
972 static struct htmlflt
*
973 _hf_puts(struct htmlflt
*self
, char const *cp
)
978 while ((c
= *cp
++) != '\0')
979 self
= _hf_putc(self
, c
);
984 static struct htmlflt
*
985 _hf_putbuf(struct htmlflt
*self
, char const *cp
, size_t len
)
990 self
= _hf_putc(self
, *cp
++);
995 static struct htmlflt
*
996 _hf_param(struct htmlflt
*self
, struct str
*store
, char const *param
)
1005 if ((cp
= UNCONST(asccasestr(self
->hf_bdat
, param
))) == NULL
)
1007 cp
+= strlen(param
);
1010 if ((c
= *cp
++) == '\0')
1015 if ((c
= *cp
) == '\0')
1018 if (c
== '"' || c
== '\'') {
1021 /* TODO oops i have forgotten wether backslash quoting is allowed in
1022 * TODO quoted HTML parameter values? not supporting that for now.. */
1023 if ((c
= *++cp
) == '\0' || c
== quote
)
1027 while ((c
= *++cp
) != '\0' && c
!= quote
)
1029 /* XXX ... and we simply ignore missing trailing " :> */
1032 while ((c
= *++cp
) != '\0' && !whitechar(c
))
1036 i
= PTR2SIZE(cp
- store
->s
);
1038 /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
1039 * parameter values prefixed and suffixed by newlines! Therefore trim the
1040 * value content TODO join into the parse step above! */
1041 for (cp
= store
->s
; i
> 0 && spacechar(*cp
); ++cp
, --i
)
1044 for (cp
+= i
- 1; i
> 0 && spacechar(*cp
); --cp
, --i
)
1046 if ((store
->l
= i
) == 0)
1053 static struct htmlflt
*
1054 _hf_expand_all_ents(struct htmlflt
*self
, struct str
const *param
)
1056 char const *cp
, *maxcp
, *ep
;
1061 for (cp
= param
->s
, maxcp
= cp
+ param
->l
; cp
< maxcp
;)
1062 if ((c
= *cp
++) != '&')
1063 self
= _hf_putc(self
, c
);
1065 for (ep
= cp
--; ep
< maxcp
&& (c
= *ep
++) != ';';)
1067 self
= _hf_puts(self
, cp
);
1070 if ((i
= PTR2SIZE(ep
- cp
)) > 1)
1071 self
= _hf_check_ent(self
, cp
, i
);
1079 static struct htmlflt
*
1080 _hf_check_tag(struct htmlflt
*self
, char const *s
)
1085 struct htmlflt_tag
const *hftp
;
1089 /* Extra check only */
1092 DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
1094 self
= _hf_puts(self
, self
->hf_bdat
);
1098 for (++s
, i
= 0; (c
= s
[i
]) != '\0' && c
!= '>' && !whitechar(c
); ++i
)
1099 /* Special massage for things like <br/>: after the slash only whitespace
1100 * may separate us from the closing right angle! */
1104 while ((c
= s
[j
]) != '\0' && c
!= '>' && whitechar(c
))
1110 for (hftp
= _hf_tags
;;) {
1111 if (i
== hftp
->hft_len
&& !ascncasecmp(s
, hftp
->hft_tag
, i
)) {
1112 c
= s
[hftp
->hft_len
];
1113 if (c
== '>' || c
== '/' || whitechar(c
))
1116 if (PTRCMP(++hftp
, >=, _hf_tags
+ NELEM(_hf_tags
)))
1121 switch (hftp
->hft_act
) {
1133 if (!(self
->hf_flags
& _HF_NL_2
))
1134 self
= _hf_nl(self
);
1137 if (!(f
& _HF_NL_1
))
1138 self
= _hf_nl(self
);
1142 self
->hf_ign_tag
= hftp
;
1143 self
->hf_flags
= (f
|= _HF_IGN
| _HF_NOPUT
);
1147 self
= _hf_param(self
, ¶m
, "alt");
1148 self
= _hf_putc(self
, '[');
1149 if (param
.s
== NULL
) {
1150 param
.s
= UNCONST("IMG");
1153 } /* else */ if (memchr(param
.s
, '&', param
.l
) != NULL
)
1154 self
= _hf_expand_all_ents(self
, ¶m
);
1157 self
= _hf_putbuf(self
, param
.s
, param
.l
);
1158 self
= _hf_putc(self
, ']');
1162 self
= _hf_param(self
, ¶m
, "href");
1163 /* Ignore non-external links */
1164 if (param
.s
!= NULL
&& *param
.s
!= '#') {
1165 struct htmlflt_href
*hhp
= smalloc(sizeof(*hhp
) -
1166 VFIELD_SIZEOF(struct htmlflt_href
, hfh_dat
) + param
.l
+1);
1168 hhp
->hfh_next
= self
->hf_hrefs
;
1169 hhp
->hfh_no
= ++self
->hf_href_no
;
1170 hhp
->hfh_len
= (ui32_t
)param
.l
;
1171 memcpy(hhp
->hfh_dat
, param
.s
, param
.l
);
1173 snprintf(nobuf
, sizeof nobuf
, "[%u]", hhp
->hfh_no
);
1174 self
->hf_flags
= (f
|= _HF_HREF
);
1175 self
->hf_hrefs
= hhp
;
1176 self
= _hf_puts(self
, nobuf
);
1178 self
->hf_flags
= (f
&= ~_HF_HREF
);
1180 case _HFSA_HREF_END
:
1182 snprintf(nobuf
, sizeof nobuf
, "[/%u]", self
->hf_href_no
);
1183 self
= _hf_puts(self
, nobuf
);
1188 c
= (char)(hftp
->hft_act
& 0xFF);
1189 self
= _hf_putc(self
, c
);
1198 /* The problem is that even invalid tagsoup is widely used, without real
1199 * searching i have seen e-mail address in <N@H.D> notation, and more.
1200 * To protect us a bit look around and possibly write the content as such */
1205 /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
1208 /* Print out an empty tag as such */
1221 /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
1222 while ((c
= *s
++) != '\0' && c
!= '>' && !whitechar(c
) && c
!= ':')
1223 if (!asciichar(c
) || punctchar(c
)) {
1224 self
= _hf_puts(self
, self
->hf_bdat
);
1230 static struct htmlflt
*
1231 _hf_check_ent(struct htmlflt
*self
, char const *s
, size_t l
)
1236 struct hf_ent
const *hfep
;
1244 assert(s
[l
- 1] == ';');
1248 /* Numeric entity, or try named search */
1250 i
= (*++s
== 'x' ? 16 : 10);
1252 if ((i
!= 16 || (++s
, --l
) > 0) && l
< sizeof(nobuf
)) {
1253 memcpy(nobuf
, s
, l
);
1255 i
= strtol(nobuf
, NULL
, i
);
1257 self
= _hf_putc(self
, (char)i
);
1258 else if (self
->hf_flags
& _HF_UTF8
) {
1260 l
= n_utf32_to_utf8((ui32_t
)i
, nobuf
);
1261 self
= _hf_putbuf(self
, nobuf
, l
);
1267 ui32_t f
= self
->hf_flags
, hf
;
1269 for (hfep
= _hf_ents
; PTRCMP(hfep
, <, _hf_ents
+ NELEM(_hf_ents
)); ++hfep
)
1270 if (l
== ((hf
= hfep
->hfe_flags
) & _HFE_LENGTH_MASK
) &&
1271 !strncmp(s
, hfep
->hfe_ent
, l
)) {
1272 if ((hf
& _HFE_HAVE_UNI
) && (f
& _HF_UTF8
)) {
1275 } else if (hf
& _HFE_HAVE_CSTR
)
1276 self
= _hf_puts(self
, hfep
->hfe_cstr
);
1278 self
= _hf_putc(self
, hfep
->hfe_c
);
1282 self
= _hf_putbuf(self
, s_save
, l_save
);
1290 _hf_add_data(struct htmlflt
*self
, char const *dat
, size_t len
)
1292 char c
, *cp
, *cp_max
;
1297 /* Final put request? */
1299 if (self
->hf_len
> 0 || self
->hf_hrefs
!= NULL
) {
1300 self
= _hf_dump(self
);
1301 if (self
->hf_hrefs
!= NULL
)
1302 self
= _hf_dump_hrefs(self
);
1308 /* Always ensure some initial buffer */
1309 if ((cp
= self
->hf_curr
) != NULL
)
1310 cp_max
= self
->hf_bmax
;
1312 cp
= self
->hf_curr
= self
->hf_bdat
= smalloc(LINESIZE
);
1313 cp_max
= self
->hf_bmax
= cp
+ LINESIZE
-1; /* (Always room for NUL!) */
1315 hot
= (cp
!= self
->hf_bdat
);
1317 for (rv
= (ssize_t
)len
; len
> 0; --len
) {
1318 ui32_t f
= self
->hf_flags
;
1324 /* Soup is really weird, and scripts may contain almost anything (and
1325 * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
1326 * test and walk until we see the required end tag */
1327 /* TODO For real safety _HF_IGN soup condome would also need to know
1328 * TODO about quoted strings so that 'var i = "</script>";' couldn't
1329 * TODO fool it! We really want this mode also for _HF_NOPUT to be
1330 * TODO able to *gracefully* detect the tag-closing '>', but then if
1331 * TODO that is a single mechanism we should have made it! */
1333 struct htmlflt_tag
const *hftp
= self
->hf_ign_tag
;
1340 } else if (c
== '>') {
1342 if ((i
= PTR2SIZE(cp
- self
->hf_bdat
)) > 1 &&
1343 --i
== hftp
->hft_len
&&
1344 !ascncasecmp(self
->hf_bdat
+ 1, hftp
->hft_tag
, i
))
1345 self
->hf_flags
= (f
&= ~(_HF_IGN
| _HF_NOPUT
));
1351 i
= PTR2SIZE(cp
- self
->hf_bdat
);
1352 if ((i
== 1 && c
!= '/') || --i
> hftp
->hft_len
) {
1359 /* People are using & without &ing it, ditto <; be aware */
1360 if (f
& (_HF_NOPUT
| _HF_ENT
)) {
1362 /* Special case "<!--" buffer content to deal with really weird
1363 * things that can be done with "<!--[if gte mso 9]>" syntax */
1364 if (PTR2SIZE(cp
- self
->hf_bdat
) != 4 ||
1365 memcmp(self
->hf_bdat
, "<!--", 4)) {
1368 self
= _hf_puts(self
, self
->hf_bdat
);
1374 self
->hf_flags
= (f
|= _HF_NOPUT
);
1377 /* Weird tagsoup around, do we actually parse a tag? */
1378 if (!(f
& _HF_NOPUT
))
1382 f
&= ~(_HF_NOPUT
| _HF_ENT
);
1384 self
= _hf_check_tag(self
, self
->hf_bdat
);
1385 *(cp
= self
->hf_bdat
) = '\0'; /* xxx extra safety */
1386 /* Quick hack to get rid of redundant newline after <pre> XXX */
1387 if (!(f
& _HF_PRE
) && (self
->hf_flags
& _HF_PRE
) &&
1388 len
> 1 && *dat
== '\n')
1392 case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
1395 /* End of line is not considered unless we are in PRE section.
1396 * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
1397 * newlines for separating parameters */
1400 self
= (f
& _HF_PRE
) ? _hf_nl_force(self
) : _hf_putc(self
, ' ');
1409 /* If not currently parsing a tag and bypassing normal output.. */
1410 if (!(f
& _HF_NOPUT
)) {
1416 self
->hf_flags
= (f
|= _HF_NOPUT
| _HF_ENT
);
1417 } else if (f
& _HF_PRE
) {
1418 self
= _hf_putc_premode(self
, c
);
1419 self
->hf_flags
&= ~_HF_BLANK
;
1421 self
= _hf_putc(self
, c
);
1422 } else if ((f
& _HF_ENT
) && c
== ';') {
1425 f
&= ~(_HF_NOPUT
| _HF_ENT
);
1427 self
= _hf_check_ent(self
, self
->hf_bdat
,
1428 PTR2SIZE(cp
+ 1 - self
->hf_bdat
));
1430 /* We may need to grow the buffer */
1431 if (PTRCMP(cp
+ 42/2, >=, cp_max
)) {
1432 size_t i
= PTR2SIZE(cp
- self
->hf_bdat
),
1433 m
= PTR2SIZE(self
->hf_bmax
- self
->hf_bdat
) + LINESIZE
;
1435 cp
= self
->hf_bdat
= srealloc(self
->hf_bdat
, m
);
1436 self
->hf_bmax
= cp
+ m
-1;
1437 self
->hf_curr
= (cp
+= i
);
1446 return (self
->hf_flags
& _HF_ERROR
) ? -1 : rv
;
1450 * TODO Because we don't support filter chains yet this filter will be run
1451 * TODO in a dedicated subprocess, driven via a special Popen() mode
1453 static bool_t __hf_hadpipesig
;
1455 __hf_onpipe(int signo
)
1457 NYD_X
; /* Signal handler */
1459 __hf_hadpipesig
= TRU1
;
1463 htmlflt_process_main(void)
1465 char buf
[BUFFER_SIZE
];
1471 __hf_hadpipesig
= FAL0
;
1472 safe_signal(SIGPIPE
, &__hf_onpipe
);
1475 htmlflt_reset(&hf
, stdout
);
1478 if ((i
= fread(buf
, sizeof(buf
[0]), NELEM(buf
), stdin
)) == 0) {
1483 if ((rv
= __hf_hadpipesig
))
1485 /* Just use this directly.. */
1486 if (htmlflt_push(&hf
, buf
, i
) < 0) {
1491 if (rv
== 0 && htmlflt_flush(&hf
) < 0)
1494 htmlflt_destroy(&hf
);
1496 rv
|= __hf_hadpipesig
;
1502 htmlflt_init(struct htmlflt
*self
)
1505 /* (Rather redundant though) */
1506 memset(self
, 0, sizeof *self
);
1511 htmlflt_destroy(struct htmlflt
*self
)
1514 htmlflt_reset(self
, NULL
);
1519 htmlflt_reset(struct htmlflt
*self
, FILE *f
)
1521 struct htmlflt_href
*hfhp
;
1524 while ((hfhp
= self
->hf_hrefs
) != NULL
) {
1525 self
->hf_hrefs
= hfhp
->hfh_next
;
1529 if (self
->hf_bdat
!= NULL
)
1530 free(self
->hf_bdat
);
1531 if (self
->hf_line
!= NULL
)
1532 free(self
->hf_line
);
1534 memset(self
, 0, sizeof *self
);
1537 ui32_t sw
= MAX(_HF_MINLEN
, (ui32_t
)scrnwidth
);
1539 self
->hf_line
= smalloc((size_t)sw
* mb_cur_max
+1);
1542 if (options
& OPT_UNICODE
) /* TODO not truly generic */
1543 self
->hf_flags
= _HF_UTF8
;
1550 htmlflt_push(struct htmlflt
*self
, char const *dat
, size_t len
)
1555 rv
= _hf_add_data(self
, dat
, len
);
1561 htmlflt_flush(struct htmlflt
*self
)
1566 rv
= _hf_add_data(self
, NULL
, 0);
1567 rv
|= !fflush(self
->hf_os
) ? 0 : -1;
1571 #endif /* HAVE_FILTER_HTML_TAGSOUP */