From 429a294dce2ab2cc555d95a8f9f997db8bff445d Mon Sep 17 00:00:00 2001 From: Steffen Nurpmeso Date: Sun, 26 Nov 2017 00:41:53 +0100 Subject: [PATCH] Improve *mime-counter-evidence* deep inspection (Doug McIlroy).. The deep application/octet-stream content inspection that is controllable by bit 4 (0b1111) did yet use the normal MIME file classification mechanism, which does not allow any content with control characters other than \t, \n and such to be treated as text. It happened that Doug McIlroy has sent a mail with a single embedded DC4 (device control 4) in a 2099 bytes message body. In order to be able to easily look into such a message in the future this changeset clears the way to pass through the actual sendaction to the MIME classifier, and henceforth SEND_TODISP(_ALL)? as well as SEND_QUOTE(_ALL)?, which mangle any text to mask control characters etc., will wave through parts with a certain (TODO fixed) number of control characters. --- mime-parse.c | 3 +- mime-types.c | 106 ++++++++++++++++++++++++++++++++++++++--------------------- nail.1 | 25 +++++++------- nail.h | 11 ++++--- nail.rc | 2 +- nailfuns.h | 6 ++-- send.c | 3 ++ 7 files changed, 98 insertions(+), 58 deletions(-) diff --git a/mime-parse.c b/mime-parse.c index 3733c0e7..fd41af5f 100644 --- a/mime-parse.c +++ b/mime-parse.c @@ -116,7 +116,8 @@ _mime_parse_part(struct message *zmp, struct mimepart *ip, if ((cp = hfield1("content-description", (struct message*)ip)) != NULL) ip->m_content_description = cp; - if ((ip->m_mimecontent = n_mimetype_classify_part(ip)) == MIME_822) { + if ((ip->m_mimecontent = n_mimetype_classify_part(ip, + ((mpf & MIME_PARSE_FOR_USER_CONTEXT) != 0))) == MIME_822) { /* TODO (v15) HACK: message/rfc822 is treated special, that this one is * TODO too stupid to apply content-decoding when (falsely) applied */ if (ip->m_mime_enc != MIMEE_8B && ip->m_mime_enc != MIMEE_7B) { diff --git a/mime-types.c b/mime-types.c index fbddfc67..845ed821 100644 --- a/mime-types.c +++ b/mime-types.c @@ -49,19 +49,21 @@ enum mime_type { }; enum mime_type_class { - _MT_C_CLEAN = 0, /* Plain RFC 5322 message */ - _MT_C_NCTT = 1<<0, /* *contenttype == NULL */ - _MT_C_ISTXT = 1<<1, /* *contenttype =~ text\/ */ - _MT_C_ISTXTCOK = 1<<2, /* _ISTXT + *mime-allow-text-controls* */ - _MT_C_HIGHBIT = 1<<3, /* Not 7bit clean */ - _MT_C_LONGLINES = 1<<4, /* MIME_LINELEN_LIMIT exceed. */ - _MT_C_CTRLCHAR = 1<<5, /* Control characters seen */ - _MT_C_HASNUL = 1<<6, /* Contains \0 characters */ - _MT_C_NOTERMNL = 1<<7, /* Lacks a final newline */ - _MT_C_FROM_ = 1<<8, /* ^From_ seen */ - _MT_C_FROM_1STLINE = 1<<9, /* From_ line seen */ - _MT_C_SUGGEST_DONE = 1<<16, /* Inspector suggests to stop further parse */ - _MT_C__1STLINE = 1<<17 /* .. */ + _MT_C_NONE, + _MT_C_CLEAN = _MT_C_NONE, /* Plain RFC 5322 message */ + _MT_C_DEEP_INSPECT = 1u<<0, /* Always test all the file */ + _MT_C_NCTT = 1u<<1, /* *contenttype == NULL */ + _MT_C_ISTXT = 1u<<2, /* *contenttype =~ text\/ */ + _MT_C_ISTXTCOK = 1u<<3, /* _ISTXT + *mime-allow-text-controls* */ + _MT_C_HIGHBIT = 1u<<4, /* Not 7bit clean */ + _MT_C_LONGLINES = 1u<<5, /* MIME_LINELEN_LIMIT exceed. */ + _MT_C_CTRLCHAR = 1u<<6, /* Control characters seen */ + _MT_C_HASNUL = 1u<<7, /* Contains \0 characters */ + _MT_C_NOTERMNL = 1u<<8, /* Lacks a final newline */ + _MT_C_FROM_ = 1u<<9, /* ^From_ seen */ + _MT_C_FROM_1STLINE = 1u<<10, /* From_ line seen */ + _MT_C_SUGGEST_DONE = 1u<<16, /* Inspector suggests to stop further parse */ + _MT_C__1STLINE = 1u<<17 /* .. */ }; struct mtbltin { @@ -87,12 +89,16 @@ struct mtlookup { }; struct mt_class_arg { - char const *mtca_buf; - size_t mtca_len; - ssize_t mtca_curlen; - /*char mtca_lastc;*/ - char mtca_c; + char const *mtca_buf; + size_t mtca_len; + ssize_t mtca_curlnlen; + /*char mtca_lastc;*/ + char mtca_c; + ui8_t mtca__dummy[3]; enum mime_type_class mtca_mtc; + ui64_t mtca_all_len; + ui64_t mtca_all_highbit; /* TODO not yet interpreted */ + ui64_t mtca_all_bogus; }; static struct mtbltin const _mt_bltin[] = { @@ -138,7 +144,8 @@ n_INLINE struct mt_class_arg * _mt_classify_init(struct mt_class_arg *mtcap, static enum mime_type_class _mt_classify_round(struct mt_class_arg *mtcap); /* We need an in-depth inspection of an application/octet-stream part */ -static enum mimecontent _mt_classify_os_part(ui32_t mce, struct mimepart *mpp); +static enum mimecontent _mt_classify_os_part(ui32_t mce, struct mimepart *mpp, + bool_t deep_inspect); /* Check whether a *pipe-XY* handler is applicable, and adjust flags according * to the defined trigger characters; upon entry MIME_HDL_NULL is set, and that @@ -548,19 +555,21 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ char f_buf[F_SIZEOF], *f_p = f_buf; char const *buf; size_t blen; - ssize_t curlen; + ssize_t curlnlen; + si64_t alllen; int c, lastc; enum mime_type_class mtc; NYD2_ENTER; buf = mtcap->mtca_buf; blen = mtcap->mtca_len; - curlen = mtcap->mtca_curlen; + curlnlen = mtcap->mtca_curlnlen; + alllen = mtcap->mtca_all_len; c = mtcap->mtca_c; /*lastc = mtcap->mtca_lastc;*/ mtc = mtcap->mtca_mtc; - for (;; ++curlen) { + for (;; ++curlnlen) { if(blen == 0){ /* Real EOF, or only current buffer end? */ if(mtcap->mtca_len == 0){ @@ -571,6 +580,7 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ break; } }else{ + ++alllen; lastc = c; c = (uc_i)*buf++; } @@ -586,13 +596,12 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ } if (c == '\n' || c == EOF) { mtc &= ~_MT_C__1STLINE; - if (curlen >= MIME_LINELEN_LIMIT) + if (curlnlen >= MIME_LINELEN_LIMIT) mtc |= _MT_C_LONGLINES; - if (c == EOF) { + if (c == EOF) break; - } f_p = f_buf; - curlen = -1; + curlnlen = -1; continue; } /* A bit hairy is handling of \r=\x0D=CR. @@ -609,6 +618,7 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ /* RFC 2045, 6.7, as above ... */ if (c != '\t' && c != '\r') mtc |= _MT_C_CTRLCHAR; + /* If there is a escape sequence in reverse solidus notation defined * for this in ANSI X3.159-1989 (ANSI C89), don't treat it as a control * for real. I.e., \a=\x07=BEL, \b=\x08=BS, \t=\x09=HT. Don't follow @@ -616,6 +626,15 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ * \e=\x1B=ESC */ if ((c >= '\x07' && c <= '\x0D') || c == '\x1B') continue; + + /* As a special case, if we are going for displaying data to the user + * or quoting a message then simply continue this, in the end, in case + * we get there, we will decide upon the all_len/all_bogus ratio + * whether this is usable plain text or not */ + ++mtcap->mtca_all_bogus; + if(mtc & _MT_C_DEEP_INSPECT) + continue; + mtc |= _MT_C_HASNUL; /* Force base64 */ if (!(mtc & _MT_C_ISTXTCOK)) { mtc |= _MT_C_SUGGEST_DONE; @@ -623,15 +642,14 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ } } else if ((ui8_t)c & 0x80) { mtc |= _MT_C_HIGHBIT; - /* TODO count chars with HIGHBIT? libmagic? - * TODO try encode part - base64 if bails? */ + ++mtcap->mtca_all_highbit; if (!(mtc & (_MT_C_NCTT | _MT_C_ISTXT))) { /* TODO _NCTT?? */ mtc |= _MT_C_HASNUL /* Force base64 */ | _MT_C_SUGGEST_DONE; break; } - } else if (!(mtc & _MT_C_FROM_) && UICMP(z, curlen, <, F_SIZEOF)) { + } else if (!(mtc & _MT_C_FROM_) && UICMP(z, curlnlen, <, F_SIZEOF)) { *f_p++ = (char)c; - if (UICMP(z, curlen, ==, F_SIZEOF - 1) && + if (UICMP(z, curlnlen, ==, F_SIZEOF - 1) && PTR2SIZE(f_p - f_buf) == F_SIZEOF && !memcmp(f_buf, F_, F_SIZEOF)){ mtc |= _MT_C_FROM_; @@ -643,10 +661,11 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ if (c == EOF && lastc != '\n') mtc |= _MT_C_NOTERMNL; - mtcap->mtca_curlen = curlen; + mtcap->mtca_curlnlen = curlnlen; /*mtcap->mtca_lastc = lastc*/; mtcap->mtca_c = c; mtcap->mtca_mtc = mtc; + mtcap->mtca_all_len = alllen; NYD2_LEAVE; return mtc; #undef F_ @@ -654,7 +673,7 @@ _mt_classify_round(struct mt_class_arg *mtcap) /* TODO dig UTF-8 for !text/!! */ } static enum mimecontent -_mt_classify_os_part(ui32_t mce, struct mimepart *mpp) +_mt_classify_os_part(ui32_t mce, struct mimepart *mpp, bool_t deep_inspect) { struct str in = {NULL, 0}, outrest, inrest, dec; struct mt_class_arg mtca; @@ -694,8 +713,12 @@ jos_leave: goto jos_leave; /* So now let's inspect the part content, decoding content-transfer-encoding - * along the way TODO this should simply be "mime_factory_create(MPP)"! */ - _mt_classify_init(&mtca, _MT_C_ISTXT); + * along the way TODO this should simply be "mime_factory_create(MPP)"! + * TODO In fact m_mime_classifier_(setup|call|call_part|finalize)() and the + * TODO state(s) (the _MT_C states) should become reported to the outer + * TODO world like that (see MIME boundary TODO around here) */ + _mt_classify_init(&mtca, (_MT_C_ISTXT | + (deep_inspect ? _MT_C_DEEP_INSPECT : _MT_C_NONE))); for (lsz = 0;;) { bool_t dobuf; @@ -782,6 +805,13 @@ jstopit: fseek(mb.mb_itf, start_off, SEEK_SET); if (!(mtc & (_MT_C_HASNUL /*| _MT_C_CTRLCHAR XXX really? */))) { + /* In that special relaxed case we may very well wave through + * octet-streams full of control characters, as they do no harm + * TODO This should be part of m_mime_classifier_finalize() then! */ + if(deep_inspect && + mtca.mtca_all_len - mtca.mtca_all_bogus < mtca.mtca_all_len >> 2) + goto jleave; + mc = MIME_TEXT_PLAIN; if (mce & MIMECE_ALL_OVWR) mpp->m_ct_type_plain = "text/plain"; @@ -1185,8 +1215,8 @@ jnorfc822: } FL enum mimecontent -n_mimetype_classify_part(struct mimepart *mpp) /* FIXME charset=binary ??? */ -{ +n_mimetype_classify_part(struct mimepart *mpp, bool_t for_user_context){ + /* TODO n_mimetype_classify_part() <-> m_mime_classifier_ with life cycle */ struct mtlookup mtl; enum mimecontent mc; char const *ct; @@ -1272,8 +1302,8 @@ jleave: jos_content_check: if((mce.f & MIMECE_BIN_PARSE) && mpp->m_mime_enc != MIMEE_BIN && - mpp->m_charset != NULL && asccasecmp(mpp->m_charset, "binary")) - mc = _mt_classify_os_part(mce.f, mpp); + mpp->m_charset != NULL) + mc = _mt_classify_os_part(mce.f, mpp, for_user_context); goto jleave; } diff --git a/nail.1 b/nail.1 index 31becbbb..bc04b2f7 100644 --- a/nail.1 +++ b/nail.1 @@ -805,6 +805,8 @@ to include the message that is being responded to when ing, which is indented by an .Va indentprefix that also deviates from standard imposed settings. +.Va mime-counter-evidence +is fully enabled, too. . .Pp Some random remarks. @@ -9586,16 +9588,12 @@ If this variable is set then \*(UA will try to re-classify such MIME message parts, if possible, for example via a possibly existing attachment filename. A non-empty value may also be given, in which case a number is expected, -actually a carrier of bits. -Creating the bit-carrying number is a simple addition: -.Bd -literal -offset indent -? !echo Value should be set to $((2 + 4 + 8)) -Value should be set to 14 -.Ed +actually a carrier of bits, best specified as a binary value, e.g., +.Ql 0b1111 . .Pp .Bl -bullet -compact .It -If bit two is set (2) then the detected +If bit two is set (counting from 1, decimal 2) then the detected .Ic mimetype will be carried along with the message and be used for deciding which MIME handler is to be used, for example; @@ -9603,14 +9601,17 @@ when displaying such a MIME part the part-info will indicate the overridden content-type by showing a plus sign .Ql + . .It -If bit three is set (4) then the counter-evidence is always produced -and a positive result will be used as the MIME type, even forcefully -overriding the parts given MIME type. +If bit three is set (decimal 4) then the counter-evidence is always +produced and a positive result will be used as the MIME type, even +forcefully overriding the parts given MIME type. .It -If bit four is set (8) then as a last resort the actual content of +If bit four is set (decimal 8) as a last resort the actual content of .Ql application/octet-stream parts will be inspected, so that data which looks like plain text can be treated as such. +This mode is even more relaxed when data is to be displayed to the user +or used as a message quote (data consumers which mangle data for display +purposes, which includes masking of control characters, for example). .El . .Mx @@ -13160,7 +13161,7 @@ set emptystart \e colour-pager crt= \e followup-to followup-to-honour=ask-yes fullnames \e history-file=+.\*(uAhist history-size=-1 history-gabby \e - mime-counter-evidence=0xE \e + mime-counter-evidence=0b1111 \e prompt='?\e$?!\e$!/\e$^ERRNAME[\e$account#\e$mailbox-display]? ' \e reply-to-honour=ask-yes \e umask= diff --git a/nail.h b/nail.h index 874ced8c..6221bee2 100644 --- a/nail.h +++ b/nail.h @@ -965,10 +965,13 @@ enum b64flags { }; enum mime_parse_flags { - MIME_PARSE_NONE = 0, - MIME_PARSE_DECRYPT = 1<<0, - MIME_PARSE_PARTS = 1<<1, - MIME_PARSE_SHALLOW = 1<<2 + MIME_PARSE_NONE, + MIME_PARSE_DECRYPT = 1u<<0, + MIME_PARSE_PARTS = 1u<<1, + MIME_PARSE_SHALLOW = 1u<<2, + /* In effect we parse this message for user display or quoting purposes, so + * relaxed rules regarding content inspection may be applicable */ + MIME_PARSE_FOR_USER_CONTEXT = 1u<<3 }; enum mime_handler_flags { diff --git a/nail.rc b/nail.rc index fbbae00d..e19133ea 100644 --- a/nail.rc +++ b/nail.rc @@ -95,7 +95,7 @@ set markanswered # Try to circumvent false or missing MIME Content-Type descriptions. # Do set a value for extended behaviour (see the manual). #set mime-counter-evidence -set mime-counter-evidence=0xE +set mime-counter-evidence=0b1111 # Control loading of mime.types(5) file, "s"ystem and/or "u"ser, etc. # Builtin types exist and may be sufficient. The default equals "us". diff --git a/nailfuns.h b/nailfuns.h index 86c82cb6..f47a6108 100644 --- a/nailfuns.h +++ b/nailfuns.h @@ -1550,8 +1550,10 @@ FL enum conversion n_mimetype_classify_file(FILE *fp, char const **contenttype, char const **charset, int *do_iconv); /* Dependend on *mime-counter-evidence* mpp->m_ct_type_usr_ovwr will be set, - * but otherwise mpp is const */ -FL enum mimecontent n_mimetype_classify_part(struct mimepart *mpp); + * but otherwise mpp is const. for_user_context rather maps 1:1 to + * MIME_PARSE_FOR_USER_CONTEXT */ +FL enum mimecontent n_mimetype_classify_part(struct mimepart *mpp, + bool_t for_user_context); /* Query handler for a part, return the plain type (& MIME_HDL_TYPE_MASK). * mhp is anyway initialized (mh_flags, mh_msg) */ diff --git a/send.c b/send.c index 3e052e08..470d754b 100644 --- a/send.c +++ b/send.c @@ -1586,6 +1586,9 @@ sendmp(struct message *mp, FILE *obuf, struct n_ignore const *doitp, mpf = MIME_PARSE_NONE; if (action != SEND_MBOX && action != SEND_RFC822 && action != SEND_SHOW) mpf |= MIME_PARSE_PARTS | MIME_PARSE_DECRYPT; + if(action == SEND_TODISP || action == SEND_TODISP_ALL || + action == SEND_QUOTE || action == SEND_QUOTE_ALL) + mpf |= MIME_PARSE_FOR_USER_CONTEXT; if ((ip = mime_parse_msg(mp, mpf)) == NULL) goto jleave; -- 2.11.4.GIT