src/Imap/Encoders.cpp

   1 /* Copyright (C) 2006 - 2012 Jan Kundrát <jkt@flaska.net>
   2
   3    This file is part of the Trojita Qt IMAP e-mail client,
   4    http://trojita.flaska.net/
   5
   6    This program is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public License as
   8    published by the Free Software Foundation; either version 2 of
   9    the License or (at your option) version 3 or any later version
  10    accepted by the membership of KDE e.V. (or its successor approved
  11    by the membership of KDE e.V.), which shall act as a proxy
  12    defined in Section 14 of version 3 of the license.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 */
  22 #include "Encoders.h"
  23 #include "Parser/3rdparty/rfccodecs.h"
  24 #include "Parser/3rdparty/kcodecs.h"
  25
  26 namespace {
  27
  28     static void enumerateCodecs()
  29     {
  30         static bool enumerated = false;
  31
  32         if (!enumerated) {
  33             qWarning() << "Available codecs:";
  34             Q_FOREACH (const QByteArray& codec, QTextCodec::availableCodecs())
  35                 qWarning() << "  " << codec;
  36
  37             enumerated = true;
  38         }
  39     }
  40
  41     static QTextCodec* codecForName(const QByteArray& charset, bool translateAscii = true)
  42     {
  43         QByteArray encoding(charset.toLower());
  44
  45         if (!encoding.isEmpty()) {
  46             int index;
  47
  48             if (translateAscii && encoding.contains("ascii")) {
  49                 // We'll assume the text is plain ASCII, to be extracted to Latin-1
  50                 encoding = "ISO-8859-1";
  51             }
  52             else if ((index = encoding.indexOf('*')) != -1) {
  53                 // This charset specification includes a trailing language specifier
  54                 encoding = encoding.left(index);
  55             }
  56
  57             QTextCodec* codec = QTextCodec::codecForName(encoding);
  58             if (!codec) {
  59                 qWarning() << "QMailCodec::codecForName - Unable to find codec for charset" << encoding;
  60                 enumerateCodecs();
  61             }
  62
  63             return codec;
  64         }
  65
  66         return 0;
  67     }
  68
  69     /** @short Interpret the raw byte array as a sequence of bytes in the given encoding */
  70     static QString decodeByteArray(const QByteArray &encoded, const QString &charset)
  71     {
  72         if (QTextCodec *codec = codecForName(charset.toLatin1())) {
  73             return codec->toUnicode(encoded);
  74         }
  75         return QString();
  76     }
  77
  78     // ASCII character values used throughout
  79     const unsigned char MaxPrintableRange = 0x7e;
  80     const unsigned char Space = 0x20;
  81     const unsigned char Equals = 0x3d;
  82     const unsigned char QuestionMark = 0x3f;
  83     const unsigned char Underscore = 0x5f;
  84
  85     /** @short Check the given unicode code point if it has to be escaped in the quoted-printable encoding according to RFC2047 */
  86     static inline bool rfc2047QPNeedsEscpaing(const int unicode)
  87     {
  88         if (unicode <= Space)
  89             return true;
  90         if (unicode == Equals || unicode == QuestionMark || unicode == Underscore)
  91             return true;
  92         if (unicode > MaxPrintableRange)
  93             return true;
  94         return false;
  95     }
  96
  97     /** @short Find the most efficient encoding for the given unicode string
  98
  99     It can be either just plain ASCII, or ISO-Latin1 using the Quoted-Printable encoding, or
 100     a full-blown UTF-8 scheme with Base64 encoding.
 101     */
 102     static Imap::Rfc2047StringCharacterSetType charsetForInput(const QString& input)
 103     {
 104         // shamelessly stolen from QMF's qmailmessage.cpp
 105
 106         // See if this input needs encoding
 107         Imap::Rfc2047StringCharacterSetType latin1 = Imap::RFC2047_STRING_ASCII;
 108
 109         const QChar* it = input.constData();
 110         const QChar* const end = it + input.length();
 111         for ( ; it != end; ++it) {
 112             if ((*it).unicode() > 0xff) {
 113                 // Multi-byte characters included - we need to use UTF-8
 114                 return Imap::RFC2047_STRING_UTF8;
 115             }
 116             else if (!latin1 && rfc2047QPNeedsEscpaing(it->unicode()))
 117             {
 118                 // We need encoding from latin-1
 119                 latin1 = Imap::RFC2047_STRING_LATIN;
 120             }
 121         }
 122
 123         return latin1;
 124     }
 125
 126     /** @short Convert a hex digit into a number */
 127     static inline int hexValueOfChar(const char input)
 128     {
 129         if (input >= '0' && input <= '9') {
 130             return input - '0';
 131         } else if (input >= 'A' && input <= 'F') {
 132             return 0x0a + input - 'A';
 133         } else if (input >= 'a' && input <= 'f') {
 134             return 0x0a + input - 'a';
 135         } else {
 136             return -1;
 137         }
 138     }
 139
 140     /** @short Translate a quoted-printable-encoded array of bytes into binary characters
 141
 142     The transformations performed are according to RFC 2047; underscores are transferred into spaces
 143     and the three-character =12 escapes are turned into a single byte value.
 144     */
 145     static inline QByteArray translateQuotedPrintableToBin(const QByteArray &input)
 146     {
 147         QByteArray res;
 148         for (int i = 0; i < input.size(); ++i) {
 149             if (input[i] == '_') {
 150                 res += ' ';
 151             } else if (input[i] == '=' && i < input.size() - 2) {
 152                 int hi = hexValueOfChar(input[++i]);
 153                 int lo = hexValueOfChar(input[++i]);
 154                 if (hi != -1 && lo != -1) {
 155                     res += static_cast<char>((hi << 4) + lo);
 156                 } else {
 157                     res += input.mid(i - 2, 3);
 158                 }
 159             } else {
 160                 res += input[i];
 161             }
 162         }
 163         return res;
 164     }
 165
 166     /** @short Decode an encoded-word as per RFC2047 into a unicode string */
 167     static QString decodeWord(const QByteArray &fullWord, const QByteArray &charset, const QByteArray &encoding, const QByteArray &encoded)
 168     {
 169         if (encoding == "Q") {
 170             return decodeByteArray(translateQuotedPrintableToBin(encoded), charset);
 171         } else if (encoding == "B") {
 172             return decodeByteArray(QByteArray::fromBase64(encoded), charset);
 173         } else {
 174             return fullWord;
 175         }
 176     }
 177
 178     /** @short Decode a header in the RFC 2047 format into a unicode string */
 179     static QString decodeWordSequence(const QByteArray& str)
 180     {
 181         QRegExp whitespace("^\\s+$");
 182
 183         QString out;
 184
 185         // Any idea why this isn't matching?
 186         //QRegExp encodedWord("\\b=\\?\\S+\\?\\S+\\?\\S*\\?=\\b");
 187         QRegExp encodedWord("\"?=(\\?\\S+)\\?(\\S+)\\?(.*)\\?=\"?");
 188
 189         // set minimal=true, to match sequences which do not have whit space in between 2 encoded words; otherwise by default greedy matching is performed
 190         // eg. "Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord" will match "=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=" as a single encoded word without minimal=true
 191         // with minimal=true, "=?ISO-8859-1?B?9g==?=" will be the first encoded word and "=?ISO-8859-1?B?5Q==?=" the second.
 192         // -- assuming there are no nested encodings, will there be?
 193         encodedWord.setMinimal(true);
 194
 195         int pos = 0;
 196         int lastPos = 0;
 197
 198         while (pos != -1) {
 199             pos = encodedWord.indexIn(str, pos);
 200             if (pos != -1) {
 201                 int endPos = pos + encodedWord.matchedLength();
 202
 203                 QString preceding(str.mid(lastPos, (pos - lastPos)));
 204                 QString decoded = decodeWord(str.mid(pos, (endPos - pos)), encodedWord.cap(1).toLatin1(),
 205                                              encodedWord.cap(2).toUpper().toLatin1(), encodedWord.cap(3).toLatin1());
 206
 207                 // If there is only whitespace between two encoded words, it should not be included
 208                 if (!whitespace.exactMatch(preceding))
 209                     out.append(preceding);
 210
 211                 out.append(decoded);
 212
 213                 pos = endPos;
 214                 lastPos = pos;
 215             }
 216         }
 217
 218         // Copy anything left
 219         out.append(str.mid(lastPos));
 220
 221         return out;
 222     }
 223
 224 }
 225
 226 namespace Imap {
 227
 228 QByteArray encodeRFC2047String(const QString &text, const Rfc2047StringCharacterSetType charset)
 229 {
 230     // We can't allow more than 75 chars per encoded-word, including the boiler plate (7 chars and the size of the encoding spec)
 231     // -- this is defined by RFC2047.
 232     int maximumEncoded = 75 - 7;
 233     QByteArray encoding;
 234     if (charset == RFC2047_STRING_UTF8)
 235         encoding = "utf-8";
 236     else
 237         encoding = "iso-8859-1";
 238     maximumEncoded -= encoding.size();
 239
 240     // If this is an encodedWord, we need to include any whitespace that we don't want to lose
 241     if (charset == RFC2047_STRING_UTF8) {
 242         QByteArray res;
 243         int start = 0;
 244
 245         while (start < text.size()) {
 246             // as long as we have something to work on...
 247             int size = maximumEncoded;
 248             QByteArray candidate;
 249
 250             // Find the character boundary at which we have to split the input.
 251             // Remember that we're iterating on Unicode codepoints now, not on raw bytes.
 252             while (true) {
 253                 candidate = text.mid(start, size).toUtf8();
 254                 int utf8Size = candidate.size();
 255                 int base64Size = utf8Size * 4 / 3 + utf8Size % 3;
 256                 if (base64Size <= maximumEncoded) {
 257                     // if this chunk's size is small enough, great
 258                     QByteArray encoded = candidate.toBase64();
 259                     if (!res.isEmpty())
 260                         res.append("\r\n ");
 261                     res.append("=?utf-8?B?" + encoded + "?=");
 262                     start += size;
 263                     break;
 264                 } else {
 265                     // otherwise, try with something smaller
 266                     --size;
 267                     Q_ASSERT(size >= 1);
 268                 }
 269             }
 270         }
 271         return res;
 272     } else {
 273         QByteArray buf = "=?" + encoding + "?Q?";
 274         int i = 0;
 275         int currentLineLength = 0;
 276         while (i < text.size()) {
 277             QByteArray symbol;
 278             const ushort unicode = text[i].unicode();
 279             if (unicode == 0x20) {
 280                 symbol = "_";
 281             } else if (!rfc2047QPNeedsEscpaing(unicode)) {
 282                 symbol += text[i].toLatin1();
 283             } else {
 284                 const char hexChars[] = "0123456789ABCDEF";
 285                 symbol = QByteArray("=") + hexChars[(unicode >> 4) & 0xf] + hexChars[unicode & 0xf];
 286             }
 287             currentLineLength += symbol.size();
 288             if (currentLineLength > maximumEncoded) {
 289                 buf += "?=\r\n =?" + encoding + "?Q?";
 290                 currentLineLength = 0;
 291             }
 292             buf += symbol;
 293             ++i;
 294         }
 295         buf += "?=";
 296         return buf;
 297     }
 298 }
 299
 300
 301 /** @short Encode the given string into RFC2047 form, preserving the ASCII leading part if possible */
 302 QByteArray encodeRFC2047StringWithAsciiPrefix(const QString &text)
 303 {
 304     // The maximal recommended line length, as defined by RFC 5322
 305     const int maxLineLength = 78;
 306
 307     // Find first character which needs escaping
 308     int pos = 0;
 309     while (pos < text.size() && pos < maxLineLength &&
 310            (text[pos].unicode() == 0x20 || !rfc2047QPNeedsEscpaing(text[pos].unicode())))
 311         ++pos;
 312
 313     // Find last character of a word which doesn't need escaping
 314     if (pos != text.size()) {
 315         while (pos > 0 && text[pos-1].unicode() != 0x20)
 316             --pos;
 317         if (pos > 0 && text[pos].unicode() == 0x20)
 318             --pos;
 319     }
 320
 321     QByteArray prefix = text.left(pos).toUtf8();
 322     if (pos == text.size())
 323         return prefix;
 324
 325     QString rest = text.mid(pos);
 326     Rfc2047StringCharacterSetType charset = charsetForInput(rest);
 327
 328     return prefix + encodeRFC2047String(rest, charset);
 329 }
 330
 331 QString decodeRFC2047String( const QByteArray& raw )
 332 {
 333     return ::decodeWordSequence( raw );
 334 }
 335
 336 QByteArray encodeImapFolderName( const QString& text )
 337 {
 338     return KIMAP::encodeImapFolderName( text ).toLatin1();
 339 }
 340
 341 QString decodeImapFolderName( const QByteArray& raw )
 342 {
 343     return KIMAP::decodeImapFolderName( raw );
 344 }
 345
 346 QByteArray quotedPrintableDecode( const QByteArray& raw )
 347 {
 348     return KCodecs::quotedPrintableDecode( raw );
 349 }
 350
 351 QByteArray quotedPrintableEncode(const QByteArray &raw)
 352 {
 353     return KCodecs::quotedPrintableEncode(raw);
 354 }
 355
 356
 357 QByteArray quotedString( const QByteArray& unquoted, QuotedStringStyle style )
 358 {
 359     QByteArray quoted;
 360     char lhq, rhq;
 361
 362     /* Compose a double-quoted string according to RFC2822 3.2.5 "quoted-string" */
 363     switch (style) {
 364     default:
 365     case DoubleQuoted:
 366         lhq = rhq = '"';
 367         break;
 368     case SquareBrackets:
 369         lhq = '[';
 370         rhq = ']';
 371         break;
 372     case Parentheses:
 373         lhq = '(';
 374         rhq = ')';
 375         break;
 376     }
 377
 378     quoted.append(lhq);
 379     for(int i = 0; i < unquoted.size(); i++) {
 380         char ch = unquoted[i];
 381         if (ch == 9 || ch == 10 || ch == 13) {
 382             /* Newlines and tabs: these are only allowed in
 383                quoted-strings as folding-whitespace, where
 384                they are "semantically invisible".  If we
 385                really want to include them, we probably need
 386                to do so as RFC2047 strings. But it's unlikely
 387                that that's a desirable behavior in the final
 388                application. Instead, translate embedded
 389                tabs/newlines into normal whitespace. */
 390             quoted.append(' ');
 391         } else {
 392             if (ch == lhq || ch == rhq || ch == '\\')
 393                 quoted.append('\\');  /* Quoted-pair */
 394             quoted.append(ch);
 395         }
 396     }
 397     quoted.append(rhq);
 398
 399     return quoted;
 400 }
 401
 402 /* encodeRFC2047Phrase encodes an arbitrary string into a
 403    byte-sequence for use in a "structured" mail header (such as To:,
 404    From:, or Received:). The result will match the "phrase"
 405    production. */
 406 static QRegExp atomPhraseRx("[ \\tA-Za-z0-9!#$&'*+/=?^_`{}|~-]*");
 407 QByteArray encodeRFC2047Phrase( const QString &text )
 408 {
 409     /* We want to know if we can encode as ASCII. But bizarrely, Qt
 410        (on my system at least) doesn't have an ASCII codec. So we use
 411        the ISO-8859-1 superset, and check for any non-ASCII characters
 412        in the result. */
 413     QTextCodec *latin1 = QTextCodec::codecForMib(4);
 414
 415     if (latin1->canEncode(text)) {
 416         /* Attempt to represent it as an RFC2822 'phrase' --- either a
 417            sequence of atoms or as a quoted-string. */
 418
 419         if (atomPhraseRx.exactMatch(text)) {
 420             /* Simplest case: a sequence of atoms (not dot-atoms) */
 421             return latin1->fromUnicode(text);
 422         } else {
 423             /* Next-simplest representation: a quoted-string */
 424             QByteArray unquoted = latin1->fromUnicode(text);
 425
 426             /* Check for non-ASCII characters. */
 427             for(int i = 0; i < unquoted.size(); i++) {
 428                 char ch = unquoted[i];
 429                 if (ch < 1 || ch >= 127) {
 430                     /* This string contains non-ASCII characters, so the
 431                        only way to represent it in a mail header is as an
 432                        RFC2047 encoded-word. */
 433                     return encodeRFC2047String(text, RFC2047_STRING_LATIN);
 434                 }
 435             }
 436
 437             return quotedString(unquoted);
 438         }
 439     }
 440
 441     /* If the text has characters outside of the basic ASCII set, then
 442        it has to be encoded using the RFC2047 encoded-word syntax. */
 443     return encodeRFC2047String(text, RFC2047_STRING_UTF8);
 444 }
 445
 446 }