Use fully-qualified type names to prevent confusion of the metatype system
[trojita.git] / src / Imap / Encoders.cpp
blobfc7f3644881e743447068ca9aa401208595f9875
1 /* Copyright (C) 2006 - 2012 Jan Kundrát <jkt@flaska.net>
3 This file is part of the Trojita Qt IMAP e-mail client,
4 http://trojita.flaska.net/
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of
9 the License or (at your option) version 3 or any later version
10 accepted by the membership of KDE e.V. (or its successor approved
11 by the membership of KDE e.V.), which shall act as a proxy
12 defined in Section 14 of version 3 of the license.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "Encoders.h"
23 #include "Parser/3rdparty/rfccodecs.h"
24 #include "Parser/3rdparty/kcodecs.h"
26 namespace {
28 static void enumerateCodecs()
30 static bool enumerated = false;
32 if (!enumerated) {
33 qWarning() << "Available codecs:";
34 Q_FOREACH (const QByteArray& codec, QTextCodec::availableCodecs())
35 qWarning() << " " << codec;
37 enumerated = true;
41 static QTextCodec* codecForName(const QByteArray& charset, bool translateAscii = true)
43 QByteArray encoding(charset.toLower());
45 if (!encoding.isEmpty()) {
46 int index;
48 if (translateAscii && encoding.contains("ascii")) {
49 // We'll assume the text is plain ASCII, to be extracted to Latin-1
50 encoding = "ISO-8859-1";
52 else if ((index = encoding.indexOf('*')) != -1) {
53 // This charset specification includes a trailing language specifier
54 encoding = encoding.left(index);
57 QTextCodec* codec = QTextCodec::codecForName(encoding);
58 if (!codec) {
59 qWarning() << "QMailCodec::codecForName - Unable to find codec for charset" << encoding;
60 enumerateCodecs();
63 return codec;
66 return 0;
69 /** @short Interpret the raw byte array as a sequence of bytes in the given encoding */
70 static QString decodeByteArray(const QByteArray &encoded, const QString &charset)
72 if (QTextCodec *codec = codecForName(charset.toLatin1())) {
73 return codec->toUnicode(encoded);
75 return QString();
78 // ASCII character values used throughout
79 const unsigned char MaxPrintableRange = 0x7e;
80 const unsigned char Space = 0x20;
81 const unsigned char Equals = 0x3d;
82 const unsigned char QuestionMark = 0x3f;
83 const unsigned char Underscore = 0x5f;
85 /** @short Check the given unicode code point if it has to be escaped in the quoted-printable encoding according to RFC2047 */
86 static inline bool rfc2047QPNeedsEscpaing(const int unicode)
88 if (unicode <= Space)
89 return true;
90 if (unicode == Equals || unicode == QuestionMark || unicode == Underscore)
91 return true;
92 if (unicode > MaxPrintableRange)
93 return true;
94 return false;
97 /** @short Find the most efficient encoding for the given unicode string
99 It can be either just plain ASCII, or ISO-Latin1 using the Quoted-Printable encoding, or
100 a full-blown UTF-8 scheme with Base64 encoding.
102 static Imap::Rfc2047StringCharacterSetType charsetForInput(const QString& input)
104 // shamelessly stolen from QMF's qmailmessage.cpp
106 // See if this input needs encoding
107 Imap::Rfc2047StringCharacterSetType latin1 = Imap::RFC2047_STRING_ASCII;
109 const QChar* it = input.constData();
110 const QChar* const end = it + input.length();
111 for ( ; it != end; ++it) {
112 if ((*it).unicode() > 0xff) {
113 // Multi-byte characters included - we need to use UTF-8
114 return Imap::RFC2047_STRING_UTF8;
116 else if (!latin1 && rfc2047QPNeedsEscpaing(it->unicode()))
118 // We need encoding from latin-1
119 latin1 = Imap::RFC2047_STRING_LATIN;
123 return latin1;
126 /** @short Convert a hex digit into a number */
127 static inline int hexValueOfChar(const char input)
129 if (input >= '0' && input <= '9') {
130 return input - '0';
131 } else if (input >= 'A' && input <= 'F') {
132 return 0x0a + input - 'A';
133 } else if (input >= 'a' && input <= 'f') {
134 return 0x0a + input - 'a';
135 } else {
136 return -1;
140 /** @short Translate a quoted-printable-encoded array of bytes into binary characters
142 The transformations performed are according to RFC 2047; underscores are transferred into spaces
143 and the three-character =12 escapes are turned into a single byte value.
145 static inline QByteArray translateQuotedPrintableToBin(const QByteArray &input)
147 QByteArray res;
148 for (int i = 0; i < input.size(); ++i) {
149 if (input[i] == '_') {
150 res += ' ';
151 } else if (input[i] == '=' && i < input.size() - 2) {
152 int hi = hexValueOfChar(input[++i]);
153 int lo = hexValueOfChar(input[++i]);
154 if (hi != -1 && lo != -1) {
155 res += static_cast<char>((hi << 4) + lo);
156 } else {
157 res += input.mid(i - 2, 3);
159 } else {
160 res += input[i];
163 return res;
166 /** @short Decode an encoded-word as per RFC2047 into a unicode string */
167 static QString decodeWord(const QByteArray &fullWord, const QByteArray &charset, const QByteArray &encoding, const QByteArray &encoded)
169 if (encoding == "Q") {
170 return decodeByteArray(translateQuotedPrintableToBin(encoded), charset);
171 } else if (encoding == "B") {
172 return decodeByteArray(QByteArray::fromBase64(encoded), charset);
173 } else {
174 return fullWord;
178 /** @short Decode a header in the RFC 2047 format into a unicode string */
179 static QString decodeWordSequence(const QByteArray& str)
181 QRegExp whitespace("^\\s+$");
183 QString out;
185 // Any idea why this isn't matching?
186 //QRegExp encodedWord("\\b=\\?\\S+\\?\\S+\\?\\S*\\?=\\b");
187 QRegExp encodedWord("\"?=(\\?\\S+)\\?(\\S+)\\?(.*)\\?=\"?");
189 // set minimal=true, to match sequences which do not have whit space in between 2 encoded words; otherwise by default greedy matching is performed
190 // eg. "Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord" will match "=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=" as a single encoded word without minimal=true
191 // with minimal=true, "=?ISO-8859-1?B?9g==?=" will be the first encoded word and "=?ISO-8859-1?B?5Q==?=" the second.
192 // -- assuming there are no nested encodings, will there be?
193 encodedWord.setMinimal(true);
195 int pos = 0;
196 int lastPos = 0;
198 while (pos != -1) {
199 pos = encodedWord.indexIn(str, pos);
200 if (pos != -1) {
201 int endPos = pos + encodedWord.matchedLength();
203 QString preceding(str.mid(lastPos, (pos - lastPos)));
204 QString decoded = decodeWord(str.mid(pos, (endPos - pos)), encodedWord.cap(1).toLatin1(),
205 encodedWord.cap(2).toUpper().toLatin1(), encodedWord.cap(3).toLatin1());
207 // If there is only whitespace between two encoded words, it should not be included
208 if (!whitespace.exactMatch(preceding))
209 out.append(preceding);
211 out.append(decoded);
213 pos = endPos;
214 lastPos = pos;
218 // Copy anything left
219 out.append(str.mid(lastPos));
221 return out;
226 namespace Imap {
228 QByteArray encodeRFC2047String(const QString &text, const Rfc2047StringCharacterSetType charset)
230 // We can't allow more than 75 chars per encoded-word, including the boiler plate (7 chars and the size of the encoding spec)
231 // -- this is defined by RFC2047.
232 int maximumEncoded = 75 - 7;
233 QByteArray encoding;
234 if (charset == RFC2047_STRING_UTF8)
235 encoding = "utf-8";
236 else
237 encoding = "iso-8859-1";
238 maximumEncoded -= encoding.size();
240 // If this is an encodedWord, we need to include any whitespace that we don't want to lose
241 if (charset == RFC2047_STRING_UTF8) {
242 QByteArray res;
243 int start = 0;
245 while (start < text.size()) {
246 // as long as we have something to work on...
247 int size = maximumEncoded;
248 QByteArray candidate;
250 // Find the character boundary at which we have to split the input.
251 // Remember that we're iterating on Unicode codepoints now, not on raw bytes.
252 while (true) {
253 candidate = text.mid(start, size).toUtf8();
254 int utf8Size = candidate.size();
255 int base64Size = utf8Size * 4 / 3 + utf8Size % 3;
256 if (base64Size <= maximumEncoded) {
257 // if this chunk's size is small enough, great
258 QByteArray encoded = candidate.toBase64();
259 if (!res.isEmpty())
260 res.append("\r\n ");
261 res.append("=?utf-8?B?" + encoded + "?=");
262 start += size;
263 break;
264 } else {
265 // otherwise, try with something smaller
266 --size;
267 Q_ASSERT(size >= 1);
271 return res;
272 } else {
273 QByteArray buf = "=?" + encoding + "?Q?";
274 int i = 0;
275 int currentLineLength = 0;
276 while (i < text.size()) {
277 QByteArray symbol;
278 const ushort unicode = text[i].unicode();
279 if (unicode == 0x20) {
280 symbol = "_";
281 } else if (!rfc2047QPNeedsEscpaing(unicode)) {
282 symbol += text[i].toLatin1();
283 } else {
284 const char hexChars[] = "0123456789ABCDEF";
285 symbol = QByteArray("=") + hexChars[(unicode >> 4) & 0xf] + hexChars[unicode & 0xf];
287 currentLineLength += symbol.size();
288 if (currentLineLength > maximumEncoded) {
289 buf += "?=\r\n =?" + encoding + "?Q?";
290 currentLineLength = 0;
292 buf += symbol;
293 ++i;
295 buf += "?=";
296 return buf;
301 /** @short Encode the given string into RFC2047 form, preserving the ASCII leading part if possible */
302 QByteArray encodeRFC2047StringWithAsciiPrefix(const QString &text)
304 // The maximal recommended line length, as defined by RFC 5322
305 const int maxLineLength = 78;
307 // Find first character which needs escaping
308 int pos = 0;
309 while (pos < text.size() && pos < maxLineLength &&
310 (text[pos].unicode() == 0x20 || !rfc2047QPNeedsEscpaing(text[pos].unicode())))
311 ++pos;
313 // Find last character of a word which doesn't need escaping
314 if (pos != text.size()) {
315 while (pos > 0 && text[pos-1].unicode() != 0x20)
316 --pos;
317 if (pos > 0 && text[pos].unicode() == 0x20)
318 --pos;
321 QByteArray prefix = text.left(pos).toUtf8();
322 if (pos == text.size())
323 return prefix;
325 QString rest = text.mid(pos);
326 Rfc2047StringCharacterSetType charset = charsetForInput(rest);
328 return prefix + encodeRFC2047String(rest, charset);
331 QString decodeRFC2047String( const QByteArray& raw )
333 return ::decodeWordSequence( raw );
336 QByteArray encodeImapFolderName( const QString& text )
338 return KIMAP::encodeImapFolderName( text ).toLatin1();
341 QString decodeImapFolderName( const QByteArray& raw )
343 return KIMAP::decodeImapFolderName( raw );
346 QByteArray quotedPrintableDecode( const QByteArray& raw )
348 return KCodecs::quotedPrintableDecode( raw );
351 QByteArray quotedPrintableEncode(const QByteArray &raw)
353 return KCodecs::quotedPrintableEncode(raw);
357 QByteArray quotedString( const QByteArray& unquoted, QuotedStringStyle style )
359 QByteArray quoted;
360 char lhq, rhq;
362 /* Compose a double-quoted string according to RFC2822 3.2.5 "quoted-string" */
363 switch (style) {
364 default:
365 case DoubleQuoted:
366 lhq = rhq = '"';
367 break;
368 case SquareBrackets:
369 lhq = '[';
370 rhq = ']';
371 break;
372 case Parentheses:
373 lhq = '(';
374 rhq = ')';
375 break;
378 quoted.append(lhq);
379 for(int i = 0; i < unquoted.size(); i++) {
380 char ch = unquoted[i];
381 if (ch == 9 || ch == 10 || ch == 13) {
382 /* Newlines and tabs: these are only allowed in
383 quoted-strings as folding-whitespace, where
384 they are "semantically invisible". If we
385 really want to include them, we probably need
386 to do so as RFC2047 strings. But it's unlikely
387 that that's a desirable behavior in the final
388 application. Instead, translate embedded
389 tabs/newlines into normal whitespace. */
390 quoted.append(' ');
391 } else {
392 if (ch == lhq || ch == rhq || ch == '\\')
393 quoted.append('\\'); /* Quoted-pair */
394 quoted.append(ch);
397 quoted.append(rhq);
399 return quoted;
402 /* encodeRFC2047Phrase encodes an arbitrary string into a
403 byte-sequence for use in a "structured" mail header (such as To:,
404 From:, or Received:). The result will match the "phrase"
405 production. */
406 static QRegExp atomPhraseRx("[ \\tA-Za-z0-9!#$&'*+/=?^_`{}|~-]*");
407 QByteArray encodeRFC2047Phrase( const QString &text )
409 /* We want to know if we can encode as ASCII. But bizarrely, Qt
410 (on my system at least) doesn't have an ASCII codec. So we use
411 the ISO-8859-1 superset, and check for any non-ASCII characters
412 in the result. */
413 QTextCodec *latin1 = QTextCodec::codecForMib(4);
415 if (latin1->canEncode(text)) {
416 /* Attempt to represent it as an RFC2822 'phrase' --- either a
417 sequence of atoms or as a quoted-string. */
419 if (atomPhraseRx.exactMatch(text)) {
420 /* Simplest case: a sequence of atoms (not dot-atoms) */
421 return latin1->fromUnicode(text);
422 } else {
423 /* Next-simplest representation: a quoted-string */
424 QByteArray unquoted = latin1->fromUnicode(text);
426 /* Check for non-ASCII characters. */
427 for(int i = 0; i < unquoted.size(); i++) {
428 char ch = unquoted[i];
429 if (ch < 1 || ch >= 127) {
430 /* This string contains non-ASCII characters, so the
431 only way to represent it in a mail header is as an
432 RFC2047 encoded-word. */
433 return encodeRFC2047String(text, RFC2047_STRING_LATIN);
437 return quotedString(unquoted);
441 /* If the text has characters outside of the basic ASCII set, then
442 it has to be encoded using the RFC2047 encoded-word syntax. */
443 return encodeRFC2047String(text, RFC2047_STRING_UTF8);