1 /* Copyright (C) 2006 - 2012 Jan Kundrát <jkt@flaska.net>
3 This file is part of the Trojita Qt IMAP e-mail client,
4 http://trojita.flaska.net/
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of
9 the License or (at your option) version 3 or any later version
10 accepted by the membership of KDE e.V. (or its successor approved
11 by the membership of KDE e.V.), which shall act as a proxy
12 defined in Section 14 of version 3 of the license.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "Parser/3rdparty/rfccodecs.h"
24 #include "Parser/3rdparty/kcodecs.h"
28 static void enumerateCodecs()
30 static bool enumerated
= false;
33 qWarning() << "Available codecs:";
34 Q_FOREACH (const QByteArray
& codec
, QTextCodec::availableCodecs())
35 qWarning() << " " << codec
;
41 static QTextCodec
* codecForName(const QByteArray
& charset
, bool translateAscii
= true)
43 QByteArray
encoding(charset
.toLower());
45 if (!encoding
.isEmpty()) {
48 if (translateAscii
&& encoding
.contains("ascii")) {
49 // We'll assume the text is plain ASCII, to be extracted to Latin-1
50 encoding
= "ISO-8859-1";
52 else if ((index
= encoding
.indexOf('*')) != -1) {
53 // This charset specification includes a trailing language specifier
54 encoding
= encoding
.left(index
);
57 QTextCodec
* codec
= QTextCodec::codecForName(encoding
);
59 qWarning() << "QMailCodec::codecForName - Unable to find codec for charset" << encoding
;
69 /** @short Interpret the raw byte array as a sequence of bytes in the given encoding */
70 static QString
decodeByteArray(const QByteArray
&encoded
, const QString
&charset
)
72 if (QTextCodec
*codec
= codecForName(charset
.toLatin1())) {
73 return codec
->toUnicode(encoded
);
78 // ASCII character values used throughout
79 const unsigned char MaxPrintableRange
= 0x7e;
80 const unsigned char Space
= 0x20;
81 const unsigned char Equals
= 0x3d;
82 const unsigned char QuestionMark
= 0x3f;
83 const unsigned char Underscore
= 0x5f;
85 /** @short Check the given unicode code point if it has to be escaped in the quoted-printable encoding according to RFC2047 */
86 static inline bool rfc2047QPNeedsEscpaing(const int unicode
)
90 if (unicode
== Equals
|| unicode
== QuestionMark
|| unicode
== Underscore
)
92 if (unicode
> MaxPrintableRange
)
97 /** @short Find the most efficient encoding for the given unicode string
99 It can be either just plain ASCII, or ISO-Latin1 using the Quoted-Printable encoding, or
100 a full-blown UTF-8 scheme with Base64 encoding.
102 static Imap::Rfc2047StringCharacterSetType
charsetForInput(const QString
& input
)
104 // shamelessly stolen from QMF's qmailmessage.cpp
106 // See if this input needs encoding
107 Imap::Rfc2047StringCharacterSetType latin1
= Imap::RFC2047_STRING_ASCII
;
109 const QChar
* it
= input
.constData();
110 const QChar
* const end
= it
+ input
.length();
111 for ( ; it
!= end
; ++it
) {
112 if ((*it
).unicode() > 0xff) {
113 // Multi-byte characters included - we need to use UTF-8
114 return Imap::RFC2047_STRING_UTF8
;
116 else if (!latin1
&& rfc2047QPNeedsEscpaing(it
->unicode()))
118 // We need encoding from latin-1
119 latin1
= Imap::RFC2047_STRING_LATIN
;
126 /** @short Convert a hex digit into a number */
127 static inline int hexValueOfChar(const char input
)
129 if (input
>= '0' && input
<= '9') {
131 } else if (input
>= 'A' && input
<= 'F') {
132 return 0x0a + input
- 'A';
133 } else if (input
>= 'a' && input
<= 'f') {
134 return 0x0a + input
- 'a';
140 /** @short Translate a quoted-printable-encoded array of bytes into binary characters
142 The transformations performed are according to RFC 2047; underscores are transferred into spaces
143 and the three-character =12 escapes are turned into a single byte value.
145 static inline QByteArray
translateQuotedPrintableToBin(const QByteArray
&input
)
148 for (int i
= 0; i
< input
.size(); ++i
) {
149 if (input
[i
] == '_') {
151 } else if (input
[i
] == '=' && i
< input
.size() - 2) {
152 int hi
= hexValueOfChar(input
[++i
]);
153 int lo
= hexValueOfChar(input
[++i
]);
154 if (hi
!= -1 && lo
!= -1) {
155 res
+= static_cast<char>((hi
<< 4) + lo
);
157 res
+= input
.mid(i
- 2, 3);
166 /** @short Decode an encoded-word as per RFC2047 into a unicode string */
167 static QString
decodeWord(const QByteArray
&fullWord
, const QByteArray
&charset
, const QByteArray
&encoding
, const QByteArray
&encoded
)
169 if (encoding
== "Q") {
170 return decodeByteArray(translateQuotedPrintableToBin(encoded
), charset
);
171 } else if (encoding
== "B") {
172 return decodeByteArray(QByteArray::fromBase64(encoded
), charset
);
178 /** @short Decode a header in the RFC 2047 format into a unicode string */
179 static QString
decodeWordSequence(const QByteArray
& str
)
181 QRegExp
whitespace("^\\s+$");
185 // Any idea why this isn't matching?
186 //QRegExp encodedWord("\\b=\\?\\S+\\?\\S+\\?\\S*\\?=\\b");
187 QRegExp
encodedWord("\"?=(\\?\\S+)\\?(\\S+)\\?(.*)\\?=\"?");
189 // set minimal=true, to match sequences which do not have whit space in between 2 encoded words; otherwise by default greedy matching is performed
190 // eg. "Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord" will match "=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=" as a single encoded word without minimal=true
191 // with minimal=true, "=?ISO-8859-1?B?9g==?=" will be the first encoded word and "=?ISO-8859-1?B?5Q==?=" the second.
192 // -- assuming there are no nested encodings, will there be?
193 encodedWord
.setMinimal(true);
199 pos
= encodedWord
.indexIn(str
, pos
);
201 int endPos
= pos
+ encodedWord
.matchedLength();
203 QString
preceding(str
.mid(lastPos
, (pos
- lastPos
)));
204 QString decoded
= decodeWord(str
.mid(pos
, (endPos
- pos
)), encodedWord
.cap(1).toLatin1(),
205 encodedWord
.cap(2).toUpper().toLatin1(), encodedWord
.cap(3).toLatin1());
207 // If there is only whitespace between two encoded words, it should not be included
208 if (!whitespace
.exactMatch(preceding
))
209 out
.append(preceding
);
218 // Copy anything left
219 out
.append(str
.mid(lastPos
));
228 QByteArray
encodeRFC2047String(const QString
&text
, const Rfc2047StringCharacterSetType charset
)
230 // We can't allow more than 75 chars per encoded-word, including the boiler plate (7 chars and the size of the encoding spec)
231 // -- this is defined by RFC2047.
232 int maximumEncoded
= 75 - 7;
234 if (charset
== RFC2047_STRING_UTF8
)
237 encoding
= "iso-8859-1";
238 maximumEncoded
-= encoding
.size();
240 // If this is an encodedWord, we need to include any whitespace that we don't want to lose
241 if (charset
== RFC2047_STRING_UTF8
) {
245 while (start
< text
.size()) {
246 // as long as we have something to work on...
247 int size
= maximumEncoded
;
248 QByteArray candidate
;
250 // Find the character boundary at which we have to split the input.
251 // Remember that we're iterating on Unicode codepoints now, not on raw bytes.
253 candidate
= text
.mid(start
, size
).toUtf8();
254 int utf8Size
= candidate
.size();
255 int base64Size
= utf8Size
* 4 / 3 + utf8Size
% 3;
256 if (base64Size
<= maximumEncoded
) {
257 // if this chunk's size is small enough, great
258 QByteArray encoded
= candidate
.toBase64();
261 res
.append("=?utf-8?B?" + encoded
+ "?=");
265 // otherwise, try with something smaller
273 QByteArray buf
= "=?" + encoding
+ "?Q?";
275 int currentLineLength
= 0;
276 while (i
< text
.size()) {
278 const ushort unicode
= text
[i
].unicode();
279 if (unicode
== 0x20) {
281 } else if (!rfc2047QPNeedsEscpaing(unicode
)) {
282 symbol
+= text
[i
].toLatin1();
284 const char hexChars
[] = "0123456789ABCDEF";
285 symbol
= QByteArray("=") + hexChars
[(unicode
>> 4) & 0xf] + hexChars
[unicode
& 0xf];
287 currentLineLength
+= symbol
.size();
288 if (currentLineLength
> maximumEncoded
) {
289 buf
+= "?=\r\n =?" + encoding
+ "?Q?";
290 currentLineLength
= 0;
301 /** @short Encode the given string into RFC2047 form, preserving the ASCII leading part if possible */
302 QByteArray
encodeRFC2047StringWithAsciiPrefix(const QString
&text
)
304 // The maximal recommended line length, as defined by RFC 5322
305 const int maxLineLength
= 78;
307 // Find first character which needs escaping
309 while (pos
< text
.size() && pos
< maxLineLength
&&
310 (text
[pos
].unicode() == 0x20 || !rfc2047QPNeedsEscpaing(text
[pos
].unicode())))
313 // Find last character of a word which doesn't need escaping
314 if (pos
!= text
.size()) {
315 while (pos
> 0 && text
[pos
-1].unicode() != 0x20)
317 if (pos
> 0 && text
[pos
].unicode() == 0x20)
321 QByteArray prefix
= text
.left(pos
).toUtf8();
322 if (pos
== text
.size())
325 QString rest
= text
.mid(pos
);
326 Rfc2047StringCharacterSetType charset
= charsetForInput(rest
);
328 return prefix
+ encodeRFC2047String(rest
, charset
);
331 QString
decodeRFC2047String( const QByteArray
& raw
)
333 return ::decodeWordSequence( raw
);
336 QByteArray
encodeImapFolderName( const QString
& text
)
338 return KIMAP::encodeImapFolderName( text
).toLatin1();
341 QString
decodeImapFolderName( const QByteArray
& raw
)
343 return KIMAP::decodeImapFolderName( raw
);
346 QByteArray
quotedPrintableDecode( const QByteArray
& raw
)
348 return KCodecs::quotedPrintableDecode( raw
);
351 QByteArray
quotedPrintableEncode(const QByteArray
&raw
)
353 return KCodecs::quotedPrintableEncode(raw
);
357 QByteArray
quotedString( const QByteArray
& unquoted
, QuotedStringStyle style
)
362 /* Compose a double-quoted string according to RFC2822 3.2.5 "quoted-string" */
379 for(int i
= 0; i
< unquoted
.size(); i
++) {
380 char ch
= unquoted
[i
];
381 if (ch
== 9 || ch
== 10 || ch
== 13) {
382 /* Newlines and tabs: these are only allowed in
383 quoted-strings as folding-whitespace, where
384 they are "semantically invisible". If we
385 really want to include them, we probably need
386 to do so as RFC2047 strings. But it's unlikely
387 that that's a desirable behavior in the final
388 application. Instead, translate embedded
389 tabs/newlines into normal whitespace. */
392 if (ch
== lhq
|| ch
== rhq
|| ch
== '\\')
393 quoted
.append('\\'); /* Quoted-pair */
402 /* encodeRFC2047Phrase encodes an arbitrary string into a
403 byte-sequence for use in a "structured" mail header (such as To:,
404 From:, or Received:). The result will match the "phrase"
406 static QRegExp
atomPhraseRx("[ \\tA-Za-z0-9!#$&'*+/=?^_`{}|~-]*");
407 QByteArray
encodeRFC2047Phrase( const QString
&text
)
409 /* We want to know if we can encode as ASCII. But bizarrely, Qt
410 (on my system at least) doesn't have an ASCII codec. So we use
411 the ISO-8859-1 superset, and check for any non-ASCII characters
413 QTextCodec
*latin1
= QTextCodec::codecForMib(4);
415 if (latin1
->canEncode(text
)) {
416 /* Attempt to represent it as an RFC2822 'phrase' --- either a
417 sequence of atoms or as a quoted-string. */
419 if (atomPhraseRx
.exactMatch(text
)) {
420 /* Simplest case: a sequence of atoms (not dot-atoms) */
421 return latin1
->fromUnicode(text
);
423 /* Next-simplest representation: a quoted-string */
424 QByteArray unquoted
= latin1
->fromUnicode(text
);
426 /* Check for non-ASCII characters. */
427 for(int i
= 0; i
< unquoted
.size(); i
++) {
428 char ch
= unquoted
[i
];
429 if (ch
< 1 || ch
>= 127) {
430 /* This string contains non-ASCII characters, so the
431 only way to represent it in a mail header is as an
432 RFC2047 encoded-word. */
433 return encodeRFC2047String(text
, RFC2047_STRING_LATIN
);
437 return quotedString(unquoted
);
441 /* If the text has characters outside of the basic ASCII set, then
442 it has to be encoded using the RFC2047 encoded-word syntax. */
443 return encodeRFC2047String(text
, RFC2047_STRING_UTF8
);