4 This file is part of KSieve,
5 the KDE internet mail/usenet news message filtering library.
6 Copyright (c) 2002-2003 Marc Mutz <mutz@kde.org>
8 KSieve is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License, version 2, as
10 published by the Free Software Foundation.
12 KSieve is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 In addition, as a special exception, the copyright holders give
22 permission to link the code of this program with any edition of
23 the Qt library by Trolltech AS, Norway (or with modified versions
24 of Qt that use the same license as Qt), and distribute linked
25 combinations including the two. You must obey the GNU General
26 Public License in all respects for all of the code used other than
27 Qt. If you modify this file, you may extend this exception to
28 your version of the file, but you are not obligated to do so. If
29 you do not wish to do so, delete this exception statement from
33 #include <ksieve/lexer.h>
34 #include <impl/lexer.h>
36 #include <impl/utf8validator.h>
37 #include <ksieve/error.h>
40 #include <QStringList>
43 #include <memory> // std::unique_ptr
46 #include <ctype.h> // isdigit
51 #define STR_DIM(x) (sizeof(x) - 1)
58 // Lexer Bridge implementation
62 Lexer::Lexer(const char *scursor
, const char *send
, int options
)
65 i
= new Impl(scursor
, send
, options
);
70 delete i
; i
= Q_NULLPTR
;
73 bool Lexer::ignoreComments() const
76 return i
->ignoreComments();
79 const Error
&Lexer::error() const
85 bool Lexer::atEnd() const
91 int Lexer::column() const
97 int Lexer::line() const
109 void Lexer::restore()
115 Lexer::Token
Lexer::nextToken(QString
&result
)
118 return i
->nextToken(result
);
121 } // namespace KSieve
123 // none except a-zA-Z0-9_
124 static const unsigned char iTextMap
[16] = {
125 0x00, 0x00, 0x00, 0x00, // CTLs: none
126 0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
127 0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
128 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
131 // SP, HT, CR, LF, {}[]();,#/
132 // ### exclude '['? Why would one want to write identifier["foo"]?
133 static const unsigned char delimMap
[16] = {
134 0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
135 0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
136 0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
137 0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
140 // All except iText, delim, "*:
141 static const unsigned char illegalMap
[16] = {
142 0xFF, 0x9B, 0xFF, 0xFF,
143 0x4F, 0x16, 0x00, 0x0F,
144 0x80, 0x00, 0x00, 0x0A,
145 0x80, 0x00, 0x00, 0x0A
148 static inline bool isOfSet(const unsigned char map
[16], unsigned char ch
)
151 return (map
[ ch
/ 8 ] & 0x80 >> ch
% 8);
154 static inline bool isIText(unsigned char ch
)
156 return ch
<= 'z' && isOfSet(iTextMap
, ch
);
159 static inline bool isDelim(unsigned char ch
)
161 return ch
<= '}' && isOfSet(delimMap
, ch
);
164 static inline bool isIllegal(unsigned char ch
)
166 return ch
>= '~' || isOfSet(illegalMap
, ch
);
169 static inline bool is8Bit(signed char ch
)
173 static QString
removeCRLF(const QString
&s
)
175 const bool CRLF
= s
.endsWith(QStringLiteral("\r\n"));
176 const bool LF
= !CRLF
&& s
.endsWith('\n');
178 const int e
= CRLF
? 2 : LF
? 1 : 0; // what to chop off at the end
180 return s
.left(s
.length() - e
);
183 static QString
removeDotStuff(const QString
&s
)
185 return s
.startsWith(QStringLiteral("..")) ? s
.mid(1) : s
;
193 // Lexer Implementation
197 Lexer::Impl::Impl(const char *scursor
, const char *send
, int options
)
198 : mState(scursor
? scursor
: send
),
199 mEnd(send
? send
: scursor
),
200 mIgnoreComments(options
& IgnoreComments
),
201 mIgnoreLF(options
& IgnoreLineFeeds
)
203 if (!scursor
|| !send
) {
208 Lexer::Token
Lexer::Impl::nextToken(QString
&result
)
214 const int oldLine
= line();
216 const bool eatingWSSucceeded
= ignoreComments() ? eatCWS() : eatWS();
218 if (!ignoreLineFeeds() && oldLine
!= line()) {
219 result
.setNum(line() - oldLine
); // return number of linefeeds encountered
223 if (!eatingWSSucceeded
) {
231 switch (*mState
.cursor
) {
232 case '#': // HashComment
233 assert(!ignoreComments());
236 parseHashComment(result
, true);
239 case '/': // BracketComment
240 assert(!ignoreComments());
241 ++mState
.cursor
; // eat slash
242 if (atEnd() || *mState
.cursor
!= '*') {
243 makeError(Error::SlashWithoutAsterisk
);
244 return BracketComment
;
246 ++mState
.cursor
; // eat asterisk
248 makeError(Error::UnfinishedBracketComment
);
249 return BracketComment
;
251 parseBracketComment(result
, true);
252 return BracketComment
;
256 makeError(Error::UnexpectedCharacter
, line(), column() - 1);
259 if (!isIText(*mState
.cursor
)) {
260 makeIllegalCharError(*mState
.cursor
);
265 case '"': // QuotedString
267 parseQuotedString(result
);
277 result
= *mState
.cursor
++;
291 case 't': // maybe MultiLineString, else Identifier
292 if (_strnicmp(mState
.cursor
, "text:", STR_DIM("text:")) == 0) {
294 mState
.cursor
+= STR_DIM("text:");
295 parseMultiLine(result
);
296 // ### FIXME: There can be a hash-comment between "text:"
297 // and CRLF! That should be preserved somehow...
298 return MultiLineString
;
300 // else fall through:
301 default: // Identifier (first must not be 0-9, and can't (caught by Number above))
302 if (!isIText(*mState
.cursor
)) {
303 makeError(Error::IllegalCharacter
);
306 parseIdentifier(result
);
311 bool Lexer::Impl::eatWS()
314 switch (*mState
.cursor
) {
333 bool Lexer::Impl::eatCRLF()
336 assert(*mState
.cursor
== '\n' || *mState
.cursor
== '\r');
338 if (*mState
.cursor
== '\r') {
340 if (atEnd() || *mState
.cursor
!= '\n') {
341 // CR w/o LF -> error
342 makeError(Error::CRWithoutLF
);
349 } else { /* *mState.cursor == '\n' */
356 bool Lexer::Impl::parseHashComment(QString
&result
, bool reallySave
)
358 // hash-comment := "#" *CHAR-NOT-CRLF CRLF
360 // check that the caller plays by the rules:
361 assert(*(mState
.cursor
- 1) == '#');
363 const char *const commentStart
= mState
.cursor
;
367 if (*mState
.cursor
== '\n' || *mState
.cursor
== '\r') {
373 const char *const commentEnd
= mState
.cursor
- 1;
375 if (commentEnd
== commentStart
) {
376 return true; // # was last char in script...
379 if (atEnd() || eatCRLF()) {
380 const int commentLength
= commentEnd
- commentStart
+ 1;
381 if (commentLength
> 0) {
382 if (!isValidUtf8(commentStart
, commentLength
)) {
383 makeError(Error::InvalidUTF8
);
387 result
+= QString::fromUtf8(commentStart
, commentLength
);
396 bool Lexer::Impl::parseBracketComment(QString
&result
, bool reallySave
)
398 // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
400 // check that caller plays by the rules:
401 assert(*(mState
.cursor
- 2) == '/');
402 assert(*(mState
.cursor
- 1) == '*');
404 const char *const commentStart
= mState
.cursor
;
405 const int commentCol
= column() - 2;
406 const int commentLine
= line();
408 // find next asterisk:
412 makeError(Error::UnfinishedBracketComment
, commentLine
, commentCol
);
416 } while (!atEnd() && *++mState
.cursor
!= '/');
419 makeError(Error::UnfinishedBracketComment
, commentLine
, commentCol
);
423 assert(*mState
.cursor
== '/');
425 const int commentLength
= mState
.cursor
- commentStart
- 1;
426 if (commentLength
> 0) {
427 if (!isValidUtf8(commentStart
, commentLength
)) {
428 makeError(Error::InvalidUTF8
);
432 QString tmp
= QString::fromUtf8(commentStart
, commentLength
);
433 result
+= tmp
.remove('\r'); // get rid of CR in CRLF pairs
437 ++mState
.cursor
; // eat '/'
441 bool Lexer::Impl::parseComment(QString
&result
, bool reallySave
)
443 // comment := hash-comment / bracket-comment
445 switch (*mState
.cursor
) {
448 return parseHashComment(result
, reallySave
);
450 if (charsLeft() < 2 || mState
.cursor
[1] != '*') {
451 makeError(Error::IllegalCharacter
);
454 mState
.cursor
+= 2; // eat "/*"
455 return parseBracketComment(result
, reallySave
);
458 return false; // don't set an error here - there was no comment
462 bool Lexer::Impl::eatCWS()
464 // white-space := 1*(SP / CRLF / HTAB / comment )
467 switch (*mState
.cursor
) {
469 case '\t': // SP / HTAB
479 case '/': { // comments
481 if (!parseComment(dummy
)) {
493 bool Lexer::Impl::parseIdentifier(QString
&result
)
495 // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
497 assert(isIText(*mState
.cursor
));
499 const char *const identifierStart
= mState
.cursor
;
502 if (isdigit(*mState
.cursor
)) { // no digits for the first
503 makeError(Error::NoLeadingDigits
);
507 // rest of identifier chars ( now digits are allowed ):
508 for (++mState
.cursor
; !atEnd() && isIText(*mState
.cursor
); ++mState
.cursor
);
510 const int identifierLength
= mState
.cursor
- identifierStart
;
512 // Can use the fast fromLatin1 here, since identifiers are always
513 // in the us-ascii subset:
514 result
+= QString::fromLatin1(identifierStart
, identifierLength
);
516 if (atEnd() || isDelim(*mState
.cursor
)) {
520 makeIllegalCharError(*mState
.cursor
);
524 bool Lexer::Impl::parseTag(QString
&result
)
526 // tag := ":" identifier
528 // check that the caller plays by the rules:
529 assert(*(mState
.cursor
- 1) == ':');
531 assert(isIText(*mState
.cursor
));
533 return parseIdentifier(result
);
536 bool Lexer::Impl::parseNumber(QString
&result
)
538 // number := 1*DIGIT [QUANTIFIER]
539 // QUANTIFIER := "K" / "M" / "G"
541 assert(isdigit(*mState
.cursor
));
543 while (!atEnd() && isdigit(*mState
.cursor
)) {
544 result
+= *mState
.cursor
++;
547 if (atEnd() || isDelim(*mState
.cursor
)) {
551 switch (*mState
.cursor
) {
558 result
+= *mState
.cursor
++;
561 makeIllegalCharError();
565 // quantifier found. Check for delimiter:
566 if (atEnd() || isDelim(*mState
.cursor
)) {
569 makeIllegalCharError();
573 bool Lexer::Impl::parseMultiLine(QString
&result
)
575 // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
576 // *(multi-line-literal / multi-line-dotstuff)
578 // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
579 // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
580 // ;; A line containing only "." ends the multi-line.
581 // ;; Remove a leading '.' if followed by another '.'.
583 assert(_strnicmp(mState
.cursor
- 5, "text:", STR_DIM("text:")) == 0);
585 const int mlBeginLine
= line();
586 const int mlBeginCol
= column() - 5;
589 switch (*mState
.cursor
) {
597 if (!parseHashComment(dummy
)) {
600 goto MultiLineStart
; // break from switch _and_ while
607 goto MultiLineStart
; // break from switch _and_ while
609 makeError(Error::NonCWSAfterTextColon
);
616 makeError(Error::PrematureEndOfMultiLine
, mlBeginLine
, mlBeginCol
);
620 // Now, collect the single lines until one with only a single dot is found:
623 const char *const oldBeginOfLine
= beginOfLine();
627 const int lineLength
= mState
.cursor
- oldBeginOfLine
;
628 if (lineLength
> 0) {
629 if (!isValidUtf8(oldBeginOfLine
, lineLength
)) {
630 makeError(Error::InvalidUTF8
);
633 const QString line
= removeCRLF(QString::fromUtf8(oldBeginOfLine
, lineLength
));
634 lines
.push_back(removeDotStuff(line
));
635 if (line
== QLatin1String(".")) {
639 lines
.push_back(QString());
643 if (lines
.back() != QLatin1String(".")) {
644 makeError(Error::PrematureEndOfMultiLine
, mlBeginLine
, mlBeginCol
);
648 assert(!lines
.empty());
649 lines
.erase(--lines
.end()); // don't include the lone dot.
650 result
= lines
.join(QStringLiteral("\n"));
654 bool Lexer::Impl::parseQuotedString(QString
&result
)
656 // quoted-string := DQUOTE *CHAR DQUOTE
658 // check that caller plays by the rules:
659 assert(*(mState
.cursor
- 1) == '"');
661 const int qsBeginCol
= column() - 1;
662 const int qsBeginLine
= line();
664 const QTextCodec
*const codec
= QTextCodec::codecForMib(106); // UTF-8
666 const std::unique_ptr
<QTextDecoder
> dec(codec
->makeDecoder());
670 switch (*mState
.cursor
) {
686 // else fall through:
688 if (!is8Bit(*mState
.cursor
)) {
689 result
+= *mState
.cursor
++;
690 } else { // probably UTF-8
691 const char *const eightBitBegin
= mState
.cursor
;
693 const int eightBitLen
= mState
.cursor
- eightBitBegin
;
694 assert(eightBitLen
> 0);
695 if (isValidUtf8(eightBitBegin
, eightBitLen
)) {
696 result
+= dec
->toUnicode(eightBitBegin
, eightBitLen
);
698 assert(column() >= eightBitLen
);
699 makeError(Error::InvalidUTF8
, line(), column() - eightBitLen
);
705 makeError(Error::PrematureEndOfQuotedString
, qsBeginLine
, qsBeginCol
);
709 void Lexer::Impl::makeIllegalCharError(char ch
)
711 makeError(isIllegal(ch
) ? Error::IllegalCharacter
: Error::UnexpectedCharacter
);
714 } // namespace KSieve