Use QStringLiteral here too
[kdepim.git] / libksieve / src / parser / lexer.cpp
blob339da1345ec9889e32420fa66b9505cf095bc5c7
1 /* -*- c++ -*-
2 parser/lexer.cpp
4 This file is part of KSieve,
5 the KDE internet mail/usenet news message filtering library.
6 Copyright (c) 2002-2003 Marc Mutz <mutz@kde.org>
8 KSieve is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License, version 2, as
10 published by the Free Software Foundation.
12 KSieve is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 In addition, as a special exception, the copyright holders give
22 permission to link the code of this program with any edition of
23 the Qt library by Trolltech AS, Norway (or with modified versions
24 of Qt that use the same license as Qt), and distribute linked
25 combinations including the two. You must obey the GNU General
26 Public License in all respects for all of the code used other than
27 Qt. If you modify this file, you may extend this exception to
28 your version of the file, but you are not obligated to do so. If
29 you do not wish to do so, delete this exception statement from
30 your version.
33 #include <ksieve/lexer.h>
34 #include <impl/lexer.h>
36 #include <impl/utf8validator.h>
37 #include <ksieve/error.h>
39 #include <QString>
40 #include <QStringList>
41 #include <QTextCodec>
43 #include <memory> // std::unique_ptr
45 #include <assert.h>
46 #include <ctype.h> // isdigit
48 #ifdef STR_DIM
49 # undef STR_DIM
50 #endif
51 #define STR_DIM(x) (sizeof(x) - 1)
53 namespace KSieve
58 // Lexer Bridge implementation
62 Lexer::Lexer(const char *scursor, const char *send, int options)
63 : i(Q_NULLPTR)
65 i = new Impl(scursor, send, options);
68 Lexer::~Lexer()
70 delete i; i = Q_NULLPTR;
73 bool Lexer::ignoreComments() const
75 assert(i);
76 return i->ignoreComments();
79 const Error &Lexer::error() const
81 assert(i);
82 return i->error();
85 bool Lexer::atEnd() const
87 assert(i);
88 return i->atEnd();
91 int Lexer::column() const
93 assert(i);
94 return i->column();
97 int Lexer::line() const
99 assert(i);
100 return i->line();
103 void Lexer::save()
105 assert(i);
106 i->save();
109 void Lexer::restore()
111 assert(i);
112 i->restore();
115 Lexer::Token Lexer::nextToken(QString &result)
117 assert(i);
118 return i->nextToken(result);
121 } // namespace KSieve
123 // none except a-zA-Z0-9_
124 static const unsigned char iTextMap[16] = {
125 0x00, 0x00, 0x00, 0x00, // CTLs: none
126 0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
127 0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
128 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
131 // SP, HT, CR, LF, {}[]();,#/
132 // ### exclude '['? Why would one want to write identifier["foo"]?
133 static const unsigned char delimMap[16] = {
134 0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
135 0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
136 0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
137 0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
140 // All except iText, delim, "*:
141 static const unsigned char illegalMap[16] = {
142 0xFF, 0x9B, 0xFF, 0xFF,
143 0x4F, 0x16, 0x00, 0x0F,
144 0x80, 0x00, 0x00, 0x0A,
145 0x80, 0x00, 0x00, 0x0A
148 static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
150 assert(ch < 128);
151 return (map[ ch / 8 ] & 0x80 >> ch % 8);
154 static inline bool isIText(unsigned char ch)
156 return ch <= 'z' && isOfSet(iTextMap, ch);
159 static inline bool isDelim(unsigned char ch)
161 return ch <= '}' && isOfSet(delimMap, ch);
164 static inline bool isIllegal(unsigned char ch)
166 return ch >= '~' || isOfSet(illegalMap, ch);
169 static inline bool is8Bit(signed char ch)
171 return ch < 0;
173 static QString removeCRLF(const QString &s)
175 const bool CRLF = s.endsWith(QStringLiteral("\r\n"));
176 const bool LF = !CRLF && s.endsWith('\n');
178 const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end
180 return s.left(s.length() - e);
183 static QString removeDotStuff(const QString &s)
185 return s.startsWith(QStringLiteral("..")) ? s.mid(1) : s;
188 namespace KSieve
193 // Lexer Implementation
197 Lexer::Impl::Impl(const char *scursor, const char *send, int options)
198 : mState(scursor ? scursor : send),
199 mEnd(send ? send : scursor),
200 mIgnoreComments(options & IgnoreComments),
201 mIgnoreLF(options & IgnoreLineFeeds)
203 if (!scursor || !send) {
204 assert(atEnd());
208 Lexer::Token Lexer::Impl::nextToken(QString &result)
210 assert(!atEnd());
211 result.clear();
212 //clearErrors();
214 const int oldLine = line();
216 const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
218 if (!ignoreLineFeeds() && oldLine != line()) {
219 result.setNum(line() - oldLine); // return number of linefeeds encountered
220 return LineFeeds;
223 if (!eatingWSSucceeded) {
224 return None;
227 if (atEnd()) {
228 return None;
231 switch (*mState.cursor) {
232 case '#': // HashComment
233 assert(!ignoreComments());
234 ++mState.cursor;
235 if (!atEnd()) {
236 parseHashComment(result, true);
238 return HashComment;
239 case '/': // BracketComment
240 assert(!ignoreComments());
241 ++mState.cursor; // eat slash
242 if (atEnd() || *mState.cursor != '*') {
243 makeError(Error::SlashWithoutAsterisk);
244 return BracketComment;
246 ++mState.cursor; // eat asterisk
247 if (atEnd()) {
248 makeError(Error::UnfinishedBracketComment);
249 return BracketComment;
251 parseBracketComment(result, true);
252 return BracketComment;
253 case ':': // Tag
254 ++mState.cursor;
255 if (atEnd()) {
256 makeError(Error::UnexpectedCharacter, line(), column() - 1);
257 return Tag;
259 if (!isIText(*mState.cursor)) {
260 makeIllegalCharError(*mState.cursor);
261 return Tag;
263 parseTag(result);
264 return Tag;
265 case '"': // QuotedString
266 ++mState.cursor;
267 parseQuotedString(result);
268 return QuotedString;
269 case '{':
270 case '}':
271 case '[':
272 case ']':
273 case '(':
274 case ')':
275 case ';':
276 case ',': // Special
277 result = *mState.cursor++;
278 return Special;
279 case '0':
280 case '1':
281 case '2':
282 case '3':
283 case '4':
284 case '5':
285 case '6':
286 case '7':
287 case '8':
288 case '9': // Number
289 parseNumber(result);
290 return Number;
291 case 't': // maybe MultiLineString, else Identifier
292 if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
293 // MultiLineString
294 mState.cursor += STR_DIM("text:");
295 parseMultiLine(result);
296 // ### FIXME: There can be a hash-comment between "text:"
297 // and CRLF! That should be preserved somehow...
298 return MultiLineString;
300 // else fall through:
301 default: // Identifier (first must not be 0-9, and can't (caught by Number above))
302 if (!isIText(*mState.cursor)) {
303 makeError(Error::IllegalCharacter);
304 return None;
306 parseIdentifier(result);
307 return Identifier;
311 bool Lexer::Impl::eatWS()
313 while (!atEnd())
314 switch (*mState.cursor) {
315 case '\r':
316 case '\n':
317 if (!eatCRLF()) {
318 return false;
320 break;
321 case ' ':
322 case '\t':
323 ++mState.cursor;
324 break;
325 default:
326 return true;
329 // at end:
330 return true;
333 bool Lexer::Impl::eatCRLF()
335 assert(!atEnd());
336 assert(*mState.cursor == '\n' || *mState.cursor == '\r');
338 if (*mState.cursor == '\r') {
339 ++mState.cursor;
340 if (atEnd() || *mState.cursor != '\n') {
341 // CR w/o LF -> error
342 makeError(Error::CRWithoutLF);
343 return false;
344 } else {
345 // good CRLF
346 newLine();
347 return true;
349 } else { /* *mState.cursor == '\n' */
350 // good, LF only
351 newLine();
352 return true;
356 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
358 // hash-comment := "#" *CHAR-NOT-CRLF CRLF
360 // check that the caller plays by the rules:
361 assert(*(mState.cursor - 1) == '#');
363 const char *const commentStart = mState.cursor;
365 // find next CRLF:
366 while (!atEnd()) {
367 if (*mState.cursor == '\n' || *mState.cursor == '\r') {
368 break;
370 ++mState.cursor;
373 const char *const commentEnd = mState.cursor - 1;
375 if (commentEnd == commentStart) {
376 return true; // # was last char in script...
379 if (atEnd() || eatCRLF()) {
380 const int commentLength = commentEnd - commentStart + 1;
381 if (commentLength > 0) {
382 if (!isValidUtf8(commentStart, commentLength)) {
383 makeError(Error::InvalidUTF8);
384 return false;
386 if (reallySave) {
387 result += QString::fromUtf8(commentStart, commentLength);
390 return true;
393 return false;
396 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
398 // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
400 // check that caller plays by the rules:
401 assert(*(mState.cursor - 2) == '/');
402 assert(*(mState.cursor - 1) == '*');
404 const char *const commentStart = mState.cursor;
405 const int commentCol = column() - 2;
406 const int commentLine = line();
408 // find next asterisk:
409 do {
410 if (!skipTo('*')) {
411 if (!error()) {
412 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
414 return false;
416 } while (!atEnd() && *++mState.cursor != '/');
418 if (atEnd()) {
419 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
420 return false;
423 assert(*mState.cursor == '/');
425 const int commentLength = mState.cursor - commentStart - 1;
426 if (commentLength > 0) {
427 if (!isValidUtf8(commentStart, commentLength)) {
428 makeError(Error::InvalidUTF8);
429 return false;
431 if (reallySave) {
432 QString tmp = QString::fromUtf8(commentStart, commentLength);
433 result += tmp.remove('\r'); // get rid of CR in CRLF pairs
437 ++mState.cursor; // eat '/'
438 return true;
441 bool Lexer::Impl::parseComment(QString &result, bool reallySave)
443 // comment := hash-comment / bracket-comment
445 switch (*mState.cursor) {
446 case '#':
447 ++mState.cursor;
448 return parseHashComment(result, reallySave);
449 case '/':
450 if (charsLeft() < 2 || mState.cursor[1] != '*') {
451 makeError(Error::IllegalCharacter);
452 return false;
453 } else {
454 mState.cursor += 2; // eat "/*"
455 return parseBracketComment(result, reallySave);
457 default:
458 return false; // don't set an error here - there was no comment
462 bool Lexer::Impl::eatCWS()
464 // white-space := 1*(SP / CRLF / HTAB / comment )
466 while (!atEnd()) {
467 switch (*mState.cursor) {
468 case ' ':
469 case '\t': // SP / HTAB
470 ++mState.cursor;
471 break;;
472 case '\n':
473 case '\r': // CRLF
474 if (!eatCRLF()) {
475 return false;
477 break;
478 case '#':
479 case '/': { // comments
480 QString dummy;
481 if (!parseComment(dummy)) {
482 return false;
485 break;
486 default:
487 return true;
490 return true;
493 bool Lexer::Impl::parseIdentifier(QString &result)
495 // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
497 assert(isIText(*mState.cursor));
499 const char *const identifierStart = mState.cursor;
501 // first char:
502 if (isdigit(*mState.cursor)) { // no digits for the first
503 makeError(Error::NoLeadingDigits);
504 return false;
507 // rest of identifier chars ( now digits are allowed ):
508 for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor);
510 const int identifierLength = mState.cursor - identifierStart;
512 // Can use the fast fromLatin1 here, since identifiers are always
513 // in the us-ascii subset:
514 result += QString::fromLatin1(identifierStart, identifierLength);
516 if (atEnd() || isDelim(*mState.cursor)) {
517 return true;
520 makeIllegalCharError(*mState.cursor);
521 return false;
524 bool Lexer::Impl::parseTag(QString &result)
526 // tag := ":" identifier
528 // check that the caller plays by the rules:
529 assert(*(mState.cursor - 1) == ':');
530 assert(!atEnd());
531 assert(isIText(*mState.cursor));
533 return parseIdentifier(result);
536 bool Lexer::Impl::parseNumber(QString &result)
538 // number := 1*DIGIT [QUANTIFIER]
539 // QUANTIFIER := "K" / "M" / "G"
541 assert(isdigit(*mState.cursor));
543 while (!atEnd() && isdigit(*mState.cursor)) {
544 result += *mState.cursor++;
547 if (atEnd() || isDelim(*mState.cursor)) {
548 return true;
551 switch (*mState.cursor) {
552 case 'G':
553 case 'g':
554 case 'M':
555 case 'm':
556 case 'K':
557 case 'k':
558 result += *mState.cursor++;
559 break;
560 default:
561 makeIllegalCharError();
562 return false;
565 // quantifier found. Check for delimiter:
566 if (atEnd() || isDelim(*mState.cursor)) {
567 return true;
569 makeIllegalCharError();
570 return false;
573 bool Lexer::Impl::parseMultiLine(QString &result)
575 // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
576 // *(multi-line-literal / multi-line-dotstuff)
577 // "." CRLF
578 // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
579 // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
580 // ;; A line containing only "." ends the multi-line.
581 // ;; Remove a leading '.' if followed by another '.'.
583 assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
585 const int mlBeginLine = line();
586 const int mlBeginCol = column() - 5;
588 while (!atEnd()) {
589 switch (*mState.cursor) {
590 case ' ':
591 case '\t':
592 ++mState.cursor;
593 break;
594 case '#': {
595 ++mState.cursor;
596 QString dummy;
597 if (!parseHashComment(dummy)) {
598 return false;
600 goto MultiLineStart; // break from switch _and_ while
602 case '\n':
603 case '\r':
604 if (!eatCRLF()) {
605 return false;
607 goto MultiLineStart; // break from switch _and_ while
608 default:
609 makeError(Error::NonCWSAfterTextColon);
610 return false;
614 MultiLineStart:
615 if (atEnd()) {
616 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
617 return false;
620 // Now, collect the single lines until one with only a single dot is found:
621 QStringList lines;
622 while (!atEnd()) {
623 const char *const oldBeginOfLine = beginOfLine();
624 if (!skipToCRLF()) {
625 return false;
627 const int lineLength = mState.cursor - oldBeginOfLine;
628 if (lineLength > 0) {
629 if (!isValidUtf8(oldBeginOfLine, lineLength)) {
630 makeError(Error::InvalidUTF8);
631 return false;
633 const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
634 lines.push_back(removeDotStuff(line));
635 if (line == QLatin1String(".")) {
636 break;
638 } else {
639 lines.push_back(QString());
643 if (lines.back() != QLatin1String(".")) {
644 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
645 return false;
648 assert(!lines.empty());
649 lines.erase(--lines.end()); // don't include the lone dot.
650 result = lines.join(QStringLiteral("\n"));
651 return true;
654 bool Lexer::Impl::parseQuotedString(QString &result)
656 // quoted-string := DQUOTE *CHAR DQUOTE
658 // check that caller plays by the rules:
659 assert(*(mState.cursor - 1) == '"');
661 const int qsBeginCol = column() - 1;
662 const int qsBeginLine = line();
664 const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
665 assert(codec);
666 const std::unique_ptr<QTextDecoder> dec(codec->makeDecoder());
667 assert(dec.get());
669 while (!atEnd())
670 switch (*mState.cursor) {
671 case '"':
672 ++mState.cursor;
673 return true;
674 case '\r':
675 case '\n':
676 if (!eatCRLF()) {
677 return false;
679 result += '\n';
680 break;
681 case '\\':
682 ++mState.cursor;
683 if (atEnd()) {
684 break;
686 // else fall through:
687 default:
688 if (!is8Bit(*mState.cursor)) {
689 result += *mState.cursor++;
690 } else { // probably UTF-8
691 const char *const eightBitBegin = mState.cursor;
692 skipTo8BitEnd();
693 const int eightBitLen = mState.cursor - eightBitBegin;
694 assert(eightBitLen > 0);
695 if (isValidUtf8(eightBitBegin, eightBitLen)) {
696 result += dec->toUnicode(eightBitBegin, eightBitLen);
697 } else {
698 assert(column() >= eightBitLen);
699 makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
700 return false;
705 makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
706 return false;
709 void Lexer::Impl::makeIllegalCharError(char ch)
711 makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
714 } // namespace KSieve