libksieve/src/parser/lexer.cpp

   1 /*  -*- c++ -*-
   2     parser/lexer.cpp
   3
   4     This file is part of KSieve,
   5     the KDE internet mail/usenet news message filtering library.
   6     Copyright (c) 2002-2003 Marc Mutz <mutz@kde.org>
   7
   8     KSieve is free software; you can redistribute it and/or modify it
   9     under the terms of the GNU General Public License, version 2, as
  10     published by the Free Software Foundation.
  11
  12     KSieve is distributed in the hope that it will be useful, but
  13     WITHOUT ANY WARRANTY; without even the implied warranty of
  14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15     General Public License for more details.
  16
  17     You should have received a copy of the GNU General Public License
  18     along with this program; if not, write to the Free Software
  19     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  20
  21     In addition, as a special exception, the copyright holders give
  22     permission to link the code of this program with any edition of
  23     the Qt library by Trolltech AS, Norway (or with modified versions
  24     of Qt that use the same license as Qt), and distribute linked
  25     combinations including the two.  You must obey the GNU General
  26     Public License in all respects for all of the code used other than
  27     Qt.  If you modify this file, you may extend this exception to
  28     your version of the file, but you are not obligated to do so.  If
  29     you do not wish to do so, delete this exception statement from
  30     your version.
  31 */
  32
  33 #include <ksieve/lexer.h>
  34 #include <impl/lexer.h>
  35
  36 #include <impl/utf8validator.h>
  37 #include <ksieve/error.h>
  38
  39 #include <QString>
  40 #include <QStringList>
  41 #include <QTextCodec>
  42
  43 #include <memory> // std::unique_ptr
  44
  45 #include <assert.h>
  46 #include <ctype.h> // isdigit
  47
  48 #ifdef STR_DIM
  49 # undef STR_DIM
  50 #endif
  51 #define STR_DIM(x) (sizeof(x) - 1)
  52
  53 namespace KSieve
  54 {
  55
  56 //
  57 //
  58 // Lexer Bridge implementation
  59 //
  60 //
  61
  62 Lexer::Lexer(const char *scursor, const char *send, int options)
  63     : i(Q_NULLPTR)
  64 {
  65     i = new Impl(scursor, send, options);
  66 }
  67
  68 Lexer::~Lexer()
  69 {
  70     delete i; i = Q_NULLPTR;
  71 }
  72
  73 bool Lexer::ignoreComments() const
  74 {
  75     assert(i);
  76     return i->ignoreComments();
  77 }
  78
  79 const Error &Lexer::error() const
  80 {
  81     assert(i);
  82     return i->error();
  83 }
  84
  85 bool Lexer::atEnd() const
  86 {
  87     assert(i);
  88     return i->atEnd();
  89 }
  90
  91 int Lexer::column() const
  92 {
  93     assert(i);
  94     return i->column();
  95 }
  96
  97 int Lexer::line() const
  98 {
  99     assert(i);
 100     return i->line();
 101 }
 102
 103 void Lexer::save()
 104 {
 105     assert(i);
 106     i->save();
 107 }
 108
 109 void Lexer::restore()
 110 {
 111     assert(i);
 112     i->restore();
 113 }
 114
 115 Lexer::Token Lexer::nextToken(QString &result)
 116 {
 117     assert(i);
 118     return i->nextToken(result);
 119 }
 120
 121 } // namespace KSieve
 122
 123 // none except a-zA-Z0-9_
 124 static const unsigned char iTextMap[16] = {
 125     0x00, 0x00, 0x00, 0x00, // CTLs:        none
 126     0x00, 0x00, 0xFF, 0xC0, // SP ... '?':  0-9
 127     0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
 128     0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL: a-z
 129 };
 130
 131 // SP, HT, CR, LF, {}[]();,#/
 132 // ### exclude '['? Why would one want to write identifier["foo"]?
 133 static const unsigned char delimMap[16] = {
 134     0x00, 0x64, 0x00, 0x00, // CTLs:        CR, HT, LF
 135     0x90, 0xC9, 0x00, 0x10, // SP ... '?':  SP, #(),;
 136     0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
 137     0x00, 0x00, 0x00, 0x16  // '`' ... DEL: {}
 138 };
 139
 140 // All except iText, delim, "*:
 141 static const unsigned char illegalMap[16] = {
 142     0xFF, 0x9B, 0xFF, 0xFF,
 143     0x4F, 0x16, 0x00, 0x0F,
 144     0x80, 0x00, 0x00, 0x0A,
 145     0x80, 0x00, 0x00, 0x0A
 146 };
 147
 148 static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
 149 {
 150     assert(ch < 128);
 151     return (map[ ch / 8 ] & 0x80 >> ch % 8);
 152 }
 153
 154 static inline bool isIText(unsigned char ch)
 155 {
 156     return ch <= 'z' && isOfSet(iTextMap, ch);
 157 }
 158
 159 static inline bool isDelim(unsigned char ch)
 160 {
 161     return ch <= '}' && isOfSet(delimMap, ch);
 162 }
 163
 164 static inline bool isIllegal(unsigned char ch)
 165 {
 166     return ch >= '~' || isOfSet(illegalMap, ch);
 167 }
 168
 169 static inline bool is8Bit(signed char ch)
 170 {
 171     return ch < 0;
 172 }
 173 static QString removeCRLF(const QString &s)
 174 {
 175     const bool CRLF = s.endsWith(QStringLiteral("\r\n"));
 176     const bool LF = !CRLF && s.endsWith('\n');
 177
 178     const int e = CRLF ? 2 : LF ? 1 : 0;  // what to chop off at the end
 179
 180     return s.left(s.length() - e);
 181 }
 182
 183 static QString removeDotStuff(const QString &s)
 184 {
 185     return s.startsWith(QStringLiteral("..")) ? s.mid(1) : s;
 186 }
 187
 188 namespace KSieve
 189 {
 190
 191 //
 192 //
 193 // Lexer Implementation
 194 //
 195 //
 196
 197 Lexer::Impl::Impl(const char *scursor, const char *send, int options)
 198     : mState(scursor ? scursor : send),
 199       mEnd(send ? send : scursor),
 200       mIgnoreComments(options & IgnoreComments),
 201       mIgnoreLF(options & IgnoreLineFeeds)
 202 {
 203     if (!scursor || !send) {
 204         assert(atEnd());
 205     }
 206 }
 207
 208 Lexer::Token Lexer::Impl::nextToken(QString &result)
 209 {
 210     assert(!atEnd());
 211     result.clear();
 212     //clearErrors();
 213
 214     const int oldLine = line();
 215
 216     const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
 217
 218     if (!ignoreLineFeeds() && oldLine != line()) {
 219         result.setNum(line() - oldLine);   // return number of linefeeds encountered
 220         return LineFeeds;
 221     }
 222
 223     if (!eatingWSSucceeded) {
 224         return None;
 225     }
 226
 227     if (atEnd()) {
 228         return None;
 229     }
 230
 231     switch (*mState.cursor) {
 232     case '#': // HashComment
 233         assert(!ignoreComments());
 234         ++mState.cursor;
 235         if (!atEnd()) {
 236             parseHashComment(result, true);
 237         }
 238         return HashComment;
 239     case '/': // BracketComment
 240         assert(!ignoreComments());
 241         ++mState.cursor; // eat slash
 242         if (atEnd() || *mState.cursor != '*') {
 243             makeError(Error::SlashWithoutAsterisk);
 244             return BracketComment;
 245         }
 246         ++mState.cursor; // eat asterisk
 247         if (atEnd()) {
 248             makeError(Error::UnfinishedBracketComment);
 249             return BracketComment;
 250         }
 251         parseBracketComment(result, true);
 252         return BracketComment;
 253     case ':': // Tag
 254         ++mState.cursor;
 255         if (atEnd()) {
 256             makeError(Error::UnexpectedCharacter, line(), column() - 1);
 257             return Tag;
 258         }
 259         if (!isIText(*mState.cursor)) {
 260             makeIllegalCharError(*mState.cursor);
 261             return Tag;
 262         }
 263         parseTag(result);
 264         return Tag;
 265     case '"': // QuotedString
 266         ++mState.cursor;
 267         parseQuotedString(result);
 268         return QuotedString;
 269     case '{':
 270     case '}':
 271     case '[':
 272     case ']':
 273     case '(':
 274     case ')':
 275     case ';':
 276     case ',': // Special
 277         result = *mState.cursor++;
 278         return Special;
 279     case '0':
 280     case '1':
 281     case '2':
 282     case '3':
 283     case '4':
 284     case '5':
 285     case '6':
 286     case '7':
 287     case '8':
 288     case '9': // Number
 289         parseNumber(result);
 290         return Number;
 291     case 't': // maybe MultiLineString, else Identifier
 292         if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
 293             // MultiLineString
 294             mState.cursor += STR_DIM("text:");
 295             parseMultiLine(result);
 296             // ### FIXME: There can be a hash-comment between "text:"
 297             // and CRLF! That should be preserved somehow...
 298             return MultiLineString;
 299         }
 300     // else fall through:
 301     default: // Identifier (first must not be 0-9, and can't (caught by Number above))
 302         if (!isIText(*mState.cursor)) {
 303             makeError(Error::IllegalCharacter);
 304             return None;
 305         }
 306         parseIdentifier(result);
 307         return Identifier;
 308     }
 309 }
 310
 311 bool Lexer::Impl::eatWS()
 312 {
 313     while (!atEnd())
 314         switch (*mState.cursor) {
 315         case '\r':
 316         case '\n':
 317             if (!eatCRLF()) {
 318                 return false;
 319             }
 320             break;
 321         case ' ':
 322         case '\t':
 323             ++mState.cursor;
 324             break;
 325         default:
 326             return true;
 327         }
 328
 329     // at end:
 330     return true;
 331 }
 332
 333 bool Lexer::Impl::eatCRLF()
 334 {
 335     assert(!atEnd());
 336     assert(*mState.cursor == '\n' || *mState.cursor == '\r');
 337
 338     if (*mState.cursor == '\r') {
 339         ++mState.cursor;
 340         if (atEnd() || *mState.cursor != '\n') {
 341             // CR w/o LF -> error
 342             makeError(Error::CRWithoutLF);
 343             return false;
 344         } else {
 345             // good CRLF
 346             newLine();
 347             return true;
 348         }
 349     } else { /* *mState.cursor == '\n' */
 350         // good, LF only
 351         newLine();
 352         return true;
 353     }
 354 }
 355
 356 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
 357 {
 358     // hash-comment := "#" *CHAR-NOT-CRLF CRLF
 359
 360     // check that the caller plays by the rules:
 361     assert(*(mState.cursor - 1) == '#');
 362
 363     const char *const commentStart = mState.cursor;
 364
 365     // find next CRLF:
 366     while (!atEnd()) {
 367         if (*mState.cursor == '\n' || *mState.cursor == '\r') {
 368             break;
 369         }
 370         ++mState.cursor;
 371     }
 372
 373     const char *const commentEnd = mState.cursor - 1;
 374
 375     if (commentEnd == commentStart) {
 376         return true;    // # was last char in script...
 377     }
 378
 379     if (atEnd() || eatCRLF()) {
 380         const int commentLength = commentEnd - commentStart + 1;
 381         if (commentLength > 0) {
 382             if (!isValidUtf8(commentStart, commentLength)) {
 383                 makeError(Error::InvalidUTF8);
 384                 return false;
 385             }
 386             if (reallySave) {
 387                 result += QString::fromUtf8(commentStart, commentLength);
 388             }
 389         }
 390         return true;
 391     }
 392
 393     return false;
 394 }
 395
 396 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
 397 {
 398     // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
 399
 400     // check that caller plays by the rules:
 401     assert(*(mState.cursor - 2) == '/');
 402     assert(*(mState.cursor - 1) == '*');
 403
 404     const char *const commentStart = mState.cursor;
 405     const int commentCol = column() - 2;
 406     const int commentLine = line();
 407
 408     // find next asterisk:
 409     do {
 410         if (!skipTo('*')) {
 411             if (!error()) {
 412                 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
 413             }
 414             return false;
 415         }
 416     } while (!atEnd() && *++mState.cursor != '/');
 417
 418     if (atEnd()) {
 419         makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
 420         return false;
 421     }
 422
 423     assert(*mState.cursor == '/');
 424
 425     const int commentLength = mState.cursor - commentStart - 1;
 426     if (commentLength > 0) {
 427         if (!isValidUtf8(commentStart, commentLength)) {
 428             makeError(Error::InvalidUTF8);
 429             return false;
 430         }
 431         if (reallySave) {
 432             QString tmp = QString::fromUtf8(commentStart, commentLength);
 433             result += tmp.remove('\r');   // get rid of CR in CRLF pairs
 434         }
 435     }
 436
 437     ++mState.cursor; // eat '/'
 438     return true;
 439 }
 440
 441 bool Lexer::Impl::parseComment(QString &result, bool reallySave)
 442 {
 443     // comment := hash-comment / bracket-comment
 444
 445     switch (*mState.cursor) {
 446     case '#':
 447         ++mState.cursor;
 448         return parseHashComment(result, reallySave);
 449     case '/':
 450         if (charsLeft() < 2 || mState.cursor[1] != '*') {
 451             makeError(Error::IllegalCharacter);
 452             return false;
 453         } else {
 454             mState.cursor += 2; // eat "/*"
 455             return parseBracketComment(result, reallySave);
 456         }
 457     default:
 458         return false; // don't set an error here - there was no comment
 459     }
 460 }
 461
 462 bool Lexer::Impl::eatCWS()
 463 {
 464     // white-space := 1*(SP / CRLF / HTAB / comment )
 465
 466     while (!atEnd()) {
 467         switch (*mState.cursor) {
 468         case ' ':
 469         case '\t': // SP / HTAB
 470             ++mState.cursor;
 471             break;;
 472         case '\n':
 473         case '\r': // CRLF
 474             if (!eatCRLF()) {
 475                 return false;
 476             }
 477             break;
 478         case '#':
 479         case '/': { // comments
 480             QString dummy;
 481             if (!parseComment(dummy)) {
 482                 return false;
 483             }
 484         }
 485         break;
 486         default:
 487             return true;
 488         }
 489     }
 490     return true;
 491 }
 492
 493 bool Lexer::Impl::parseIdentifier(QString &result)
 494 {
 495     // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
 496
 497     assert(isIText(*mState.cursor));
 498
 499     const char *const identifierStart = mState.cursor;
 500
 501     // first char:
 502     if (isdigit(*mState.cursor)) {     // no digits for the first
 503         makeError(Error::NoLeadingDigits);
 504         return false;
 505     }
 506
 507     // rest of identifier chars ( now digits are allowed ):
 508     for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor);
 509
 510     const int identifierLength = mState.cursor - identifierStart;
 511
 512     // Can use the fast fromLatin1 here, since identifiers are always
 513     // in the us-ascii subset:
 514     result += QString::fromLatin1(identifierStart, identifierLength);
 515
 516     if (atEnd() || isDelim(*mState.cursor)) {
 517         return true;
 518     }
 519
 520     makeIllegalCharError(*mState.cursor);
 521     return false;
 522 }
 523
 524 bool Lexer::Impl::parseTag(QString &result)
 525 {
 526     // tag := ":" identifier
 527
 528     // check that the caller plays by the rules:
 529     assert(*(mState.cursor - 1) == ':');
 530     assert(!atEnd());
 531     assert(isIText(*mState.cursor));
 532
 533     return parseIdentifier(result);
 534 }
 535
 536 bool Lexer::Impl::parseNumber(QString &result)
 537 {
 538     // number     := 1*DIGIT [QUANTIFIER]
 539     // QUANTIFIER := "K" / "M" / "G"
 540
 541     assert(isdigit(*mState.cursor));
 542
 543     while (!atEnd() && isdigit(*mState.cursor)) {
 544         result += *mState.cursor++;
 545     }
 546
 547     if (atEnd() || isDelim(*mState.cursor)) {
 548         return true;
 549     }
 550
 551     switch (*mState.cursor) {
 552     case 'G':
 553     case 'g':
 554     case 'M':
 555     case 'm':
 556     case 'K':
 557     case 'k':
 558         result += *mState.cursor++;
 559         break;
 560     default:
 561         makeIllegalCharError();
 562         return false;
 563     }
 564
 565     // quantifier found. Check for delimiter:
 566     if (atEnd() || isDelim(*mState.cursor)) {
 567         return true;
 568     }
 569     makeIllegalCharError();
 570     return false;
 571 }
 572
 573 bool Lexer::Impl::parseMultiLine(QString &result)
 574 {
 575     // multi-line          := "text:" *(SP / HTAB) (hash-comment / CRLF)
 576     //                        *(multi-line-literal / multi-line-dotstuff)
 577     //                        "." CRLF
 578     // multi-line-literal  := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
 579     // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
 580     //         ;; A line containing only "." ends the multi-line.
 581     //         ;; Remove a leading '.' if followed by another '.'.
 582
 583     assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
 584
 585     const int mlBeginLine = line();
 586     const int mlBeginCol = column() - 5;
 587
 588     while (!atEnd()) {
 589         switch (*mState.cursor) {
 590         case ' ':
 591         case '\t':
 592             ++mState.cursor;
 593             break;
 594         case '#': {
 595             ++mState.cursor;
 596             QString dummy;
 597             if (!parseHashComment(dummy)) {
 598                 return false;
 599             }
 600             goto MultiLineStart; // break from switch _and_ while
 601         }
 602         case '\n':
 603         case '\r':
 604             if (!eatCRLF()) {
 605                 return false;
 606             }
 607             goto MultiLineStart; // break from switch _and_ while
 608         default:
 609             makeError(Error::NonCWSAfterTextColon);
 610             return false;
 611         }
 612     }
 613
 614 MultiLineStart:
 615     if (atEnd()) {
 616         makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
 617         return false;
 618     }
 619
 620     // Now, collect the single lines until one with only a single dot is found:
 621     QStringList lines;
 622     while (!atEnd()) {
 623         const char *const oldBeginOfLine = beginOfLine();
 624         if (!skipToCRLF()) {
 625             return false;
 626         }
 627         const int lineLength = mState.cursor - oldBeginOfLine;
 628         if (lineLength > 0) {
 629             if (!isValidUtf8(oldBeginOfLine, lineLength)) {
 630                 makeError(Error::InvalidUTF8);
 631                 return false;
 632             }
 633             const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
 634             lines.push_back(removeDotStuff(line));
 635             if (line == QLatin1String(".")) {
 636                 break;
 637             }
 638         } else {
 639             lines.push_back(QString());
 640         }
 641     }
 642
 643     if (lines.back() != QLatin1String(".")) {
 644         makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
 645         return false;
 646     }
 647
 648     assert(!lines.empty());
 649     lines.erase(--lines.end());   // don't include the lone dot.
 650     result = lines.join(QStringLiteral("\n"));
 651     return true;
 652 }
 653
 654 bool Lexer::Impl::parseQuotedString(QString &result)
 655 {
 656     // quoted-string := DQUOTE *CHAR DQUOTE
 657
 658     // check that caller plays by the rules:
 659     assert(*(mState.cursor - 1) == '"');
 660
 661     const int qsBeginCol = column() - 1;
 662     const int qsBeginLine = line();
 663
 664     const QTextCodec *const codec = QTextCodec::codecForMib(106);    // UTF-8
 665     assert(codec);
 666     const std::unique_ptr<QTextDecoder> dec(codec->makeDecoder());
 667     assert(dec.get());
 668
 669     while (!atEnd())
 670         switch (*mState.cursor) {
 671         case '"':
 672             ++mState.cursor;
 673             return true;
 674         case '\r':
 675         case '\n':
 676             if (!eatCRLF()) {
 677                 return false;
 678             }
 679             result += '\n';
 680             break;
 681         case '\\':
 682             ++mState.cursor;
 683             if (atEnd()) {
 684                 break;
 685             }
 686         // else fall through:
 687         default:
 688             if (!is8Bit(*mState.cursor)) {
 689                 result += *mState.cursor++;
 690             } else { // probably UTF-8
 691                 const char *const eightBitBegin = mState.cursor;
 692                 skipTo8BitEnd();
 693                 const int eightBitLen = mState.cursor - eightBitBegin;
 694                 assert(eightBitLen > 0);
 695                 if (isValidUtf8(eightBitBegin, eightBitLen)) {
 696                     result += dec->toUnicode(eightBitBegin, eightBitLen);
 697                 } else {
 698                     assert(column() >= eightBitLen);
 699                     makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
 700                     return false;
 701                 }
 702             }
 703         }
 704
 705     makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
 706     return false;
 707 }
 708
 709 void Lexer::Impl::makeIllegalCharError(char ch)
 710 {
 711     makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
 712 }
 713
 714 } // namespace KSieve