source/libs/poppler/poppler-src/poppler/Lexer.cc

   1 //========================================================================
   2 //
   3 // Lexer.cc
   4 //
   5 // Copyright 1996-2003 Glyph & Cog, LLC
   6 //
   7 //========================================================================
   8
   9 //========================================================================
  10 //
  11 // Modified under the Poppler project - http://poppler.freedesktop.org
  12 //
  13 // All changes made under the Poppler project to this file are licensed
  14 // under GPL version 2 or later
  15 //
  16 // Copyright (C) 2006-2010, 2012-2014 Albert Astals Cid <aacid@kde.org>
  17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
  18 // Copyright (C) 2010 Carlos Garcia Campos <carlosgc@gnome.org>
  19 // Copyright (C) 2012, 2013 Adrian Johnson <ajohnson@redneon.com>
  20 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
  21 //
  22 // To see a description of the changes please see the Changelog file that
  23 // came with your tarball or type make ChangeLog if you are building from git
  24 //
  25 //========================================================================
  26
  27 #include <config.h>
  28
  29 #ifdef USE_GCC_PRAGMAS
  30 #pragma implementation
  31 #endif
  32
  33 #include <stdlib.h>
  34 #include <stddef.h>
  35 #include <string.h>
  36 #include <limits.h>
  37 #include <ctype.h>
  38 #include "Lexer.h"
  39 #include "Error.h"
  40 #include "XRef.h"
  41
  42 //------------------------------------------------------------------------
  43
  44 // A '1' in this array means the character is white space.  A '1' or
  45 // '2' means the character ends a name or command.
  46 static const char specialChars[256] = {
  47   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  48   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  49   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  50   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  51   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  52   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  53   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  54   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  55   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  56   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  57   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  58   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  60   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  61   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  62   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
  63 };
  64
  65 static const int IntegerSafeLimit = (INT_MAX - 9) / 10;
  66 static const long long LongLongSafeLimit = (LLONG_MAX - 9) / 10;
  67
  68 //------------------------------------------------------------------------
  69 // Lexer
  70 //------------------------------------------------------------------------
  71
  72 Lexer::Lexer(XRef *xrefA, Stream *str) {
  73   Object obj;
  74
  75   lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
  76   xref = xrefA;
  77
  78   curStr.initStream(str);
  79   streams = new Array(xref);
  80   streams->add(curStr.copy(&obj));
  81   strPtr = 0;
  82   freeArray = gTrue;
  83   curStr.streamReset();
  84 }
  85
  86 Lexer::Lexer(XRef *xrefA, Object *obj) {
  87   Object obj2;
  88
  89   lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
  90   xref = xrefA;
  91
  92   if (obj->isStream()) {
  93     streams = new Array(xref);
  94     freeArray = gTrue;
  95     streams->add(obj->copy(&obj2));
  96   } else {
  97     streams = obj->getArray();
  98     freeArray = gFalse;
  99   }
 100   strPtr = 0;
 101   if (streams->getLength() > 0) {
 102     streams->get(strPtr, &curStr);
 103     curStr.streamReset();
 104   }
 105 }
 106
 107 Lexer::~Lexer() {
 108   if (!curStr.isNone()) {
 109     curStr.streamClose();
 110     curStr.free();
 111   }
 112   if (freeArray) {
 113     delete streams;
 114   }
 115 }
 116
 117 int Lexer::getChar(GBool comesFromLook) {
 118   int c;
 119
 120   if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
 121     c = lookCharLastValueCached;
 122     lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
 123     return c;
 124   }
 125
 126   c = EOF;
 127   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
 128     if (comesFromLook == gTrue) {
 129       return EOF;
 130     } else {
 131       curStr.streamClose();
 132       curStr.free();
 133       ++strPtr;
 134       if (strPtr < streams->getLength()) {
 135         streams->get(strPtr, &curStr);
 136         curStr.streamReset();
 137       }
 138     }
 139   }
 140   return c;
 141 }
 142
 143 int Lexer::lookChar() {
 144
 145   if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
 146     return lookCharLastValueCached;
 147   }
 148   lookCharLastValueCached = getChar(gTrue);
 149   if (lookCharLastValueCached == EOF) {
 150     lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
 151     return EOF;
 152   } else {
 153     return lookCharLastValueCached;
 154   }
 155 }
 156
 157 Object *Lexer::getObj(Object *obj, int objNum) {
 158   char *p;
 159   int c, c2;
 160   GBool comment, neg, done, overflownInteger, overflownLongLong;
 161   int numParen;
 162   int xi;
 163   long long xll = 0;
 164   double xf = 0, scale;
 165   GooString *s;
 166   int n, m;
 167
 168   // skip whitespace and comments
 169   comment = gFalse;
 170   while (1) {
 171     if ((c = getChar()) == EOF) {
 172       return obj->initEOF();
 173     }
 174     if (comment) {
 175       if (c == '\r' || c == '\n')
 176         comment = gFalse;
 177     } else if (c == '%') {
 178       comment = gTrue;
 179     } else if (specialChars[c] != 1) {
 180       break;
 181     }
 182   }
 183
 184   // start reading token
 185   switch (c) {
 186
 187   // number
 188   case '0': case '1': case '2': case '3': case '4':
 189   case '5': case '6': case '7': case '8': case '9':
 190   case '+': case '-': case '.':
 191     overflownInteger = gFalse;
 192     overflownLongLong = gFalse;
 193     neg = gFalse;
 194     xi = 0;
 195     if (c == '-') {
 196       neg = gTrue;
 197     } else if (c == '.') {
 198       goto doReal;
 199     } else if (c != '+') {
 200       xi = c - '0';
 201     }
 202     while (1) {
 203       c = lookChar();
 204       if (isdigit(c)) {
 205         getChar();
 206         if (unlikely(overflownLongLong)) {
 207           xf = xf * 10.0 + (c - '0');
 208         } else if (unlikely (overflownInteger)) {
 209           if (unlikely(xll > LongLongSafeLimit) &&
 210               (xll > (LLONG_MAX - (c - '0')) / 10.0)) {
 211             overflownLongLong = gTrue;
 212             xf = xll * 10.0 + (c - '0');
 213           } else {
 214             xll = xll * 10 + (c - '0');
 215           }
 216         } else {
 217           if (unlikely(xi > IntegerSafeLimit) &&
 218               (xi > (INT_MAX - (c - '0')) / 10.0)) {
 219             overflownInteger = gTrue;
 220             xll = xi * 10LL + (c - '0');
 221           } else {
 222             xi = xi * 10 + (c - '0');
 223           }
 224         }
 225       } else if (c == '.') {
 226         getChar();
 227         goto doReal;
 228       } else {
 229         break;
 230       }
 231     }
 232     if (neg) {
 233       xi = -xi;
 234       xll = -xll;
 235       xf = -xf;
 236     }
 237     if (unlikely(overflownInteger)) {
 238       if (overflownLongLong) {
 239         obj->initReal(xf);
 240       } else {
 241         if (unlikely(xll == INT_MIN)) {
 242           obj->initInt(INT_MIN);
 243         } else {
 244           obj->initInt64(xll);
 245         }
 246       }
 247     } else {
 248       obj->initInt(xi);
 249     }
 250     break;
 251   doReal:
 252     if (likely(!overflownInteger)) {
 253       xf = xi;
 254     } else if (!overflownLongLong) {
 255       xf = xll;
 256     }
 257     scale = 0.1;
 258     while (1) {
 259       c = lookChar();
 260       if (c == '-') {
 261         // ignore minus signs in the middle of numbers to match
 262         // Adobe's behavior
 263         error(errSyntaxWarning, getPos(), "Badly formatted number");
 264         getChar();
 265         continue;
 266       }
 267       if (!isdigit(c)) {
 268         break;
 269       }
 270       getChar();
 271       xf = xf + scale * (c - '0');
 272       scale *= 0.1;
 273     }
 274     if (neg) {
 275       xf = -xf;
 276     }
 277     obj->initReal(xf);
 278     break;
 279
 280   // string
 281   case '(':
 282     p = tokBuf;
 283     n = 0;
 284     numParen = 1;
 285     done = gFalse;
 286     s = NULL;
 287     do {
 288       c2 = EOF;
 289       switch (c = getChar()) {
 290
 291       case EOF:
 292 #if 0
 293       // This breaks some PDF files, e.g., ones from Photoshop.
 294       case '\r':
 295       case '\n':
 296 #endif
 297         error(errSyntaxError, getPos(), "Unterminated string");
 298         done = gTrue;
 299         break;
 300
 301       case '(':
 302         ++numParen;
 303         c2 = c;
 304         break;
 305
 306       case ')':
 307         if (--numParen == 0) {
 308           done = gTrue;
 309         } else {
 310           c2 = c;
 311         }
 312         break;
 313
 314       case '\\':
 315         switch (c = getChar()) {
 316         case 'n':
 317           c2 = '\n';
 318           break;
 319         case 'r':
 320           c2 = '\r';
 321           break;
 322         case 't':
 323           c2 = '\t';
 324           break;
 325         case 'b':
 326           c2 = '\b';
 327           break;
 328         case 'f':
 329           c2 = '\f';
 330           break;
 331         case '\\':
 332         case '(':
 333         case ')':
 334           c2 = c;
 335           break;
 336         case '0': case '1': case '2': case '3':
 337         case '4': case '5': case '6': case '7':
 338           c2 = c - '0';
 339           c = lookChar();
 340           if (c >= '0' && c <= '7') {
 341             getChar();
 342             c2 = (c2 << 3) + (c - '0');
 343             c = lookChar();
 344             if (c >= '0' && c <= '7') {
 345               getChar();
 346               c2 = (c2 << 3) + (c - '0');
 347             }
 348           }
 349           break;
 350         case '\r':
 351           c = lookChar();
 352           if (c == '\n') {
 353             getChar();
 354           }
 355           break;
 356         case '\n':
 357           break;
 358         case EOF:
 359           error(errSyntaxError, getPos(), "Unterminated string");
 360           done = gTrue;
 361           break;
 362         default:
 363           c2 = c;
 364           break;
 365         }
 366         break;
 367
 368       default:
 369         c2 = c;
 370         break;
 371       }
 372
 373       if (c2 != EOF) {
 374         if (n == tokBufSize) {
 375           if (!s)
 376             s = new GooString(tokBuf, tokBufSize);
 377           else
 378             s->append(tokBuf, tokBufSize);
 379           p = tokBuf;
 380           n = 0;
 381
 382           // we are growing see if the document is not malformed and we are growing too much
 383           if (objNum > 0 && xref != NULL)
 384           {
 385             int newObjNum = xref->getNumEntry(curStr.streamGetPos());
 386             if (newObjNum != objNum)
 387             {
 388               error(errSyntaxError, getPos(), "Unterminated string");
 389               done = gTrue;
 390               delete s;
 391               n = -2;
 392             }
 393           }
 394         }
 395         *p++ = (char)c2;
 396         ++n;
 397       }
 398     } while (!done);
 399     if (n >= 0) {
 400       if (!s)
 401         s = new GooString(tokBuf, n);
 402       else
 403         s->append(tokBuf, n);
 404       obj->initString(s);
 405     } else {
 406       obj->initEOF();
 407     }
 408     break;
 409
 410   // name
 411   case '/':
 412     p = tokBuf;
 413     n = 0;
 414     s = NULL;
 415     while ((c = lookChar()) != EOF && !specialChars[c]) {
 416       getChar();
 417       if (c == '#') {
 418         c2 = lookChar();
 419         if (c2 >= '0' && c2 <= '9') {
 420           c = c2 - '0';
 421         } else if (c2 >= 'A' && c2 <= 'F') {
 422           c = c2 - 'A' + 10;
 423         } else if (c2 >= 'a' && c2 <= 'f') {
 424           c = c2 - 'a' + 10;
 425         } else {
 426           goto notEscChar;
 427         }
 428         getChar();
 429         c <<= 4;
 430         c2 = getChar();
 431         if (c2 >= '0' && c2 <= '9') {
 432           c += c2 - '0';
 433         } else if (c2 >= 'A' && c2 <= 'F') {
 434           c += c2 - 'A' + 10;
 435         } else if (c2 >= 'a' && c2 <= 'f') {
 436           c += c2 - 'a' + 10;
 437         } else {
 438           error(errSyntaxError, getPos(), "Illegal digit in hex char in name");
 439         }
 440       }
 441      notEscChar:
 442       // the PDF spec claims that names are limited to 127 chars, but
 443       // Distiller 8 will produce longer names, and Acrobat 8 will
 444       // accept longer names
 445       ++n;
 446       if (n < tokBufSize) {
 447         *p++ = c;
 448       } else if (n == tokBufSize) {
 449         error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
 450         *p = c;
 451         s = new GooString(tokBuf, n);
 452       } else {
 453         s->append((char)c);
 454       }
 455     }
 456     if (n < tokBufSize) {
 457       *p = '\0';
 458       obj->initName(tokBuf);
 459     } else {
 460       obj->initName(s->getCString());
 461       delete s;
 462     }
 463     break;
 464
 465   // array punctuation
 466   case '[':
 467   case ']':
 468     tokBuf[0] = c;
 469     tokBuf[1] = '\0';
 470     obj->initCmd(tokBuf);
 471     break;
 472
 473   // hex string or dict punctuation
 474   case '<':
 475     c = lookChar();
 476
 477     // dict punctuation
 478     if (c == '<') {
 479       getChar();
 480       tokBuf[0] = tokBuf[1] = '<';
 481       tokBuf[2] = '\0';
 482       obj->initCmd(tokBuf);
 483
 484     // hex string
 485     } else {
 486       p = tokBuf;
 487       m = n = 0;
 488       c2 = 0;
 489       s = NULL;
 490       while (1) {
 491         c = getChar();
 492         if (c == '>') {
 493           break;
 494         } else if (c == EOF) {
 495           error(errSyntaxError, getPos(), "Unterminated hex string");
 496           break;
 497         } else if (specialChars[c] != 1) {
 498           c2 = c2 << 4;
 499           if (c >= '0' && c <= '9')
 500             c2 += c - '0';
 501           else if (c >= 'A' && c <= 'F')
 502             c2 += c - 'A' + 10;
 503           else if (c >= 'a' && c <= 'f')
 504             c2 += c - 'a' + 10;
 505           else
 506             error(errSyntaxError, getPos(), "Illegal character <{0:02x}> in hex string", c);
 507           if (++m == 2) {
 508             if (n == tokBufSize) {
 509               if (!s)
 510                 s = new GooString(tokBuf, tokBufSize);
 511               else
 512                 s->append(tokBuf, tokBufSize);
 513               p = tokBuf;
 514               n = 0;
 515             }
 516             *p++ = (char)c2;
 517             ++n;
 518             c2 = 0;
 519             m = 0;
 520           }
 521         }
 522       }
 523       if (!s)
 524         s = new GooString(tokBuf, n);
 525       else
 526         s->append(tokBuf, n);
 527       if (m == 1)
 528         s->append((char)(c2 << 4));
 529       obj->initString(s);
 530     }
 531     break;
 532
 533   // dict punctuation
 534   case '>':
 535     c = lookChar();
 536     if (c == '>') {
 537       getChar();
 538       tokBuf[0] = tokBuf[1] = '>';
 539       tokBuf[2] = '\0';
 540       obj->initCmd(tokBuf);
 541     } else {
 542       error(errSyntaxError, getPos(), "Illegal character '>'");
 543       obj->initError();
 544     }
 545     break;
 546
 547   // error
 548   case ')':
 549   case '{':
 550   case '}':
 551     error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
 552     obj->initError();
 553     break;
 554
 555   // command
 556   default:
 557     p = tokBuf;
 558     *p++ = c;
 559     n = 1;
 560     while ((c = lookChar()) != EOF && !specialChars[c]) {
 561       getChar();
 562       if (++n == tokBufSize) {
 563         error(errSyntaxError, getPos(), "Command token too long");
 564         break;
 565       }
 566       *p++ = c;
 567     }
 568     *p = '\0';
 569     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
 570       obj->initBool(gTrue);
 571     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
 572       obj->initBool(gFalse);
 573     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
 574       obj->initNull();
 575     } else {
 576       obj->initCmd(tokBuf);
 577     }
 578     break;
 579   }
 580
 581   return obj;
 582 }
 583
 584 Object *Lexer::getObj(Object *obj, const char *cmdA, int objNum) {
 585   char *p;
 586   int c;
 587   GBool comment;
 588   int n;
 589
 590   // skip whitespace and comments
 591   comment = gFalse;
 592   const char *cmd1 = tokBuf;
 593   *tokBuf = 0;
 594   while (strcmp(cmdA, cmd1) && (objNum < 0 || (xref && xref->getNumEntry(getPos()) == objNum))) {
 595     while (1) {
 596       if ((c = getChar()) == EOF) {
 597         return obj->initEOF();
 598       }
 599       if (comment) {
 600         if (c == '\r' || c == '\n') {
 601           comment = gFalse;
 602         }
 603       } else if (c == '%') {
 604         comment = gTrue;
 605       } else if (specialChars[c] != 1) {
 606         break;
 607       }
 608     }
 609     p = tokBuf;
 610     *p++ = c;
 611     n = 1;
 612     while ((c = lookChar()) != EOF && specialChars[c] == 0) {
 613       getChar();
 614       if (++n == tokBufSize) {
 615         break;
 616       }
 617       *p++ = c;
 618     }
 619     *p = '\0';
 620   }
 621   obj->initCmd(tokBuf);
 622
 623   return obj;
 624 }
 625
 626 void Lexer::skipToNextLine() {
 627   int c;
 628
 629   while (1) {
 630     c = getChar();
 631     if (c == EOF || c == '\n') {
 632       return;
 633     }
 634     if (c == '\r') {
 635       if ((c = lookChar()) == '\n') {
 636         getChar();
 637       }
 638       return;
 639     }
 640   }
 641 }
 642
 643 GBool Lexer::isSpace(int c) {
 644   return c >= 0 && c <= 0xff && specialChars[c] == 1;
 645 }