beta-0.89.2
[luatex.git] / source / libs / poppler / poppler-src / poppler / Lexer.cc
blob952967a998a2c87203c16cbdfc672b1c1be0524b
1 //========================================================================
2 //
3 // Lexer.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2006-2010, 2012-2014 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18 // Copyright (C) 2010 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2012, 2013 Adrian Johnson <ajohnson@redneon.com>
20 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
22 // To see a description of the changes please see the Changelog file that
23 // came with your tarball or type make ChangeLog if you are building from git
25 //========================================================================
27 #include <config.h>
29 #ifdef USE_GCC_PRAGMAS
30 #pragma implementation
31 #endif
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <limits.h>
37 #include <ctype.h>
38 #include "Lexer.h"
39 #include "Error.h"
40 #include "XRef.h"
42 //------------------------------------------------------------------------
44 // A '1' in this array means the character is white space. A '1' or
45 // '2' means the character ends a name or command.
46 static const char specialChars[256] = {
47 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
49 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
65 static const int IntegerSafeLimit = (INT_MAX - 9) / 10;
66 static const long long LongLongSafeLimit = (LLONG_MAX - 9) / 10;
68 //------------------------------------------------------------------------
69 // Lexer
70 //------------------------------------------------------------------------
72 Lexer::Lexer(XRef *xrefA, Stream *str) {
73 Object obj;
75 lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
76 xref = xrefA;
78 curStr.initStream(str);
79 streams = new Array(xref);
80 streams->add(curStr.copy(&obj));
81 strPtr = 0;
82 freeArray = gTrue;
83 curStr.streamReset();
86 Lexer::Lexer(XRef *xrefA, Object *obj) {
87 Object obj2;
89 lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
90 xref = xrefA;
92 if (obj->isStream()) {
93 streams = new Array(xref);
94 freeArray = gTrue;
95 streams->add(obj->copy(&obj2));
96 } else {
97 streams = obj->getArray();
98 freeArray = gFalse;
100 strPtr = 0;
101 if (streams->getLength() > 0) {
102 streams->get(strPtr, &curStr);
103 curStr.streamReset();
107 Lexer::~Lexer() {
108 if (!curStr.isNone()) {
109 curStr.streamClose();
110 curStr.free();
112 if (freeArray) {
113 delete streams;
117 int Lexer::getChar(GBool comesFromLook) {
118 int c;
120 if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
121 c = lookCharLastValueCached;
122 lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
123 return c;
126 c = EOF;
127 while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
128 if (comesFromLook == gTrue) {
129 return EOF;
130 } else {
131 curStr.streamClose();
132 curStr.free();
133 ++strPtr;
134 if (strPtr < streams->getLength()) {
135 streams->get(strPtr, &curStr);
136 curStr.streamReset();
140 return c;
143 int Lexer::lookChar() {
145 if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
146 return lookCharLastValueCached;
148 lookCharLastValueCached = getChar(gTrue);
149 if (lookCharLastValueCached == EOF) {
150 lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
151 return EOF;
152 } else {
153 return lookCharLastValueCached;
157 Object *Lexer::getObj(Object *obj, int objNum) {
158 char *p;
159 int c, c2;
160 GBool comment, neg, done, overflownInteger, overflownLongLong;
161 int numParen;
162 int xi;
163 long long xll = 0;
164 double xf = 0, scale;
165 GooString *s;
166 int n, m;
168 // skip whitespace and comments
169 comment = gFalse;
170 while (1) {
171 if ((c = getChar()) == EOF) {
172 return obj->initEOF();
174 if (comment) {
175 if (c == '\r' || c == '\n')
176 comment = gFalse;
177 } else if (c == '%') {
178 comment = gTrue;
179 } else if (specialChars[c] != 1) {
180 break;
184 // start reading token
185 switch (c) {
187 // number
188 case '0': case '1': case '2': case '3': case '4':
189 case '5': case '6': case '7': case '8': case '9':
190 case '+': case '-': case '.':
191 overflownInteger = gFalse;
192 overflownLongLong = gFalse;
193 neg = gFalse;
194 xi = 0;
195 if (c == '-') {
196 neg = gTrue;
197 } else if (c == '.') {
198 goto doReal;
199 } else if (c != '+') {
200 xi = c - '0';
202 while (1) {
203 c = lookChar();
204 if (isdigit(c)) {
205 getChar();
206 if (unlikely(overflownLongLong)) {
207 xf = xf * 10.0 + (c - '0');
208 } else if (unlikely (overflownInteger)) {
209 if (unlikely(xll > LongLongSafeLimit) &&
210 (xll > (LLONG_MAX - (c - '0')) / 10.0)) {
211 overflownLongLong = gTrue;
212 xf = xll * 10.0 + (c - '0');
213 } else {
214 xll = xll * 10 + (c - '0');
216 } else {
217 if (unlikely(xi > IntegerSafeLimit) &&
218 (xi > (INT_MAX - (c - '0')) / 10.0)) {
219 overflownInteger = gTrue;
220 xll = xi * 10LL + (c - '0');
221 } else {
222 xi = xi * 10 + (c - '0');
225 } else if (c == '.') {
226 getChar();
227 goto doReal;
228 } else {
229 break;
232 if (neg) {
233 xi = -xi;
234 xll = -xll;
235 xf = -xf;
237 if (unlikely(overflownInteger)) {
238 if (overflownLongLong) {
239 obj->initReal(xf);
240 } else {
241 if (unlikely(xll == INT_MIN)) {
242 obj->initInt(INT_MIN);
243 } else {
244 obj->initInt64(xll);
247 } else {
248 obj->initInt(xi);
250 break;
251 doReal:
252 if (likely(!overflownInteger)) {
253 xf = xi;
254 } else if (!overflownLongLong) {
255 xf = xll;
257 scale = 0.1;
258 while (1) {
259 c = lookChar();
260 if (c == '-') {
261 // ignore minus signs in the middle of numbers to match
262 // Adobe's behavior
263 error(errSyntaxWarning, getPos(), "Badly formatted number");
264 getChar();
265 continue;
267 if (!isdigit(c)) {
268 break;
270 getChar();
271 xf = xf + scale * (c - '0');
272 scale *= 0.1;
274 if (neg) {
275 xf = -xf;
277 obj->initReal(xf);
278 break;
280 // string
281 case '(':
282 p = tokBuf;
283 n = 0;
284 numParen = 1;
285 done = gFalse;
286 s = NULL;
287 do {
288 c2 = EOF;
289 switch (c = getChar()) {
291 case EOF:
292 #if 0
293 // This breaks some PDF files, e.g., ones from Photoshop.
294 case '\r':
295 case '\n':
296 #endif
297 error(errSyntaxError, getPos(), "Unterminated string");
298 done = gTrue;
299 break;
301 case '(':
302 ++numParen;
303 c2 = c;
304 break;
306 case ')':
307 if (--numParen == 0) {
308 done = gTrue;
309 } else {
310 c2 = c;
312 break;
314 case '\\':
315 switch (c = getChar()) {
316 case 'n':
317 c2 = '\n';
318 break;
319 case 'r':
320 c2 = '\r';
321 break;
322 case 't':
323 c2 = '\t';
324 break;
325 case 'b':
326 c2 = '\b';
327 break;
328 case 'f':
329 c2 = '\f';
330 break;
331 case '\\':
332 case '(':
333 case ')':
334 c2 = c;
335 break;
336 case '0': case '1': case '2': case '3':
337 case '4': case '5': case '6': case '7':
338 c2 = c - '0';
339 c = lookChar();
340 if (c >= '0' && c <= '7') {
341 getChar();
342 c2 = (c2 << 3) + (c - '0');
343 c = lookChar();
344 if (c >= '0' && c <= '7') {
345 getChar();
346 c2 = (c2 << 3) + (c - '0');
349 break;
350 case '\r':
351 c = lookChar();
352 if (c == '\n') {
353 getChar();
355 break;
356 case '\n':
357 break;
358 case EOF:
359 error(errSyntaxError, getPos(), "Unterminated string");
360 done = gTrue;
361 break;
362 default:
363 c2 = c;
364 break;
366 break;
368 default:
369 c2 = c;
370 break;
373 if (c2 != EOF) {
374 if (n == tokBufSize) {
375 if (!s)
376 s = new GooString(tokBuf, tokBufSize);
377 else
378 s->append(tokBuf, tokBufSize);
379 p = tokBuf;
380 n = 0;
382 // we are growing see if the document is not malformed and we are growing too much
383 if (objNum > 0 && xref != NULL)
385 int newObjNum = xref->getNumEntry(curStr.streamGetPos());
386 if (newObjNum != objNum)
388 error(errSyntaxError, getPos(), "Unterminated string");
389 done = gTrue;
390 delete s;
391 n = -2;
395 *p++ = (char)c2;
396 ++n;
398 } while (!done);
399 if (n >= 0) {
400 if (!s)
401 s = new GooString(tokBuf, n);
402 else
403 s->append(tokBuf, n);
404 obj->initString(s);
405 } else {
406 obj->initEOF();
408 break;
410 // name
411 case '/':
412 p = tokBuf;
413 n = 0;
414 s = NULL;
415 while ((c = lookChar()) != EOF && !specialChars[c]) {
416 getChar();
417 if (c == '#') {
418 c2 = lookChar();
419 if (c2 >= '0' && c2 <= '9') {
420 c = c2 - '0';
421 } else if (c2 >= 'A' && c2 <= 'F') {
422 c = c2 - 'A' + 10;
423 } else if (c2 >= 'a' && c2 <= 'f') {
424 c = c2 - 'a' + 10;
425 } else {
426 goto notEscChar;
428 getChar();
429 c <<= 4;
430 c2 = getChar();
431 if (c2 >= '0' && c2 <= '9') {
432 c += c2 - '0';
433 } else if (c2 >= 'A' && c2 <= 'F') {
434 c += c2 - 'A' + 10;
435 } else if (c2 >= 'a' && c2 <= 'f') {
436 c += c2 - 'a' + 10;
437 } else {
438 error(errSyntaxError, getPos(), "Illegal digit in hex char in name");
441 notEscChar:
442 // the PDF spec claims that names are limited to 127 chars, but
443 // Distiller 8 will produce longer names, and Acrobat 8 will
444 // accept longer names
445 ++n;
446 if (n < tokBufSize) {
447 *p++ = c;
448 } else if (n == tokBufSize) {
449 error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
450 *p = c;
451 s = new GooString(tokBuf, n);
452 } else {
453 s->append((char)c);
456 if (n < tokBufSize) {
457 *p = '\0';
458 obj->initName(tokBuf);
459 } else {
460 obj->initName(s->getCString());
461 delete s;
463 break;
465 // array punctuation
466 case '[':
467 case ']':
468 tokBuf[0] = c;
469 tokBuf[1] = '\0';
470 obj->initCmd(tokBuf);
471 break;
473 // hex string or dict punctuation
474 case '<':
475 c = lookChar();
477 // dict punctuation
478 if (c == '<') {
479 getChar();
480 tokBuf[0] = tokBuf[1] = '<';
481 tokBuf[2] = '\0';
482 obj->initCmd(tokBuf);
484 // hex string
485 } else {
486 p = tokBuf;
487 m = n = 0;
488 c2 = 0;
489 s = NULL;
490 while (1) {
491 c = getChar();
492 if (c == '>') {
493 break;
494 } else if (c == EOF) {
495 error(errSyntaxError, getPos(), "Unterminated hex string");
496 break;
497 } else if (specialChars[c] != 1) {
498 c2 = c2 << 4;
499 if (c >= '0' && c <= '9')
500 c2 += c - '0';
501 else if (c >= 'A' && c <= 'F')
502 c2 += c - 'A' + 10;
503 else if (c >= 'a' && c <= 'f')
504 c2 += c - 'a' + 10;
505 else
506 error(errSyntaxError, getPos(), "Illegal character <{0:02x}> in hex string", c);
507 if (++m == 2) {
508 if (n == tokBufSize) {
509 if (!s)
510 s = new GooString(tokBuf, tokBufSize);
511 else
512 s->append(tokBuf, tokBufSize);
513 p = tokBuf;
514 n = 0;
516 *p++ = (char)c2;
517 ++n;
518 c2 = 0;
519 m = 0;
523 if (!s)
524 s = new GooString(tokBuf, n);
525 else
526 s->append(tokBuf, n);
527 if (m == 1)
528 s->append((char)(c2 << 4));
529 obj->initString(s);
531 break;
533 // dict punctuation
534 case '>':
535 c = lookChar();
536 if (c == '>') {
537 getChar();
538 tokBuf[0] = tokBuf[1] = '>';
539 tokBuf[2] = '\0';
540 obj->initCmd(tokBuf);
541 } else {
542 error(errSyntaxError, getPos(), "Illegal character '>'");
543 obj->initError();
545 break;
547 // error
548 case ')':
549 case '{':
550 case '}':
551 error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
552 obj->initError();
553 break;
555 // command
556 default:
557 p = tokBuf;
558 *p++ = c;
559 n = 1;
560 while ((c = lookChar()) != EOF && !specialChars[c]) {
561 getChar();
562 if (++n == tokBufSize) {
563 error(errSyntaxError, getPos(), "Command token too long");
564 break;
566 *p++ = c;
568 *p = '\0';
569 if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
570 obj->initBool(gTrue);
571 } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
572 obj->initBool(gFalse);
573 } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
574 obj->initNull();
575 } else {
576 obj->initCmd(tokBuf);
578 break;
581 return obj;
584 Object *Lexer::getObj(Object *obj, const char *cmdA, int objNum) {
585 char *p;
586 int c;
587 GBool comment;
588 int n;
590 // skip whitespace and comments
591 comment = gFalse;
592 const char *cmd1 = tokBuf;
593 *tokBuf = 0;
594 while (strcmp(cmdA, cmd1) && (objNum < 0 || (xref && xref->getNumEntry(getPos()) == objNum))) {
595 while (1) {
596 if ((c = getChar()) == EOF) {
597 return obj->initEOF();
599 if (comment) {
600 if (c == '\r' || c == '\n') {
601 comment = gFalse;
603 } else if (c == '%') {
604 comment = gTrue;
605 } else if (specialChars[c] != 1) {
606 break;
609 p = tokBuf;
610 *p++ = c;
611 n = 1;
612 while ((c = lookChar()) != EOF && specialChars[c] == 0) {
613 getChar();
614 if (++n == tokBufSize) {
615 break;
617 *p++ = c;
619 *p = '\0';
621 obj->initCmd(tokBuf);
623 return obj;
626 void Lexer::skipToNextLine() {
627 int c;
629 while (1) {
630 c = getChar();
631 if (c == EOF || c == '\n') {
632 return;
634 if (c == '\r') {
635 if ((c = lookChar()) == '\n') {
636 getChar();
638 return;
643 GBool Lexer::isSpace(int c) {
644 return c >= 0 && c <= 0xff && specialChars[c] == 1;