3 Copyright 2013 Taco Hoekwater <taco@luatex.org>
5 This file is part of LuaTeX.
7 LuaTeX is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2 of the License, or (at your
10 option) any later version.
12 LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License along
18 with LuaTeX; if not, see <http://www.gnu.org/licenses/>. */
33 # include <poppler-config.h>
34 # include <goo/GooString.h>
35 # include <goo/gmem.h>
36 # include <goo/gfile.h>
49 # include <GlobalParams.h>
52 # include <lua/luatex-api.h>
55 #define SCANNER "pdfscanner"
57 #define MAXOPERANDS 1000
73 typedef struct Token
{
79 typedef struct ObjectList
{
80 struct ObjectList
*next
;
84 typedef struct scannerdata
{
87 Token
** _operandstack
;
89 ObjectList
* _streams
;
92 typedef enum { ALLOC_POPPLER
, ALLOC_LEPDF
} alloctype
;
94 #define M_Object "Object"
95 #define M_Stream "Stream"
99 alloctype atype
; // was it allocated by poppler or the lepdflib.cc?
100 void *pd
; // reference to PdfDocument, or NULL
101 unsigned long pc
; // counter to detect PDFDoc change
104 static void clear_operand_stack (scannerdata
*self
, int from
);
105 static Token
*_parseToken (scannerdata
*self
, int c
);
106 static void push_token (lua_State
*L
, scannerdata
*self
);
108 void *xmalloc (size_t size
)
110 void *new_mem
= (void *)malloc(size
);
111 if (new_mem
== NULL
) {
112 fprintf(stderr
, "fatal: memory exhausted (xmalloc of %lu bytes).\n", (unsigned long)size
);
118 void *xrealloc (void *old_ptr
, size_t size
)
120 void *new_mem
= (void *)realloc(old_ptr
, size
);
121 if (new_mem
== NULL
) {
122 fprintf(stderr
,"fatal: memory exhausted (realloc of %lu bytes).\n", (unsigned long)size
);
128 #define xreallocarray(ptr,type,size) ((type*)xrealloc(ptr,(size+1)*sizeof(type)))
130 #define INITBUFSIZE 64
132 #define define_buffer(a) \
133 char *a = (char *)xmalloc (INITBUFSIZE); \
134 int a##_size = INITBUFSIZE; \
136 memset (a,0,INITBUFSIZE)
138 #define check_overflow(a, wsize) do { \
139 if (wsize >= a##_size) { \
140 int nsize = a##_size + a##_size / 4; \
141 a = (char *) xreallocarray(a, char, (unsigned) nsize); \
142 memset (a+a##_size, 0, a##_size / 4); \
148 static scannerdata
* scanner_push(lua_State
* L
)
150 scannerdata
*a
= (scannerdata
*)lua_newuserdata(L
, sizeof(scannerdata
));
151 luaL_getmetatable(L
, SCANNER
);
152 lua_setmetatable(L
, -2);
156 static scannerdata
*scanner_check (lua_State
*L
, int index
)
159 luaL_checktype(L
, index
, LUA_TUSERDATA
);
160 bar
= (scannerdata
*)luaL_checkudata(L
, index
, SCANNER
);
161 if (bar
== NULL
) luaL_argerror(L
, index
, SCANNER
" expected");
165 static void free_token (Token
*token
)
168 free((void *)token
->string
);
173 static void clear_operand_stack (scannerdata
*self
, int from
)
175 int i
= self
->_nextoperand
-1;
177 if (self
->_operandstack
[i
]) {
178 free_token(self
->_operandstack
[i
]);
179 self
->_operandstack
[i
] = NULL
;
183 self
->_nextoperand
= from
;
186 static void push_operand (scannerdata
*self
, Token
*token
)
188 if (self
->_nextoperand
+1> MAXOPERANDS
) {
189 fprintf(stderr
, "out of operand stack space");
192 self
->_operandstack
[self
->_nextoperand
++] = token
;
195 static Token
* new_operand (pdf_token_type c
)
197 Token
*token
= (Token
*)xmalloc(sizeof(Token
));
198 memset (token
, 0, sizeof(Token
));
203 static void _nextStream (scannerdata
*self
) {
204 self
->_stream
->streamClose();
205 ObjectList
*rover
= self
->_streams
;
206 self
->_stream
= rover
->stream
;
207 self
->_stream
->streamReset();
208 self
->_streams
= rover
->next
;
212 static int streamGetChar (scannerdata
*self
) {
213 int i
= self
->_stream
->streamGetChar();
214 if (i
<0 && self
->_streams
) {
216 i
= streamGetChar(self
);
221 static int streamLookChar (scannerdata
*self
) {
222 int i
= self
->_stream
->streamLookChar();
223 if (i
<0 && self
->_streams
) {
225 i
= streamLookChar(self
);
230 static Token
* _parseSpace (scannerdata
*self
)
232 return _parseToken (self
,streamGetChar(self
));
235 static Token
* _parseString (scannerdata
*self
, int c
)
237 // local token = {type = pdf_string,value = ''}
238 define_buffer(found
);
241 c
= streamGetChar(self
);
247 if (level
< 1) break;
250 int next
= streamGetChar(self
);
251 if (next
== '(' || next
== ')' || next
== '\\') {
253 } else if (next
== '\n' || next
== '\r') {
255 } else if (next
== 'n') {
257 } else if (next
== 'r') {
259 } else if (next
== 't') {
261 } else if (next
== 'b') {
263 } else if (next
== 'f') {
265 } else if (next
>= '0' && next
<= '7') {
267 int next2
= streamLookChar(self
);
268 if (next2
>= '0' && next2
<= '7') {
269 next2
= streamGetChar(self
);
271 int next3
= streamLookChar(self
);
272 if (next3
>= '0' && next3
<= '7') {
273 next3
= streamGetChar(self
);
275 c
= (next
*64+next2
*8+next3
);
286 check_overflow(found
,foundindex
);
288 found
[foundindex
++] = c
;
291 Token
*token
= new_operand(pdf_string
);
292 token
->value
= foundindex
;
293 token
->string
= found
;
298 static Token
* _parseNumber (scannerdata
*self
, int c
)
301 pdf_token_type type
= pdf_integer
;
307 c
= streamGetChar(self
);
315 c
= streamLookChar(self
);
316 if ((c
>= '0'&& c
<= '9') || c
== '.') {
317 c
= streamGetChar(self
);
325 value
= value
+ (i
/(pow(10.0,isfraction
)));
326 isfraction
= isfraction
+ 1;
328 value
= (value
* 10) + i
;
331 c
= streamLookChar(self
);
332 if (! ((c
>= '0' && c
<= '9') || c
== '.')) break ;
333 c
= streamGetChar(self
);
339 Token
*token
= new_operand(type
);
340 token
->value
= value
;
345 static Token
*_parseName (scannerdata
*self
, int c
)
347 define_buffer(found
);
348 c
= streamGetChar(self
);
350 check_overflow(found
,foundindex
);
351 found
[foundindex
++] = c
;
352 c
= streamLookChar(self
);
353 if (c
== ' ' || c
== '\n' || c
== '\r' || c
== '\t' ||
354 c
== '/' || c
== '[' || c
== '(' || c
== '<') break ;
355 c
= streamGetChar(self
);
357 Token
*token
= new_operand(pdf_name
);
358 token
->string
= found
;
359 token
->value
= strlen(found
);
363 #define hexdigit(c) \
364 (c>= '0' && c<= '9') ? (c - '0') : ((c>= 'A' && c<= 'F') ? (c - 'A' + 10) : (c - 'a' + 10))
366 static Token
*_parseHexstring (scannerdata
*self
, int c
)
370 define_buffer(found
);
372 if ((c
>= '0' && c
<= '9') ||
373 (c
>= 'A' && c
<= 'F') ||
374 (c
>= 'a' && c
<= 'f')) {
379 hexval
+= hexdigit(c
);
380 check_overflow(found
,foundindex
);
381 found
[foundindex
++] = hexval
;
383 isodd
= (isodd
==1 ? 0 : 1);
385 c
= streamGetChar(self
);
387 Token
*token
= new_operand(pdf_string
);
388 token
->value
= foundindex
;
389 token
->string
= found
;
393 #define pdf_isspace(a) (a == '\0' || a == ' ' || a == '\n' || a == '\r' || a == '\t' || a == '\v')
395 // -- this is rather horrible
396 static Token
*_parseInlineImage (scannerdata
*self
, int c
)
398 define_buffer(found
);
399 if (c
== ' ') { // first space can be ignored
400 c
= streamGetChar(self
);
402 check_overflow(found
, foundindex
);
403 found
[foundindex
++] = c
;
405 c
= streamLookChar(self
);
406 if (c
== 'E' && (found
[foundindex
-1] == '\n' || found
[foundindex
-1] == '\r')) {
407 c
= streamGetChar(self
);
408 check_overflow(found
, foundindex
);
409 found
[foundindex
++] = c
;
410 c
= streamLookChar(self
);
412 c
= streamGetChar(self
);
413 check_overflow(found
, foundindex
);
414 found
[foundindex
++] = c
;
415 c
= streamLookChar(self
);
416 if (pdf_isspace(c
)) {
417 found
[--foundindex
] = '\0'; /* I */
418 found
[--foundindex
] = '\0'; /* E */
419 /* remove end-of-line before EI */
420 if (found
[foundindex
-1] == '\n') {
421 found
[--foundindex
] = '\0';
423 if (found
[foundindex
-1] == '\r') {
424 found
[--foundindex
] = '\0';
428 c
= streamGetChar(self
);
429 check_overflow(found
, foundindex
);
430 found
[foundindex
++] = c
;
433 c
= streamGetChar(self
);
434 check_overflow(found
, foundindex
);
435 found
[foundindex
++] = c
;
438 c
= streamGetChar(self
);
439 check_overflow(found
, foundindex
);
440 found
[foundindex
++] = c
;
443 Token
*token
= new_operand(pdf_string
);
444 token
->value
= foundindex
;
445 token
->string
= found
;
449 static Token
*_parseOperator (scannerdata
*self
, int c
)
451 define_buffer(found
);
453 check_overflow(found
, foundindex
);
454 found
[foundindex
++] = c
;
455 c
= streamLookChar(self
);
456 if ((c
<0) || (c
== ' ' || c
== '\n' || c
== '\r' || c
== '\t' ||
457 c
== '/' || c
== '[' || c
== '(' || c
== '<'))
459 c
= streamGetChar(self
);
462 if (strcmp(found
, "ID") == 0) {
463 self
->_ininlineimage
= 1;
465 if (strcmp(found
,"false") == 0) {
466 Token
*token
= new_operand(pdf_boolean
);
470 } else if (strcmp(found
,"true") == 0) {
471 Token
*token
= new_operand(pdf_boolean
);
476 Token
*token
= new_operand(pdf_operator
);
477 token
->string
= found
;
483 static Token
* _parseComment (scannerdata
*self
, int c
)
486 c
= streamGetChar(self
);
487 } while (c
!= '\n' && c
!= '\r' && c
!= -1);
488 return _parseToken(self
,streamGetChar(self
));
491 static Token
*_parseLt (scannerdata
*self
, int c
)
493 c
= streamGetChar(self
);
495 return new_operand(pdf_startdict
);
497 return _parseHexstring(self
,c
);
501 static Token
* _parseGt (scannerdata
*self
, int c
)
503 c
= streamGetChar(self
);
505 return new_operand(pdf_stopdict
);
507 fprintf(stderr
,"stray > in stream");
513 static Token
*_parseError (int c
)
515 fprintf(stderr
, "stray %c [%d] in stream", c
, c
);
519 static Token
*_parseStartarray ()
521 return new_operand (pdf_startarray
);
524 static Token
*_parseStoparray ()
526 return new_operand (pdf_stoparray
);
530 static Token
*_parseToken (scannerdata
*self
, int c
)
532 if (self
->_ininlineimage
==1) {
533 self
->_ininlineimage
= 2;
534 return _parseInlineImage(self
,c
);
535 } else if (self
->_ininlineimage
==2) {
536 self
->_ininlineimage
= 0;
537 Token
*token
= new_operand(pdf_operator
);
538 token
->string
= strdup("EI");
541 if (c
<0) return NULL
;
543 case '(': return _parseString(self
,c
); break;
544 case ')': return _parseError(c
); break;
545 case '[': return _parseStartarray(); break;
546 case ']': return _parseStoparray(); break;
547 case '/': return _parseName(self
,c
); break;
548 case '<': return _parseLt(self
,c
); break;
549 case '>': return _parseGt(self
,c
); break;
550 case '%': return _parseComment(self
,c
); break;
555 return _parseSpace(self
); break;
568 return _parseNumber(self
,c
); break;
571 return _parseOperator(self
,c
);
573 return _parseError(c
);
578 static int scanner_scan(lua_State
* L
)
582 if (lua_gettop(L
) != 3) {
585 luaL_checktype(L
, 2, LUA_TTABLE
);
586 luaL_checktype(L
, 3, LUA_TTABLE
);
587 self
= scanner_push(L
);
588 memset(self
,0,sizeof(scannerdata
));
589 self
->_operandstack
= (Token
**)xmalloc (MAXOPERANDS
* sizeof (Token
));
590 memset (self
->_operandstack
,0,(MAXOPERANDS
* sizeof (Token
)));
592 if (lua_type(L
,1)== LUA_TTABLE
) {
597 if (lua_type(L
,-1)== LUA_TUSERDATA
) {
598 uin
= (udstruct
*) luaL_checkudata(L
, -1, M_Object
);
599 if (((Object
*) uin
->d
)->isStream()) {
600 ObjectList
*rover
= self
->_streams
;
601 ObjectList
*item
= (ObjectList
*)xmalloc (sizeof(ObjectList
));
602 item
->stream
= ((Object
*) uin
->d
);
606 self
->_streams
= rover
;
614 ObjectList
*rover
= self
->_streams
;
615 self
->_stream
= rover
->stream
;
616 self
->_streams
= rover
->next
;
627 luaL_checktype(L
, 1, LUA_TUSERDATA
);
628 uin
= (udstruct
*) luaL_checkudata(L
, 1, M_Object
);
629 if (((Object
*) uin
->d
)->isStream()) {
630 self
->_stream
= ((Object
*) uin
->d
);
631 } else if (((Object
*) uin
->d
)->isArray()) {
632 Array
*arrayref
= ((Object
*) uin
->d
)->getArray();
633 int count
= arrayref
->getLength();
635 for (i
=0;i
<count
;i
++) {
636 Object
*val
= new Object();
637 arrayref
->get(i
, val
);
638 if (val
->isStream()) {
639 ObjectList
*rover
= self
->_streams
;
640 ObjectList
*item
= (ObjectList
*)xmalloc (sizeof(ObjectList
));
645 self
->_streams
= rover
;
653 ObjectList
*rover
= self
->_streams
;
654 self
->_stream
= rover
->stream
;
655 self
->_streams
= rover
->next
;
659 assert (lua_gettop(L
) == 4);
660 self
->_stream
->streamReset();
661 token
= _parseToken(self
,streamGetChar(self
));
663 if (token
->type
== pdf_operator
) {
664 lua_pushstring(L
, token
->string
);
666 lua_rawget(L
,2); // operator table
667 if (lua_isfunction(L
,-1)) {
670 (void)lua_call(L
,2,0);
674 clear_operand_stack(self
,0);
676 push_operand(self
, token
);
678 if (!self
->_stream
) {
681 token
= _parseToken(self
,streamGetChar(self
));
685 self
->_stream
->streamClose();
686 self
->_stream
= NULL
;
688 clear_operand_stack(self
,0);
689 free(self
->_operandstack
);
693 static int scanner_done(lua_State
* L
)
696 scannerdata
*self
= scanner_check(L
,1);
697 while ((c
=streamGetChar(self
))>=0)
702 // here are the stack popping functions, and their helpers
704 static void operandstack_backup (scannerdata
*self
) {
705 int i
= self
->_nextoperand
-1;
708 int backupstop
= self
->_operandstack
[i
]->type
;
709 if (backupstop
== pdf_stopdict
) {
710 backupstart
= pdf_startdict
;
711 } else if (backupstop
== pdf_stoparray
) {
712 backupstart
= pdf_startarray
;
717 if (self
->_operandstack
[i
]->type
== backupstop
) {
719 } else if (self
->_operandstack
[i
]->type
== backupstart
) {
726 self
->_nextoperand
= i
+1;
729 static void push_array (lua_State
*L
, scannerdata
*self
)
731 int balance
= 1; // nesting tracking
732 int index
= 1; // lua array index
733 Token
*token
= self
->_operandstack
[self
->_nextoperand
++];
736 if (token
->type
== pdf_stoparray
)
738 if (token
->type
== pdf_startarray
)
744 lua_rawseti(L
,-2, index
++);
746 token
= self
->_operandstack
[self
->_nextoperand
++];
751 static void push_dict (lua_State
*L
, scannerdata
*self
)
753 int balance
= 1; // nesting tracking
754 int needskey
= 1; // toggle between lua value and lua key
755 Token
*token
= self
->_operandstack
[self
->_nextoperand
++];
758 if (token
->type
== pdf_stopdict
)
760 if (token
->type
== pdf_startdict
)
766 lua_pushlstring(L
, token
->string
, token
->value
);
774 token
= self
->_operandstack
[self
->_nextoperand
++];
778 const char *typenames
[pdf_stopdict
+1] =
779 { "unknown", "integer", "real", "boolean", "name", "operator",
780 "string", "array", "array", "dict", "dict" };
782 static void push_token (lua_State
*L
, scannerdata
*self
)
784 Token
*token
= self
->_operandstack
[self
->_nextoperand
-1];
785 lua_createtable(L
,2,0);
786 lua_pushstring (L
, typenames
[token
->type
]);
788 if (token
->type
== pdf_string
|| token
->type
== pdf_name
) {
789 lua_pushlstring(L
, token
->string
, token
->value
);
790 } else if (token
->type
== pdf_real
|| token
->type
== pdf_integer
) {
791 lua_pushnumber(L
, token
->value
);
792 } else if (token
->type
== pdf_boolean
) {
793 lua_pushboolean(L
, (int)token
->value
);
794 } else if (token
->type
== pdf_startarray
) {
796 } else if (token
->type
== pdf_startdict
) {
801 lua_rawseti(L
,-2, 2);
804 static int scanner_popsingular (lua_State
* L
, int token_type
) {
805 int clear
= 0; // how much of the operand stack needs deleting
806 scannerdata
*self
= scanner_check(L
,1);
807 if (self
->_nextoperand
==0) {
810 clear
= self
->_nextoperand
-1;
811 Token
*token
= self
->_operandstack
[self
->_nextoperand
-1];
812 if (token
==NULL
|| (token
->type
!= token_type
)) {
815 // the simple cases can be written out directly, but dicts and
816 // arrays are better done via the recursive function
817 if (token_type
== pdf_stoparray
|| token_type
== pdf_stopdict
) {
818 operandstack_backup(self
);
819 clear
= self
->_nextoperand
-1;
822 } else if (token_type
== pdf_real
|| token_type
== pdf_integer
) {
823 lua_pushnumber(L
, token
->value
);
824 } else if (token_type
== pdf_boolean
) {
825 lua_pushboolean(L
,(int)token
->value
);
826 } else if (token_type
== pdf_name
|| token_type
== pdf_string
) {
827 lua_pushlstring(L
, token
->string
, token
->value
);
831 clear_operand_stack(self
,clear
);
835 static int scanner_popanything (lua_State
* L
) {
836 int clear
= 0; // how much of the operand stack needs deleting
837 scannerdata
*self
= scanner_check(L
,1);
838 if (self
->_nextoperand
==0) {
841 clear
= self
->_nextoperand
-1;
842 Token
*token
= self
->_operandstack
[self
->_nextoperand
-1];
846 int token_type
= token
->type
;
847 // the simple cases can be written out directly, but dicts and
848 // arrays are better done via the recursive function
849 if (token_type
== pdf_stoparray
|| token_type
== pdf_stopdict
) {
850 operandstack_backup(self
);
851 clear
= self
->_nextoperand
-1;
856 clear_operand_stack(self
,clear
);
861 static int scanner_popnumber(lua_State
* L
)
863 if(scanner_popsingular(L
,pdf_real
))
865 if (scanner_popsingular(L
,pdf_integer
))
871 static int scanner_popboolean(lua_State
* L
)
873 if(scanner_popsingular(L
,pdf_boolean
))
879 static int scanner_popstring(lua_State
* L
)
881 if (scanner_popsingular(L
,pdf_string
))
887 static int scanner_popname(lua_State
* L
)
889 if (scanner_popsingular(L
,pdf_name
))
895 static int scanner_poparray(lua_State
* L
)
897 if (scanner_popsingular(L
,pdf_stoparray
))
903 static int scanner_popdictionary(lua_State
* L
)
905 if (scanner_popsingular(L
,pdf_stopdict
))
911 static int scanner_popany(lua_State
* L
)
913 if (scanner_popanything(L
))
919 static const luaL_Reg scannerlib_meta
[] = {
923 static const struct luaL_Reg scannerlib_m
[] = {
924 {"done", scanner_done
},
925 {"popNumber", scanner_popnumber
},
926 {"popName", scanner_popname
},
927 {"popString", scanner_popstring
},
928 {"popArray", scanner_poparray
},
929 {"popDict", scanner_popdictionary
},
930 {"popBool", scanner_popboolean
},
931 {"pop", scanner_popany
},
932 {NULL
, NULL
} /* sentinel */
936 static const luaL_Reg scannerlib
[] = {
937 {"scan", scanner_scan
},
941 LUALIB_API
int luaopen_pdfscanner(lua_State
* L
)
943 luaL_newmetatable(L
, SCANNER
);
944 luaL_openlib(L
, 0, scannerlib_meta
, 0);
945 lua_pushvalue(L
, -1);
946 lua_setfield(L
, -2, "__index");
947 luaL_register(L
, NULL
, scannerlib_m
);
948 luaL_register(L
, "pdfscanner", scannerlib
);