1 /* Parse and extract PDF objects */
8 /* the number white space characters */
9 int pdf_ws(char *pdf
, int len
, int pos
)
12 while (i
< len
&& isspace((unsigned char) pdf
[i
]))
17 /* s: string, d: dictionary, l: list, n: number, /: name, r: reference */
18 int pdf_type(char *pdf
, int len
, int pos
)
20 pos
+= pdf_ws(pdf
, len
, pos
);
25 if (pdf
[pos
] == '<' && pdf
[pos
+ 1] != '<')
27 if (pdf
[pos
] == '<' && pdf
[pos
+ 1] == '<')
31 if (strchr("0123456789+-.", (unsigned char) pdf
[pos
])) {
32 if (!isdigit((unsigned char) pdf
[pos
]))
34 while (pos
< len
&& isdigit((unsigned char) pdf
[pos
]))
36 pos
+= pdf_ws(pdf
, len
, pos
);
37 if (!isdigit((unsigned char) pdf
[pos
]))
39 while (pos
< len
&& isdigit((unsigned char) pdf
[pos
]))
41 pos
+= pdf_ws(pdf
, len
, pos
);
42 return pos
< len
&& pdf
[pos
] == 'R' ? 'r' : 'n';
47 /* the length of a pdf object */
48 int pdf_len(char *pdf
, int len
, int pos
)
54 pos
+= pdf_ws(pdf
, len
, pos
);
55 c
= (unsigned char) pdf
[pos
];
56 if (strchr("0123456789+-.", c
)) {
57 if (pdf_type(pdf
, len
, pos
) == 'r') {
58 char *r
= memchr(pdf
+ pos
, 'R', len
- pos
);
59 return r
- (pdf
+ old
) + 1;
62 while (pos
< len
&& strchr("0123456789.", (unsigned char) pdf
[pos
]))
68 while (pos
< len
&& depth
> 0) {
78 if (c
== '<' && pos
+ 1 < len
&& pdf
[pos
+ 1] == '<') {
80 while (pos
+ 2 < len
&& (pdf
[pos
] != '>' || pdf
[pos
+ 1] != '>')) {
81 pos
+= pdf_len(pdf
, len
, pos
);
82 pos
+= pdf_len(pdf
, len
, pos
);
83 pos
+= pdf_ws(pdf
, len
, pos
);
87 } else if (c
== '<') {
88 while (pos
< len
&& pdf
[pos
] != '>')
95 while (pos
< len
&& !strchr(" \t\r\n\f()<>[]{}/%",
96 (unsigned char) pdf
[pos
]))
101 while (pos
< len
&& pdf
[pos
] != ']') {
102 pos
+= pdf_len(pdf
, len
, pos
);
103 pos
+= pdf_ws(pdf
, len
, pos
);
110 static int startswith(char *s
, char *t
)
118 /* read an indirect reference */
119 int pdf_obj(char *pdf
, int len
, int pos
, int *obj
, int *rev
)
121 if (pdf_type(pdf
, len
, pos
) != 'r')
123 *obj
= atoi(pdf
+ pos
);
124 pos
+= pdf_len(pdf
, len
, pos
);
125 *rev
= atoi(pdf
+ pos
);
129 /* the value of a pdf dictionary key */
130 int pdf_dval(char *pdf
, int len
, int pos
, char *key
)
133 while (pos
+ 2 < len
&& (pdf
[pos
] != '>' || pdf
[pos
+ 1] != '>')) {
134 pos
+= pdf_ws(pdf
, len
, pos
);
135 if (pdf_len(pdf
, len
, pos
) == strlen(key
) && startswith(key
, pdf
+ pos
)) {
136 pos
+= pdf_len(pdf
, len
, pos
);
137 pos
+= pdf_ws(pdf
, len
, pos
);
140 pos
+= pdf_len(pdf
, len
, pos
);
141 pos
+= pdf_len(pdf
, len
, pos
);
142 pos
+= pdf_ws(pdf
, len
, pos
);
147 /* return a dictionary key */
148 int pdf_dkey(char *pdf
, int len
, int pos
, int key
)
152 while (pos
+ 2 < len
&& (pdf
[pos
] != '>' || pdf
[pos
+ 1] != '>')) {
153 pos
+= pdf_ws(pdf
, len
, pos
);
156 pos
+= pdf_len(pdf
, len
, pos
);
157 pos
+= pdf_len(pdf
, len
, pos
);
158 pos
+= pdf_ws(pdf
, len
, pos
);
163 /* return a list entry */
164 int pdf_lval(char *pdf
, int len
, int pos
, int idx
)
168 while (pos
< len
&& pdf
[pos
] != ']') {
171 pos
+= pdf_len(pdf
, len
, pos
);
172 pos
+= pdf_ws(pdf
, len
, pos
);
177 static void *my_memrchr(void *m
, int c
, long n
)
180 for (i
= 0; i
< n
; i
++)
181 if (*(unsigned char *) (m
+ n
- 1 - i
) == c
)
182 return m
+ n
- 1 - i
;
186 static int prevline(char *pdf
, int len
, int off
)
188 char *nl
= my_memrchr(pdf
, '\n', off
);
189 if (nl
&& nl
> pdf
) {
190 char *nl2
= my_memrchr(pdf
, '\n', nl
- pdf
- 1);
192 return nl2
- pdf
+ 1;
197 static int nextline(char *pdf
, int len
, int off
)
199 char *nl
= memchr(pdf
+ off
, '\n', len
- off
);
205 /* the position of the trailer */
206 int pdf_trailer(char *pdf
, int len
)
208 int pos
= prevline(pdf
, len
, len
); /* %%EOF */
209 while (!startswith(pdf
+ pos
, "trailer"))
210 if ((pos
= prevline(pdf
, len
, pos
)) < 0)
212 return nextline(pdf
, len
, pos
); /* skip trailer\n */
215 /* the position of the last xref table */
216 static int pdf_xref(char *pdf
, int len
)
218 int pos
= prevline(pdf
, len
, len
); /* %%EOF */
219 if ((pos
= prevline(pdf
, len
, pos
)) < 0)
221 /* read startxref offset */
222 if (sscanf(pdf
+ pos
, "%d", &pos
) != 1 || pos
>= len
|| pos
< 0)
224 return nextline(pdf
, len
, pos
); /* skip xref\n */
227 /* find a pdf object */
228 int pdf_find(char *pdf
, int len
, int obj
, int rev
)
230 int obj_beg
, obj_cnt
;
231 int cur_rev
, cur_pos
;
234 int pos
= pdf_xref(pdf
, len
);
237 /* the numbers after xref */
238 while (pos
< len
&& sscanf(pdf
+ pos
, "%d %d", &obj_beg
, &obj_cnt
) == 2) {
239 for (i
= 0; i
< obj_cnt
; i
++) {
240 if ((pos
= nextline(pdf
, len
, pos
)) < 0)
242 if (sscanf(pdf
+ pos
, "%d %d", &cur_pos
, &cur_rev
) != 2)
244 if (obj_beg
+ i
== obj
&& cur_rev
== rev
) {
245 if (cur_pos
< 0 || cur_pos
>= len
)
247 if (!(beg
= strstr(pdf
+ cur_pos
, "obj")))
250 pos
+= pdf_ws(pdf
, len
, pos
);
258 /* read and dereference an indirect reference */
259 int pdf_ref(char *pdf
, int len
, int pos
)
262 if (pdf_obj(pdf
, len
, pos
, &obj
, &rev
))
264 return pdf_find(pdf
, len
, obj
, rev
);
267 /* retrieve and dereference a dictionary entry */
268 int pdf_dval_val(char *pdf
, int len
, int pos
, char *key
)
270 int val
= pdf_dval(pdf
, len
, pos
, key
);
271 int val_obj
, val_rev
;
274 if (pdf_type(pdf
, len
, val
) == 'r') {
275 pdf_obj(pdf
, len
, val
, &val_obj
, &val_rev
);
276 return pdf_find(pdf
, len
, val_obj
, val_rev
);
281 /* retrieve a dictionary entry, which is an indirect reference */
282 int pdf_dval_obj(char *pdf
, int len
, int pos
, char *key
)
284 int val
= pdf_dval(pdf
, len
, pos
, key
);
287 return pdf_ref(pdf
, len
, val
);