MDL-38311 question manual grading: comment fixes.
[moodle.git] / lib / tcpdf / tcpdf_parser.php
blobf17359f2134c6e46c4901267e2531f42a7754000
1 <?php
2 //============================================================+
3 // File name : tcpdf_parser.php
4 // Version : 1.0.001
5 // Begin : 2011-05-23
6 // Last Update : 2012-05-03
7 // Author : Nicola Asuni - Tecnick.com LTD - Manor Coach House, Church Hill, Aldershot, Hants, GU12 4RQ, UK - www.tecnick.com - info@tecnick.com
8 // License : http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT GNU-LGPLv3
9 // -------------------------------------------------------------------
10 // Copyright (C) 2011-2012 Nicola Asuni - Tecnick.com LTD
12 // This file is part of TCPDF software library.
14 // TCPDF is free software: you can redistribute it and/or modify it
15 // under the terms of the GNU Lesser General Public License as
16 // published by the Free Software Foundation, either version 3 of the
17 // License, or (at your option) any later version.
19 // TCPDF is distributed in the hope that it will be useful, but
20 // WITHOUT ANY WARRANTY; without even the implied warranty of
21 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 // See the GNU Lesser General Public License for more details.
24 // You should have received a copy of the License
25 // along with TCPDF. If not, see
26 // <http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT>.
28 // See LICENSE.TXT file for more information.
29 // -------------------------------------------------------------------
31 // Description : This is a PHP class for parsing PDF documents.
33 //============================================================+
35 /**
36 * @file
37 * This is a PHP class for parsing PDF documents.<br>
38 * @package com.tecnick.tcpdf
39 * @author Nicola Asuni
40 * @version 1.0.001
43 // include class for decoding filters
44 require_once(dirname(__FILE__).'/tcpdf_filters.php');
46 /**
47 * @class TCPDF_PARSER
48 * This is a PHP class for parsing PDF documents.<br>
49 * @package com.tecnick.tcpdf
50 * @brief This is a PHP class for parsing PDF documents..
51 * @version 1.0.001
52 * @author Nicola Asuni - info@tecnick.com
54 class TCPDF_PARSER {
56 /**
57 * Raw content of the PDF document.
58 * @private
60 private $pdfdata = '';
62 /**
63 * XREF data.
64 * @protected
66 protected $xref = array();
68 /**
69 * Array of PDF objects.
70 * @protected
72 protected $objects = array();
74 /**
75 * Class object for decoding filters.
76 * @private
78 private $FilterDecoders;
80 // -----------------------------------------------------------------------------
82 /**
83 * Parse a PDF document an return an array of objects.
84 * @param $data (string) PDF data to parse.
85 * @public
86 * @since 1.0.000 (2011-05-24)
88 public function __construct($data) {
89 if (empty($data)) {
90 $this->Error('Empty PDF data.');
92 $this->pdfdata = $data;
93 // get length
94 $pdflen = strlen($this->pdfdata);
95 // initialize class for decoding filters
96 $this->FilterDecoders = new TCPDF_FILTERS();
97 // get xref and trailer data
98 $this->xref = $this->getXrefData();
99 // parse all document objects
100 $this->objects = array();
101 foreach ($this->xref['xref'] as $obj => $offset) {
102 if (!isset($this->objects[$obj])) {
103 $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true);
106 // release some memory
107 unset($this->pdfdata);
108 $this->pdfdata = '';
112 * Return an array of parsed PDF document objects.
113 * @return (array) Array of parsed PDF document objects.
114 * @public
115 * @since 1.0.000 (2011-06-26)
117 public function getParsedData() {
118 return array($this->xref, $this->objects);
122 * Get xref (cross-reference table) and trailer data from PDF document data.
123 * @param $offset (int) xref offset (if know).
124 * @param $xref (array) previous xref array (if any).
125 * @return Array containing xref and trailer data.
126 * @protected
127 * @since 1.0.000 (2011-05-24)
129 protected function getXrefData($offset=0, $xref=array()) {
130 if ($offset == 0) {
131 // find last startxref
132 if (preg_match_all('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_SET_ORDER, $offset) == 0) {
133 $this->Error('Unable to find startxref');
135 $matches = array_pop($matches);
136 $startxref = $matches[1];
137 } else {
138 // get the first xref at the specified offset
139 if (preg_match('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) == 0) {
140 $this->Error('Unable to find startxref');
142 $startxref = $matches[1][0];
144 // check xref position
145 if (strpos($this->pdfdata, 'xref', $startxref) != $startxref) {
146 $this->Error('Unable to find xref');
148 // extract xref data (object indexes and offsets)
149 $xoffset = $startxref + 5;
150 // initialize object number
151 $obj_num = 0;
152 $offset = $xoffset;
153 while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
154 $offset = (strlen($matches[0][0]) + $matches[0][1]);
155 if ($matches[3][0] == 'n') {
156 // create unique object index: [object number]_[generation number]
157 $index = $obj_num.'_'.intval($matches[2][0]);
158 // check if object already exist
159 if (!isset($xref['xref'][$index])) {
160 // store object offset position
161 $xref['xref'][$index] = intval($matches[1][0]);
163 ++$obj_num;
164 $offset += 2;
165 } elseif ($matches[3][0] == 'f') {
166 ++$obj_num;
167 $offset += 2;
168 } else {
169 // object number (index)
170 $obj_num = intval($matches[1][0]);
173 // get trailer data
174 if (preg_match('/trailer[\s]*<<(.*)>>[\s]*[\r\n]+startxref[\s]*[\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $xoffset) > 0) {
175 $trailer_data = $matches[1][0];
176 if (!isset($xref['trailer'])) {
177 // get only the last updated version
178 $xref['trailer'] = array();
179 // parse trailer_data
180 if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
181 $xref['trailer']['size'] = intval($matches[1]);
183 if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
184 $xref['trailer']['root'] = intval($matches[1]).'_'.intval($matches[2]);
186 if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
187 $xref['trailer']['encrypt'] = intval($matches[1]).'_'.intval($matches[2]);
189 if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
190 $xref['trailer']['info'] = intval($matches[1]).'_'.intval($matches[2]);
192 if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
193 $xref['trailer']['id'] = array();
194 $xref['trailer']['id'][0] = $matches[1];
195 $xref['trailer']['id'][1] = $matches[2];
198 if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
199 // get previous xref
200 $xref = $this->getXrefData(intval($matches[1]), $xref);
202 } else {
203 $this->Error('Unable to find trailer');
205 return $xref;
209 * Get object type, raw value and offset to next object
210 * @param $offset (int) Object offset.
211 * @return array containing object type, raw value and offset to next object
212 * @protected
213 * @since 1.0.000 (2011-06-20)
215 protected function getRawObject($offset=0) {
216 $objtype = ''; // object type to be returned
217 $objval = ''; // object value to be returned
218 // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
219 $offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset);
220 // get first char
221 $char = $this->pdfdata{$offset};
222 // get object type
223 switch ($char) {
224 case '%': { // \x25 PERCENT SIGN
225 // skip comment and search for next token
226 $next = strcspn($this->pdfdata, "\r\n", $offset);
227 if ($next > 0) {
228 $offset += $next;
229 return $this->getRawObject($this->pdfdata, $offset);
231 break;
233 case '/': { // \x2F SOLIDUS
234 // name object
235 $objtype = $char;
236 ++$offset;
237 if (preg_match('/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/', substr($this->pdfdata, $offset, 256), $matches) == 1) {
238 $objval = $matches[1]; // unescaped value
239 $offset += strlen($objval);
241 break;
243 case '(': // \x28 LEFT PARENTHESIS
244 case ')': { // \x29 RIGHT PARENTHESIS
245 // literal string object
246 $objtype = $char;
247 ++$offset;
248 $strpos = $offset;
249 if ($char == '(') {
250 $open_bracket = 1;
251 while ($open_bracket > 0) {
252 if (!isset($this->pdfdata{$strpos})) {
253 break;
255 $ch = $this->pdfdata{$strpos};
256 switch ($ch) {
257 case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash)
258 // skip next character
259 ++$strpos;
260 break;
262 case '(': { // LEFT PARENHESIS (28h)
263 ++$open_bracket;
264 break;
266 case ')': { // RIGHT PARENTHESIS (29h)
267 --$open_bracket;
268 break;
271 ++$strpos;
273 $objval = substr($this->pdfdata, $offset, ($strpos - $offset - 1));
274 $offset = $strpos;
276 break;
278 case '[': // \x5B LEFT SQUARE BRACKET
279 case ']': { // \x5D RIGHT SQUARE BRACKET
280 // array object
281 $objtype = $char;
282 ++$offset;
283 if ($char == '[') {
284 // get array content
285 $objval = array();
286 do {
287 // get element
288 $element = $this->getRawObject($offset);
289 $offset = $element[2];
290 $objval[] = $element;
291 } while ($element[0] != ']');
292 // remove closing delimiter
293 array_pop($objval);
295 break;
297 case '<': // \x3C LESS-THAN SIGN
298 case '>': { // \x3E GREATER-THAN SIGN
299 if (isset($this->pdfdata{($offset + 1)}) AND ($this->pdfdata{($offset + 1)} == $char)) {
300 // dictionary object
301 $objtype = $char.$char;
302 $offset += 2;
303 if ($char == '<') {
304 // get array content
305 $objval = array();
306 do {
307 // get element
308 $element = $this->getRawObject($offset);
309 $offset = $element[2];
310 $objval[] = $element;
311 } while ($element[0] != '>>');
312 // remove closing delimiter
313 array_pop($objval);
315 } else {
316 // hexadecimal string object
317 $objtype = $char;
318 ++$offset;
319 if (($char == '<') AND (preg_match('/^([0-9A-Fa-f]+)[>]/iU', substr($this->pdfdata, $offset), $matches) == 1)) {
320 $objval = $matches[1];
321 $offset += strlen($matches[0]);
324 break;
326 default: {
327 if (substr($this->pdfdata, $offset, 6) == 'endobj') {
328 // indirect object
329 $objtype = 'endobj';
330 $offset += 6;
331 } elseif (substr($this->pdfdata, $offset, 4) == 'null') {
332 // null object
333 $objtype = 'null';
334 $offset += 4;
335 $objval = 'null';
336 } elseif (substr($this->pdfdata, $offset, 4) == 'true') {
337 // boolean true object
338 $objtype = 'boolean';
339 $offset += 4;
340 $objval = 'true';
341 } elseif (substr($this->pdfdata, $offset, 5) == 'false') {
342 // boolean false object
343 $objtype = 'boolean';
344 $offset += 5;
345 $objval = 'false';
346 } elseif (substr($this->pdfdata, $offset, 6) == 'stream') {
347 // start stream object
348 $objtype = 'stream';
349 $offset += 6;
350 if (preg_match('/^[\r\n]+(.*)[\r\n]*endstream/isU', substr($this->pdfdata, $offset), $matches) == 1) {
351 $objval = $matches[1];
352 $offset += strlen($matches[0]);
354 } elseif (substr($this->pdfdata, $offset, 9) == 'endstream') {
355 // end stream object
356 $objtype = 'endstream';
357 $offset += 9;
358 } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
359 // indirect object reference
360 $objtype = 'ojbref';
361 $offset += strlen($matches[0]);
362 $objval = intval($matches[1]).'_'.intval($matches[2]);
363 } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
364 // object start
365 $objtype = 'ojb';
366 $objval = intval($matches[1]).'_'.intval($matches[2]);
367 $offset += strlen ($matches[0]);
368 } elseif (($numlen = strspn($this->pdfdata, '+-.0123456789', $offset)) > 0) {
369 // numeric object
370 $objtype = 'numeric';
371 $objval = substr($this->pdfdata, $offset, $numlen);
372 $offset += $numlen;
374 break;
377 return array($objtype, $objval, $offset);
381 * Get content of indirect object.
382 * @param $obj_ref (string) Object number and generation number separated by underscore character.
383 * @param $offset (int) Object offset.
384 * @param $decoding (boolean) If true decode streams.
385 * @return array containing object data.
386 * @protected
387 * @since 1.0.000 (2011-05-24)
389 protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) {
390 $obj = explode('_', $obj_ref);
391 if (($obj === false) OR (count($obj) != 2)) {
392 $this->Error('Invalid object reference: '.$obj);
393 return;
395 $objref = $obj[0].' '.$obj[1].' obj';
396 if (strpos($this->pdfdata, $objref, $offset) != $offset) {
397 // an indirect reference to an undefined object shall be considered a reference to the null object
398 return array('null', 'null', $offset);
400 // starting position of object content
401 $offset += strlen($objref);
402 // get array of object content
403 $objdata = array();
404 $i = 0; // object main index
405 do {
406 // get element
407 $element = $this->getRawObject($offset);
408 $offset = $element[2];
409 // decode stream using stream's dictionary information
410 if ($decoding AND ($element[0] == 'stream') AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == '<<')) {
411 $element[3] = $this->decodeStream($objdata[($i - 1)][1], substr($element[1], 1));
413 $objdata[$i] = $element;
414 ++$i;
415 } while ($element[0] != 'endobj');
416 // remove closing delimiter
417 array_pop($objdata);
418 // return raw object content
419 return $objdata;
423 * Get the content of object, resolving indect object reference if necessary.
424 * @param $obj (string) Object value.
425 * @return array containing object data.
426 * @protected
427 * @since 1.0.000 (2011-06-26)
429 protected function getObjectVal($obj) {
430 if ($obj[0] == 'objref') {
431 // reference to indirect object
432 if (isset($this->objects[$obj[1]])) {
433 // this object has been already parsed
434 return $this->objects[$obj[1]];
435 } elseif (isset($this->xref[$obj[1]])) {
436 // parse new object
437 $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false);
438 return $this->objects[$obj[1]];
441 return $obj;
445 * Decode the specified stream.
446 * @param $sdic (array) Stream's dictionary array.
447 * @param $stream (string) Stream to decode.
448 * @return array containing decoded stream data and remaining filters.
449 * @protected
450 * @since 1.0.000 (2011-06-22)
452 protected function decodeStream($sdic, $stream) {
453 // get stream lenght and filters
454 $slength = strlen($stream);
455 $filters = array();
456 foreach ($sdic as $k => $v) {
457 if ($v[0] == '/') {
458 if (($v[1] == 'Length') AND (isset($sdic[($k + 1)])) AND ($sdic[($k + 1)][0] == 'numeric')) {
459 // get declared stream lenght
460 $declength = intval($sdic[($k + 1)][1]);
461 if ($declength < $slength) {
462 $stream = substr($stream, 0, $declength);
463 $slength = $declength;
465 } elseif (($v[1] == 'Filter') AND (isset($sdic[($k + 1)]))) {
466 // resolve indirect object
467 $objval = $this->getObjectVal($sdic[($k + 1)]);
468 if ($objval[0] == '/') {
469 // single filter
470 $filters[] = $objval[1];
471 } elseif ($objval[0] == '[') {
472 // array of filters
473 foreach ($objval[1] as $flt) {
474 if ($flt[0] == '/') {
475 $filters[] = $flt[1];
482 // decode the stream
483 $remaining_filters = array();
484 foreach ($filters as $filter) {
485 if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) {
486 $stream = $this->FilterDecoders->decodeFilter($filter, $stream);
487 } else {
488 // add missing filter to array
489 $remaining_filters[] = $filter;
492 return array($stream, $remaining_filters);
496 * This method is automatically called in case of fatal error; it simply outputs the message and halts the execution.
497 * @param $msg (string) The error message
498 * @public
499 * @since 1.0.000 (2011-05-23)
501 public function Error($msg) {
502 // exit program and print error
503 die('<strong>TCPDF_PARSER ERROR: </strong>'.$msg);
506 } // END OF TCPDF_PARSER CLASS
508 //============================================================+
509 // END OF FILE
510 //============================================================+