3 * HTML2PDF Library - parsingHtml class
5 * HTML => PDF convertor
6 * distributed under the LGPL License
9 * @author Laurent MINGUET <webmaster@html2pdf.fr>
10 * @copyright 2016 Laurent MINGUET
12 class HTML2PDF_parsingHtml
14 protected $_html = ''; // HTML code to parse
15 protected $_num = 0; // table number
16 protected $_level = 0; // table level
17 protected $_encoding = ''; // encoding
18 public $code = array(); // parsed HTML code
25 * @param string $encoding
28 public function __construct($encoding = 'UTF-8')
31 $this->_level
= array($this->_num
);
33 $this->code
= array();
34 $this->setEncoding($encoding);
40 * @param string $encoding
43 public function setEncoding($encoding)
45 $this->_encoding
= $encoding;
49 * Define the HTML code to parse
51 * @param string $html code
54 public function setHTML($html)
56 // remove the HTML in comment
57 $html = preg_replace('/<!--(.*)-->/isU', '', $html);
68 public function parse()
72 // flag : are we in a <pre> Tag ?
75 // action to use for each line of the content of a <pre> Tag
85 // tag that can be not closed
86 $tagsNotClosed = array(
87 'br', 'hr', 'img', 'col',
88 'input', 'link', 'option',
89 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
92 // search the HTML tags
93 $parts = $this->_searchCode();
95 // all the actions to do
98 // foreach part of the HTML code
99 foreach ($parts as $part) {
100 // if it is a tag code
101 if ($part[0] == 'code') {
102 // analyze the HTML code
103 $res = $this->_analyzeCode($part[1]);
105 // if it is a real HTML tag
107 // save the current position in the HTML code
108 $res['html_pos'] = $part[2];
110 // if the tag must be closed
111 if (!in_array($res['name'], $tagsNotClosed)) {
112 // if it is a closure tag
115 if (count($parents) < 1) {
116 throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
117 } else if (end($parents) != $res['name']) {
118 throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
123 // if it is an auto-closed tag
124 if ($res['autoclose']) {
125 // save the opened tag
128 // prepare the closed tag
129 $res['params'] = array();
130 $res['close'] = true;
132 // else: add a child for validation
133 array_push($parents, $res['name']);
137 // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
138 if (($res['name'] == 'pre' ||
$res['name'] == 'code') && !$res['autoclose']) {
139 $tagPreIn = !$res['close'];
143 // save the actions to convert
145 } else { // else (it is not a real HTML tag => we transform it in Text
150 if ($part[0] == 'txt') {
151 // if we are not in a <pre> tag
157 'param' => array('txt' => $this->_prepareTxt($part[1])),
159 } else { // else (if we are in a <pre> tag)
161 $part[1] = str_replace("\r", '', $part[1]);
162 $part[1] = explode("\n", $part[1]);
164 // foreach line of the text
165 foreach ($part[1] as $k => $txt) {
166 // transform the line
167 $txt = str_replace("\t", self
::HTML_TAB
, $txt);
168 $txt = str_replace(' ', ' ', $txt);
172 $actions[] = $tagPreBr;
179 'param' => array('txt' => $this->_prepareTxt($txt, false)),
186 // for each identified action, we have to clean up the begin and the end of the texte
187 // based on tags that surround it
189 // list of the tags to clean
190 $tagsToClean = array(
191 'page', 'page_header', 'page_footer', 'form',
192 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
193 'div', 'hr', 'p', 'ul', 'ol', 'li',
194 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
195 'bookmark', 'fieldset', 'legend',
196 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
201 $nb = count($actions);
202 for ($k = 0; $k < $nb; $k++
) {
204 if ($actions[$k]['name']=='write') {
205 // if the tag before the text is a tag to clean => ltrim on the text
206 if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean))
207 $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
209 // if the tag after the text is a tag to clean => rtrim on the text
210 if ($k < $nb - 1 && in_array($actions[$k +
1]['name'], $tagsToClean))
211 $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
213 // if the text is empty => remove the action
214 if (!strlen($actions[$k]['param']['txt'])) {
220 // if we are not on the level 0 => HTML validator ERROR
221 if (count($parents)) {
222 throw new HTML2PDF_exception(5, $parents);
225 // save the actions to do
226 $this->code
= array_values($actions);
233 * @param boolean $spaces true => replace multiple space+\t+\r+\n by a single space
237 protected function _prepareTxt($txt, $spaces = true)
239 if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt);
240 $txt = str_replace('€', '€', $txt);
241 $txt = html_entity_decode($txt, ENT_QUOTES
, $this->_encoding
);
246 * parse the HTML code
250 protected function _searchCode()
252 // initialise the array
255 // regexp to separate the tags from the texts
256 $reg = '/(<[^>]+>)|([^<]+)+/isU';
262 // As it finds a match
263 while (preg_match($reg, $this->_html
, $parse, PREG_OFFSET_CAPTURE
, $offset)) {
266 // save the previous text if it exists
268 $parts[] = array('txt', $str);
271 // save the tag, with the offset
272 $parts[] = array('code', trim($parse[1][0]), $offset);
274 // init the current text
276 } else { // else (if it is a text)
277 // add the new text to the current text
278 $str .= $parse[2][0];
281 // Update offset to the end of the match
282 $offset = $parse[0][1] +
strlen($parse[0][0]);
285 // if a text is present in the end, we save it
287 $parts[] = array('txt', $str);
296 * @param string $code HTML code to analise
297 * @return array corresponding action
299 protected function _analyzeCode($code)
301 // name of the tag, opening, closure, autoclosure
302 $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
303 if (!preg_match('/'.$tag.'/isU', $code, $match)) {
306 $close = ($match[1] == '/' ?
true : false);
307 $autoclose = preg_match('/\/>$/isU', $code);
308 $name = strtolower($match[2]);
310 // required parameters (depends on the tag name)
312 $param['style'] = '';
313 if ($name == 'img') {
321 // read the parameters : name=value
322 $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
323 preg_match_all('/'.$prop.'/is', $code, $match);
324 for ($k = 0; $k < count($match[0]); $k++
) {
325 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
328 // read the parameters : name="value"
329 $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
330 preg_match_all('/'.$prop.'/is', $code, $match);
331 for ($k = 0; $k < count($match[0]); $k++
) {
332 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
335 // read the parameters : name='value'
336 $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
337 preg_match_all('/'.$prop.'/is', $code, $match);
338 for ($k = 0; $k < count($match[0]); $k++
) {
339 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
342 // compliance of each parameter
345 foreach ($param as $key => $val) {
346 $key = strtolower($key);
351 $param['style'] .= 'width: '.$val.'px; ';
355 if ($name === 'img') {
357 $param['style'] .= 'float: '.$val.'; ';
358 } elseif ($name !== 'table') {
360 $param['style'] .= 'text-align: '.$val.'; ';
366 $param['style'] .= 'vertical-align: '.$val.'; ';
371 $param['style'] .= 'height: '.$val.'px; ';
376 $param['style'] .= 'background: '.$val.'; ';
386 if (preg_match('/^[0-9]+$/isU', $val)) {
394 if (preg_match('/^([0-9]+)$/isU', $val)) {
395 $param[$key] = $val.'px';
401 $val = preg_replace('/[^0-9]/isU', '', $val);
410 // compliance of the border
411 if ($border !== null) {
412 if ($border) $border = 'border: solid '.$border.' '.$color;
413 else $border = 'border: none';
415 $param['style'] .= $border.'; ';
416 $param['border'] = $border;
419 // reading styles: decomposition and standardization
420 $styles = explode(';', $param['style']);
421 $param['style'] = array();
422 foreach ($styles as $style) {
423 $tmp = explode(':', $style);
424 if (count($tmp) > 1) {
427 $tmp = implode(':', $tmp);
428 $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
432 // determining the level of table opening, with an added level
433 if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
435 $this->_level
[count($this->_level
)] = $this->_num
;
438 // get the level of the table containing the element
439 if (!isset($param['num'])) {
440 $param['num'] = $this->_level
[count($this->_level
) - 1];
443 // for closures table: remove a level
444 if (in_array($name, array('ul', 'ol', 'table')) && $close) {
445 unset($this->_level
[count($this->_level
) - 1]);
448 // prepare the parameters
449 if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
450 if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
451 if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
452 if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
454 // return the new action to do
455 return array('name' => $name, 'close' => $close ?
1 : 0, 'autoclose' => $autoclose, 'param' => $param);
459 * get a full level of HTML, between an opening and closing corresponding
462 * @return array actions
464 public function getLevel($k)
466 // if the code does not exist => return empty
467 if (!isset($this->code
[$k])) {
472 $detect = $this->code
[$k]['name'];
474 // if it is a text => return
475 if ($detect == 'write') {
476 return array($this->code
[$k]);
480 $level = 0; // depth level
481 $end = false; // end of the search
482 $code = array(); // extract code
484 // while it's not ended
487 $row = $this->code
[$k];
489 // if 'write' => we add the text
490 if ($row['name']=='write') {
492 } else { // else, it is a html tag
493 $not = false; // flag for not taking into account the current tag
495 // if it is the searched tag
496 if ($row['name'] == $detect) {
497 // if we are just at the root level => dont take it
503 $level+
= ($row['close'] ?
-1 : 1);
505 // if we are now at the root level => it is the end, and dont take it
512 // if we can take into account the current tag => save it
514 if (isset($row['style']['text-align'])) {
515 unset($row['style']['text-align']);
521 // it continues as long as there has code to analyze
522 if (isset($this->code
[$k +
1])) {
529 // return the extract
534 * return a part of the HTML code, for error message
536 * @param integer $pos
537 * @param integer $before take before
538 * @param integer $after take after
539 * @return string part of the html code
541 public function getHtmlErrorCode($pos, $before=30, $after=40)
543 return substr($this->_html
, $pos-$before, $before+
$after);