library/html2pdf/_class/parsingHtml.class.php

   1 <?php
   2 /**
   3  * HTML2PDF Library - parsingHtml class
   4  *
   5  * HTML => PDF convertor
   6  * distributed under the LGPL License
   7  *
   8  * @package   Html2pdf
   9  * @author    Laurent MINGUET <webmaster@html2pdf.fr>
  10  * @copyright 2016 Laurent MINGUET
  11  */
  12 class HTML2PDF_parsingHtml
  13 {
  14     protected    $_html     = '';        // HTML code to parse
  15     protected    $_num      = 0;         // table number
  16     protected    $_level    = 0;         // table level
  17     protected    $_encoding = '';        // encoding
  18     public       $code      = array();   // parsed HTML code
  19
  20     const HTML_TAB = '        ';
  21
  22     /**
  23      * main constructor
  24      *
  25      * @param   string $encoding
  26      * @access  public
  27      */
  28     public function __construct($encoding = 'UTF-8')
  29     {
  30         $this->_num   = 0;
  31         $this->_level = array($this->_num);
  32         $this->_html  = '';
  33         $this->code  = array();
  34         $this->setEncoding($encoding);
  35     }
  36
  37     /**
  38      * change the encoding
  39      *
  40      * @param   string $encoding
  41      * @access  public
  42      */
  43     public function setEncoding($encoding)
  44     {
  45         $this->_encoding = $encoding;
  46     }
  47
  48     /**
  49      * Define the HTML code to parse
  50      *
  51      * @param   string $html code
  52      * @access  public
  53      */
  54     public function setHTML($html)
  55     {
  56         // remove the HTML in comment
  57         $html = preg_replace('/<!--(.*)-->/isU', '', $html);
  58
  59         // save the HTML code
  60         $this->_html = $html;
  61     }
  62
  63     /**
  64      * parse the HTML code
  65      *
  66      * @access public
  67      */
  68     public function parse()
  69     {
  70         $parents = array();
  71
  72         // flag : are we in a <pre> Tag ?
  73         $tagPreIn = false;
  74
  75         // action to use for each line of the content of a <pre> Tag
  76         $tagPreBr = array(
  77                     'name' => 'br',
  78                     'close' => false,
  79                     'param' => array(
  80                         'style' => array(),
  81                         'num'    => 0
  82                     )
  83                 );
  84
  85         // tag that can be not closed
  86         $tagsNotClosed = array(
  87             'br', 'hr', 'img', 'col',
  88             'input', 'link', 'option',
  89             'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
  90         );
  91
  92         // search the HTML tags
  93         $parts = $this->_searchCode();
  94
  95         // all the actions to do
  96         $actions = array();
  97
  98         // foreach part of the HTML code
  99         foreach ($parts as $part) {
 100             // if it is a tag code
 101             if ($part[0] == 'code') {
 102                 // analyze the HTML code
 103                 $res = $this->_analyzeCode($part[1]);
 104
 105                 // if it is a real HTML tag
 106                 if ($res) {
 107                     // save the current position in the HTML code
 108                     $res['html_pos'] = $part[2];
 109
 110                     // if the tag must be closed
 111                     if (!in_array($res['name'], $tagsNotClosed)) {
 112                         // if it is a closure tag
 113                         if ($res['close']) {
 114                             // HTML validation
 115                             if (count($parents) < 1) {
 116                                 throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
 117                             } else if (end($parents) != $res['name']) {
 118                                 throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
 119                             } else {
 120                                 array_pop($parents);
 121                             }
 122                         } else {
 123                             // if it is an auto-closed tag
 124                             if ($res['autoclose']) {
 125                                 // save the opened tag
 126                                 $actions[] = $res;
 127
 128                                 // prepare the closed tag
 129                                 $res['params'] = array();
 130                                 $res['close'] = true;
 131                             } else {
 132                                 // else: add a child for validation
 133                                 array_push($parents, $res['name']);
 134                             }
 135                         }
 136
 137                         // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
 138                         if (($res['name'] == 'pre' || $res['name'] == 'code') && !$res['autoclose']) {
 139                             $tagPreIn = !$res['close'];
 140                         }
 141                     }
 142
 143                     // save the actions to convert
 144                     $actions[] = $res;
 145                 } else { // else (it is not a real HTML tag => we transform it in Text
 146                     $part[0] = 'txt';
 147                 }
 148             }
 149             // if it is text
 150             if ($part[0] == 'txt') {
 151                 // if we are not in a <pre> tag
 152                 if (!$tagPreIn) {
 153                     // save the action
 154                     $actions[] = array(
 155                         'name'  => 'write',
 156                         'close' => false,
 157                         'param' => array('txt' => $this->_prepareTxt($part[1])),
 158                     );
 159                 } else { // else (if we are in a <pre> tag)
 160                     // prepare the text
 161                     $part[1] = str_replace("\r", '', $part[1]);
 162                     $part[1] = explode("\n", $part[1]);
 163
 164                     // foreach line of the text
 165                     foreach ($part[1] as $k => $txt) {
 166                         // transform the line
 167                         $txt = str_replace("\t", self::HTML_TAB, $txt);
 168                         $txt = str_replace(' ', '&nbsp;', $txt);
 169
 170                         // add a break line
 171                         if ($k > 0) {
 172                             $actions[] = $tagPreBr;
 173                         }
 174
 175                         // save the action
 176                         $actions[] = array(
 177                             'name'  => 'write',
 178                             'close' => false,
 179                             'param' => array('txt' => $this->_prepareTxt($txt, false)),
 180                         );
 181                     }
 182                 }
 183             }
 184         }
 185
 186         // for each identified action, we have to clean up the begin and the end of the texte
 187         // based on tags that surround it
 188
 189         // list of the tags to clean
 190         $tagsToClean = array(
 191             'page', 'page_header', 'page_footer', 'form',
 192             'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
 193             'div', 'hr', 'p', 'ul', 'ol', 'li',
 194             'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 195             'bookmark', 'fieldset', 'legend',
 196             'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
 197             'option'
 198         );
 199
 200         // foreach action
 201         $nb = count($actions);
 202         for ($k = 0; $k < $nb; $k++) {
 203             // if it is a Text
 204             if ($actions[$k]['name']=='write') {
 205                 // if the tag before the text is a tag to clean => ltrim on the text
 206                 if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean))
 207                     $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
 208
 209                 // if the tag after the text is a tag to clean => rtrim on the text
 210                 if ($k < $nb - 1 && in_array($actions[$k + 1]['name'], $tagsToClean))
 211                     $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
 212
 213                 // if the text is empty => remove the action
 214                 if (!strlen($actions[$k]['param']['txt'])) {
 215                     unset($actions[$k]);
 216                 }
 217             }
 218         }
 219
 220         // if we are not on the level 0 => HTML validator ERROR
 221         if (count($parents)) {
 222             throw new HTML2PDF_exception(5, $parents);
 223         }
 224
 225         // save the actions to do
 226         $this->code = array_values($actions);
 227     }
 228
 229     /**
 230      * prepare the text
 231      *
 232      * @param   string $txt
 233      * @param   boolean $spaces true => replace multiple space+\t+\r+\n by a single space
 234      * @return  string txt
 235      * @access  protected
 236      */
 237     protected function _prepareTxt($txt, $spaces = true)
 238     {
 239         if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt);
 240         $txt = str_replace('&euro;', '€', $txt);
 241         $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
 242         return $txt;
 243     }
 244
 245     /**
 246      * parse the HTML code
 247      *
 248      * @return array
 249      */
 250     protected function _searchCode()
 251     {
 252         // initialise the array
 253         $parts = array();
 254
 255         // regexp to separate the tags from the texts
 256         $reg = '/(<[^>]+>)|([^<]+)+/isU';
 257
 258         // last match found
 259         $str = '';
 260         $offset = 0;
 261
 262         // As it finds a match
 263         while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
 264             // if it is a tag
 265             if ($parse[1][0]) {
 266                 // save the previous text if it exists
 267                 if ($str !== '') {
 268                     $parts[] = array('txt', $str);
 269                 }
 270
 271                 // save the tag, with the offset
 272                 $parts[] = array('code', trim($parse[1][0]), $offset);
 273
 274                 // init the current text
 275                 $str = '';
 276             } else { // else (if it is a text)
 277                 // add the new text to the current text
 278                 $str .= $parse[2][0];
 279             }
 280
 281             // Update offset to the end of the match
 282             $offset = $parse[0][1] + strlen($parse[0][0]);
 283             unset($parse);
 284         }
 285         // if a text is present in the end, we save it
 286         if ($str != '') {
 287             $parts[] = array('txt', $str);
 288         }
 289
 290         return $parts;
 291     }
 292
 293     /**
 294      * analise a HTML tag
 295      *
 296      * @param   string   $code HTML code to analise
 297      * @return  array    corresponding action
 298      */
 299     protected function _analyzeCode($code)
 300     {
 301         // name of the tag, opening, closure, autoclosure
 302         $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
 303         if (!preg_match('/'.$tag.'/isU', $code, $match)) {
 304             return null;
 305         }
 306         $close     = ($match[1] == '/' ? true : false);
 307         $autoclose = preg_match('/\/>$/isU', $code);
 308         $name      = strtolower($match[2]);
 309
 310         // required parameters (depends on the tag name)
 311         $param    = array();
 312         $param['style'] = '';
 313         if ($name == 'img') {
 314             $param['alt'] = '';
 315             $param['src'] = '';
 316         }
 317         if ($name == 'a') {
 318             $param['href'] = '';
 319         }
 320
 321         // read the parameters : name=value
 322         $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
 323         preg_match_all('/'.$prop.'/is', $code, $match);
 324         for ($k = 0; $k < count($match[0]); $k++) {
 325             $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 326         }
 327
 328         // read the parameters : name="value"
 329         $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
 330         preg_match_all('/'.$prop.'/is', $code, $match);
 331         for ($k = 0; $k < count($match[0]); $k++) {
 332             $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 333         }
 334
 335         // read the parameters : name='value'
 336         $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
 337         preg_match_all('/'.$prop.'/is', $code, $match);
 338         for ($k = 0; $k < count($match[0]); $k++) {
 339             $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 340         }
 341
 342         // compliance of each parameter
 343         $color  = "#000000";
 344         $border = null;
 345         foreach ($param as $key => $val) {
 346             $key = strtolower($key);
 347             switch($key)
 348             {
 349                 case 'width':
 350                     unset($param[$key]);
 351                     $param['style'] .= 'width: '.$val.'px; ';
 352                     break;
 353
 354                 case 'align':
 355                     if ($name === 'img') {
 356                         unset($param[$key]);
 357                         $param['style'] .= 'float: '.$val.'; ';
 358                     } elseif ($name !== 'table') {
 359                         unset($param[$key]);
 360                         $param['style'] .= 'text-align: '.$val.'; ';
 361                     }
 362                     break;
 363
 364                 case 'valign':
 365                     unset($param[$key]);
 366                     $param['style'] .= 'vertical-align: '.$val.'; ';
 367                     break;
 368
 369                 case 'height':
 370                     unset($param[$key]);
 371                     $param['style'] .= 'height: '.$val.'px; ';
 372                     break;
 373
 374                 case 'bgcolor':
 375                     unset($param[$key]);
 376                     $param['style'] .= 'background: '.$val.'; ';
 377                     break;
 378
 379                 case 'bordercolor':
 380                     unset($param[$key]);
 381                     $color = $val;
 382                     break;
 383
 384                 case 'border':
 385                     unset($param[$key]);
 386                     if (preg_match('/^[0-9]+$/isU', $val)) {
 387                         $val = $val.'px';
 388                     }
 389                     $border = $val;
 390                     break;
 391
 392                 case 'cellpadding':
 393                 case 'cellspacing':
 394                     if (preg_match('/^([0-9]+)$/isU', $val)) {
 395                         $param[$key] = $val.'px';
 396                     }
 397                     break;
 398
 399                 case 'colspan':
 400                 case 'rowspan':
 401                     $val = preg_replace('/[^0-9]/isU', '', $val);
 402                     if (!$val) {
 403                         $val = 1;
 404                     }
 405                     $param[$key] = $val;
 406                     break;
 407             }
 408         }
 409
 410         // compliance of the border
 411         if ($border !== null) {
 412             if ($border)    $border = 'border: solid '.$border.' '.$color;
 413             else            $border = 'border: none';
 414
 415             $param['style'] .= $border.'; ';
 416             $param['border'] = $border;
 417         }
 418
 419         // reading styles: decomposition and standardization
 420         $styles = explode(';', $param['style']);
 421         $param['style'] = array();
 422         foreach ($styles as $style) {
 423             $tmp = explode(':', $style);
 424             if (count($tmp) > 1) {
 425                 $cod = $tmp[0];
 426                 unset($tmp[0]);
 427                 $tmp = implode(':', $tmp);
 428                 $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
 429             }
 430         }
 431
 432         // determining the level of table opening, with an added level
 433         if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
 434             $this->_num++;
 435             $this->_level[count($this->_level)] = $this->_num;
 436         }
 437
 438         // get the level of the table containing the element
 439         if (!isset($param['num'])) {
 440             $param['num'] = $this->_level[count($this->_level) - 1];
 441         }
 442
 443         // for closures table: remove a level
 444         if (in_array($name, array('ul', 'ol', 'table')) && $close) {
 445             unset($this->_level[count($this->_level) - 1]);
 446         }
 447
 448         // prepare the parameters
 449         if (isset($param['value']))  $param['value']  = $this->_prepareTxt($param['value']);
 450         if (isset($param['alt']))    $param['alt']    = $this->_prepareTxt($param['alt']);
 451         if (isset($param['title']))  $param['title']  = $this->_prepareTxt($param['title']);
 452         if (isset($param['class']))  $param['class']  = $this->_prepareTxt($param['class']);
 453
 454         // return the new action to do
 455         return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
 456     }
 457
 458     /**
 459      * get a full level of HTML, between an opening and closing corresponding
 460      *
 461      * @param   integer $k
 462      * @return  array   actions
 463      */
 464     public function getLevel($k)
 465     {
 466         // if the code does not exist => return empty
 467         if (!isset($this->code[$k])) {
 468             return array();
 469         }
 470
 471         // the tag to detect
 472         $detect = $this->code[$k]['name'];
 473
 474         // if it is a text => return
 475         if ($detect == 'write') {
 476             return array($this->code[$k]);
 477         }
 478
 479         //
 480         $level = 0;      // depth level
 481         $end = false;    // end of the search
 482         $code = array(); // extract code
 483
 484         // while it's not ended
 485         while (!$end) {
 486             // current action
 487             $row = $this->code[$k];
 488
 489             // if 'write' => we add the text
 490             if ($row['name']=='write') {
 491                 $code[] = $row;
 492             } else { // else, it is a html tag
 493                 $not = false; // flag for not taking into account the current tag
 494
 495                 // if it is the searched tag
 496                 if ($row['name'] == $detect) {
 497                     // if we are just at the root level => dont take it
 498                     if ($level == 0) {
 499                         $not = true;
 500                     }
 501
 502                     // update the level
 503                     $level+= ($row['close'] ? -1 : 1);
 504
 505                     // if we are now at the root level => it is the end, and dont take it
 506                     if ($level == 0) {
 507                         $not = true;
 508                         $end = true;
 509                     }
 510                 }
 511
 512                 // if we can take into account the current tag => save it
 513                 if (!$not) {
 514                     if (isset($row['style']['text-align'])) {
 515                         unset($row['style']['text-align']);
 516                     }
 517                     $code[] = $row;
 518                 }
 519             }
 520
 521             // it continues as long as there has code to analyze
 522             if (isset($this->code[$k + 1])) {
 523                 $k++;
 524             } else {
 525                 $end = true;
 526             }
 527         }
 528
 529         // return the extract
 530         return $code;
 531     }
 532
 533     /**
 534      * return a part of the HTML code, for error message
 535      *
 536      * @param   integer $pos
 537      * @param   integer $before take before
 538      * @param   integer $after  take after
 539      * @return  string  part of the html code
 540      */
 541     public function getHtmlErrorCode($pos, $before=30, $after=40)
 542     {
 543         return substr($this->_html, $pos-$before, $before+$after);
 544     }
 545 }