preparing for 5.0.1 release in several weeks (#1509)
[openemr.git] / library / html2pdf / _class / parsingHtml.class.php
blob81ac3a67784b979a4aeca28b5ada43e5015fb892
1 <?php
2 /**
3 * HTML2PDF Library - parsingHtml class
5 * HTML => PDF convertor
6 * distributed under the LGPL License
8 * @package Html2pdf
9 * @author Laurent MINGUET <webmaster@html2pdf.fr>
10 * @copyright 2016 Laurent MINGUET
12 class HTML2PDF_parsingHtml
14 protected $_html = ''; // HTML code to parse
15 protected $_num = 0; // table number
16 protected $_level = 0; // table level
17 protected $_encoding = ''; // encoding
18 public $code = array(); // parsed HTML code
20 const HTML_TAB = ' ';
22 /**
23 * main constructor
25 * @param string $encoding
26 * @access public
28 public function __construct($encoding = 'UTF-8')
30 $this->_num = 0;
31 $this->_level = array($this->_num);
32 $this->_html = '';
33 $this->code = array();
34 $this->setEncoding($encoding);
37 /**
38 * change the encoding
40 * @param string $encoding
41 * @access public
43 public function setEncoding($encoding)
45 $this->_encoding = $encoding;
48 /**
49 * Define the HTML code to parse
51 * @param string $html code
52 * @access public
54 public function setHTML($html)
56 // remove the HTML in comment
57 $html = preg_replace('/<!--(.*)-->/isU', '', $html);
59 // save the HTML code
60 $this->_html = $html;
63 /**
64 * parse the HTML code
66 * @access public
68 public function parse()
70 $parents = array();
72 // flag : are we in a <pre> Tag ?
73 $tagPreIn = false;
75 // action to use for each line of the content of a <pre> Tag
76 $tagPreBr = array(
77 'name' => 'br',
78 'close' => false,
79 'param' => array(
80 'style' => array(),
81 'num' => 0
85 // tag that can be not closed
86 $tagsNotClosed = array(
87 'br', 'hr', 'img', 'col',
88 'input', 'link', 'option',
89 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
92 // search the HTML tags
93 $parts = $this->_searchCode();
95 // all the actions to do
96 $actions = array();
98 // foreach part of the HTML code
99 foreach ($parts as $part) {
100 // if it is a tag code
101 if ($part[0] == 'code') {
102 // analyze the HTML code
103 $res = $this->_analyzeCode($part[1]);
105 // if it is a real HTML tag
106 if ($res) {
107 // save the current position in the HTML code
108 $res['html_pos'] = $part[2];
110 // if the tag must be closed
111 if (!in_array($res['name'], $tagsNotClosed)) {
112 // if it is a closure tag
113 if ($res['close']) {
114 // HTML validation
115 if (count($parents) < 1) {
116 throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
117 } else if (end($parents) != $res['name']) {
118 throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
119 } else {
120 array_pop($parents);
122 } else {
123 // if it is an auto-closed tag
124 if ($res['autoclose']) {
125 // save the opened tag
126 $actions[] = $res;
128 // prepare the closed tag
129 $res['params'] = array();
130 $res['close'] = true;
131 } else {
132 // else: add a child for validation
133 array_push($parents, $res['name']);
137 // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
138 if (($res['name'] == 'pre' || $res['name'] == 'code') && !$res['autoclose']) {
139 $tagPreIn = !$res['close'];
143 // save the actions to convert
144 $actions[] = $res;
145 } else { // else (it is not a real HTML tag => we transform it in Text
146 $part[0] = 'txt';
149 // if it is text
150 if ($part[0] == 'txt') {
151 // if we are not in a <pre> tag
152 if (!$tagPreIn) {
153 // save the action
154 $actions[] = array(
155 'name' => 'write',
156 'close' => false,
157 'param' => array('txt' => $this->_prepareTxt($part[1])),
159 } else { // else (if we are in a <pre> tag)
160 // prepare the text
161 $part[1] = str_replace("\r", '', $part[1]);
162 $part[1] = explode("\n", $part[1]);
164 // foreach line of the text
165 foreach ($part[1] as $k => $txt) {
166 // transform the line
167 $txt = str_replace("\t", self::HTML_TAB, $txt);
168 $txt = str_replace(' ', '&nbsp;', $txt);
170 // add a break line
171 if ($k > 0) {
172 $actions[] = $tagPreBr;
175 // save the action
176 $actions[] = array(
177 'name' => 'write',
178 'close' => false,
179 'param' => array('txt' => $this->_prepareTxt($txt, false)),
186 // for each identified action, we have to clean up the begin and the end of the texte
187 // based on tags that surround it
189 // list of the tags to clean
190 $tagsToClean = array(
191 'page', 'page_header', 'page_footer', 'form',
192 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
193 'div', 'hr', 'p', 'ul', 'ol', 'li',
194 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
195 'bookmark', 'fieldset', 'legend',
196 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
197 'option'
200 // foreach action
201 $nb = count($actions);
202 for ($k = 0; $k < $nb; $k++) {
203 // if it is a Text
204 if ($actions[$k]['name']=='write') {
205 // if the tag before the text is a tag to clean => ltrim on the text
206 if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean))
207 $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
209 // if the tag after the text is a tag to clean => rtrim on the text
210 if ($k < $nb - 1 && in_array($actions[$k + 1]['name'], $tagsToClean))
211 $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
213 // if the text is empty => remove the action
214 if (!strlen($actions[$k]['param']['txt'])) {
215 unset($actions[$k]);
220 // if we are not on the level 0 => HTML validator ERROR
221 if (count($parents)) {
222 throw new HTML2PDF_exception(5, $parents);
225 // save the actions to do
226 $this->code = array_values($actions);
230 * prepare the text
232 * @param string $txt
233 * @param boolean $spaces true => replace multiple space+\t+\r+\n by a single space
234 * @return string txt
235 * @access protected
237 protected function _prepareTxt($txt, $spaces = true)
239 if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt);
240 $txt = str_replace('&euro;', '€', $txt);
241 $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
242 return $txt;
246 * parse the HTML code
248 * @return array
250 protected function _searchCode()
252 // initialise the array
253 $parts = array();
255 // regexp to separate the tags from the texts
256 $reg = '/(<[^>]+>)|([^<]+)+/isU';
258 // last match found
259 $str = '';
260 $offset = 0;
262 // As it finds a match
263 while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
264 // if it is a tag
265 if ($parse[1][0]) {
266 // save the previous text if it exists
267 if ($str !== '') {
268 $parts[] = array('txt', $str);
271 // save the tag, with the offset
272 $parts[] = array('code', trim($parse[1][0]), $offset);
274 // init the current text
275 $str = '';
276 } else { // else (if it is a text)
277 // add the new text to the current text
278 $str .= $parse[2][0];
281 // Update offset to the end of the match
282 $offset = $parse[0][1] + strlen($parse[0][0]);
283 unset($parse);
285 // if a text is present in the end, we save it
286 if ($str != '') {
287 $parts[] = array('txt', $str);
290 return $parts;
294 * analise a HTML tag
296 * @param string $code HTML code to analise
297 * @return array corresponding action
299 protected function _analyzeCode($code)
301 // name of the tag, opening, closure, autoclosure
302 $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
303 if (!preg_match('/'.$tag.'/isU', $code, $match)) {
304 return null;
306 $close = ($match[1] == '/' ? true : false);
307 $autoclose = preg_match('/\/>$/isU', $code);
308 $name = strtolower($match[2]);
310 // required parameters (depends on the tag name)
311 $param = array();
312 $param['style'] = '';
313 if ($name == 'img') {
314 $param['alt'] = '';
315 $param['src'] = '';
317 if ($name == 'a') {
318 $param['href'] = '';
321 // read the parameters : name=value
322 $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
323 preg_match_all('/'.$prop.'/is', $code, $match);
324 for ($k = 0; $k < count($match[0]); $k++) {
325 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
328 // read the parameters : name="value"
329 $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
330 preg_match_all('/'.$prop.'/is', $code, $match);
331 for ($k = 0; $k < count($match[0]); $k++) {
332 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
335 // read the parameters : name='value'
336 $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
337 preg_match_all('/'.$prop.'/is', $code, $match);
338 for ($k = 0; $k < count($match[0]); $k++) {
339 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
342 // compliance of each parameter
343 $color = "#000000";
344 $border = null;
345 foreach ($param as $key => $val) {
346 $key = strtolower($key);
347 switch($key)
349 case 'width':
350 unset($param[$key]);
351 $param['style'] .= 'width: '.$val.'px; ';
352 break;
354 case 'align':
355 if ($name === 'img') {
356 unset($param[$key]);
357 $param['style'] .= 'float: '.$val.'; ';
358 } elseif ($name !== 'table') {
359 unset($param[$key]);
360 $param['style'] .= 'text-align: '.$val.'; ';
362 break;
364 case 'valign':
365 unset($param[$key]);
366 $param['style'] .= 'vertical-align: '.$val.'; ';
367 break;
369 case 'height':
370 unset($param[$key]);
371 $param['style'] .= 'height: '.$val.'px; ';
372 break;
374 case 'bgcolor':
375 unset($param[$key]);
376 $param['style'] .= 'background: '.$val.'; ';
377 break;
379 case 'bordercolor':
380 unset($param[$key]);
381 $color = $val;
382 break;
384 case 'border':
385 unset($param[$key]);
386 if (preg_match('/^[0-9]+$/isU', $val)) {
387 $val = $val.'px';
389 $border = $val;
390 break;
392 case 'cellpadding':
393 case 'cellspacing':
394 if (preg_match('/^([0-9]+)$/isU', $val)) {
395 $param[$key] = $val.'px';
397 break;
399 case 'colspan':
400 case 'rowspan':
401 $val = preg_replace('/[^0-9]/isU', '', $val);
402 if (!$val) {
403 $val = 1;
405 $param[$key] = $val;
406 break;
410 // compliance of the border
411 if ($border !== null) {
412 if ($border) $border = 'border: solid '.$border.' '.$color;
413 else $border = 'border: none';
415 $param['style'] .= $border.'; ';
416 $param['border'] = $border;
419 // reading styles: decomposition and standardization
420 $styles = explode(';', $param['style']);
421 $param['style'] = array();
422 foreach ($styles as $style) {
423 $tmp = explode(':', $style);
424 if (count($tmp) > 1) {
425 $cod = $tmp[0];
426 unset($tmp[0]);
427 $tmp = implode(':', $tmp);
428 $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
432 // determining the level of table opening, with an added level
433 if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
434 $this->_num++;
435 $this->_level[count($this->_level)] = $this->_num;
438 // get the level of the table containing the element
439 if (!isset($param['num'])) {
440 $param['num'] = $this->_level[count($this->_level) - 1];
443 // for closures table: remove a level
444 if (in_array($name, array('ul', 'ol', 'table')) && $close) {
445 unset($this->_level[count($this->_level) - 1]);
448 // prepare the parameters
449 if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
450 if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
451 if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
452 if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
454 // return the new action to do
455 return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
459 * get a full level of HTML, between an opening and closing corresponding
461 * @param integer $k
462 * @return array actions
464 public function getLevel($k)
466 // if the code does not exist => return empty
467 if (!isset($this->code[$k])) {
468 return array();
471 // the tag to detect
472 $detect = $this->code[$k]['name'];
474 // if it is a text => return
475 if ($detect == 'write') {
476 return array($this->code[$k]);
480 $level = 0; // depth level
481 $end = false; // end of the search
482 $code = array(); // extract code
484 // while it's not ended
485 while (!$end) {
486 // current action
487 $row = $this->code[$k];
489 // if 'write' => we add the text
490 if ($row['name']=='write') {
491 $code[] = $row;
492 } else { // else, it is a html tag
493 $not = false; // flag for not taking into account the current tag
495 // if it is the searched tag
496 if ($row['name'] == $detect) {
497 // if we are just at the root level => dont take it
498 if ($level == 0) {
499 $not = true;
502 // update the level
503 $level+= ($row['close'] ? -1 : 1);
505 // if we are now at the root level => it is the end, and dont take it
506 if ($level == 0) {
507 $not = true;
508 $end = true;
512 // if we can take into account the current tag => save it
513 if (!$not) {
514 if (isset($row['style']['text-align'])) {
515 unset($row['style']['text-align']);
517 $code[] = $row;
521 // it continues as long as there has code to analyze
522 if (isset($this->code[$k + 1])) {
523 $k++;
524 } else {
525 $end = true;
529 // return the extract
530 return $code;
534 * return a part of the HTML code, for error message
536 * @param integer $pos
537 * @param integer $before take before
538 * @param integer $after take after
539 * @return string part of the html code
541 public function getHtmlErrorCode($pos, $before=30, $after=40)
543 return substr($this->_html, $pos-$before, $before+$after);