lib/html2text/Html2Text.php

   1 <?php
   2
   3 /*
   4  * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5  *
   6  * This script is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * The GNU General Public License can be found at
  12  * http://www.gnu.org/copyleft/gpl.html.
  13  *
  14  * This script is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 namespace Html2Text;
  21
  22 class Html2Text
  23 {
  24     const ENCODING = 'UTF-8';
  25
  26     protected $htmlFuncFlags;
  27
  28     /**
  29      * Contains the HTML content to convert.
  30      *
  31      * @type string
  32      */
  33     protected $html;
  34
  35     /**
  36      * Contains the converted, formatted text.
  37      *
  38      * @type string
  39      */
  40     protected $text;
  41
  42     /**
  43      * List of preg* regular expression patterns to search for,
  44      * used in conjunction with $replace.
  45      *
  46      * @type array
  47      * @see $replace
  48      */
  49     protected $search = array(
  50         "/\r/",                                           // Non-legal carriage return
  51         "/[\n\t]+/",                                      // Newlines and tabs
  52         '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  53         '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  54         '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  55         '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  56         '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  57         '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  58         '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  59         '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  60         '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  61         '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  62         '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  63         '/<li\b[^>]*>/i',                                 // <li>
  64         '/<hr\b[^>]*>/i',                                 // <hr>
  65         '/<div\b[^>]*>/i',                                // <div>
  66         '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  67         '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  68         '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  69         '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  70         '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  71     );
  72
  73     /**
  74      * List of pattern replacements corresponding to patterns searched.
  75      *
  76      * @type array
  77      * @see $search
  78      */
  79     protected $replace = array(
  80         '',                              // Non-legal carriage return
  81         ' ',                             // Newlines and tabs
  82         '',                              // <head>
  83         '',                              // <script>s -- which strip_tags supposedly has problems with
  84         '',                              // <style>s -- which strip_tags supposedly has problems with
  85         '_\\1_',                         // <i>
  86         '_\\1_',                         // <em>
  87         "\n\n",                          // <ul> and </ul>
  88         "\n\n",                          // <ol> and </ol>
  89         "\n\n",                          // <dl> and </dl>
  90         "\t* \\1\n",                     // <li> and </li>
  91         " \\1\n",                        // <dd> and </dd>
  92         "\t* \\1",                       // <dt> and </dt>
  93         "\n\t* ",                        // <li>
  94         "\n-------------------------\n", // <hr>
  95         "<div>\n",                       // <div>
  96         "\n\n",                          // <table> and </table>
  97         "\n",                            // <tr> and </tr>
  98         "\t\t\\1\n",                     // <td> and </td>
  99         "",                              // <span class="_html2text_ignore">...</span>
 100         '[\\2]',                         // <img> with alt tag
 101     );
 102
 103     /**
 104      * List of preg* regular expression patterns to search for,
 105      * used in conjunction with $entReplace.
 106      *
 107      * @type array
 108      * @see $entReplace
 109      */
 110     protected $entSearch = array(
 111         '/&#153;/i',                                     // TM symbol in win-1252
 112         '/&#151;/i',                                     // m-dash in win-1252
 113         '/&(amp|#38);/i',                                // Ampersand: see converter()
 114         '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 115     );
 116
 117     /**
 118      * List of pattern replacements corresponding to patterns searched.
 119      *
 120      * @type array
 121      * @see $entSearch
 122      */
 123     protected $entReplace = array(
 124         '™',         // TM symbol
 125         '—',         // m-dash
 126         '|+|amp|+|', // Ampersand: see converter()
 127         ' ',         // Runs of spaces, post-handling
 128     );
 129
 130     /**
 131      * List of preg* regular expression patterns to search for
 132      * and replace using callback function.
 133      *
 134      * @type array
 135      */
 136     protected $callbackSearch = array(
 137         '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 138         '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 139         '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 140         '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 141         '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 142         '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 143         '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 144     );
 145
 146     /**
 147      * List of preg* regular expression patterns to search for in PRE body,
 148      * used in conjunction with $preReplace.
 149      *
 150      * @type array
 151      * @see $preReplace
 152      */
 153     protected $preSearch = array(
 154         "/\n/",
 155         "/\t/",
 156         '/ /',
 157         '/<pre[^>]*>/',
 158         '/<\/pre>/'
 159     );
 160
 161     /**
 162      * List of pattern replacements corresponding to patterns searched for PRE body.
 163      *
 164      * @type array
 165      * @see $preSearch
 166      */
 167     protected $preReplace = array(
 168         '<br>',
 169         '&nbsp;&nbsp;&nbsp;&nbsp;',
 170         '&nbsp;',
 171         '',
 172         '',
 173     );
 174
 175     /**
 176      * Temporary workspace used during PRE processing.
 177      *
 178      * @type string
 179      */
 180     protected $preContent = '';
 181
 182     /**
 183      * Contains the base URL that relative links should resolve to.
 184      *
 185      * @type string
 186      */
 187     protected $baseurl = '';
 188
 189     /**
 190      * Indicates whether content in the $html variable has been converted yet.
 191      *
 192      * @type boolean
 193      * @see $html, $text
 194      */
 195     protected $converted = false;
 196
 197     /**
 198      * Contains URL addresses from links to be rendered in plain text.
 199      *
 200      * @type array
 201      * @see buildlinkList()
 202      */
 203     protected $linkList = array();
 204
 205     /**
 206      * Various configuration options (able to be set in the constructor)
 207      *
 208      * @type array
 209      */
 210     protected $options = array(
 211         'do_links' => 'inline', // 'none'
 212                                 // 'inline' (show links inline)
 213                                 // 'nextline' (show links on the next line)
 214                                 // 'table' (if a table of link URLs should be listed after the text.
 215                                 // 'bbcode' (show links as bbcode)
 216
 217         'width' => 70,          //  Maximum width of the formatted text, in columns.
 218                                 //  Set this value to 0 (or less) to ignore word wrapping
 219                                 //  and not constrain text to a fixed-width column.
 220     );
 221
 222     private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 223     {
 224         $this->set_html($html, $fromFile);
 225         $this->options = array_merge($this->options, $options);
 226     }
 227
 228     /**
 229      * @param string $html    Source HTML
 230      * @param array  $options Set configuration options
 231      */
 232     public function __construct($html = '', $options = array())
 233     {
 234         // for backwards compatibility
 235         if (!is_array($options)) {
 236             return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 237         }
 238
 239         $this->html = $html;
 240         $this->options = array_merge($this->options, $options);
 241         $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 242             ? ENT_COMPAT
 243             : ENT_COMPAT | ENT_HTML5;
 244     }
 245
 246     /**
 247     * Get the source HTML
 248     *
 249     * @return string
 250     */
 251     public function getHtml()
 252     {
 253         return $this->html;
 254     }
 255
 256     /**
 257      * Set the source HTML
 258      *
 259      * @param string $html HTML source content
 260      */
 261     public function setHtml($html)
 262     {
 263         $this->html = $html;
 264         $this->converted = false;
 265     }
 266
 267     /**
 268      * @deprecated
 269      */
 270     public function set_html($html, $from_file = false)
 271     {
 272         if ($from_file) {
 273             throw new \InvalidArgumentException("Argument from_file no longer supported");
 274         }
 275
 276         return $this->setHtml($html);
 277     }
 278
 279     /**
 280      * Returns the text, converted from HTML.
 281      *
 282      * @return string
 283      */
 284     public function getText()
 285     {
 286         if (!$this->converted) {
 287             $this->convert();
 288         }
 289
 290         return $this->text;
 291     }
 292
 293     /**
 294      * @deprecated
 295      */
 296     public function get_text()
 297     {
 298         return $this->getText();
 299     }
 300
 301     /**
 302      * @deprecated
 303      */
 304     public function print_text()
 305     {
 306         print $this->getText();
 307     }
 308
 309     /**
 310      * @deprecated
 311      */
 312     public function p()
 313     {
 314         return $this->print_text();
 315     }
 316
 317     /**
 318      * Sets a base URL to handle relative links.
 319      *
 320      * @param string $baseurl
 321      */
 322     public function setBaseUrl($baseurl)
 323     {
 324         $this->baseurl = $baseurl;
 325     }
 326
 327     /**
 328      * @deprecated
 329      */
 330     public function set_base_url($baseurl)
 331     {
 332         return $this->setBaseUrl($baseurl);
 333     }
 334
 335     protected function convert()
 336     {
 337        $origEncoding = mb_internal_encoding();
 338        mb_internal_encoding(self::ENCODING);
 339
 340        $this->doConvert();
 341
 342        mb_internal_encoding($origEncoding);
 343     }
 344
 345     protected function doConvert()
 346     {
 347         $this->linkList = array();
 348
 349         $text = trim($this->html);
 350
 351         $this->converter($text);
 352
 353         if ($this->linkList) {
 354             $text .= "\n\nLinks:\n------\n";
 355             foreach ($this->linkList as $i => $url) {
 356                 $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 357             }
 358         }
 359
 360         $this->text = $text;
 361
 362         $this->converted = true;
 363     }
 364
 365     protected function converter(&$text)
 366     {
 367         $this->convertBlockquotes($text);
 368         $this->convertPre($text);
 369         $text = preg_replace($this->search, $this->replace, $text);
 370         $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 371         $text = strip_tags($text);
 372         $text = preg_replace($this->entSearch, $this->entReplace, $text);
 373         $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 374
 375         // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 376         $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 377
 378         // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 379         // This properly handles situation of "&amp;quot;" in input string
 380         $text = str_replace('|+|amp|+|', '&', $text);
 381
 382         // Normalise empty lines
 383         $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 384         $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 385
 386         // remove leading empty lines (can be produced by eg. P tag on the beginning)
 387         $text = ltrim($text, "\n");
 388
 389         if ($this->options['width'] > 0) {
 390             $text = wordwrap($text, $this->options['width']);
 391         }
 392     }
 393
 394     /**
 395      * Helper function called by preg_replace() on link replacement.
 396      *
 397      * Maintains an internal list of links to be displayed at the end of the
 398      * text, with numeric indices to the original point in the text they
 399      * appeared. Also makes an effort at identifying and handling absolute
 400      * and relative links.
 401      *
 402      * @param  string $link          URL of the link
 403      * @param  string $display       Part of the text to associate number with
 404      * @param  null   $linkOverride
 405      * @return string
 406      */
 407     protected function buildlinkList($link, $display, $linkOverride = null)
 408     {
 409         $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 410         if ($linkMethod == 'none') {
 411             return $display;
 412         }
 413
 414         // Ignored link types
 415         if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
 416             return $display;
 417         }
 418
 419         if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 420             $url = $link;
 421         } else {
 422             $url = $this->baseurl;
 423             if (mb_substr($link, 0, 1) != '/') {
 424                 $url .= '/';
 425             }
 426             $url .= $link;
 427         }
 428
 429         if ($linkMethod == 'table') {
 430             if (($index = array_search($url, $this->linkList)) === false) {
 431                 $index = count($this->linkList);
 432                 $this->linkList[] = $url;
 433             }
 434
 435             return $display . ' [' . ($index + 1) . ']';
 436         } elseif ($linkMethod == 'nextline') {
 437             if ($url === $display) {
 438                 return $display;
 439             }
 440             return $display . "\n[" . $url . ']';
 441         } elseif ($linkMethod == 'bbcode') {
 442             return sprintf('[url=%s]%s[/url]', $url, $display);
 443         } else { // link_method defaults to inline
 444             if ($url === $display) {
 445                 return $display;
 446             }
 447             return $display . ' [' . $url . ']';
 448         }
 449     }
 450
 451     protected function convertPre(&$text)
 452     {
 453         // get the content of PRE element
 454         while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 455             // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 456             $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 457
 458             // Run our defined tags search-and-replace with callback
 459             $this->preContent = preg_replace_callback(
 460                 $this->callbackSearch,
 461                 array($this, 'pregCallback'),
 462                 $this->preContent
 463             );
 464
 465             // convert the content
 466             $this->preContent = sprintf(
 467                 '<div><br>%s<br></div>',
 468                 preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 469             );
 470
 471             // replace the content (use callback because content can contain $0 variable)
 472             $text = preg_replace_callback(
 473                 '/<pre[^>]*>.*<\/pre>/ismU',
 474                 array($this, 'pregPreCallback'),
 475                 $text,
 476                 1
 477             );
 478
 479             // free memory
 480             $this->preContent = '';
 481         }
 482     }
 483
 484     /**
 485      * Helper function for BLOCKQUOTE body conversion.
 486      *
 487      * @param string $text HTML content
 488      */
 489     protected function convertBlockquotes(&$text)
 490     {
 491         if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 492             $originalText = $text;
 493             $start = 0;
 494             $taglen = 0;
 495             $level = 0;
 496             $diff = 0;
 497             foreach ($matches[0] as $m) {
 498                 $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 499                 if ($m[0][0] == '<' && $m[0][1] == '/') {
 500                     $level--;
 501                     if ($level < 0) {
 502                         $level = 0; // malformed HTML: go to next blockquote
 503                     } elseif ($level > 0) {
 504                         // skip inner blockquote
 505                     } else {
 506                         $end = $m[1];
 507                         $len = $end - $taglen - $start;
 508                         // Get blockquote content
 509                         $body = mb_substr($text, $start + $taglen - $diff, $len);
 510
 511                         // Set text width
 512                         $pWidth = $this->options['width'];
 513                         if ($this->options['width'] > 0) $this->options['width'] -= 2;
 514                         // Convert blockquote content
 515                         $body = trim($body);
 516                         $this->converter($body);
 517                         // Add citation markers and create PRE block
 518                         $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 519                         $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 520                         // Re-set text width
 521                         $this->options['width'] = $pWidth;
 522                         // Replace content
 523                         $text = mb_substr($text, 0, $start - $diff)
 524                             . $body
 525                             . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 526
 527                         $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 528                         unset($body);
 529                     }
 530                 } else {
 531                     if ($level == 0) {
 532                         $start = $m[1];
 533                         $taglen = mb_strlen($m[0]);
 534                     }
 535                     $level++;
 536                 }
 537             }
 538         }
 539     }
 540
 541     /**
 542      * Callback function for preg_replace_callback use.
 543      *
 544      * @param  array  $matches PREG matches
 545      * @return string
 546      */
 547     protected function pregCallback($matches)
 548     {
 549         switch (mb_strtolower($matches[1])) {
 550             case 'p':
 551                 // Replace newlines with spaces.
 552                 $para = str_replace("\n", " ", $matches[3]);
 553
 554                 // Trim trailing and leading whitespace within the tag.
 555                 $para = trim($para);
 556
 557                 // Add trailing newlines for this para.
 558                 return "\n" . $para . "\n";
 559             case 'br':
 560                 return "\n";
 561             case 'b':
 562             case 'strong':
 563                 return $this->toupper($matches[3]);
 564             case 'th':
 565                 return $this->toupper("\t\t" . $matches[3] . "\n");
 566             case 'h':
 567                 return $this->toupper("\n\n" . $matches[3] . "\n\n");
 568             case 'a':
 569                 // override the link method
 570                 $linkOverride = null;
 571                 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 572                     $linkOverride = $linkOverrideMatch[1];
 573                 }
 574                 // Remove spaces in URL (#1487805)
 575                 $url = str_replace(' ', '', $matches[3]);
 576
 577                 return $this->buildlinkList($url, $matches[5], $linkOverride);
 578         }
 579
 580         return '';
 581     }
 582
 583     /**
 584      * Callback function for preg_replace_callback use in PRE content handler.
 585      *
 586      * @param  array  $matches PREG matches
 587      * @return string
 588      */
 589     protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 590     {
 591         return $this->preContent;
 592     }
 593
 594     /**
 595      * Strtoupper function with HTML tags and entities handling.
 596      *
 597      * @param  string $str Text to convert
 598      * @return string Converted text
 599      */
 600     protected function toupper($str)
 601     {
 602         // string can contain HTML tags
 603         $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 604
 605         // convert toupper only the text between HTML tags
 606         foreach ($chunks as $i => $chunk) {
 607             if ($chunk[0] != '<') {
 608                 $chunks[$i] = $this->strtoupper($chunk);
 609             }
 610         }
 611
 612         return implode($chunks);
 613     }
 614
 615     /**
 616      * Strtoupper multibyte wrapper function with HTML entities handling.
 617      *
 618      * @param  string $str Text to convert
 619      * @return string Converted text
 620      */
 621     protected function strtoupper($str)
 622     {
 623         $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 624         $str = mb_strtoupper($str);
 625         $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 626
 627         return $str;
 628     }
 629 }