Applied patch #411
[elgg.git] / lib / html2text.php
blobc4c3c0d878654647b9a2c3131a5015e367378974
1 <?php
3 /***************************************************************
4 * Library to convert HTML into an approximate text equivalent *
5 ***************************************************************
7 Version: 1.0.3 (with modifications)
8 Copyright 2003 Mark Wilton-Jones
9 License: HowToCreate script license with written permission
10 URL: http://www.howtocreate.co.uk/php/
12 For full details about the script and to get the latest version,
13 please see the HowToCreate web site above.
15 This version contains modifications for Moodle. In each case the
16 lines are marked with "Moodle", so you can see what has changed.
18 ********************************************************************/
20 function html2text( $badStr ) {
22 $is_open_tb = false;
23 $is_open_dq = false;
24 $is_open_sq = false;
26 //remove comments
28 while (substr_count($badStr, '<!--') &&
29 substr_count($badStr, '-->') &&
30 strpos($badStr, '-->', strpos($badStr, '<!--' ) ) > strpos( $badStr, '<!--' ) ) {
31 $badStr = substr( $badStr, 0, strpos( $badStr, '<!--' ) ) .
32 substr( $badStr, strpos( $badStr, '-->',
33 strpos( $badStr, '<!--' ) ) + 3 );
36 //now make sure all HTML tags are correctly written (> not in between quotes)
38 $len = strlen($badStr); // Moodle
39 $chr = $badStr{0}; // Moodle
40 $goodStr = ''; // Moodle
42 if ($len > 0) { // Moodle
43 for ($x=0; $x < $len; $x++ ) { // Moodle
44 $chr = $badStr{$x}; //take each letter in turn and check if that character is permitted there
45 switch ( $chr ) {
46 case '<':
47 if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 5 ) ) == 'style' ) {
48 $badStr = substr( $badStr, 0, $x ) .
49 substr( $badStr, strpos( strtolower( $badStr ), '</style>', $x ) + 7 );
50 $chr = '';
51 } else if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 6 ) ) == 'script' ) {
52 $badStr = substr( $badStr, 0, $x ) .
53 substr( $badStr, strpos( strtolower( $badStr ), '</script>', $x ) + 8 );
54 $chr = '';
55 } else if (!$is_open_tb) {
56 $is_open_tb = true;
57 } else {
58 $chr = '&lt;';
60 break;
62 case '>':
63 if ( !$is_open_tb || $is_open_dq || $is_open_sq ) {
64 $chr = '&gt;';
65 } else {
66 $is_open_tb = false;
68 break;
70 case '"':
71 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
72 $is_open_dq = true;
73 } else if ( $is_open_tb && $is_open_dq && !$is_open_sq ) {
74 $is_open_dq = false;
75 } else {
76 $chr = '&quot;';
78 break;
80 case "'":
81 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
82 $is_open_sq = true;
83 } else if ( $is_open_tb && !$is_open_dq && $is_open_sq ) {
84 $is_open_sq = false;
86 break;
88 $goodStr .= $chr;
90 } // Moodle
92 //now that the page is valid (I hope) for strip_tags, strip all unwanted tags
94 $goodStr = strip_tags( $goodStr, '<title><hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' );
96 //strip extra whitespace except between <pre> and <textarea> tags
98 $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr );
100 for ( $x = 0; isset($badStr[$x]) && is_string( $badStr[$x] ); $x++ ) { // Moodle: added isset() test
101 if ( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else {
102 $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] );
103 for ( $z = 0; isset($goodStr[$z]) && is_string( $goodStr[$z] ); $z++ ) { // Moodle: added isset() test
104 if ( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else {
105 $goodStr[$z] = str_replace(' ', ' ', $goodStr[$z] );
108 $badStr[$x] = implode('',$goodStr);
112 $goodStr = implode('',$badStr);
114 //remove all options from select inputs
116 $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr );
118 //replace all tags with their text equivalents
120 $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr );
122 $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr );
124 $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr );
126 $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr );
128 $goodStr = preg_replace( "/<li[^>]*>/i", "\n� ", $goodStr );
130 $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr );
132 $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr );
134 // $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr ); // Moodle
135 $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>([^<]*)<\/a>/i", "$7 [$2$4$6]", $goodStr );
137 // $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr ); // Moodle
138 $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[$2$3$4] ", $goodStr );
140 $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr );
142 $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr );
144 //strip all remaining tags (mostly closing tags)
146 $goodStr = strip_tags( $goodStr );
148 //convert HTML entities
150 $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) );
152 preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr );
154 //wordwrap
156 // $goodStr = wordwrap( $goodStr ); // Moodle
157 $goodStr = wordwrap( $goodStr, 78 );
159 //make sure there are no more than 3 linebreaks in a row and trim whitespace
160 $goodStr = preg_replace("/\r\n?|\f/", "\n", $goodStr);
161 $goodStr = preg_replace("/\n(\s*\n){2}/", "\n\n\n", $goodStr);
162 $goodStr = preg_replace("/[ \t]+(\n|$)/", "$1", $goodStr);
163 $goodStr = preg_replace("/^\n*|\n*$/", '', $goodStr);
165 return $goodStr;