3 /***************************************************************
4 * Library to convert HTML into an approximate text equivalent *
5 ***************************************************************
7 Version: 1.0.3 (with modifications)
8 Copyright 2003 Mark Wilton-Jones
9 License: HowToCreate script license with written permission
10 URL: http://www.howtocreate.co.uk/php/
12 For full details about the script and to get the latest version,
13 please see the HowToCreate web site above.
15 This version contains modifications for Moodle. In each case the
16 lines are marked with "Moodle", so you can see what has changed.
18 ********************************************************************/
20 function html2text( $badStr ) {
28 while (substr_count($badStr, '<!--') &&
29 substr_count($badStr, '-->') &&
30 strpos($badStr, '-->', strpos($badStr, '<!--' ) ) > strpos( $badStr, '<!--' ) ) {
31 $badStr = substr( $badStr, 0, strpos( $badStr, '<!--' ) ) .
32 substr( $badStr, strpos( $badStr, '-->',
33 strpos( $badStr, '<!--' ) ) +
3 );
36 //now make sure all HTML tags are correctly written (> not in between quotes)
38 $len = strlen($badStr); // Moodle
39 $chr = $badStr{0}; // Moodle
40 $goodStr = ''; // Moodle
42 if ($len > 0) { // Moodle
43 for ($x=0; $x < $len; $x++
) { // Moodle
44 $chr = $badStr{$x}; //take each letter in turn and check if that character is permitted there
47 if ( !$is_open_tb && strtolower( substr( $badStr, $x +
1, 5 ) ) == 'style' ) {
48 $badStr = substr( $badStr, 0, $x ) .
49 substr( $badStr, strpos( strtolower( $badStr ), '</style>', $x ) +
7 );
51 } else if ( !$is_open_tb && strtolower( substr( $badStr, $x +
1, 6 ) ) == 'script' ) {
52 $badStr = substr( $badStr, 0, $x ) .
53 substr( $badStr, strpos( strtolower( $badStr ), '</script>', $x ) +
8 );
55 } else if (!$is_open_tb) {
63 if ( !$is_open_tb ||
$is_open_dq ||
$is_open_sq ) {
71 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
73 } else if ( $is_open_tb && $is_open_dq && !$is_open_sq ) {
81 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
83 } else if ( $is_open_tb && !$is_open_dq && $is_open_sq ) {
92 //now that the page is valid (I hope) for strip_tags, strip all unwanted tags
94 $goodStr = strip_tags( $goodStr, '<title><hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' );
96 //strip extra whitespace except between <pre> and <textarea> tags
98 $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr );
100 for ( $x = 0; isset($badStr[$x]) && is_string( $badStr[$x] ); $x++
) { // Moodle: added isset() test
101 if ( $x %
2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else {
102 $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] );
103 for ( $z = 0; isset($goodStr[$z]) && is_string( $goodStr[$z] ); $z++
) { // Moodle: added isset() test
104 if ( $z %
2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else {
105 $goodStr[$z] = str_replace(' ', ' ', $goodStr[$z] );
108 $badStr[$x] = implode('',$goodStr);
112 $goodStr = implode('',$badStr);
114 //remove all options from select inputs
116 $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr );
118 //replace all tags with their text equivalents
120 $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr );
122 $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr );
124 $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr );
126 $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr );
128 $goodStr = preg_replace( "/<li[^>]*>/i", "\nยท ", $goodStr );
130 $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr );
132 $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr );
134 // $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr ); // Moodle
135 $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>([^<]*)<\/a>/i", "$7 [$2$4$6]", $goodStr );
137 // $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr ); // Moodle
138 $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[$2$3$4] ", $goodStr );
140 $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr );
142 $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr );
144 //strip all remaining tags (mostly closing tags)
146 $goodStr = strip_tags( $goodStr );
148 //convert HTML entities
150 $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES
) ) );
152 preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr );
156 // $goodStr = wordwrap( $goodStr ); // Moodle
157 $goodStr = wordwrap( $goodStr, 78 );
159 //make sure there are no more than 3 linebreaks in a row and trim whitespace
160 $goodStr = preg_replace("/\r\n?|\f/", "\n", $goodStr);
161 $goodStr = preg_replace("/\n(\s*\n){2}/", "\n\n\n", $goodStr);
162 $goodStr = preg_replace("/[ \t]+(\n|$)/", "$1", $goodStr);
163 $goodStr = preg_replace("/^\n*|\n*$/", '', $goodStr);