From 84a9f58b673035691736991d264b15cbfb55e282 Mon Sep 17 00:00:00 2001 From: Marina Glancy Date: Tue, 10 Oct 2017 14:59:31 +0800 Subject: [PATCH] MDL-46269 tool_httpsreplace: minor fixes - don't call curl() from unittests - make sure we don't replace links to http:// urls when the embedded contents from the same domain is present - convert only text or long varchar columns - improve searching performance, use case-insensitive regex/like instead of LOWER - add unittests for the above and also for international domains --- admin/tool/httpsreplace/classes/url_finder.php | 109 ++++++++++------- .../tool/httpsreplace/tests/httpsreplace_test.php | 132 +++++++++++++++++++-- 2 files changed, 188 insertions(+), 53 deletions(-) diff --git a/admin/tool/httpsreplace/classes/url_finder.php b/admin/tool/httpsreplace/classes/url_finder.php index 2ccfeb28b43..af9fc9bc01d 100644 --- a/admin/tool/httpsreplace/classes/url_finder.php +++ b/admin/tool/httpsreplace/classes/url_finder.php @@ -62,44 +62,61 @@ class url_finder { * @param string $table * @param string $column * @param string $domain + * @param string $search search string that has prefix, protocol, domain name and one extra character, + * example1: src="http://host.com/ + * example2: DATA="HTTP://MYDOMAIN.EDU" + * example3: src="HTTP://hello.world? * @return void */ - private function domain_swap($table, $column, $domain) { + protected function domain_swap($table, $column, $domain, $search) { global $DB; $renames = json_decode(get_config('tool_httpsreplace', 'renames'), true); - $search = "http://$domain"; - $replace = "https://$domain"; if (isset($renames[$domain])) { - $replace = 'https://' . $renames[$domain]; + $replace = preg_replace('|http://'.preg_quote($domain).'|i', 'https://' . $renames[$domain], $search); + } else { + $replace = preg_replace('|http://|i', 'https://', $search); } $DB->set_debug(true); - // Note, this search is case sensitive. $DB->replace_all_text($table, $column, $search, $replace); $DB->set_debug(false); } /** + * Returns SQL to be used to match embedded http links in the given column + * + * @param string $columnname name of the column (ready to be used in the SQL query) + * @return array + */ + protected function get_select_search_in_column($columnname) { + global $DB; + + if ($DB->sql_regex_supported()) { + // Database supports regex, use it for better match. + $select = $columnname . ' ' . $DB->sql_regex() . ' ?'; + $params = ["(src|data)\ *=\ *[\\\"\']http://"]; + } else { + // Databases without regex support should use case-insensitive LIKE. + // This will have false positive matches and more results than we need, we'll have to filter them in php. + $select = $DB->sql_like($columnname, '?', false); + $params = ['%=%http://%']; + } + + return [$select, $params]; + } + + /** * Originally forked from core function db_search(). * @param bool $replacing Whether or not to replace the found urls. * @param progress_bar $progress Progress bar keeping track of this process. * @return bool|array If $replacing, return true on success. If not, return hash of http urls to number of times used. */ - private function process($replacing = false, $progress = null) { + protected function process($replacing = false, $progress = null) { global $DB, $CFG; require_once($CFG->libdir.'/filelib.php'); - if ($DB->sql_regex_supported()) { - $regexp = $DB->sql_regex(); - $httpurls = "(src|data)\ *=\ *[\\\"\']http://"; - } else { - // Simpler query for DBs without regex support. - $regexp = "like"; - $httpurls = "%=%http://%"; - } - // TODO: block_instances have HTML content as base64, need to decode then // search, currently just skipped. See MDL-60024. $skiptables = array( @@ -128,14 +145,6 @@ class url_finder { } $urls = array(); - $texttypes = array ( - 'text', - 'mediumtext', - 'longtext', - 'varchar', - 'nvarchar', - 'CLOB', - ); $numberoftables = count($tables); $tablenumber = 0; @@ -150,31 +159,39 @@ class url_finder { if ($columns = $DB->get_columns($table)) { foreach ($columns as $column) { - if (in_array($column->type, $texttypes)) { + // Only convert columns that are either text or long varchar. + if ($column->meta_type == 'X' || ($column->meta_type == 'C' && $column->max_length > 255)) { $columnname = $column->name; - $select = "LOWER($columnname) $regexp ?"; - $rs = $DB->get_recordset_select($table, $select, [$httpurls]); + $columnnamequoted = $DB->get_manager()->generator->getEncQuoted($columnname); + list($select, $params) = $this->get_select_search_in_column($columnnamequoted); + $rs = $DB->get_recordset_select($table, $select, $params, '', $columnnamequoted); $found = array(); foreach ($rs as $record) { // Regex to match src=http://etc. and data=http://etc.urls. // Standard warning on expecting regex to perfectly parse HTML // read http://stackoverflow.com/a/1732454 for more info. - $regex = '#(src|data)\ *=\ *[\'\"]http://[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))[\'\"]#i'; + $regex = '#((src|data)\ *=\ *[\'\"])(http://)([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))[\'\"]#i'; preg_match_all($regex, $record->$columnname, $match); - foreach ($match[0] as $url) { - if (strpos($url, $CFG->wwwroot) !== false) { + foreach ($match[0] as $i => $fullmatch) { + if (strpos($fullmatch, $CFG->wwwroot) !== false) { + continue; + } + $prefix = $match[1][$i]; + $protocol = $match[3][$i]; + $url = $protocol . $match[4][$i]; + $host = \core_text::strtolower(parse_url($url, PHP_URL_HOST)); + if (empty($host)) { continue; } if ($replacing) { - $url = substr($url, strpos($url, 'http'), -1); - $host = parse_url($url, PHP_URL_HOST); - $found[] = $host; + // For replace string use: prefix, protocol, host and one extra character. + $found[$prefix . substr($url, 0, strlen($host) + 8)] = $host; } else { $entry["table"] = $table; $entry["columnname"] = $columnname; - $entry["url"] = str_replace(array("'", '"'), "", substr($url, ((int) strpos($url, "=") + 1) )); - $entry["host"] = parse_url($entry["url"], PHP_URL_HOST); + $entry["url"] = $url; + $entry["host"] = $host; $entry["raw"] = $record->$columnname; $entry["ssl"] = ''; $urls[] = $entry; @@ -184,9 +201,8 @@ class url_finder { $rs->close(); if ($replacing) { - $found = array_unique($found); - foreach ($found as $domain) { - $this->domain_swap($table, $column, $domain); + foreach ($found as $search => $domain) { + $this->domain_swap($table, $column, $domain, $search); } } } @@ -209,11 +225,7 @@ class url_finder { $sslfailures = array(); foreach ($uniquedomains as $domain) { - $url = "https://$domain/"; - $curl = new \curl(); - $curl->head($url); - $info = $curl->get_info(); - if (empty($info['http_code']) or ($info['http_code'] >= 400)) { + if (!$this->check_domain_availability("https://$domain/")) { $sslfailures[] = $domain; } } @@ -233,4 +245,17 @@ class url_finder { } return $results; } + + /** + * Check if url is available (GET request returns 200) + * + * @param string $url + * @return bool + */ + protected function check_domain_availability($url) { + $curl = new \curl(); + $curl->head($url); + $info = $curl->get_info(); + return !empty($info['http_code']) && $info['http_code'] == 200; + } } diff --git a/admin/tool/httpsreplace/tests/httpsreplace_test.php b/admin/tool/httpsreplace/tests/httpsreplace_test.php index 07bb5ae216f..c10ea2f658d 100644 --- a/admin/tool/httpsreplace/tests/httpsreplace_test.php +++ b/admin/tool/httpsreplace/tests/httpsreplace_test.php @@ -54,6 +54,11 @@ class httpsreplace_test extends \advanced_testcase { "outputregex" => '/UPDATE/', "expectedcontent" => '', ], + "Test image from a site with international name should be replaced" => [ + "content" => '', + "outputregex" => '/UPDATE/', + "expectedcontent" => '', + ], "Link that is from this site should be replaced" => [ "content" => '', "outputregex" => '/UPDATE/', @@ -79,10 +84,17 @@ class httpsreplace_test extends \advanced_testcase { "outputregex" => '/UPDATE/', "expectedcontent" => '', ], + "URL should be case insensitive" => [ + "content" => '', + "outputregex" => '/UPDATE/', + "expectedcontent" => '', + ], "More params should not interfere" => [ - "content" => 'A picture

', + "content" => 'A picture

', "outputregex" => '/UPDATE/', - "expectedcontent" => 'A picture

', + "expectedcontent" => 'A picture

', ], "Broken URL should not be changed" => [ "content" => '', @@ -90,9 +102,18 @@ class httpsreplace_test extends \advanced_testcase { "expectedcontent" => '', ], "Link URL should not be changed" => [ - "content" => '' . $this->getExternalTestFileUrl('/test.png', false) . '', + "content" => '' . + $this->getExternalTestFileUrl('/test.png', false) . '', "outputregex" => '/^$/', - "expectedcontent" => '' . $this->getExternalTestFileUrl('/test.png', false) . '', + "expectedcontent" => '' . + $this->getExternalTestFileUrl('/test.png', false) . '', + ], + "Test image from another site should be replaced but link should not" => [ + "content" => '', + "outputregex" => '/UPDATE/', + "expectedcontent" => '', ], ]; } @@ -110,7 +131,7 @@ class httpsreplace_test extends \advanced_testcase { $this->resetAfterTest(); $this->expectOutputRegex($ouputregex); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); $generator = $this->getDataGenerator(); $course = $generator->create_course((object) [ @@ -181,7 +202,7 @@ class httpsreplace_test extends \advanced_testcase { public function test_http_link_stats($content, $domain, $expectedcount) { $this->resetAfterTest(); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); $generator = $this->getDataGenerator(); $course = $generator->create_course((object) [ @@ -202,7 +223,7 @@ class httpsreplace_test extends \advanced_testcase { $this->resetAfterTest(); $this->expectOutputRegex('/^$/'); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); $generator = $this->getDataGenerator(); $course = $generator->create_course((object) [ @@ -234,7 +255,7 @@ class httpsreplace_test extends \advanced_testcase { $CFG->wwwroot = preg_replace('/^https:/', 'http:', $CFG->wwwroot); $this->expectOutputRegex('/^$/'); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); $generator = $this->getDataGenerator(); $course = $generator->create_course((object) [ @@ -257,7 +278,7 @@ class httpsreplace_test extends \advanced_testcase { set_config('test_upgrade_http_links', ''); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); ob_start(); $results = $finder->upgrade_http_links(); $output = ob_get_contents(); @@ -283,11 +304,11 @@ class httpsreplace_test extends \advanced_testcase { set_config('renames', json_encode($renames), 'tool_httpsreplace'); - $finder = new \tool_httpsreplace\url_finder(); + $finder = new tool_httpreplace_url_finder_test(); $generator = $this->getDataGenerator(); $course = $generator->create_course((object) [ - 'summary' => '