Merge branch 'MDL-40816_m25' of git://github.com/markn86/moodle into MOODLE_25_STABLE
[moodle.git] / repository / url / locallib.php
blob042350b4951f74bc90a0566666ad01939c370101
1 <?php
3 /**
4 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
20 * the names of its contributors may be used to endorse or promote
21 * products derived from this software without specific prior
22 * written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
35 * OF SUCH DAMAGE.
39 * This is a BSD License approved by the Open Source Initiative (OSI).
40 * See: http://www.opensource.org/licenses/bsd-license.php
43 /**
44 * Combine a base URL and a relative URL to produce a new
45 * absolute URL. The base URL is often the URL of a page,
46 * and the relative URL is a URL embedded on that page.
48 * This function implements the "absolutize" algorithm from
49 * the RFC3986 specification for URLs.
51 * This function supports multi-byte characters with the UTF-8 encoding,
52 * per the URL specification.
54 * Parameters:
55 * baseUrl the absolute base URL.
57 * url the relative URL to convert.
59 * Return values:
60 * An absolute URL that combines parts of the base and relative
61 * URLs, or FALSE if the base URL is not absolute or if either
62 * URL cannot be parsed.
64 function url_to_absolute( $baseUrl, $relativeUrl )
66 // If relative URL has a scheme, clean path and return.
67 $r = split_url( $relativeUrl );
68 if ( $r === FALSE )
69 return FALSE;
70 if ( !empty( $r['scheme'] ) )
72 if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
73 $r['path'] = url_remove_dot_segments( $r['path'] );
74 return join_url( $r );
77 // Make sure the base URL is absolute.
78 $b = split_url( $baseUrl );
79 if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
80 return FALSE;
81 $r['scheme'] = $b['scheme'];
82 if (empty($b['path'])) {
83 $b['path'] = '';
86 // If relative URL has an authority, clean path and return.
87 if ( isset( $r['host'] ) )
89 if ( !empty( $r['path'] ) )
90 $r['path'] = url_remove_dot_segments( $r['path'] );
91 return join_url( $r );
93 unset( $r['port'] );
94 unset( $r['user'] );
95 unset( $r['pass'] );
97 // Copy base authority.
98 $r['host'] = $b['host'];
99 if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
100 if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
101 if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
103 // If relative URL has no path, use base path
104 if ( empty( $r['path'] ) )
106 if ( !empty( $b['path'] ) )
107 $r['path'] = $b['path'];
108 if ( !isset( $r['query'] ) && isset( $b['query'] ) )
109 $r['query'] = $b['query'];
110 return join_url( $r );
113 // If relative URL path doesn't start with /, merge with base path
114 if ( $r['path'][0] != '/' )
116 $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
117 if ( $base === FALSE ) $base = '';
118 $r['path'] = $base . '/' . $r['path'];
120 $r['path'] = url_remove_dot_segments( $r['path'] );
121 return join_url( $r );
125 * Filter out "." and ".." segments from a URL's path and return
126 * the result.
128 * This function implements the "remove_dot_segments" algorithm from
129 * the RFC3986 specification for URLs.
131 * This function supports multi-byte characters with the UTF-8 encoding,
132 * per the URL specification.
134 * Parameters:
135 * path the path to filter
137 * Return values:
138 * The filtered path with "." and ".." removed.
140 function url_remove_dot_segments( $path )
142 // multi-byte character explode
143 $inSegs = preg_split( '!/!u', $path );
144 $outSegs = array( );
145 foreach ( $inSegs as $seg )
147 if ( $seg == '' || $seg == '.')
148 continue;
149 if ( $seg == '..' )
150 array_pop( $outSegs );
151 else
152 array_push( $outSegs, $seg );
154 $outPath = implode( '/', $outSegs );
155 if ( $path[0] == '/' )
156 $outPath = '/' . $outPath;
157 // compare last multi-byte character against '/'
158 if ( $outPath != '/' &&
159 (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
160 $outPath .= '/';
161 return $outPath;
165 * This function parses an absolute or relative URL and splits it
166 * into individual components.
168 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
169 * A portion of the ABNFs are repeated here:
171 * URI-reference = URI
172 * / relative-ref
174 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
176 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
178 * hier-part = "//" authority path-abempty
179 * / path-absolute
180 * / path-rootless
181 * / path-empty
183 * relative-part = "//" authority path-abempty
184 * / path-absolute
185 * / path-noscheme
186 * / path-empty
188 * authority = [ userinfo "@" ] host [ ":" port ]
190 * So, a URL has the following major components:
192 * scheme
193 * The name of a method used to interpret the rest of
194 * the URL. Examples: "http", "https", "mailto", "file'.
196 * authority
197 * The name of the authority governing the URL's name
198 * space. Examples: "example.com", "user@example.com",
199 * "example.com:80", "user:password@example.com:80".
201 * The authority may include a host name, port number,
202 * user name, and password.
204 * The host may be a name, an IPv4 numeric address, or
205 * an IPv6 numeric address.
207 * path
208 * The hierarchical path to the URL's resource.
209 * Examples: "/index.htm", "/scripts/page.php".
211 * query
212 * The data for a query. Examples: "?search=google.com".
214 * fragment
215 * The name of a secondary resource relative to that named
216 * by the path. Examples: "#section1", "#header".
218 * An "absolute" URL must include a scheme and path. The authority, query,
219 * and fragment components are optional.
221 * A "relative" URL does not include a scheme and must include a path. The
222 * authority, query, and fragment components are optional.
224 * This function splits the $url argument into the following components
225 * and returns them in an associative array. Keys to that array include:
227 * "scheme" The scheme, such as "http".
228 * "host" The host name, IPv4, or IPv6 address.
229 * "port" The port number.
230 * "user" The user name.
231 * "pass" The user password.
232 * "path" The path, such as a file path for "http".
233 * "query" The query.
234 * "fragment" The fragment.
236 * One or more of these may not be present, depending upon the URL.
238 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
239 * "path", "query", and "fragment" may have percent-encoded characters
240 * decoded. The "scheme" and "port" cannot include percent-encoded
241 * characters and are never decoded. Decoding occurs after the URL has
242 * been parsed.
244 * Parameters:
245 * url the URL to parse.
247 * decode an optional boolean flag selecting whether
248 * to decode percent encoding or not. Default = TRUE.
250 * Return values:
251 * the associative array of URL parts, or FALSE if the URL is
252 * too malformed to recognize any parts.
254 function split_url( $url, $decode=FALSE)
256 // Character sets from RFC3986.
257 $xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
258 $xpchar = $xunressub . ':@% ';
260 // Scheme from RFC3986.
261 $xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
263 // User info (user + password) from RFC3986.
264 $xuserinfo = '(([' . $xunressub . '%]*)' .
265 '(:([' . $xunressub . ':%]*))?)';
267 // IPv4 from RFC3986 (without digit constraints).
268 $xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
270 // IPv6 from RFC2732 (without digit and grouping constraints).
271 $xipv6 = '(\[([a-fA-F\d.:]+)\])';
273 // Host name from RFC1035. Technically, must start with a letter.
274 // Relax that restriction to better parse URL structure, then
275 // leave host name validation to application.
276 $xhost_name = '([a-zA-Z\d-.%]+)';
278 // Authority from RFC3986. Skip IP future.
279 $xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
280 $xport = '(\d*)';
281 $xauthority = '((' . $xuserinfo . '@)?' . $xhost .
282 '?(:' . $xport . ')?)';
284 // Path from RFC3986. Blend absolute & relative for efficiency.
285 $xslash_seg = '(/[' . $xpchar . ']*)';
286 $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
287 $xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
288 $xpath_abs = '(/(' . $xpath_rel . ')?)';
289 $xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
290 '|' . $xpath_rel . ')';
292 // Query and fragment from RFC3986.
293 $xqueryfrag = '([' . $xpchar . '/?' . ']*)';
295 // URL.
296 $xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
297 '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
300 // Split the URL into components.
301 if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
302 return FALSE;
304 if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]);
306 if ( !empty($m[7]) ) {
307 if ( isset( $m[9] ) ) $parts['user'] = $m[9];
308 else $parts['user'] = '';
310 if ( !empty($m[10]) ) $parts['pass'] = $m[11];
312 if ( !empty($m[13]) ) $h=$parts['host'] = $m[13];
313 else if ( !empty($m[14]) ) $parts['host'] = $m[14];
314 else if ( !empty($m[16]) ) $parts['host'] = $m[16];
315 else if ( !empty( $m[5] ) ) $parts['host'] = '';
316 if ( !empty($m[17]) ) $parts['port'] = $m[18];
318 if ( !empty($m[19]) ) $parts['path'] = $m[19];
319 else if ( !empty($m[21]) ) $parts['path'] = $m[21];
320 else if ( !empty($m[25]) ) $parts['path'] = $m[25];
322 if ( !empty($m[27]) ) $parts['query'] = $m[28];
323 if ( !empty($m[29]) ) $parts['fragment']= $m[30];
325 if ( !$decode )
326 return $parts;
327 if ( !empty($parts['user']) )
328 $parts['user'] = rawurldecode( $parts['user'] );
329 if ( !empty($parts['pass']) )
330 $parts['pass'] = rawurldecode( $parts['pass'] );
331 if ( !empty($parts['path']) )
332 $parts['path'] = rawurldecode( $parts['path'] );
333 if ( isset($h) )
334 $parts['host'] = rawurldecode( $parts['host'] );
335 if ( !empty($parts['query']) )
336 $parts['query'] = rawurldecode( $parts['query'] );
337 if ( !empty($parts['fragment']) )
338 $parts['fragment'] = rawurldecode( $parts['fragment'] );
339 return $parts;
343 * This function joins together URL components to form a complete URL.
345 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
346 * This function implements the specification's "component recomposition"
347 * algorithm for combining URI components into a full URI string.
349 * The $parts argument is an associative array containing zero or
350 * more of the following:
352 * "scheme" The scheme, such as "http".
353 * "host" The host name, IPv4, or IPv6 address.
354 * "port" The port number.
355 * "user" The user name.
356 * "pass" The user password.
357 * "path" The path, such as a file path for "http".
358 * "query" The query.
359 * "fragment" The fragment.
361 * The "port", "user", and "pass" values are only used when a "host"
362 * is present.
364 * The optional $encode argument indicates if appropriate URL components
365 * should be percent-encoded as they are assembled into the URL. Encoding
366 * is only applied to the "user", "pass", "host" (if a host name, not an
367 * IP address), "path", "query", and "fragment" components. The "scheme"
368 * and "port" are never encoded. When a "scheme" and "host" are both
369 * present, the "path" is presumed to be hierarchical and encoding
370 * processes each segment of the hierarchy separately (i.e., the slashes
371 * are left alone).
373 * The assembled URL string is returned.
375 * Parameters:
376 * parts an associative array of strings containing the
377 * individual parts of a URL.
379 * encode an optional boolean flag selecting whether
380 * to do percent encoding or not. Default = true.
382 * Return values:
383 * Returns the assembled URL string. The string is an absolute
384 * URL if a scheme is supplied, and a relative URL if not. An
385 * empty string is returned if the $parts array does not contain
386 * any of the needed values.
388 function join_url( $parts, $encode=FALSE)
390 if ( $encode )
392 if ( isset( $parts['user'] ) )
393 $parts['user'] = rawurlencode( $parts['user'] );
394 if ( isset( $parts['pass'] ) )
395 $parts['pass'] = rawurlencode( $parts['pass'] );
396 if ( isset( $parts['host'] ) &&
397 !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
398 $parts['host'] = rawurlencode( $parts['host'] );
399 if ( !empty( $parts['path'] ) )
400 $parts['path'] = preg_replace( '!%2F!ui', '/',
401 rawurlencode( $parts['path'] ) );
402 if ( isset( $parts['query'] ) )
403 $parts['query'] = rawurlencode( $parts['query'] );
404 if ( isset( $parts['fragment'] ) )
405 $parts['fragment'] = rawurlencode( $parts['fragment'] );
408 $url = '';
409 if ( !empty( $parts['scheme'] ) )
410 $url .= $parts['scheme'] . ':';
411 if ( isset( $parts['host'] ) )
413 $url .= '//';
414 if ( isset( $parts['user'] ) )
416 $url .= $parts['user'];
417 if ( isset( $parts['pass'] ) )
418 $url .= ':' . $parts['pass'];
419 $url .= '@';
421 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
422 $url .= '[' . $parts['host'] . ']'; // IPv6
423 else
424 $url .= $parts['host']; // IPv4 or name
425 if ( isset( $parts['port'] ) )
426 $url .= ':' . $parts['port'];
427 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
428 $url .= '/';
430 if ( !empty( $parts['path'] ) )
431 $url .= $parts['path'];
432 if ( isset( $parts['query'] ) )
433 $url .= '?' . $parts['query'];
434 if ( isset( $parts['fragment'] ) )
435 $url .= '#' . $parts['fragment'];
436 return $url;
440 * This function encodes URL to form a URL which is properly
441 * percent encoded to replace disallowed characters.
443 * RFC3986 specifies the allowed characters in the URL as well as
444 * reserved characters in the URL. This function replaces all the
445 * disallowed characters in the URL with their repective percent
446 * encodings. Already encoded characters are not encoded again,
447 * such as '%20' is not encoded to '%2520'.
449 * Parameters:
450 * url the url to encode.
452 * Return values:
453 * Returns the encoded URL string.
455 function encode_url($url) {
456 $reserved = array(
457 ":" => '!%3A!ui',
458 "/" => '!%2F!ui',
459 "?" => '!%3F!ui',
460 "#" => '!%23!ui',
461 "[" => '!%5B!ui',
462 "]" => '!%5D!ui',
463 "@" => '!%40!ui',
464 "!" => '!%21!ui',
465 "$" => '!%24!ui',
466 "&" => '!%26!ui',
467 "'" => '!%27!ui',
468 "(" => '!%28!ui',
469 ")" => '!%29!ui',
470 "*" => '!%2A!ui',
471 "+" => '!%2B!ui',
472 "," => '!%2C!ui',
473 ";" => '!%3B!ui',
474 "=" => '!%3D!ui',
475 "%" => '!%25!ui',
478 $url = rawurlencode($url);
479 $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
480 return $url;
484 * Extract URLs from a web page.
486 * URLs are extracted from a long list of tags and attributes as defined
487 * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
488 * URLs are also extracted from tags and attributes that are common
489 * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
490 * and from WML 1.3 and 2.0.
492 * The function returns an associative array of associative arrays of
493 * arrays of URLs. The outermost array's keys are the tag (element) name,
494 * such as "a" for <a> or "img" for <img>. The values for these entries
495 * are associative arrays where the keys are attribute names for those
496 * tags, such as "href" for <a href="...">. Finally, the values for
497 * those arrays are URLs found in those tags and attributes throughout
498 * the text.
500 * Parameters:
501 * text the UTF-8 text to scan
503 * Return values:
504 * an associative array where keys are tags and values are an
505 * associative array where keys are attributes and values are
506 * an array of URLs.
508 * See:
509 * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
511 function extract_html_urls( $text )
513 $match_elements = array(
514 // HTML
515 array('element'=>'a', 'attribute'=>'href'), // 2.0
516 array('element'=>'a', 'attribute'=>'urn'), // 2.0
517 array('element'=>'base', 'attribute'=>'href'), // 2.0
518 array('element'=>'form', 'attribute'=>'action'), // 2.0
519 array('element'=>'img', 'attribute'=>'src'), // 2.0
520 array('element'=>'link', 'attribute'=>'href'), // 2.0
522 array('element'=>'applet', 'attribute'=>'code'), // 3.2
523 array('element'=>'applet', 'attribute'=>'codebase'), // 3.2
524 array('element'=>'area', 'attribute'=>'href'), // 3.2
525 array('element'=>'body', 'attribute'=>'background'), // 3.2
526 array('element'=>'img', 'attribute'=>'usemap'), // 3.2
527 array('element'=>'input', 'attribute'=>'src'), // 3.2
529 array('element'=>'applet', 'attribute'=>'archive'), // 4.01
530 array('element'=>'applet', 'attribute'=>'object'), // 4.01
531 array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01
532 array('element'=>'del', 'attribute'=>'cite'), // 4.01
533 array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01
534 array('element'=>'frame', 'attribute'=>'src'), // 4.01
535 array('element'=>'head', 'attribute'=>'profile'), // 4.01
536 array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01
537 array('element'=>'iframe', 'attribute'=>'src'), // 4.01
538 array('element'=>'img', 'attribute'=>'longdesc'), // 4.01
539 array('element'=>'input', 'attribute'=>'usemap'), // 4.01
540 array('element'=>'ins', 'attribute'=>'cite'), // 4.01
541 array('element'=>'object', 'attribute'=>'archive'), // 4.01
542 array('element'=>'object', 'attribute'=>'classid'), // 4.01
543 array('element'=>'object', 'attribute'=>'codebase'), // 4.01
544 array('element'=>'object', 'attribute'=>'data'), // 4.01
545 array('element'=>'object', 'attribute'=>'usemap'), // 4.01
546 array('element'=>'q', 'attribute'=>'cite'), // 4.01
547 array('element'=>'script', 'attribute'=>'src'), // 4.01
549 array('element'=>'audio', 'attribute'=>'src'), // 5.0
550 array('element'=>'command', 'attribute'=>'icon'), // 5.0
551 array('element'=>'embed', 'attribute'=>'src'), // 5.0
552 array('element'=>'event-source','attribute'=>'src'), // 5.0
553 array('element'=>'html', 'attribute'=>'manifest'), // 5.0
554 array('element'=>'source', 'attribute'=>'src'), // 5.0
555 array('element'=>'video', 'attribute'=>'src'), // 5.0
556 array('element'=>'video', 'attribute'=>'poster'), // 5.0
558 array('element'=>'bgsound', 'attribute'=>'src'), // Extension
559 array('element'=>'body', 'attribute'=>'credits'), // Extension
560 array('element'=>'body', 'attribute'=>'instructions'), // Extension
561 array('element'=>'body', 'attribute'=>'logo'), // Extension
562 array('element'=>'div', 'attribute'=>'href'), // Extension
563 array('element'=>'div', 'attribute'=>'src'), // Extension
564 array('element'=>'embed', 'attribute'=>'code'), // Extension
565 array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension
566 array('element'=>'html', 'attribute'=>'background'), // Extension
567 array('element'=>'ilayer', 'attribute'=>'src'), // Extension
568 array('element'=>'img', 'attribute'=>'dynsrc'), // Extension
569 array('element'=>'img', 'attribute'=>'lowsrc'), // Extension
570 array('element'=>'input', 'attribute'=>'dynsrc'), // Extension
571 array('element'=>'input', 'attribute'=>'lowsrc'), // Extension
572 array('element'=>'table', 'attribute'=>'background'), // Extension
573 array('element'=>'td', 'attribute'=>'background'), // Extension
574 array('element'=>'th', 'attribute'=>'background'), // Extension
575 array('element'=>'layer', 'attribute'=>'src'), // Extension
576 array('element'=>'xml', 'attribute'=>'src'), // Extension
578 array('element'=>'button', 'attribute'=>'action'), // Forms 2.0
579 array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0
580 array('element'=>'form', 'attribute'=>'data'), // Forms 2.0
581 array('element'=>'input', 'attribute'=>'action'), // Forms 2.0
582 array('element'=>'select', 'attribute'=>'data'), // Forms 2.0
584 // XHTML
585 array('element'=>'html', 'attribute'=>'xmlns'),
587 // WML
588 array('element'=>'access', 'attribute'=>'path'), // 1.3
589 array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3
590 array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3
591 array('element'=>'card', 'attribute'=>'ontimer'), // 1.3
592 array('element'=>'go', 'attribute'=>'href'), // 1.3
593 array('element'=>'option', 'attribute'=>'onpick'), // 1.3
594 array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3
595 array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3
596 array('element'=>'template', 'attribute'=>'ontimer'), // 1.3
597 array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0
600 $match_metas = array(
601 'content-base',
602 'content-location',
603 'referer',
604 'location',
605 'refresh',
608 // Extract all elements
609 if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
610 return array( );
611 $elements = $matches[1];
612 $value_pattern = '=(("([^"]*)")|([^\s]*))';
614 // Match elements and attributes
615 foreach ( $match_elements as $match_element )
617 $name = $match_element['element'];
618 $attr = $match_element['attribute'];
619 $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
620 if ( $name == 'object' )
621 $split_pattern = '/\s*/u'; // Space-separated URL list
622 else if ( $name == 'archive' )
623 $split_pattern = '/,\s*/u'; // Comma-separated URL list
624 else
625 unset( $split_pattern ); // Single URL
626 foreach ( $elements as $element )
628 if ( !preg_match( $pattern, $element, $match ) )
629 continue;
630 $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
631 if ( !isset( $split_pattern ) )
632 $urls[$name][$attr][] = $m;
633 else
635 $msplit = preg_split( $split_pattern, $m );
636 foreach ( $msplit as $ms )
637 $urls[$name][$attr][] = $ms;
642 // Match meta http-equiv elements
643 foreach ( $match_metas as $match_meta )
645 $attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu';
646 $content_pattern = '/content' . $value_pattern . '/iu';
647 $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
648 foreach ( $elements as $element )
650 if ( !preg_match( '/^meta/iu', $element ) ||
651 !preg_match( $attr_pattern, $element ) ||
652 !preg_match( $content_pattern, $element, $match ) )
653 continue;
654 $m = empty($match[3]) ? $match[4] : $match[3];
655 if ( $match_meta != 'refresh' )
656 $urls['meta']['http-equiv'][] = $m;
657 else if ( preg_match( $refresh_pattern, $m, $match ) )
658 $urls['meta']['http-equiv'][] = $match[2];
662 // Match style attributes
663 $urls['style'] = array( );
664 $style_pattern = '/style' . $value_pattern . '/iu';
665 foreach ( $elements as $element )
667 if ( !preg_match( $style_pattern, $element, $match ) )
668 continue;
669 $m = empty($match[3]) ? $match[4] : $match[3];
670 $style_urls = extract_css_urls( $m );
671 if ( !empty( $style_urls ) )
672 $urls['style'] = array_merge_recursive(
673 $urls['style'], $style_urls );
676 // Match style bodies
677 if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
679 foreach ( $style_bodies[1] as $style_body )
681 $style_urls = extract_css_urls( $style_body );
682 if ( !empty( $style_urls ) )
683 $urls['style'] = array_merge_recursive(
684 $urls['style'], $style_urls );
687 if ( empty($urls['style']) )
688 unset( $urls['style'] );
690 return $urls;
693 * Extract URLs from UTF-8 CSS text.
695 * URLs within @import statements and url() property functions are extracted
696 * and returned in an associative array of arrays. Array keys indicate
697 * the use context for the URL, including:
699 * "import"
700 * "property"
702 * Each value in the associative array is an array of URLs.
704 * Parameters:
705 * text the UTF-8 text to scan
707 * Return values:
708 * an associative array of arrays of URLs.
710 * See:
711 * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
713 function extract_css_urls( $text )
715 $urls = array( );
717 $url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
718 $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
719 $pattern = '/(' .
720 '(@import\s*[\'"]' . $url_pattern . '[\'"])' .
721 '|(@import\s*' . $urlfunc_pattern . ')' .
722 '|(' . $urlfunc_pattern . ')' . ')/iu';
723 if ( !preg_match_all( $pattern, $text, $matches ) )
724 return $urls;
726 // @import '...'
727 // @import "..."
728 foreach ( $matches[3] as $match )
729 if ( !empty($match) )
730 $urls['import'][] =
731 preg_replace( '/\\\\(.)/u', '\\1', $match );
733 // @import url(...)
734 // @import url('...')
735 // @import url("...")
736 foreach ( $matches[7] as $match )
737 if ( !empty($match) )
738 $urls['import'][] =
739 preg_replace( '/\\\\(.)/u', '\\1', $match );
741 // url(...)
742 // url('...')
743 // url("...")
744 foreach ( $matches[11] as $match )
745 if ( !empty($match) )
746 $urls['property'][] =
747 preg_replace( '/\\\\(.)/u', '\\1', $match );
749 return $urls;