repository/url/locallib.php

   1 <?php
   2
   3 /**
   4  * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  *      * Redistributions of source code must retain the above copyright
  12  *        notice, this list of conditions and the following disclaimer.
  13  *
  14  *      * Redistributions in binary form must reproduce the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer in the documentation and/or other materials provided
  17  *        with the distribution.
  18  *
  19  *      * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
  20  *        the names of its contributors may be used to endorse or promote
  21  *        products derived from this software without specific prior
  22  *        written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  28  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  35  * OF SUCH DAMAGE.
  36  */
  37
  38 /*
  39  * This is a BSD License approved by the Open Source Initiative (OSI).
  40  * See:  http://www.opensource.org/licenses/bsd-license.php
  41  */
  42
  43 /**
  44  * Combine a base URL and a relative URL to produce a new
  45  * absolute URL.  The base URL is often the URL of a page,
  46  * and the relative URL is a URL embedded on that page.
  47  *
  48  * This function implements the "absolutize" algorithm from
  49  * the RFC3986 specification for URLs.
  50  *
  51  * This function supports multi-byte characters with the UTF-8 encoding,
  52  * per the URL specification.
  53  *
  54  * Parameters:
  55  *      baseUrl         the absolute base URL.
  56  *
  57  *      url             the relative URL to convert.
  58  *
  59  * Return values:
  60  *      An absolute URL that combines parts of the base and relative
  61  *      URLs, or FALSE if the base URL is not absolute or if either
  62  *      URL cannot be parsed.
  63  */
  64 function url_to_absolute( $baseUrl, $relativeUrl )
  65 {
  66         // If relative URL has a scheme, clean path and return.
  67         $r = split_url( $relativeUrl );
  68         if ( $r === FALSE )
  69                 return FALSE;
  70         if ( !empty( $r['scheme'] ) )
  71         {
  72                 if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
  73                         $r['path'] = url_remove_dot_segments( $r['path'] );
  74                 return join_url( $r );
  75         }
  76
  77         // Make sure the base URL is absolute.
  78         $b = split_url( $baseUrl );
  79         if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
  80                 return FALSE;
  81         $r['scheme'] = $b['scheme'];
  82     if (empty($b['path'])) {
  83         $b['path'] = '';
  84     }
  85
  86         // If relative URL has an authority, clean path and return.
  87         if ( isset( $r['host'] ) )
  88         {
  89                 if ( !empty( $r['path'] ) )
  90                         $r['path'] = url_remove_dot_segments( $r['path'] );
  91                 return join_url( $r );
  92         }
  93         unset( $r['port'] );
  94         unset( $r['user'] );
  95         unset( $r['pass'] );
  96
  97         // Copy base authority.
  98         $r['host'] = $b['host'];
  99         if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
 100         if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
 101         if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
 102
 103         // If relative URL has no path, use base path
 104         if ( empty( $r['path'] ) )
 105         {
 106                 if ( !empty( $b['path'] ) )
 107                         $r['path'] = $b['path'];
 108                 if ( !isset( $r['query'] ) && isset( $b['query'] ) )
 109                         $r['query'] = $b['query'];
 110                 return join_url( $r );
 111         }
 112
 113         // If relative URL path doesn't start with /, merge with base path
 114         if ( $r['path'][0] != '/' )
 115         {
 116                 $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
 117                 if ( $base === FALSE ) $base = '';
 118                 $r['path'] = $base . '/' . $r['path'];
 119         }
 120         $r['path'] = url_remove_dot_segments( $r['path'] );
 121         return join_url( $r );
 122 }
 123
 124 /**
 125  * Filter out "." and ".." segments from a URL's path and return
 126  * the result.
 127  *
 128  * This function implements the "remove_dot_segments" algorithm from
 129  * the RFC3986 specification for URLs.
 130  *
 131  * This function supports multi-byte characters with the UTF-8 encoding,
 132  * per the URL specification.
 133  *
 134  * Parameters:
 135  *      path    the path to filter
 136  *
 137  * Return values:
 138  *      The filtered path with "." and ".." removed.
 139  */
 140 function url_remove_dot_segments( $path )
 141 {
 142         // multi-byte character explode
 143         $inSegs  = preg_split( '!/!u', $path );
 144         $outSegs = array( );
 145         foreach ( $inSegs as $seg )
 146         {
 147                 if ( $seg == '' || $seg == '.')
 148                         continue;
 149                 if ( $seg == '..' )
 150                         array_pop( $outSegs );
 151                 else
 152                         array_push( $outSegs, $seg );
 153         }
 154         $outPath = implode( '/', $outSegs );
 155         if ( $path[0] == '/' )
 156                 $outPath = '/' . $outPath;
 157         // compare last multi-byte character against '/'
 158         if ( $outPath != '/' &&
 159                 (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
 160                 $outPath .= '/';
 161         return $outPath;
 162 }
 163
 164 /**
 165  * This function parses an absolute or relative URL and splits it
 166  * into individual components.
 167  *
 168  * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 169  * A portion of the ABNFs are repeated here:
 170  *
 171  *      URI-reference   = URI
 172  *                      / relative-ref
 173  *
 174  *      URI             = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 175  *
 176  *      relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 177  *
 178  *      hier-part       = "//" authority path-abempty
 179  *                      / path-absolute
 180  *                      / path-rootless
 181  *                      / path-empty
 182  *
 183  *      relative-part   = "//" authority path-abempty
 184  *                      / path-absolute
 185  *                      / path-noscheme
 186  *                      / path-empty
 187  *
 188  *      authority       = [ userinfo "@" ] host [ ":" port ]
 189  *
 190  * So, a URL has the following major components:
 191  *
 192  *      scheme
 193  *              The name of a method used to interpret the rest of
 194  *              the URL.  Examples:  "http", "https", "mailto", "file'.
 195  *
 196  *      authority
 197  *              The name of the authority governing the URL's name
 198  *              space.  Examples:  "example.com", "user@example.com",
 199  *              "example.com:80", "user:password@example.com:80".
 200  *
 201  *              The authority may include a host name, port number,
 202  *              user name, and password.
 203  *
 204  *              The host may be a name, an IPv4 numeric address, or
 205  *              an IPv6 numeric address.
 206  *
 207  *      path
 208  *              The hierarchical path to the URL's resource.
 209  *              Examples:  "/index.htm", "/scripts/page.php".
 210  *
 211  *      query
 212  *              The data for a query.  Examples:  "?search=google.com".
 213  *
 214  *      fragment
 215  *              The name of a secondary resource relative to that named
 216  *              by the path.  Examples:  "#section1", "#header".
 217  *
 218  * An "absolute" URL must include a scheme and path.  The authority, query,
 219  * and fragment components are optional.
 220  *
 221  * A "relative" URL does not include a scheme and must include a path.  The
 222  * authority, query, and fragment components are optional.
 223  *
 224  * This function splits the $url argument into the following components
 225  * and returns them in an associative array.  Keys to that array include:
 226  *
 227  *      "scheme"        The scheme, such as "http".
 228  *      "host"          The host name, IPv4, or IPv6 address.
 229  *      "port"          The port number.
 230  *      "user"          The user name.
 231  *      "pass"          The user password.
 232  *      "path"          The path, such as a file path for "http".
 233  *      "query"         The query.
 234  *      "fragment"      The fragment.
 235  *
 236  * One or more of these may not be present, depending upon the URL.
 237  *
 238  * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 239  * "path", "query", and "fragment" may have percent-encoded characters
 240  * decoded.  The "scheme" and "port" cannot include percent-encoded
 241  * characters and are never decoded.  Decoding occurs after the URL has
 242  * been parsed.
 243  *
 244  * Parameters:
 245  *      url             the URL to parse.
 246  *
 247  *      decode          an optional boolean flag selecting whether
 248  *                      to decode percent encoding or not.  Default = TRUE.
 249  *
 250  * Return values:
 251  *      the associative array of URL parts, or FALSE if the URL is
 252  *      too malformed to recognize any parts.
 253  */
 254 function split_url( $url, $decode=FALSE)
 255 {
 256         // Character sets from RFC3986.
 257         $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
 258         $xpchar        = $xunressub . ':@% ';
 259
 260         // Scheme from RFC3986.
 261         $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
 262
 263         // User info (user + password) from RFC3986.
 264         $xuserinfo     = '((['  . $xunressub . '%]*)' .
 265                          '(:([' . $xunressub . ':%]*))?)';
 266
 267         // IPv4 from RFC3986 (without digit constraints).
 268         $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
 269
 270         // IPv6 from RFC2732 (without digit and grouping constraints).
 271         $xipv6         = '(\[([a-fA-F\d.:]+)\])';
 272
 273         // Host name from RFC1035.  Technically, must start with a letter.
 274         // Relax that restriction to better parse URL structure, then
 275         // leave host name validation to application.
 276         $xhost_name    = '([a-zA-Z\d-.%]+)';
 277
 278         // Authority from RFC3986.  Skip IP future.
 279         $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
 280         $xport         = '(\d*)';
 281         $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
 282                          '?(:' . $xport . ')?)';
 283
 284         // Path from RFC3986.  Blend absolute & relative for efficiency.
 285         $xslash_seg    = '(/[' . $xpchar . ']*)';
 286         $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
 287         $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
 288         $xpath_abs     = '(/(' . $xpath_rel . ')?)';
 289         $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
 290                          '|' . $xpath_rel . ')';
 291
 292         // Query and fragment from RFC3986.
 293         $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
 294
 295         // URL.
 296         $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
 297                          '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
 298
 299
 300         // Split the URL into components.
 301         if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
 302                 return FALSE;
 303
 304         if ( !empty($m[2]) )            $parts['scheme']  = strtolower($m[2]);
 305
 306         if ( !empty($m[7]) ) {
 307                 if ( isset( $m[9] ) )   $parts['user']    = $m[9];
 308                 else                    $parts['user']    = '';
 309         }
 310         if ( !empty($m[10]) )           $parts['pass']    = $m[11];
 311
 312         if ( !empty($m[13]) )           $h=$parts['host'] = $m[13];
 313         else if ( !empty($m[14]) )      $parts['host']    = $m[14];
 314         else if ( !empty($m[16]) )      $parts['host']    = $m[16];
 315         else if ( !empty( $m[5] ) )     $parts['host']    = '';
 316         if ( !empty($m[17]) )           $parts['port']    = $m[18];
 317
 318         if ( !empty($m[19]) )           $parts['path']    = $m[19];
 319         else if ( !empty($m[21]) )      $parts['path']    = $m[21];
 320         else if ( !empty($m[25]) )      $parts['path']    = $m[25];
 321
 322         if ( !empty($m[27]) )           $parts['query']   = $m[28];
 323         if ( !empty($m[29]) )           $parts['fragment']= $m[30];
 324
 325         if ( !$decode )
 326                 return $parts;
 327         if ( !empty($parts['user']) )
 328                 $parts['user']     = rawurldecode( $parts['user'] );
 329         if ( !empty($parts['pass']) )
 330                 $parts['pass']     = rawurldecode( $parts['pass'] );
 331         if ( !empty($parts['path']) )
 332                 $parts['path']     = rawurldecode( $parts['path'] );
 333         if ( isset($h) )
 334                 $parts['host']     = rawurldecode( $parts['host'] );
 335         if ( !empty($parts['query']) )
 336                 $parts['query']    = rawurldecode( $parts['query'] );
 337         if ( !empty($parts['fragment']) )
 338                 $parts['fragment'] = rawurldecode( $parts['fragment'] );
 339         return $parts;
 340 }
 341
 342 /**
 343  * This function joins together URL components to form a complete URL.
 344  *
 345  * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 346  * This function implements the specification's "component recomposition"
 347  * algorithm for combining URI components into a full URI string.
 348  *
 349  * The $parts argument is an associative array containing zero or
 350  * more of the following:
 351  *
 352  *      "scheme"        The scheme, such as "http".
 353  *      "host"          The host name, IPv4, or IPv6 address.
 354  *      "port"          The port number.
 355  *      "user"          The user name.
 356  *      "pass"          The user password.
 357  *      "path"          The path, such as a file path for "http".
 358  *      "query"         The query.
 359  *      "fragment"      The fragment.
 360  *
 361  * The "port", "user", and "pass" values are only used when a "host"
 362  * is present.
 363  *
 364  * The optional $encode argument indicates if appropriate URL components
 365  * should be percent-encoded as they are assembled into the URL.  Encoding
 366  * is only applied to the "user", "pass", "host" (if a host name, not an
 367  * IP address), "path", "query", and "fragment" components.  The "scheme"
 368  * and "port" are never encoded.  When a "scheme" and "host" are both
 369  * present, the "path" is presumed to be hierarchical and encoding
 370  * processes each segment of the hierarchy separately (i.e., the slashes
 371  * are left alone).
 372  *
 373  * The assembled URL string is returned.
 374  *
 375  * Parameters:
 376  *      parts           an associative array of strings containing the
 377  *                      individual parts of a URL.
 378  *
 379  *      encode          an optional boolean flag selecting whether
 380  *                      to do percent encoding or not.  Default = true.
 381  *
 382  * Return values:
 383  *      Returns the assembled URL string.  The string is an absolute
 384  *      URL if a scheme is supplied, and a relative URL if not.  An
 385  *      empty string is returned if the $parts array does not contain
 386  *      any of the needed values.
 387  */
 388 function join_url( $parts, $encode=FALSE)
 389 {
 390         if ( $encode )
 391         {
 392                 if ( isset( $parts['user'] ) )
 393                         $parts['user']     = rawurlencode( $parts['user'] );
 394                 if ( isset( $parts['pass'] ) )
 395                         $parts['pass']     = rawurlencode( $parts['pass'] );
 396                 if ( isset( $parts['host'] ) &&
 397                         !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
 398                         $parts['host']     = rawurlencode( $parts['host'] );
 399                 if ( !empty( $parts['path'] ) )
 400                         $parts['path']     = preg_replace( '!%2F!ui', '/',
 401                                 rawurlencode( $parts['path'] ) );
 402                 if ( isset( $parts['query'] ) )
 403                         $parts['query']    = rawurlencode( $parts['query'] );
 404                 if ( isset( $parts['fragment'] ) )
 405                         $parts['fragment'] = rawurlencode( $parts['fragment'] );
 406         }
 407
 408         $url = '';
 409         if ( !empty( $parts['scheme'] ) )
 410                 $url .= $parts['scheme'] . ':';
 411         if ( isset( $parts['host'] ) )
 412         {
 413                 $url .= '//';
 414                 if ( isset( $parts['user'] ) )
 415                 {
 416                         $url .= $parts['user'];
 417                         if ( isset( $parts['pass'] ) )
 418                                 $url .= ':' . $parts['pass'];
 419                         $url .= '@';
 420                 }
 421                 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
 422                         $url .= '[' . $parts['host'] . ']';     // IPv6
 423                 else
 424                         $url .= $parts['host'];                 // IPv4 or name
 425                 if ( isset( $parts['port'] ) )
 426                         $url .= ':' . $parts['port'];
 427                 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
 428                         $url .= '/';
 429         }
 430         if ( !empty( $parts['path'] ) )
 431                 $url .= $parts['path'];
 432         if ( isset( $parts['query'] ) )
 433                 $url .= '?' . $parts['query'];
 434         if ( isset( $parts['fragment'] ) )
 435                 $url .= '#' . $parts['fragment'];
 436         return $url;
 437 }
 438
 439 /**
 440  * This function encodes URL to form a URL which is properly
 441  * percent encoded to replace disallowed characters.
 442  *
 443  * RFC3986 specifies the allowed characters in the URL as well as
 444  * reserved characters in the URL. This function replaces all the
 445  * disallowed characters in the URL with their repective percent
 446  * encodings. Already encoded characters are not encoded again,
 447  * such as '%20' is not encoded to '%2520'.
 448  *
 449  * Parameters:
 450  *      url             the url to encode.
 451  *
 452  * Return values:
 453  *      Returns the encoded URL string.
 454  */
 455 function encode_url($url) {
 456   $reserved = array(
 457     ":" => '!%3A!ui',
 458     "/" => '!%2F!ui',
 459     "?" => '!%3F!ui',
 460     "#" => '!%23!ui',
 461     "[" => '!%5B!ui',
 462     "]" => '!%5D!ui',
 463     "@" => '!%40!ui',
 464     "!" => '!%21!ui',
 465     "$" => '!%24!ui',
 466     "&" => '!%26!ui',
 467     "'" => '!%27!ui',
 468     "(" => '!%28!ui',
 469     ")" => '!%29!ui',
 470     "*" => '!%2A!ui',
 471     "+" => '!%2B!ui',
 472     "," => '!%2C!ui',
 473     ";" => '!%3B!ui',
 474     "=" => '!%3D!ui',
 475     "%" => '!%25!ui',
 476   );
 477
 478   $url = rawurlencode($url);
 479   $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
 480   return $url;
 481 }
 482
 483 /**
 484  * Extract URLs from a web page.
 485  *
 486  * URLs are extracted from a long list of tags and attributes as defined
 487  * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
 488  * URLs are also extracted from tags and attributes that are common
 489  * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
 490  * and from WML 1.3 and 2.0.
 491  *
 492  * The function returns an associative array of associative arrays of
 493  * arrays of URLs.  The outermost array's keys are the tag (element) name,
 494  * such as "a" for <a> or "img" for <img>.  The values for these entries
 495  * are associative arrays where the keys are attribute names for those
 496  * tags, such as "href" for <a href="...">.  Finally, the values for
 497  * those arrays are URLs found in those tags and attributes throughout
 498  * the text.
 499  *
 500  * Parameters:
 501  *      text            the UTF-8 text to scan
 502  *
 503  * Return values:
 504  *      an associative array where keys are tags and values are an
 505  *      associative array where keys are attributes and values are
 506  *      an array of URLs.
 507  *
 508  * See:
 509  *      http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
 510  */
 511 function extract_html_urls( $text )
 512 {
 513         $match_elements = array(
 514                 // HTML
 515                 array('element'=>'a',           'attribute'=>'href'),           // 2.0
 516                 array('element'=>'a',           'attribute'=>'urn'),            // 2.0
 517                 array('element'=>'base',        'attribute'=>'href'),           // 2.0
 518                 array('element'=>'form',        'attribute'=>'action'),         // 2.0
 519                 array('element'=>'img',         'attribute'=>'src'),            // 2.0
 520                 array('element'=>'link',        'attribute'=>'href'),           // 2.0
 521
 522                 array('element'=>'applet',      'attribute'=>'code'),           // 3.2
 523                 array('element'=>'applet',      'attribute'=>'codebase'),       // 3.2
 524                 array('element'=>'area',        'attribute'=>'href'),           // 3.2
 525                 array('element'=>'body',        'attribute'=>'background'),     // 3.2
 526                 array('element'=>'img',         'attribute'=>'usemap'),         // 3.2
 527                 array('element'=>'input',       'attribute'=>'src'),            // 3.2
 528
 529                 array('element'=>'applet',      'attribute'=>'archive'),        // 4.01
 530                 array('element'=>'applet',      'attribute'=>'object'),         // 4.01
 531                 array('element'=>'blockquote',  'attribute'=>'cite'),           // 4.01
 532                 array('element'=>'del',         'attribute'=>'cite'),           // 4.01
 533                 array('element'=>'frame',       'attribute'=>'longdesc'),       // 4.01
 534                 array('element'=>'frame',       'attribute'=>'src'),            // 4.01
 535                 array('element'=>'head',        'attribute'=>'profile'),        // 4.01
 536                 array('element'=>'iframe',      'attribute'=>'longdesc'),       // 4.01
 537                 array('element'=>'iframe',      'attribute'=>'src'),            // 4.01
 538                 array('element'=>'img',         'attribute'=>'longdesc'),       // 4.01
 539                 array('element'=>'input',       'attribute'=>'usemap'),         // 4.01
 540                 array('element'=>'ins',         'attribute'=>'cite'),           // 4.01
 541                 array('element'=>'object',      'attribute'=>'archive'),        // 4.01
 542                 array('element'=>'object',      'attribute'=>'classid'),        // 4.01
 543                 array('element'=>'object',      'attribute'=>'codebase'),       // 4.01
 544                 array('element'=>'object',      'attribute'=>'data'),           // 4.01
 545                 array('element'=>'object',      'attribute'=>'usemap'),         // 4.01
 546                 array('element'=>'q',           'attribute'=>'cite'),           // 4.01
 547                 array('element'=>'script',      'attribute'=>'src'),            // 4.01
 548
 549                 array('element'=>'audio',       'attribute'=>'src'),            // 5.0
 550                 array('element'=>'command',     'attribute'=>'icon'),           // 5.0
 551                 array('element'=>'embed',       'attribute'=>'src'),            // 5.0
 552                 array('element'=>'event-source','attribute'=>'src'),            // 5.0
 553                 array('element'=>'html',        'attribute'=>'manifest'),       // 5.0
 554                 array('element'=>'source',      'attribute'=>'src'),            // 5.0
 555                 array('element'=>'video',       'attribute'=>'src'),            // 5.0
 556                 array('element'=>'video',       'attribute'=>'poster'),         // 5.0
 557
 558                 array('element'=>'bgsound',     'attribute'=>'src'),            // Extension
 559                 array('element'=>'body',        'attribute'=>'credits'),        // Extension
 560                 array('element'=>'body',        'attribute'=>'instructions'),   // Extension
 561                 array('element'=>'body',        'attribute'=>'logo'),           // Extension
 562                 array('element'=>'div',         'attribute'=>'href'),           // Extension
 563                 array('element'=>'div',         'attribute'=>'src'),            // Extension
 564                 array('element'=>'embed',       'attribute'=>'code'),           // Extension
 565                 array('element'=>'embed',       'attribute'=>'pluginspage'),    // Extension
 566                 array('element'=>'html',        'attribute'=>'background'),     // Extension
 567                 array('element'=>'ilayer',      'attribute'=>'src'),            // Extension
 568                 array('element'=>'img',         'attribute'=>'dynsrc'),         // Extension
 569                 array('element'=>'img',         'attribute'=>'lowsrc'),         // Extension
 570                 array('element'=>'input',       'attribute'=>'dynsrc'),         // Extension
 571                 array('element'=>'input',       'attribute'=>'lowsrc'),         // Extension
 572                 array('element'=>'table',       'attribute'=>'background'),     // Extension
 573                 array('element'=>'td',          'attribute'=>'background'),     // Extension
 574                 array('element'=>'th',          'attribute'=>'background'),     // Extension
 575                 array('element'=>'layer',       'attribute'=>'src'),            // Extension
 576                 array('element'=>'xml',         'attribute'=>'src'),            // Extension
 577
 578                 array('element'=>'button',      'attribute'=>'action'),         // Forms 2.0
 579                 array('element'=>'datalist',    'attribute'=>'data'),           // Forms 2.0
 580                 array('element'=>'form',        'attribute'=>'data'),           // Forms 2.0
 581                 array('element'=>'input',       'attribute'=>'action'),         // Forms 2.0
 582                 array('element'=>'select',      'attribute'=>'data'),           // Forms 2.0
 583
 584                 // XHTML
 585                 array('element'=>'html',        'attribute'=>'xmlns'),
 586
 587                 // WML
 588                 array('element'=>'access',      'attribute'=>'path'),           // 1.3
 589                 array('element'=>'card',        'attribute'=>'onenterforward'), // 1.3
 590                 array('element'=>'card',        'attribute'=>'onenterbackward'),// 1.3
 591                 array('element'=>'card',        'attribute'=>'ontimer'),        // 1.3
 592                 array('element'=>'go',          'attribute'=>'href'),           // 1.3
 593                 array('element'=>'option',      'attribute'=>'onpick'),         // 1.3
 594                 array('element'=>'template',    'attribute'=>'onenterforward'), // 1.3
 595                 array('element'=>'template',    'attribute'=>'onenterbackward'),// 1.3
 596                 array('element'=>'template',    'attribute'=>'ontimer'),        // 1.3
 597                 array('element'=>'wml',         'attribute'=>'xmlns'),          // 2.0
 598         );
 599
 600         $match_metas = array(
 601                 'content-base',
 602                 'content-location',
 603                 'referer',
 604                 'location',
 605                 'refresh',
 606         );
 607
 608         // Extract all elements
 609         if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
 610                 return array( );
 611         $elements = $matches[1];
 612         $value_pattern = '=(("([^"]*)")|([^\s]*))';
 613
 614         // Match elements and attributes
 615         foreach ( $match_elements as $match_element )
 616         {
 617                 $name = $match_element['element'];
 618                 $attr = $match_element['attribute'];
 619                 $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
 620                 if ( $name == 'object' )
 621                         $split_pattern = '/\s*/u';      // Space-separated URL list
 622                 else if ( $name == 'archive' )
 623                         $split_pattern = '/,\s*/u';     // Comma-separated URL list
 624                 else
 625                         unset( $split_pattern );        // Single URL
 626                 foreach ( $elements as $element )
 627                 {
 628                         if ( !preg_match( $pattern, $element, $match ) )
 629                                 continue;
 630                         $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
 631                         if ( !isset( $split_pattern ) )
 632                                 $urls[$name][$attr][] = $m;
 633                         else
 634                         {
 635                                 $msplit = preg_split( $split_pattern, $m );
 636                                 foreach ( $msplit as $ms )
 637                                         $urls[$name][$attr][] = $ms;
 638                         }
 639                 }
 640         }
 641
 642         // Match meta http-equiv elements
 643         foreach ( $match_metas as $match_meta )
 644         {
 645                 $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
 646                 $content_pattern = '/content'  . $value_pattern . '/iu';
 647                 $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
 648                 foreach ( $elements as $element )
 649                 {
 650                         if ( !preg_match( '/^meta/iu', $element ) ||
 651                                 !preg_match( $attr_pattern, $element ) ||
 652                                 !preg_match( $content_pattern, $element, $match ) )
 653                                 continue;
 654                         $m = empty($match[3]) ? $match[4] : $match[3];
 655                         if ( $match_meta != 'refresh' )
 656                                 $urls['meta']['http-equiv'][] = $m;
 657                         else if ( preg_match( $refresh_pattern, $m, $match ) )
 658                                 $urls['meta']['http-equiv'][] = $match[2];
 659                 }
 660         }
 661
 662         // Match style attributes
 663         $urls['style'] = array( );
 664         $style_pattern = '/style' . $value_pattern . '/iu';
 665         foreach ( $elements as $element )
 666         {
 667                 if ( !preg_match( $style_pattern, $element, $match ) )
 668                         continue;
 669                 $m = empty($match[3]) ? $match[4] : $match[3];
 670                 $style_urls = extract_css_urls( $m );
 671                 if ( !empty( $style_urls ) )
 672                         $urls['style'] = array_merge_recursive(
 673                                 $urls['style'], $style_urls );
 674         }
 675
 676         // Match style bodies
 677         if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
 678         {
 679                 foreach ( $style_bodies[1] as $style_body )
 680                 {
 681                         $style_urls = extract_css_urls( $style_body );
 682                         if ( !empty( $style_urls ) )
 683                                 $urls['style'] = array_merge_recursive(
 684                                         $urls['style'], $style_urls );
 685                 }
 686         }
 687         if ( empty($urls['style']) )
 688                 unset( $urls['style'] );
 689
 690         return $urls;
 691 }
 692 /**
 693  * Extract URLs from UTF-8 CSS text.
 694  *
 695  * URLs within @import statements and url() property functions are extracted
 696  * and returned in an associative array of arrays.  Array keys indicate
 697  * the use context for the URL, including:
 698  *
 699  *      "import"
 700  *      "property"
 701  *
 702  * Each value in the associative array is an array of URLs.
 703  *
 704  * Parameters:
 705  *      text            the UTF-8 text to scan
 706  *
 707  * Return values:
 708  *      an associative array of arrays of URLs.
 709  *
 710  * See:
 711  *      http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
 712  */
 713 function extract_css_urls( $text )
 714 {
 715         $urls = array( );
 716
 717         $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
 718         $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
 719         $pattern         = '/(' .
 720                  '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
 721                 '|(@import\s*'      . $urlfunc_pattern . ')'      .
 722                 '|('                . $urlfunc_pattern . ')'      .  ')/iu';
 723         if ( !preg_match_all( $pattern, $text, $matches ) )
 724                 return $urls;
 725
 726         // @import '...'
 727         // @import "..."
 728         foreach ( $matches[3] as $match )
 729                 if ( !empty($match) )
 730                         $urls['import'][] =
 731                                 preg_replace( '/\\\\(.)/u', '\\1', $match );
 732
 733         // @import url(...)
 734         // @import url('...')
 735         // @import url("...")
 736         foreach ( $matches[7] as $match )
 737                 if ( !empty($match) )
 738                         $urls['import'][] =
 739                                 preg_replace( '/\\\\(.)/u', '\\1', $match );
 740
 741         // url(...)
 742         // url('...')
 743         // url("...")
 744         foreach ( $matches[11] as $match )
 745                 if ( !empty($match) )
 746                         $urls['property'][] =
 747                                 preg_replace( '/\\\\(.)/u', '\\1', $match );
 748
 749         return $urls;
 750 }