From 38ceb5859917ee64625cc6a97aaca0a3a882377e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sat, 17 Mar 2007 02:24:36 +0000
Subject: [PATCH] Add Acronymizer DOMFilter. index.xhtml edited accordingly.
 <?xml declarations added to all of my source files. Add convenience functions
 for DOM. Filter manager now sets XML documents to UTF-8 and scrubs out <?xml
 tags due to poor Internet Explorer support.

git-svn-id: http://htmlpurifier.org/svnroot@823 48356398-32a2-884e-a903-53898d9a118a
---
 comparison.xhtml                                   |   1 +
 index.xhtml                                        | 111 ++++++++++-----------
 xhtml-compiler/XHTMLCompiler/DOMFilter.php         |  30 ++++++
 .../XHTMLCompiler/DOMFilter/Acronymizer.php        |  46 +++++++++
 .../DOMFilter/GenerateTableOfContents.php          |   8 +-
 xhtml-compiler/XHTMLCompiler/FilterManager.php     |   2 +
 6 files changed, 134 insertions(+), 64 deletions(-)
 create mode 100644 xhtml-compiler/XHTMLCompiler/DOMFilter/Acronymizer.php
diff --git a/comparison.xhtml b/comparison.xhtml
index 285d5f7..506d9da 100644
--- a/comparison.xhtml
+++ b/comparison.xhtml
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
diff --git a/index.xhtml b/index.xhtml
index 8050ade..a4f79e2 100644
--- a/index.xhtml
+++ b/index.xhtml
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
@@ -26,7 +27,7 @@
         <li><a href="#News">News</a></li>
         <li><a href="#Plugins">Plugins</a></li>
         <li><a href="#Demo">Demo</a></li>
-        <li><a href="#Download"><strong>Download</strong></a></li>
+        <li><strong><a href="#Download">Download</a></strong></li>
         <li><a href="#Resources">Resources</a></li>
         <li><a href="phorum/">Forum</a></li>
         <li><a href="#Contact">Contact</a></li>
@@ -37,24 +38,20 @@
 
 <a href="#Download"><img src="download.png" class="download-button" alt="Download HTML Purifier" /></a>
 
-<p class="lead"><strong>HTML Purifier</strong> is a standards-compliant
-<acronym title="HyperText Markup Language">HTML</acronym> filter library
-written in <acronym title="PHP: Hypertext Preprocessor">PHP</acronym>.
-HTML Purifier will not only remove all malicious code (better known as
-<acronym title="Cross Site Scripting">XSS</acronym>) with a
-thoroughly audited, secure <em>yet</em> permissive
-<strong><a href="http://hp.jpsband.org/live/smoketests/printDefinition.php">whitelist</a></strong>, it will also
-make sure your documents are <strong>standards compliant</strong>, something
-only achievable with a comprehensive knowledge of
-<acronym title="World Wide Web Consortium">W3C</acronym>'s specifications.
-Tired of using BBCode due to the
-current landscape of deficient or insecure <acronym
-title="HyperText Markup Language">HTML</acronym> filters?  Have a
-<strong><acronym title="What You See Is What You Get">WYSIWYG</acronym></strong>
-editor but never been able to use it?  Looking for high-quality,
-standards-compliant, open-source components for that application you're
-building?
-HTML Purifier is for you!</p>
+<p class="lead"><strong>HTML Purifier</strong> is a standards-compliant 
+<acronym>HTML</acronym> filter library written in 
+<acronym>PHP</acronym>. HTML Purifier will not only remove all malicious 
+code (better known as <acronym>XSS</acronym>) with a thoroughly audited, 
+secure <em>yet</em> permissive <strong><a 
+href="http://hp.jpsband.org/live/smoketests/printDefinition.php">whitelist</a></strong>,
+it will also make sure your documents are 
+<strong>standards compliant</strong>, something only achievable with a 
+comprehensive knowledge of <acronym>W3C</acronym>'s specifications. 
+Tired of using BBCode due to the current landscape of deficient or 
+insecure <acronym>HTML</acronym> filters? Have a 
+<strong>WYSIWYG</strong> editor but never been able to use it? Looking 
+for high-quality, standards-compliant, open-source components for that 
+application you're building? HTML Purifier is for you!</p> 
 
 <blockquote class="fancy">
 <div class="quote">
@@ -66,11 +63,9 @@ HTML Purifier is for you!</p>
 
 <h2 id="Background">Background</h2>
 
-<p class="lead">There are a number of open-source <acronym
-title="HyperText Markup Language">HTML</acronym> filtering solutions out
+<p class="lead">There are a number of open-source <acronym>HTML</acronym> filtering solutions out
 there on the web already
-(i.e. <acronym
-title="PHP Extension and Application Repository">PEAR</acronym>'s
+(i.e. <acronym>PEAR</acronym>'s
 <a href="http://pear.php.net/package/HTML_Safe">HTML_Safe</a>,
 <a href="http://sourceforge.net/projects/kses">kses</a>
 and
@@ -78,47 +73,45 @@ and
 SafeHtmlChecker.class.php</a>).  What sets HTML Purifier apart from them?
 Aren't all of these choices &quot;secure&quot;?</p>
 
-<p>When it comes to <acronym title="HyperText Markup Language">HTML</acronym>,
-<strong>attention to detail</strong> is key.  Does the library demonstrate
-an in-depth knowledge of the <acronym
-title="Document Type Definition">DTD</acronym> that defines <acronym
-title="HyperText Markup Language">HTML</acronym>? Does it perform its 
-filtering off a robust whitelist rather than a usually out-dated blacklist?
-Does it go through the care to check every single attribute in the document
-for validity? Does it actually understand tag markup, or pay lip-service
-with a series of deficient regexes and str_replace's?</p>
-
-<p>Somewhere along the way, all of HTML Purifier's predecessors fall
-flat. HTML_Safe dooms itself to attacks of the future by using a blacklist.
-Configurable filters like kses and PHP Input Filter still cannot validate the
-contents inside attributes.  With all these gaps in coverage, none of the
-usual libraries come close to achieving <strong>standards-compliance</strong>.
-There is a user-unfriendly, draconic
-<acronym title="eXtensible Markup Language">XML</acronym>-based filter
-called Safe HTML Checker, but even it forgets that <code>&lt;a&gt;</code> tags
-cannot be nested within each other!</p>
+<p>When it comes to <acronym>HTML</acronym>, <strong>attention to 
+detail</strong> is key. Does the library demonstrate an in-depth 
+knowledge of the <acronym>DTD</acronym> that defines 
+<acronym>HTML</acronym>? Does it perform its filtering off a robust 
+whitelist rather than a usually out-dated blacklist? Does it go through 
+the care to check every single attribute in the document for validity? 
+Does it actually understand tag markup, or pay lip-service with a series 
+of deficient regexes and str_replace's?</p> 
+
+<p>Somewhere along the way, all of HTML Purifier's predecessors fall 
+flat. HTML_Safe dooms itself to attacks of the future by using a 
+blacklist. Configurable filters like kses and PHP Input Filter still 
+cannot validate the contents inside attributes. With all these gaps in 
+coverage, none of the usual libraries come close to achieving 
+<strong>standards-compliance</strong>. There is a user-unfriendly, 
+draconic <acronym>XML</acronym>-based filter called Safe HTML Checker, 
+but even it forgets that <code>&lt;a&gt;</code> tags cannot be nested 
+within each other!</p> 
 
 <p><strong>Know thy enemy.</strong> Wily hackers have a huge arsenal of 
-<acronym title="Cross Site Scripting">XSS</acronym> hidden within the depths
-of the <acronym title="HyperText Markup Language">HTML</acronym>
-specification.  HTML Purifier takes its effectiveness from the fact that it will
-decompose the whole document into tokens, and rigorously process the tokens by
-removing non-whitelisted elements, transforming bad practice tags like font
-into span, properly checking the nesting of tags and their children and
-validating all attributes according to their <acronym
-title="Request for Comment">RFC</acronym>s.  HTML Purifier's comprehensive
-algorithms are complemented by a <strong>breadth of knowledge</strong>,
-ensuring that richly formatted documents pass through unstripped.</p>
+<acronym>XSS</acronym> hidden within the depths of the 
+<acronym>HTML</acronym> specification. HTML Purifier takes its 
+effectiveness from the fact that it will decompose the whole document 
+into tokens, and rigorously process the tokens by removing 
+non-whitelisted elements, transforming bad practice tags like font into 
+span, properly checking the nesting of tags and their children and 
+validating all attributes according to their <acronym>RFC</acronym>s. 
+HTML Purifier's comprehensive algorithms are complemented by a 
+<strong>breadth of knowledge</strong>, ensuring that richly formatted 
+documents pass through unstripped.</p> 
 
 <a href="comparison.html"><img src="compare.png" class="compare-button" alt="Compare HTML Purifier with other filters" /></a>
 
-<p>To my knowledge, there is nothing else in the wild that offers
-protection from XSS, standards-compliance, and the corrective
-processing of poorly formed HTML simultaneously. Don't take my word
-for it though:
-do your research. Investigate the
-other libraries, and decide for yourself who you would prefer to be the
-<strong>gatekeeper</strong> to your system.</p>
+<p>To my knowledge, there is nothing else in the wild that offers 
+protection from XSS, standards-compliance, and the corrective processing 
+of poorly formed HTML simultaneously. Don't take my word for it though: 
+do your research. Investigate the other libraries, and decide for 
+yourself who you would prefer to be the <strong>gatekeeper</strong> to 
+your system.</p> 
 
 <p>To find out more, you can read the
 <a href="comparison.html"><strong>Comparison</strong></a>
diff --git a/xhtml-compiler/XHTMLCompiler/DOMFilter.php b/xhtml-compiler/XHTMLCompiler/DOMFilter.php
index 3950d7e..fa4ee0c 100644
--- a/xhtml-compiler/XHTMLCompiler/DOMFilter.php
+++ b/xhtml-compiler/XHTMLCompiler/DOMFilter.php
@@ -15,6 +15,36 @@ abstract class XHTMLCompiler_DOMFilter extends XHTMLCompiler_Filter
      */
     abstract public function process(DOMDocument $dom, $page);
     
+    /**
+     * Performs common initialization of DOM and XPath
+     */
+    protected function setup($dom) {
+        $this->dom = $dom;
+        $this->xpath = new DOMXPath($dom);
+        $this->xpath->registerNamespace('html', "http://www.w3.org/1999/xhtml");
+    }
+    
+    /**
+     * XPath object for the current DOM
+     */
+    protected $xpath;
+    
+    /**
+     * Current DOMDocument
+     */
+    protected $dom;
+    
+    /**
+     * Querys a DOM with an XPath expression
+     * @param $expr XPath expression to evaluate
+     * @param $context Context node
+     */
+    protected function query($expr, $context = false) {
+        if (!$this->dom) throw new Exception('Filter must be setup before using convenience functions');
+        if (!$context) return $this->xpath->query($expr);
+        return $this->xpath->query($expr, $context);
+    }
+    
 }
 
 ?>
\ No newline at end of file
diff --git a/xhtml-compiler/XHTMLCompiler/DOMFilter/Acronymizer.php b/xhtml-compiler/XHTMLCompiler/DOMFilter/Acronymizer.php
new file mode 100644
index 0000000..5a7ab79
--- /dev/null
+++ b/xhtml-compiler/XHTMLCompiler/DOMFilter/Acronymizer.php
@@ -0,0 +1,46 @@
+<?php
+
+/**
+ * Based on a list of known acronyms, populates of the title attribute
+ * of acronym elements in documents.
+ */
+class XHTMLCompiler_DOMFilter_Acronymizer extends XHTMLCompiler_DOMFilter
+{
+    
+    protected $name = 'Acronymizer';
+    
+    /**
+     * Array of recognized acronyms.
+     * @todo Make a public API for this, allow multiple acronym sets
+     *       and different precedences for them.
+     */
+    protected $acronyms = array(
+        'PHP' => 'PHP: HyperText Preprocessor',
+        'HTML' => 'HyperText Markup Language',
+        'XHTML' => 'eXtensible HyperText Markup Language',
+        'XSS' => 'Cross-Site Scripting',
+        'W3C' => 'World Wide Web Consortium',
+        'WYSIWYG' => 'What You See Is What You Get',
+        'WYSIWYM' => 'What You See Is What You Mean',
+        'PEAR' => 'PHP Extension and Application Repository',
+        'DTD' => 'Document Type Definition',
+        'XML' => 'eXtensible Markup Language',
+        'RFC' => 'Request for Comment',
+    );
+    
+    public function process(DOMDocument $dom, $page) {
+        $this->setup($dom);
+        $nodes = $this->query("//html:acronym[not(@title)]");
+        foreach ($nodes as $node) {
+            $acronym = $node->textContent;
+            if (!isset($this->acronyms[$acronym])) {
+                trigger_error(htmlspecialchars($acronym) . ' is not a recognized acronym (missing title attribute)');
+                continue;
+            }
+            $node->setAttribute('title', $this->acronyms[$acronym]);
+        }
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/xhtml-compiler/XHTMLCompiler/DOMFilter/GenerateTableOfContents.php b/xhtml-compiler/XHTMLCompiler/DOMFilter/GenerateTableOfContents.php
index 8d7d4cb..9682355 100644
--- a/xhtml-compiler/XHTMLCompiler/DOMFilter/GenerateTableOfContents.php
+++ b/xhtml-compiler/XHTMLCompiler/DOMFilter/GenerateTableOfContents.php
@@ -11,12 +11,10 @@ class XHTMLCompiler_DOMFilter_GenerateTableOfContents extends XHTMLCompiler_DOMF
     
     public function process(DOMDocument $dom, $page) {
         
-        // setup xpath, this can be factored out
-        $xpath = new DOMXPath($dom);
-        $xpath->registerNamespace('html', "http://www.w3.org/1999/xhtml");
+        $this->setup($dom);
         
         // test for ToC container, if not present don't bother
-        $container = $xpath->query("//html:div[@id='toc']")->item(0);
+        $container = $this->query("//html:div[@id='toc']")->item(0);
         if (!$container) return;
         
         // grab all headings h2 and down from the document
@@ -24,7 +22,7 @@ class XHTMLCompiler_DOMFilter_GenerateTableOfContents extends XHTMLCompiler_DOMF
         foreach ($headings as $k => $v) $headings[$k] = "self::html:$v";
         $query_headings = implode(' or ', $headings);
         $query = "//*[$query_headings]"; // looks like "//*[self::html:h2 or ...]"
-        $headings = $xpath->query($query);
+        $headings = $this->query($query);
         
         // setup the table of contents element
         $toc = $dom->createElement('ul');
diff --git a/xhtml-compiler/XHTMLCompiler/FilterManager.php b/xhtml-compiler/XHTMLCompiler/FilterManager.php
index 1b8ba76..cfe687d 100644
--- a/xhtml-compiler/XHTMLCompiler/FilterManager.php
+++ b/xhtml-compiler/XHTMLCompiler/FilterManager.php
@@ -94,6 +94,7 @@ class XHTMLCompiler_FilterManager
         $dom->preserveWhiteSpace = false;
         $dom->formatOutput = true;
         $dom->loadXML($text);
+        $dom->encoding = 'UTF-8';
         foreach ($this->DOMFilters as $filter) {
             $filter->process($dom, $page);
         }
@@ -101,6 +102,7 @@ class XHTMLCompiler_FilterManager
         foreach ($this->postTextFilters as $filter) {
             $text = $filter->process($text, $page);
         }
+        $text = str_replace('<?xml version="1.0" encoding="UTF-8"?>'."\n", '', $text);
         return $text;
     }
     
-- 
2.11.4.GIT