library/HTMLPurifier/Injector/AutoParagraph.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/Injector.php';
   4
   5 HTMLPurifier_ConfigSchema::define(
   6     'AutoFormat', 'AutoParagraph', false, 'bool', '
   7 <p>
   8   This directive turns on auto-paragraphing, where double newlines are
   9   converted in to paragraphs whenever possible. Auto-paragraphing:
  10 </p>
  11 <ul>
  12   <li>Always applies to inline elements or text in the root node,</li>
  13   <li>Applies to inline elements or text with double newlines in nodes
  14       that allow paragraph tags,</li>
  15   <li>Applies to double newlines in paragraph tags</li>
  16 </ul>
  17 <p>
  18   <code>p</code> tags must be allowed for this directive to take effect.
  19   We do not use <code>br</code> tags for paragraphing, as that is
  20   semantically incorrect.
  21 </p>
  22 <p>
  23   To prevent auto-paragraphing as a content-producer, refrain from using
  24   double-newlines except to specify a new paragraph or in contexts where
  25   it has special meaning (whitespace usually has no meaning except in
  26   tags like <code>pre</code>, so this should not be difficult.) To prevent
  27   the paragraphing of inline text adjacent to block elements, wrap them
  28   in <code>div</code> tags (the behavior is slightly different outside of
  29   the root node.)
  30 </p>
  31 <p>
  32   This directive has been available since 2.0.1.
  33 </p>
  34 ');
  35
  36 /**
  37  * Injector that auto paragraphs text in the root node based on
  38  * double-spacing.
  39  */
  40 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  41 {
  42
  43     var $name = 'AutoParagraph';
  44     var $needed = array('p');
  45
  46     function _pStart() {
  47         $par = new HTMLPurifier_Token_Start('p');
  48         $par->armor['MakeWellFormed_TagClosedError'] = true;
  49         return $par;
  50     }
  51
  52     function handleText(&$token) {
  53         $text = $token->data;
  54         if (empty($this->currentNesting)) {
  55             if (!$this->allowsElement('p')) return;
  56             // case 1: we're in root node (and it allows paragraphs)
  57             $token = array($this->_pStart());
  58             $this->_splitText($text, $token);
  59         } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') {
  60             // case 2: we're in a paragraph
  61             $token = array();
  62             $this->_splitText($text, $token);
  63         } elseif ($this->allowsElement('p')) {
  64             // case 3: we're in an element that allows paragraphs
  65             if (strpos($text, "\n\n") !== false) {
  66                 // case 3.1: this text node has a double-newline
  67                 $token = array($this->_pStart());
  68                 $this->_splitText($text, $token);
  69             } else {
  70                 $ok = false;
  71                 // test if up-coming tokens are either block or have
  72                 // a double newline in them
  73                 $nesting = 0;
  74                 for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
  75                     if ($this->inputTokens[$i]->type == 'start'){
  76                         if (!$this->_isInline($this->inputTokens[$i])) {
  77                             // we haven't found a double-newline, and
  78                             // we've hit a block element, so don't paragraph
  79                             $ok = false;
  80                             break;
  81                         }
  82                         $nesting++;
  83                     }
  84                     if ($this->inputTokens[$i]->type == 'end') {
  85                         if ($nesting <= 0) break;
  86                         $nesting--;
  87                     }
  88                     if ($this->inputTokens[$i]->type == 'text') {
  89                         // found it!
  90                         if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
  91                             $ok = true;
  92                             break;
  93                         }
  94                     }
  95                 }
  96                 if ($ok) {
  97                     // case 3.2: this text node is next to another node
  98                     // that will start a paragraph
  99                     $token = array($this->_pStart(), $token);
 100                 }
 101             }
 102         }
 103
 104     }
 105
 106     function handleElement(&$token) {
 107         // check if we're inside a tag already
 108         if (!empty($this->currentNesting)) {
 109             if ($this->allowsElement('p')) {
 110                 // special case: we're in an element that allows paragraphs
 111
 112                 // this token is already paragraph, abort
 113                 if ($token->name == 'p') return;
 114
 115                 // this token is a block level, abort
 116                 if (!$this->_isInline($token)) return;
 117
 118                 // check if this token is adjacent to the parent token
 119                 $prev = $this->inputTokens[$this->inputIndex - 1];
 120                 if ($prev->type != 'start') {
 121                     // not adjacent, we can abort early
 122                     // add lead paragraph tag if our token is inline
 123                     // and the previous tag was an end paragraph
 124                     if (
 125                         $prev->name == 'p' && $prev->type == 'end' &&
 126                         $this->_isInline($token)
 127                     ) {
 128                         $token = array($this->_pStart(), $token);
 129                     }
 130                     return;
 131                 }
 132
 133                 // this token is the first child of the element that allows
 134                 // paragraph. We have to peek ahead and see whether or not
 135                 // there is anything inside that suggests that a paragraph
 136                 // will be needed
 137                 $ok = false;
 138                 // maintain a mini-nesting counter, this lets us bail out
 139                 // early if possible
 140                 $j = 1; // current nesting, one is due to parent (we recalculate current token)
 141                 for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) {
 142                     if ($this->inputTokens[$i]->type == 'start') $j++;
 143                     if ($this->inputTokens[$i]->type == 'end') $j--;
 144                     if ($this->inputTokens[$i]->type == 'text') {
 145                         if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
 146                             $ok = true;
 147                             break;
 148                         }
 149                     }
 150                     if ($j <= 0) break;
 151                 }
 152                 if ($ok) {
 153                     $token = array($this->_pStart(), $token);
 154                 }
 155             }
 156             return;
 157         }
 158
 159         // check if the start tag counts as a "block" element
 160         if (!$this->_isInline($token)) return;
 161
 162         // append a paragraph tag before the token
 163         $token = array($this->_pStart(), $token);
 164     }
 165
 166     /**
 167      * Splits up a text in paragraph tokens and appends them
 168      * to the result stream that will replace the original
 169      * @param $data String text data that will be processed
 170      *    into paragraphs
 171      * @param $result Reference to array of tokens that the
 172      *    tags will be appended onto
 173      * @param $config Instance of HTMLPurifier_Config
 174      * @param $context Instance of HTMLPurifier_Context
 175      * @private
 176      */
 177     function _splitText($data, &$result) {
 178         $raw_paragraphs = explode("\n\n", $data);
 179
 180         // remove empty paragraphs
 181         $paragraphs = array();
 182         $needs_start = false;
 183         $needs_end   = false;
 184
 185         $c = count($raw_paragraphs);
 186         if ($c == 1) {
 187             // there were no double-newlines, abort quickly
 188             $result[] = new HTMLPurifier_Token_Text($data);
 189             return;
 190         }
 191
 192         for ($i = 0; $i < $c; $i++) {
 193             $par = $raw_paragraphs[$i];
 194             if (trim($par) !== '') {
 195                 $paragraphs[] = $par;
 196                 continue;
 197             }
 198             if ($i == 0 && empty($result)) {
 199                 // The empty result indicates that the AutoParagraph
 200                 // injector did not add any start paragraph tokens.
 201                 // The fact that the first paragraph is empty indicates
 202                 // that there was a double-newline at the start of the
 203                 // data.
 204                 // Combined together, this means that we are in a paragraph,
 205                 // and the newline means we should start a new one.
 206                 $result[] = new HTMLPurifier_Token_End('p');
 207                 // However, the start token should only be added if
 208                 // there is more processing to be done (i.e. there are
 209                 // real paragraphs in here). If there are none, the
 210                 // next start paragraph tag will be handled by the
 211                 // next run-around the injector
 212                 $needs_start = true;
 213             } elseif ($i + 1 == $c) {
 214                 // a double-paragraph at the end indicates that
 215                 // there is an overriding need to start a new paragraph
 216                 // for the next section. This has no effect until
 217                 // we've processed all of the other paragraphs though
 218                 $needs_end = true;
 219             }
 220         }
 221
 222         // check if there are no "real" paragraphs to be processed
 223         if (empty($paragraphs)) {
 224             return;
 225         }
 226
 227         // add a start tag if an end tag was added while processing
 228         // the raw paragraphs (that happens if there's a leading double
 229         // newline)
 230         if ($needs_start) $result[] = $this->_pStart();
 231
 232         // append the paragraphs onto the result
 233         foreach ($paragraphs as $par) {
 234             $result[] = new HTMLPurifier_Token_Text($par);
 235             $result[] = new HTMLPurifier_Token_End('p');
 236             $result[] = $this->_pStart();
 237         }
 238
 239         // remove trailing start token, if one is needed, it will
 240         // be handled the next time this injector is called
 241         array_pop($result);
 242
 243         // check the outside to determine whether or not the
 244         // end paragraph tag should be removed. It should be removed
 245         // unless the next non-whitespace token is a paragraph
 246         // or a block element.
 247         $remove_paragraph_end = true;
 248
 249         if (!$needs_end) {
 250             // Start of the checks one after the current token's index
 251             for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
 252                 if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') {
 253                     $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]);
 254                 }
 255                 // check if we can abort early (whitespace means we carry-on!)
 256                 if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break;
 257                 // end tags will automatically be handled by MakeWellFormed,
 258                 // so we don't have to worry about them
 259                 if ($this->inputTokens[$i]->type == 'end') break;
 260             }
 261         } else {
 262             $remove_paragraph_end = false;
 263         }
 264
 265         // check the outside to determine whether or not the
 266         // end paragraph tag should be removed
 267         if ($remove_paragraph_end) {
 268             array_pop($result);
 269         }
 270
 271     }
 272
 273     /**
 274      * Returns true if passed token is inline (and, ergo, allowed in
 275      * paragraph tags)
 276      * @private
 277      */
 278     function _isInline($token) {
 279         return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
 280     }
 281
 282 }
 283