Release 2.1.3, merged in 1404 to HEAD.
[htmlpurifier/bfroehle.git] / library / HTMLPurifier / Injector / AutoParagraph.php
blob56a6a2687884caa168e57dbf962d4baa21c20395
1 <?php
3 require_once 'HTMLPurifier/Injector.php';
5 HTMLPurifier_ConfigSchema::define(
6 'AutoFormat', 'AutoParagraph', false, 'bool', '
7 <p>
8 This directive turns on auto-paragraphing, where double newlines are
9 converted in to paragraphs whenever possible. Auto-paragraphing:
10 </p>
11 <ul>
12 <li>Always applies to inline elements or text in the root node,</li>
13 <li>Applies to inline elements or text with double newlines in nodes
14 that allow paragraph tags,</li>
15 <li>Applies to double newlines in paragraph tags</li>
16 </ul>
17 <p>
18 <code>p</code> tags must be allowed for this directive to take effect.
19 We do not use <code>br</code> tags for paragraphing, as that is
20 semantically incorrect.
21 </p>
22 <p>
23 To prevent auto-paragraphing as a content-producer, refrain from using
24 double-newlines except to specify a new paragraph or in contexts where
25 it has special meaning (whitespace usually has no meaning except in
26 tags like <code>pre</code>, so this should not be difficult.) To prevent
27 the paragraphing of inline text adjacent to block elements, wrap them
28 in <code>div</code> tags (the behavior is slightly different outside of
29 the root node.)
30 </p>
31 <p>
32 This directive has been available since 2.0.1.
33 </p>
34 ');
36 /**
37 * Injector that auto paragraphs text in the root node based on
38 * double-spacing.
40 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
43 var $name = 'AutoParagraph';
44 var $needed = array('p');
46 function _pStart() {
47 $par = new HTMLPurifier_Token_Start('p');
48 $par->armor['MakeWellFormed_TagClosedError'] = true;
49 return $par;
52 function handleText(&$token) {
53 $text = $token->data;
54 if (empty($this->currentNesting)) {
55 if (!$this->allowsElement('p')) return;
56 // case 1: we're in root node (and it allows paragraphs)
57 $token = array($this->_pStart());
58 $this->_splitText($text, $token);
59 } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') {
60 // case 2: we're in a paragraph
61 $token = array();
62 $this->_splitText($text, $token);
63 } elseif ($this->allowsElement('p')) {
64 // case 3: we're in an element that allows paragraphs
65 if (strpos($text, "\n\n") !== false) {
66 // case 3.1: this text node has a double-newline
67 $token = array($this->_pStart());
68 $this->_splitText($text, $token);
69 } else {
70 $ok = false;
71 // test if up-coming tokens are either block or have
72 // a double newline in them
73 $nesting = 0;
74 for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
75 if ($this->inputTokens[$i]->type == 'start'){
76 if (!$this->_isInline($this->inputTokens[$i])) {
77 // we haven't found a double-newline, and
78 // we've hit a block element, so don't paragraph
79 $ok = false;
80 break;
82 $nesting++;
84 if ($this->inputTokens[$i]->type == 'end') {
85 if ($nesting <= 0) break;
86 $nesting--;
88 if ($this->inputTokens[$i]->type == 'text') {
89 // found it!
90 if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
91 $ok = true;
92 break;
96 if ($ok) {
97 // case 3.2: this text node is next to another node
98 // that will start a paragraph
99 $token = array($this->_pStart(), $token);
106 function handleElement(&$token) {
107 // check if we're inside a tag already
108 if (!empty($this->currentNesting)) {
109 if ($this->allowsElement('p')) {
110 // special case: we're in an element that allows paragraphs
112 // this token is already paragraph, abort
113 if ($token->name == 'p') return;
115 // this token is a block level, abort
116 if (!$this->_isInline($token)) return;
118 // check if this token is adjacent to the parent token
119 $prev = $this->inputTokens[$this->inputIndex - 1];
120 if ($prev->type != 'start') {
121 // not adjacent, we can abort early
122 // add lead paragraph tag if our token is inline
123 // and the previous tag was an end paragraph
124 if (
125 $prev->name == 'p' && $prev->type == 'end' &&
126 $this->_isInline($token)
128 $token = array($this->_pStart(), $token);
130 return;
133 // this token is the first child of the element that allows
134 // paragraph. We have to peek ahead and see whether or not
135 // there is anything inside that suggests that a paragraph
136 // will be needed
137 $ok = false;
138 // maintain a mini-nesting counter, this lets us bail out
139 // early if possible
140 $j = 1; // current nesting, one is due to parent (we recalculate current token)
141 for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) {
142 if ($this->inputTokens[$i]->type == 'start') $j++;
143 if ($this->inputTokens[$i]->type == 'end') $j--;
144 if ($this->inputTokens[$i]->type == 'text') {
145 if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
146 $ok = true;
147 break;
150 if ($j <= 0) break;
152 if ($ok) {
153 $token = array($this->_pStart(), $token);
156 return;
159 // check if the start tag counts as a "block" element
160 if (!$this->_isInline($token)) return;
162 // append a paragraph tag before the token
163 $token = array($this->_pStart(), $token);
167 * Splits up a text in paragraph tokens and appends them
168 * to the result stream that will replace the original
169 * @param $data String text data that will be processed
170 * into paragraphs
171 * @param $result Reference to array of tokens that the
172 * tags will be appended onto
173 * @param $config Instance of HTMLPurifier_Config
174 * @param $context Instance of HTMLPurifier_Context
175 * @private
177 function _splitText($data, &$result) {
178 $raw_paragraphs = explode("\n\n", $data);
180 // remove empty paragraphs
181 $paragraphs = array();
182 $needs_start = false;
183 $needs_end = false;
185 $c = count($raw_paragraphs);
186 if ($c == 1) {
187 // there were no double-newlines, abort quickly
188 $result[] = new HTMLPurifier_Token_Text($data);
189 return;
192 for ($i = 0; $i < $c; $i++) {
193 $par = $raw_paragraphs[$i];
194 if (trim($par) !== '') {
195 $paragraphs[] = $par;
196 continue;
198 if ($i == 0 && empty($result)) {
199 // The empty result indicates that the AutoParagraph
200 // injector did not add any start paragraph tokens.
201 // The fact that the first paragraph is empty indicates
202 // that there was a double-newline at the start of the
203 // data.
204 // Combined together, this means that we are in a paragraph,
205 // and the newline means we should start a new one.
206 $result[] = new HTMLPurifier_Token_End('p');
207 // However, the start token should only be added if
208 // there is more processing to be done (i.e. there are
209 // real paragraphs in here). If there are none, the
210 // next start paragraph tag will be handled by the
211 // next run-around the injector
212 $needs_start = true;
213 } elseif ($i + 1 == $c) {
214 // a double-paragraph at the end indicates that
215 // there is an overriding need to start a new paragraph
216 // for the next section. This has no effect until
217 // we've processed all of the other paragraphs though
218 $needs_end = true;
222 // check if there are no "real" paragraphs to be processed
223 if (empty($paragraphs)) {
224 return;
227 // add a start tag if an end tag was added while processing
228 // the raw paragraphs (that happens if there's a leading double
229 // newline)
230 if ($needs_start) $result[] = $this->_pStart();
232 // append the paragraphs onto the result
233 foreach ($paragraphs as $par) {
234 $result[] = new HTMLPurifier_Token_Text($par);
235 $result[] = new HTMLPurifier_Token_End('p');
236 $result[] = $this->_pStart();
239 // remove trailing start token, if one is needed, it will
240 // be handled the next time this injector is called
241 array_pop($result);
243 // check the outside to determine whether or not the
244 // end paragraph tag should be removed. It should be removed
245 // unless the next non-whitespace token is a paragraph
246 // or a block element.
247 $remove_paragraph_end = true;
249 if (!$needs_end) {
250 // Start of the checks one after the current token's index
251 for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
252 if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') {
253 $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]);
255 // check if we can abort early (whitespace means we carry-on!)
256 if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break;
257 // end tags will automatically be handled by MakeWellFormed,
258 // so we don't have to worry about them
259 if ($this->inputTokens[$i]->type == 'end') break;
261 } else {
262 $remove_paragraph_end = false;
265 // check the outside to determine whether or not the
266 // end paragraph tag should be removed
267 if ($remove_paragraph_end) {
268 array_pop($result);
274 * Returns true if passed token is inline (and, ergo, allowed in
275 * paragraph tags)
276 * @private
278 function _isInline($token) {
279 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);