library/HTMLPurifier/ChildDef/Table.php

   1 <?php
   2
   3 /**
   4  * Definition for tables.  The general idea is to extract out all of the
   5  * essential bits, and then reconstruct it later.
   6  *
   7  * This is a bit confusing, because the DTDs and the W3C
   8  * validators seem to disagree on the appropriate definition. The
   9  * DTD claims:
  10  *
  11  *      (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
  12  *
  13  * But actually, the HTML4 spec then has this to say:
  14  *
  15  *      The TBODY start tag is always required except when the table
  16  *      contains only one table body and no table head or foot sections.
  17  *      The TBODY end tag may always be safely omitted.
  18  *
  19  * So the DTD is kind of wrong.  The validator is, unfortunately, kind
  20  * of on crack.
  21  *
  22  * The definition changed again in XHTML1.1; and in my opinion, this
  23  * formulation makes the most sense.
  24  *
  25  *      caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
  26  *
  27  * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
  28  * If we encounter a thead, tfoot or tbody, we are placed in the former
  29  * mode, and we *must* wrap any stray tr segments with a tbody. But if
  30  * we don't run into any of them, just have tr tags is OK.
  31  */
  32 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
  33 {
  34     public $allow_empty = false;
  35     public $type = 'table';
  36     public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
  37         'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
  38     public function __construct() {}
  39     public function validateChildren($tokens_of_children, $config, $context) {
  40         if (empty($tokens_of_children)) return false;
  41
  42         // this ensures that the loop gets run one last time before closing
  43         // up. It's a little bit of a hack, but it works! Just make sure you
  44         // get rid of the token later.
  45         $tokens_of_children[] = false;
  46
  47         // only one of these elements is allowed in a table
  48         $caption = false;
  49         $thead   = false;
  50         $tfoot   = false;
  51
  52         // as many of these as you want
  53         $cols    = array();
  54         $content = array();
  55
  56         $nesting = 0; // current depth so we can determine nodes
  57         $is_collecting = false; // are we globbing together tokens to package
  58                                 // into one of the collectors?
  59         $collection = array(); // collected nodes
  60         $tag_index = 0; // the first node might be whitespace,
  61                             // so this tells us where the start tag is
  62         $tbody_mode = false; // if true, then we need to wrap any stray
  63                              // <tr>s with a <tbody>.
  64
  65         foreach ($tokens_of_children as $token) {
  66             $is_child = ($nesting == 0);
  67
  68             if ($token === false) {
  69                 // terminating sequence started
  70             } elseif ($token instanceof HTMLPurifier_Token_Start) {
  71                 $nesting++;
  72             } elseif ($token instanceof HTMLPurifier_Token_End) {
  73                 $nesting--;
  74             }
  75
  76             // handle node collection
  77             if ($is_collecting) {
  78                 if ($is_child) {
  79                     // okay, let's stash the tokens away
  80                     // first token tells us the type of the collection
  81                     switch ($collection[$tag_index]->name) {
  82                         case 'tbody':
  83                             $tbody_mode = true;
  84                             // fall through
  85                         case 'tr':
  86                             $content[] = $collection;
  87                             break;
  88                         case 'caption':
  89                             if ($caption !== false) break;
  90                             $caption = $collection;
  91                             break;
  92                         case 'thead':
  93                         case 'tfoot':
  94                             $tbody_mode = true;
  95                             // XXX This breaks rendering properties with
  96                             // Firefox, which never floats a <thead> to
  97                             // the top. Ever. (Our scheme will float the
  98                             // first <thead> to the top.)  So maybe
  99                             // <thead>s that are not first should be
 100                             // turned into <tbody>? Very tricky, indeed.
 101
 102                             // access the appropriate variable, $thead or $tfoot
 103                             $var = $collection[$tag_index]->name;
 104                             if ($$var === false) {
 105                                 $$var = $collection;
 106                             } else {
 107                                 // Oops, there's a second one! What
 108                                 // should we do?  Current behavior is to
 109                                 // transmutate the first and last entries into
 110                                 // tbody tags, and then put into content.
 111                                 // Maybe a better idea is to *attach
 112                                 // it* to the existing thead or tfoot?
 113                                 // We don't do this, because Firefox
 114                                 // doesn't float an extra tfoot to the
 115                                 // bottom like it does for the first one.
 116                                 $collection[$tag_index]->name = 'tbody';
 117                                 $collection[count($collection)-1]->name = 'tbody';
 118                                 $content[] = $collection;
 119                             }
 120                             break;
 121                          case 'colgroup':
 122                             $cols[] = $collection;
 123                             break;
 124                     }
 125                     $collection = array();
 126                     $is_collecting = false;
 127                     $tag_index = 0;
 128                 } else {
 129                     // add the node to the collection
 130                     $collection[] = $token;
 131                 }
 132             }
 133
 134             // terminate
 135             if ($token === false) break;
 136
 137             if ($is_child) {
 138                 // determine what we're dealing with
 139                 if ($token->name == 'col') {
 140                     // the only empty tag in the possie, we can handle it
 141                     // immediately
 142                     $cols[] = array_merge($collection, array($token));
 143                     $collection = array();
 144                     $tag_index = 0;
 145                     continue;
 146                 }
 147                 switch($token->name) {
 148                     case 'caption':
 149                     case 'colgroup':
 150                     case 'thead':
 151                     case 'tfoot':
 152                     case 'tbody':
 153                     case 'tr':
 154                         $is_collecting = true;
 155                         $collection[] = $token;
 156                         continue;
 157                     default:
 158                         if (!empty($token->is_whitespace)) {
 159                             $collection[] = $token;
 160                             $tag_index++;
 161                         }
 162                         continue;
 163                 }
 164             }
 165         }
 166
 167         if (empty($content)) return false;
 168
 169         $ret = array();
 170         if ($caption !== false) $ret = array_merge($ret, $caption);
 171         if ($cols !== false)    foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
 172         if ($thead !== false)   $ret = array_merge($ret, $thead);
 173         if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
 174
 175         if ($tbody_mode) {
 176             // a little tricky, since the start of the collection may be
 177             // whitespace
 178             $inside_tbody = false;
 179             foreach ($content as $token_array) {
 180                 // find the starting token
 181                 foreach ($token_array as $t) {
 182                     if ($t->name === 'tr' || $t->name === 'tbody') {
 183                         break;
 184                     }
 185                 } // iterator variable carries over
 186                 if ($t->name === 'tr') {
 187                     if ($inside_tbody) {
 188                         $ret = array_merge($ret, $token_array);
 189                     } else {
 190                         $ret[] = new HTMLPurifier_Token_Start('tbody');
 191                         $ret = array_merge($ret, $token_array);
 192                         $inside_tbody = true;
 193                     }
 194                 } elseif ($t->name === 'tbody') {
 195                     if ($inside_tbody) {
 196                         $ret[] = new HTMLPurifier_Token_End('tbody');
 197                         $inside_tbody = false;
 198                         $ret = array_merge($ret, $token_array);
 199                     } else {
 200                         $ret = array_merge($ret, $token_array);
 201                     }
 202                 } else {
 203                     trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
 204                 }
 205             }
 206             if ($inside_tbody) {
 207                 $ret[] = new HTMLPurifier_Token_End('tbody');
 208             }
 209         } else {
 210             foreach ($content as $token_array) {
 211                 // invariant: everything in here is <tr>s
 212                 $ret = array_merge($ret, $token_array);
 213             }
 214         }
 215
 216         if (!empty($collection) && $is_collecting == false){
 217             // grab the trailing space
 218             $ret = array_merge($ret, $collection);
 219         }
 220
 221         array_pop($tokens_of_children); // remove phantom token
 222
 223         return ($ret === $tokens_of_children) ? true : $ret;
 224
 225     }
 226 }
 227
 228 // vim: et sw=4 sts=4