phpmyadmin/libraries/plugins/import/ImportMediawiki.class.php

   1 <?php
   2 /* vim: set expandtab sw=4 ts=4 sts=4: */
   3 /**
   4  * MediaWiki import plugin for phpMyAdmin
   5  *
   6  * @package    PhpMyAdmin-Import
   7  * @subpackage MediaWiki
   8  */
   9 if (! defined('PHPMYADMIN')) {
  10     exit;
  11 }
  12
  13 /* Get the import interface */
  14 require_once 'libraries/plugins/ImportPlugin.class.php';
  15
  16 /**
  17  * Handles the import for the MediaWiki format
  18  *
  19  * @package    PhpMyAdmin-Import
  20  * @subpackage MediaWiki
  21  */
  22 class ImportMediawiki extends ImportPlugin
  23 {
  24     /**
  25      * Whether to analyze tables
  26      *
  27      * @var bool
  28      */
  29     private $_analyze;
  30
  31     /**
  32      * Constructor
  33      */
  34     public function __construct()
  35     {
  36         $this->setProperties();
  37     }
  38
  39     /**
  40      * Sets the import plugin properties.
  41      * Called in the constructor.
  42      *
  43      * @return void
  44      */
  45     protected function setProperties()
  46     {
  47         $this->_setAnalyze(false);
  48         if ($GLOBALS['plugin_param'] !== 'table') {
  49             $this->_setAnalyze(true);
  50         }
  51
  52         $props = 'libraries/properties/';
  53         include_once "$props/plugins/ImportPluginProperties.class.php";
  54
  55         $importPluginProperties = new ImportPluginProperties();
  56         $importPluginProperties->setText(__('MediaWiki Table'));
  57         $importPluginProperties->setExtension('txt');
  58         $importPluginProperties->setMimeType('text/plain');
  59         $importPluginProperties->setOptions(array());
  60         $importPluginProperties->setOptionsText(__('Options'));
  61
  62         $this->properties = $importPluginProperties;
  63     }
  64
  65     /**
  66      * This method is called when any PluginManager to which the observer
  67      * is attached calls PluginManager::notify()
  68      *
  69      * @param SplSubject $subject The PluginManager notifying the observer
  70      *                            of an update.
  71      *
  72      * @return void
  73      */
  74     public function update (SplSubject $subject)
  75     {
  76     }
  77
  78     /**
  79      * Handles the whole import logic
  80      *
  81      * @return void
  82      */
  83     public function doImport()
  84     {
  85         global $error, $timeout_passed, $finished;
  86
  87         // Defaults for parser
  88
  89         // The buffer that will be used to store chunks read from the imported file
  90         $buffer = '';
  91
  92         // Used as storage for the last part of the current chunk data
  93         // Will be appended to the first line of the next chunk, if there is one
  94         $last_chunk_line = '';
  95
  96         // Remembers whether the current buffer line is part of a comment
  97         $inside_comment = false;
  98         // Remembers whether the current buffer line is part of a data comment
  99         $inside_data_comment = false;
 100         // Remembers whether the current buffer line is part of a structure comment
 101         $inside_structure_comment = false;
 102
 103         // MediaWiki only accepts "\n" as row terminator
 104         $mediawiki_new_line = "\n";
 105
 106         // Initialize the name of the current table
 107         $cur_table_name = "";
 108
 109         while (! $finished && ! $error && ! $timeout_passed ) {
 110             $data = PMA_importGetNextChunk();
 111
 112             if ($data === false) {
 113                 // Subtract data we didn't handle yet and stop processing
 114                 $offset -= strlen($buffer);
 115                 break;
 116             } elseif ($data === true) {
 117                 // Handle rest of buffer
 118             } else {
 119                 // Append new data to buffer
 120                 $buffer = $data;
 121                 unset($data);
 122                 // Don't parse string if we're not at the end
 123                 // and don't have a new line inside
 124                 if ( strpos($buffer, $mediawiki_new_line) === false ) {
 125                     continue;
 126                 }
 127             }
 128
 129             // Because of reading chunk by chunk, the first line from the buffer
 130             // contains only a portion of an actual line from the imported file.
 131             // Therefore, we have to append it to the last line from the previous
 132             // chunk. If we are at the first chunk, $last_chunk_line should be empty.
 133             $buffer = $last_chunk_line . $buffer;
 134
 135             // Process the buffer line by line
 136             $buffer_lines = explode($mediawiki_new_line, $buffer);
 137
 138             $full_buffer_lines_count = count($buffer_lines);
 139             // If the reading is not finalised, the final line of the current chunk
 140             // will not be complete
 141             if (! $finished) {
 142                 $full_buffer_lines_count -= 1;
 143                 $last_chunk_line = $buffer_lines[$full_buffer_lines_count];
 144             }
 145
 146             for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++ $line_nr) {
 147                 $cur_buffer_line = trim($buffer_lines[$line_nr]);
 148
 149                 // If the line is empty, go to the next one
 150                 if ( $cur_buffer_line === '' ) {
 151                     continue;
 152                 }
 153
 154                 $first_character = $cur_buffer_line[0];
 155                 $matches = array();
 156
 157                 // Check beginnning of comment
 158                 if (! strcmp(substr($cur_buffer_line, 0, 4), "<!--")) {
 159                     $inside_comment = true;
 160                     continue;
 161                 } elseif ($inside_comment) {
 162                     // Check end of comment
 163                     if (! strcmp(substr($cur_buffer_line, 0, 4), "-->")) {
 164                         // Only data comments are closed. The structure comments
 165                         // will be closed when a data comment begins (in order to
 166                         // skip structure tables)
 167                         if ($inside_data_comment) {
 168                             $inside_data_comment = false;
 169                         }
 170
 171                         // End comments that are not related to table structure
 172                         if (! $inside_structure_comment) {
 173                             $inside_comment = false;
 174                         }
 175                     } else {
 176                         // Check table name
 177                         $match_table_name = array();
 178                         if (preg_match(
 179                             "/^Table data for `(.*)`$/",
 180                             $cur_buffer_line,
 181                             $match_table_name
 182                         )
 183                         ) {
 184                             $cur_table_name = $match_table_name[1];
 185                             $inside_data_comment = true;
 186
 187                             // End ignoring structure rows
 188                             if ($inside_structure_comment) {
 189                                 $inside_structure_comment = false;
 190                             }
 191                         } elseif (preg_match(
 192                             "/^Table structure for `(.*)`$/",
 193                             $cur_buffer_line,
 194                             $match_table_name
 195                         )
 196                         ) {
 197                             // The structure comments will be ignored
 198                             $inside_structure_comment = true;
 199                         }
 200                     }
 201                     continue;
 202                 } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
 203                     // Check start of table
 204
 205                     // This will store all the column info on all rows from
 206                     // the current table read from the buffer
 207                     $cur_temp_table = array();
 208
 209                     // Will be used as storage for the current row in the buffer
 210                     // Once all its columns are read, it will be added to
 211                     // $cur_temp_table and then it will be emptied
 212                     $cur_temp_line = array();
 213
 214                     // Helps us differentiate the header columns
 215                     // from the normal columns
 216                     $in_table_header = false;
 217                     // End processing because the current line does not
 218                     // contain any column information
 219                 } elseif (substr($cur_buffer_line, 0, 2) === '|-'
 220                       || substr($cur_buffer_line, 0, 2) === '|+'
 221                       || substr($cur_buffer_line, 0, 2) === '|}'
 222                 ) {
 223                     // Check begin row or end table
 224
 225                     // Add current line to the values storage
 226                     if (! empty($cur_temp_line)) {
 227                         // If the current line contains header cells
 228                         // ( marked with '!' ),
 229                         // it will be marked as table header
 230                         if ( $in_table_header ) {
 231                             // Set the header columns
 232                             $cur_temp_table_headers = $cur_temp_line;
 233                         } else {
 234                             // Normal line, add it to the table
 235                             $cur_temp_table [] = $cur_temp_line;
 236                         }
 237                     }
 238
 239                     // Empty the temporary buffer
 240                     $cur_temp_line = array();
 241
 242                     // No more processing required at the end of the table
 243                     if (substr($cur_buffer_line, 0, 2) === '|}') {
 244                         $current_table = array(
 245                             $cur_table_name,
 246                             $cur_temp_table_headers,
 247                             $cur_temp_table
 248                         );
 249
 250                         // Import the current table data into the database
 251                         $this->_importDataOneTable($current_table);
 252
 253                         // Reset table name
 254                         $cur_table_name = "";
 255                     }
 256                     // What's after the row tag is now only attributes
 257
 258                 } elseif (($first_character === '|') || ($first_character === '!')) {
 259                     // Check cell elements
 260
 261                     // Header cells
 262                     if ($first_character === '!') {
 263                         // Mark as table header, but treat as normal row
 264                         $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
 265                         // Will be used to set $cur_temp_line as table header
 266                         $in_table_header = true;
 267                     } else {
 268                         $in_table_header = false;
 269                     }
 270
 271                     // Loop through each table cell
 272                     $cells = $this->_explodeMarkup($cur_buffer_line);
 273                     foreach ($cells as $cell) {
 274                         // A cell could contain both parameters and data
 275                         $cell_data = explode('|', $cell, 2);
 276
 277                         // A '|' inside an invalid link should not
 278                         // be mistaken as delimiting cell parameters
 279                         if (strpos($cell_data[0], '[[') === true ) {
 280                             if (count($cell_data) == 1) {
 281                                 $cell = $cell_data[0];
 282                             } else {
 283                                 $cell = $cell_data[1];
 284                             }
 285                         }
 286
 287                         // Delete the beginning of the column, if there is one
 288                         $cell = trim($cell);
 289                         $col_start_chars = array( "|", "!");
 290                         foreach ($col_start_chars as $col_start_char) {
 291                             if (strpos($cell, $col_start_char) === 0) {
 292                                 $cell = trim(substr($cell, 1));
 293                             }
 294                         }
 295
 296                         // Add the cell to the row
 297                         $cur_temp_line [] = $cell;
 298                     } // foreach $cells
 299                 } else {
 300                     // If it's none of the above, then the current line has a bad
 301                     // format
 302                     $message = PMA_Message::error(
 303                         __('Invalid format of mediawiki input on line: <br />%s.')
 304                     );
 305                     $message->addParam($cur_buffer_line);
 306                     $error = true;
 307                 }
 308             } // End treating full buffer lines
 309         } // while - finished parsing buffer
 310     }
 311
 312     /**
 313      * Imports data from a single table
 314      *
 315      * @param array $table containing all table info:
 316      *        <code>
 317      *            $table[0] - string containing table name
 318      *            $table[1] - array[]   of table headers
 319      *            $table[2] - array[][] of table content rows
 320      *        </code>
 321      *
 322      * @global bool  $analyze whether to scan for column types
 323      *
 324      * @return void
 325      */
 326     private function _importDataOneTable ($table)
 327     {
 328         $analyze = $this->_getAnalyze();
 329         if ($analyze) {
 330             // Set the table name
 331             $this->_setTableName($table[0]);
 332
 333             // Set generic names for table headers if they don't exist
 334             $this->_setTableHeaders($table[1], $table[2][0]);
 335
 336             // Create the tables array to be used in PMA_buildSQL()
 337             $tables = array();
 338             $tables [] = array($table[0], $table[1], $table[2]);
 339
 340             // Obtain the best-fit MySQL types for each column
 341             $analyses = array();
 342             $analyses [] = PMA_analyzeTable($tables[0]);
 343
 344             $this->_executeImportTables($tables, $analyses);
 345         }
 346
 347         // Commit any possible data in buffers
 348         PMA_importRunQuery();
 349     }
 350
 351     /**
 352      * Sets the table name
 353      *
 354      * @param string &$table_name reference to the name of the table
 355      *
 356      * @return void
 357      */
 358     private function _setTableName(&$table_name)
 359     {
 360         if (empty($table_name)) {
 361             $result = PMA_DBI_fetch_result('SHOW TABLES');
 362             // todo check if the name below already exists
 363             $table_name = 'TABLE '.(count($result) + 1);
 364         }
 365     }
 366
 367     /**
 368      * Set generic names for table headers, if they don't exist
 369      *
 370      * @param array &$table_headers reference to the array containing the headers
 371      *                              of a table
 372      * @param array $table_row      array containing the first content row
 373      *
 374      * @return void
 375      */
 376     private function _setTableHeaders(&$table_headers, $table_row)
 377     {
 378         if (empty($table_headers)) {
 379             // The first table row should contain the number of columns
 380             // If they are not set, generic names will be given (COL 1, COL 2, etc)
 381             $num_cols = count($table_row);
 382             for ($i = 0; $i < $num_cols; ++ $i) {
 383                 $table_headers [$i] = 'COL '. ($i + 1);
 384             }
 385         }
 386     }
 387
 388     /**
 389      * Sets the database name and additional options and calls PMA_buildSQL()
 390      * Used in PMA_importDataAllTables() and $this->_importDataOneTable()
 391      *
 392      * @param array &$tables   structure:
 393      *              array(
 394      *                  array(table_name, array() column_names, array()() rows)
 395      *              )
 396      * @param array &$analyses structure:
 397      *              $analyses = array(
 398      *                  array(array() column_types, array() column_sizes)
 399      *              )
 400      *
 401      * @global string $db name of the database to import in
 402      *
 403      * @return void
 404      */
 405     private function _executeImportTables(&$tables, &$analyses)
 406     {
 407         global $db;
 408
 409         // $db_name : The currently selected database name, if applicable
 410         //            No backquotes
 411         // $options : An associative array of options
 412         if (strlen($db)) {
 413             $db_name = $db;
 414             $options = array('create_db' => false);
 415         } else {
 416             $db_name = 'mediawiki_DB';
 417             $options = null;
 418         }
 419
 420         // Array of SQL strings
 421         // Non-applicable parameters
 422         $create = null;
 423
 424         // Create and execute necessary SQL statements from data
 425         PMA_buildSQL($db_name, $tables, $analyses, $create, $options);
 426
 427         unset($tables);
 428         unset($analyses);
 429     }
 430
 431
 432     /**
 433      * Replaces all instances of the '||' separator between delimiters
 434      * in a given string
 435      *
 436      * @param string $start_delim start delimiter
 437      * @param string $end_delim   end delimiter
 438      * @param string $replace     the string to be replaced with
 439      * @param string $subject     the text to be replaced
 440      *
 441      * @return string with replacements
 442      */
 443     private function _delimiterReplace($start_delim, $end_delim, $replace, $subject)
 444     {
 445         // String that will be returned
 446         $cleaned = "";
 447         // Possible states of current character
 448         $inside_tag = false;
 449         $inside_attribute = false;
 450         // Attributes can be declared with either " or '
 451         $start_attribute_character = false;
 452
 453         // The full separator is "||";
 454         // This rembembers if the previous character was '|'
 455         $partial_separator = false;
 456
 457         // Parse text char by char
 458         for ($i = 0; $i < strlen($subject); $i ++) {
 459             $cur_char = $subject[$i];
 460             // Check for separators
 461             if ($cur_char == '|') {
 462                 // If we're not inside a tag, then this is part of a real separator,
 463                 // so we append it to the current segment
 464                 if (! $inside_attribute) {
 465                     $cleaned .= $cur_char;
 466                     if ($partial_separator) {
 467                         $inside_tag = false;
 468                         $inside_attribute = false;
 469                     }
 470                 } elseif ($partial_separator) {
 471                     // If we are inside a tag, we replace the current char with
 472                     // the placeholder and append that to the current segment
 473                     $cleaned .= $replace;
 474                 }
 475
 476                 // If the previous character was also '|', then this ends a
 477                 // full separator. If not, this may be the beginning of one
 478                 $partial_separator = ! $partial_separator;
 479             } else {
 480                 // If we're inside a tag attribute and the current character is
 481                 // not '|', but the previous one was, it means that the single '|'
 482                 // was not appended, so we append it now
 483                 if ($partial_separator && $inside_attribute) {
 484                     $cleaned .= "|";
 485                 }
 486                 // If the char is different from "|", no separator can be formed
 487                 $partial_separator = false;
 488
 489                 // any other character should be appended to the current segment
 490                 $cleaned .= $cur_char;
 491
 492                 if ($cur_char == '<' && ! $inside_attribute) {
 493                     // start of a tag
 494                     $inside_tag = true;
 495                 } elseif ($cur_char == '>' && ! $inside_attribute) {
 496                     // end of a tag
 497                     $inside_tag = false;
 498                 } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) {
 499                     // start or end of an attribute
 500                     if (! $inside_attribute) {
 501                         $inside_attribute = true;
 502                         // remember the attribute`s declaration character (" or ')
 503                         $start_attribute_character = $cur_char;
 504                     } else {
 505                         if ($cur_char == $start_attribute_character) {
 506                             $inside_attribute = false;
 507                             // unset attribute declaration character
 508                             $start_attribute_character = false;
 509                         }
 510                     }
 511                 }
 512             }
 513         } // end for each character in $subject
 514
 515         return $cleaned;
 516     }
 517
 518     /**
 519      * Separates a string into items, similarly to explode
 520      * Uses the '||' separator (which is standard in the mediawiki format)
 521      * and ignores any instances of it inside markup tags
 522      * Used in parsing buffer lines containing data cells
 523      *
 524      * @param string $text text to be split
 525      *
 526      * @return array
 527      */
 528     private function _explodeMarkup($text)
 529     {
 530         $separator = "||";
 531         $placeholder = "\x00";
 532
 533         // Remove placeholder instances
 534         $text = str_replace($placeholder, '', $text);
 535
 536         // Replace instances of the separator inside HTML-like
 537         // tags with the placeholder
 538         $cleaned = $this->_delimiterReplace("<", ">", $placeholder, $text);
 539         // Explode, then put the replaced separators back in
 540         $items = explode($separator, $cleaned);
 541         foreach ($items as $i => $str) {
 542             $items[$i] = str_replace($placeholder, $separator, $str);
 543         }
 544
 545         return $items;
 546     }
 547
 548
 549     /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
 550
 551
 552     /**
 553      * Returns true if the table should be analyzed, false otherwise
 554      *
 555      * @return bool
 556      */
 557     private function _getAnalyze()
 558     {
 559         return $this->_analyze;
 560     }
 561
 562     /**
 563      * Sets to true if the table should be analyzed, false otherwise
 564      *
 565      * @param bool $analyze status
 566      *
 567      * @return void
 568      */
 569     private function _setAnalyze($analyze)
 570     {
 571         $this->_analyze = $analyze;
 572     }
 573 }