inc/indexer.php

   1 <?php
   2
   3 /**
   4  * Functions to create the fulltext search index
   5  *
   6  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   7  * @author     Andreas Gohr <andi@splitbrain.org>
   8  * @author     Tom N Harris <tnharris@whoopdedo.org>
   9  */
  10
  11 use dokuwiki\Utf8\Clean;
  12 use dokuwiki\Extension\Event;
  13 use dokuwiki\Search\Indexer;
  14
  15 // Version tag used to force rebuild on upgrade
  16 define('INDEXER_VERSION', 8);
  17
  18 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  19 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
  20
  21 /**
  22  * Version of the indexer taking into consideration the external tokenizer.
  23  * The indexer is only compatible with data written by the same version.
  24  *
  25  * @triggers INDEXER_VERSION_GET
  26  * Plugins that modify what gets indexed should hook this event and
  27  * add their version info to the event data like so:
  28  *     $data[$plugin_name] = $plugin_version;
  29  *
  30  * @author Tom N Harris <tnharris@whoopdedo.org>
  31  * @author Michael Hamann <michael@content-space.de>
  32  *
  33  * @return int|string
  34  */
  35 function idx_get_version()
  36 {
  37     static $indexer_version = null;
  38     if ($indexer_version == null) {
  39         $version = INDEXER_VERSION;
  40
  41         // DokuWiki version is included for the convenience of plugins
  42         $data = ['dokuwiki' => $version];
  43         Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  44         unset($data['dokuwiki']); // this needs to be first
  45         ksort($data);
  46         foreach ($data as $plugin => $vers)
  47             $version .= '+' . $plugin . '=' . $vers;
  48         $indexer_version = $version;
  49     }
  50     return $indexer_version;
  51 }
  52
  53 /**
  54  * Measure the length of a string.
  55  * Differs from strlen in handling of asian characters.
  56  *
  57  * @author Tom N Harris <tnharris@whoopdedo.org>
  58  *
  59  * @param string $w
  60  * @return int
  61  */
  62 function wordlen($w)
  63 {
  64     $l = strlen($w);
  65     // If left alone, all chinese "words" will get put into w3.idx
  66     // So the "length" of a "word" is faked
  67     if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
  68         foreach ($leadbytes[0] as $b)
  69             $l += ord($b) - 0xE1;
  70     }
  71     return $l;
  72 }
  73
  74 /**
  75  * Create an instance of the indexer.
  76  *
  77  * @return Indexer    an Indexer
  78  *
  79  * @author Tom N Harris <tnharris@whoopdedo.org>
  80  */
  81 function idx_get_indexer()
  82 {
  83     static $Indexer;
  84     if (!isset($Indexer)) {
  85         $Indexer = new Indexer();
  86     }
  87     return $Indexer;
  88 }
  89
  90 /**
  91  * Returns words that will be ignored.
  92  *
  93  * @return array                list of stop words
  94  *
  95  * @author Tom N Harris <tnharris@whoopdedo.org>
  96  */
  97 function & idx_get_stopwords()
  98 {
  99     static $stopwords = null;
 100     if (is_null($stopwords)) {
 101         global $conf;
 102         $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
 103         if (file_exists($swfile)) {
 104             $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
 105         } else {
 106             $stopwords = [];
 107         }
 108     }
 109     return $stopwords;
 110 }
 111
 112 /**
 113  * Adds/updates the search index for the given page
 114  *
 115  * Locking is handled internally.
 116  *
 117  * @param string        $page   name of the page to index
 118  * @param boolean       $verbose    print status messages
 119  * @param boolean       $force  force reindexing even when the index is up to date
 120  * @return string|boolean  the function completed successfully
 121  *
 122  * @author Tom N Harris <tnharris@whoopdedo.org>
 123  */
 124 function idx_addPage($page, $verbose = false, $force = false)
 125 {
 126     $idxtag = metaFN($page, '.indexed');
 127     // check if page was deleted but is still in the index
 128     if (!page_exists($page)) {
 129         if (!file_exists($idxtag)) {
 130             if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
 131             return false;
 132         }
 133         $Indexer = idx_get_indexer();
 134         $result = $Indexer->deletePage($page);
 135         if ($result === "locked") {
 136             if ($verbose) echo "Indexer: locked" . DOKU_LF;
 137             return false;
 138         }
 139         @unlink($idxtag);
 140         return $result;
 141     }
 142
 143     // check if indexing needed
 144     if (!$force && file_exists($idxtag)) {
 145         if (trim(io_readFile($idxtag)) == idx_get_version()) {
 146             $last = @filemtime($idxtag);
 147             if ($last > @filemtime(wikiFN($page))) {
 148                 if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
 149                 return false;
 150             }
 151         }
 152     }
 153
 154     $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
 155     if ($indexenabled === false) {
 156         $result = false;
 157         if (file_exists($idxtag)) {
 158             $Indexer = idx_get_indexer();
 159             $result = $Indexer->deletePage($page);
 160             if ($result === "locked") {
 161                 if ($verbose) echo "Indexer: locked" . DOKU_LF;
 162                 return false;
 163             }
 164             @unlink($idxtag);
 165         }
 166         if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
 167         return $result;
 168     }
 169
 170     $Indexer = idx_get_indexer();
 171     $pid = $Indexer->getPID($page);
 172     if ($pid === false) {
 173         if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
 174         return false;
 175     }
 176     $body = '';
 177     $metadata = [];
 178     $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
 179     if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
 180         $metadata['relation_references'] = array_keys($references);
 181     else $metadata['relation_references'] = [];
 182
 183     if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
 184         $metadata['relation_media'] = array_keys($media);
 185     else $metadata['relation_media'] = [];
 186
 187     $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
 188     $evt = new Event('INDEXER_PAGE_ADD', $data);
 189     if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
 190     $evt->advise_after();
 191     unset($evt);
 192     extract($data);
 193
 194     $result = $Indexer->addPageWords($page, $body);
 195     if ($result === "locked") {
 196         if ($verbose) echo "Indexer: locked" . DOKU_LF;
 197         return false;
 198     }
 199
 200     if ($result) {
 201         $result = $Indexer->addMetaKeys($page, $metadata);
 202         if ($result === "locked") {
 203             if ($verbose) echo "Indexer: locked" . DOKU_LF;
 204             return false;
 205         }
 206     }
 207
 208     if ($result)
 209         io_saveFile(metaFN($page, '.indexed'), idx_get_version());
 210     if ($verbose) {
 211         echo "Indexer: finished" . DOKU_LF;
 212         return true;
 213     }
 214     return $result;
 215 }
 216
 217 /**
 218  * Find tokens in the fulltext index
 219  *
 220  * Takes an array of words and will return a list of matching
 221  * pages for each one.
 222  *
 223  * Important: No ACL checking is done here! All results are
 224  *            returned, regardless of permissions
 225  *
 226  * @param array      $words  list of words to search for
 227  * @return array             list of pages found, associated with the search terms
 228  */
 229 function idx_lookup(&$words)
 230 {
 231     $Indexer = idx_get_indexer();
 232     return $Indexer->lookup($words);
 233 }
 234
 235 /**
 236  * Split a string into tokens
 237  *
 238  * @param string $string
 239  * @param bool $wc
 240  *
 241  * @return array
 242  */
 243 function idx_tokenizer($string, $wc = false)
 244 {
 245     $Indexer = idx_get_indexer();
 246     return $Indexer->tokenizer($string, $wc);
 247 }
 248
 249 /* For compatibility */
 250
 251 /**
 252  * Read the list of words in an index (if it exists).
 253  *
 254  * @author Tom N Harris <tnharris@whoopdedo.org>
 255  *
 256  * @param string $idx
 257  * @param string $suffix
 258  * @return array
 259  */
 260 function idx_getIndex($idx, $suffix)
 261 {
 262     global $conf;
 263     $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
 264     if (!file_exists($fn)) return [];
 265     return file($fn);
 266 }
 267
 268 /**
 269  * Get the list of lengths indexed in the wiki.
 270  *
 271  * Read the index directory or a cache file and returns
 272  * a sorted array of lengths of the words used in the wiki.
 273  *
 274  * @author YoBoY <yoboy.leguesh@gmail.com>
 275  *
 276  * @return array
 277  */
 278 function idx_listIndexLengths()
 279 {
 280     global $conf;
 281     // testing what we have to do, create a cache file or not.
 282     if ($conf['readdircache'] == 0) {
 283         $docache = false;
 284     } else {
 285         clearstatcache();
 286         if (
 287             file_exists($conf['indexdir'] . '/lengths.idx')
 288             && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
 289         ) {
 290             if (
 291                 ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
 292                 !== false
 293             ) {
 294                 $idx = [];
 295                 foreach ($lengths as $length) {
 296                     $idx[] = (int)$length;
 297                 }
 298                 return $idx;
 299             }
 300         }
 301         $docache = true;
 302     }
 303
 304     if ($conf['readdircache'] == 0 || $docache) {
 305         $dir = @opendir($conf['indexdir']);
 306         if ($dir === false)
 307             return [];
 308         $idx = [];
 309         while (($f = readdir($dir)) !== false) {
 310             if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
 311                 $i = substr($f, 1, -4);
 312                 if (is_numeric($i))
 313                     $idx[] = (int)$i;
 314             }
 315         }
 316         closedir($dir);
 317         sort($idx);
 318         // save this in a file
 319         if ($docache) {
 320             $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
 321             @fwrite($handle, implode("\n", $idx));
 322             @fclose($handle);
 323         }
 324         return $idx;
 325     }
 326
 327     return [];
 328 }
 329
 330 /**
 331  * Get the word lengths that have been indexed.
 332  *
 333  * Reads the index directory and returns an array of lengths
 334  * that there are indices for.
 335  *
 336  * @author YoBoY <yoboy.leguesh@gmail.com>
 337  *
 338  * @param array|int $filter
 339  * @return array
 340  */
 341 function idx_indexLengths($filter)
 342 {
 343     global $conf;
 344     $idx = [];
 345     if (is_array($filter)) {
 346         // testing if index files exist only
 347         $path = $conf['indexdir'] . "/i";
 348         foreach (array_keys($filter) as $key) {
 349             if (file_exists($path . $key . '.idx'))
 350                 $idx[] = $key;
 351         }
 352     } else {
 353         $lengths = idx_listIndexLengths();
 354         foreach ($lengths as $length) {
 355             // keep all the values equal or superior
 356             if ((int)$length >= (int)$filter)
 357                 $idx[] = $length;
 358         }
 359     }
 360     return $idx;
 361 }
 362
 363 /**
 364  * Clean a name of a key for use as a file name.
 365  *
 366  * Romanizes non-latin characters, then strips away anything that's
 367  * not a letter, number, or underscore.
 368  *
 369  * @author Tom N Harris <tnharris@whoopdedo.org>
 370  *
 371  * @param string $name
 372  * @return string
 373  */
 374 function idx_cleanName($name)
 375 {
 376     $name = Clean::romanize(trim((string)$name));
 377     $name = preg_replace('#[ \./\\:-]+#', '_', $name);
 378     $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
 379     return strtolower($name);
 380 }
 381
 382 //Setup VIM: ex: et ts=4 :