inc/indexer.php

   1 <?php
   2 /**
   3  * Functions to create the fulltext search index
   4  *
   5  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6  * @author     Andreas Gohr <andi@splitbrain.org>
   7  * @author     Tom N Harris <tnharris@whoopdedo.org>
   8  */
   9 use dokuwiki\Utf8\Clean;
  10 use dokuwiki\Extension\Event;
  11 use dokuwiki\Search\Indexer;
  12
  13 // Version tag used to force rebuild on upgrade
  14 define('INDEXER_VERSION', 8);
  15
  16 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  17 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
  18
  19 /**
  20  * Version of the indexer taking into consideration the external tokenizer.
  21  * The indexer is only compatible with data written by the same version.
  22  *
  23  * @triggers INDEXER_VERSION_GET
  24  * Plugins that modify what gets indexed should hook this event and
  25  * add their version info to the event data like so:
  26  *     $data[$plugin_name] = $plugin_version;
  27  *
  28  * @author Tom N Harris <tnharris@whoopdedo.org>
  29  * @author Michael Hamann <michael@content-space.de>
  30  *
  31  * @return int|string
  32  */
  33 function idx_get_version()
  34 {
  35     static $indexer_version = null;
  36     if ($indexer_version == null) {
  37         $version = INDEXER_VERSION;
  38
  39         // DokuWiki version is included for the convenience of plugins
  40         $data = ['dokuwiki'=>$version];
  41         Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  42         unset($data['dokuwiki']); // this needs to be first
  43         ksort($data);
  44         foreach ($data as $plugin=>$vers)
  45             $version .= '+'.$plugin.'='.$vers;
  46         $indexer_version = $version;
  47     }
  48     return $indexer_version;
  49 }
  50
  51 /**
  52  * Measure the length of a string.
  53  * Differs from strlen in handling of asian characters.
  54  *
  55  * @author Tom N Harris <tnharris@whoopdedo.org>
  56  *
  57  * @param string $w
  58  * @return int
  59  */
  60 function wordlen($w)
  61 {
  62     $l = strlen($w);
  63     // If left alone, all chinese "words" will get put into w3.idx
  64     // So the "length" of a "word" is faked
  65     if(preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
  66         foreach($leadbytes[0] as $b)
  67             $l += ord($b) - 0xE1;
  68     }
  69     return $l;
  70 }
  71
  72 /**
  73  * Create an instance of the indexer.
  74  *
  75  * @return Indexer    an Indexer
  76  *
  77  * @author Tom N Harris <tnharris@whoopdedo.org>
  78  */
  79 function idx_get_indexer()
  80 {
  81     static $Indexer;
  82     if (!isset($Indexer)) {
  83         $Indexer = new Indexer();
  84     }
  85     return $Indexer;
  86 }
  87
  88 /**
  89  * Returns words that will be ignored.
  90  *
  91  * @return array                list of stop words
  92  *
  93  * @author Tom N Harris <tnharris@whoopdedo.org>
  94  */
  95 function & idx_get_stopwords()
  96 {
  97     static $stopwords = null;
  98     if (is_null($stopwords)) {
  99         global $conf;
 100         $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
 101         if(file_exists($swfile)){
 102             $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
 103         }else{
 104             $stopwords = [];
 105         }
 106     }
 107     return $stopwords;
 108 }
 109
 110 /**
 111  * Adds/updates the search index for the given page
 112  *
 113  * Locking is handled internally.
 114  *
 115  * @param string        $page   name of the page to index
 116  * @param boolean       $verbose    print status messages
 117  * @param boolean       $force  force reindexing even when the index is up to date
 118  * @return string|boolean  the function completed successfully
 119  *
 120  * @author Tom N Harris <tnharris@whoopdedo.org>
 121  */
 122 function idx_addPage($page, $verbose = false, $force = false)
 123 {
 124     $idxtag = metaFN($page, '.indexed');
 125     // check if page was deleted but is still in the index
 126     if (!page_exists($page)) {
 127         if (!file_exists($idxtag)) {
 128             if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
 129             return false;
 130         }
 131         $Indexer = idx_get_indexer();
 132         $result = $Indexer->deletePage($page);
 133         if ($result === "locked") {
 134             if ($verbose) print("Indexer: locked".DOKU_LF);
 135             return false;
 136         }
 137         @unlink($idxtag);
 138         return $result;
 139     }
 140
 141     // check if indexing needed
 142     if(!$force && file_exists($idxtag)){
 143         if(trim(io_readFile($idxtag)) == idx_get_version()){
 144             $last = @filemtime($idxtag);
 145             if($last > @filemtime(wikiFN($page))){
 146                 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
 147                 return false;
 148             }
 149         }
 150     }
 151
 152     $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
 153     if ($indexenabled === false) {
 154         $result = false;
 155         if (file_exists($idxtag)) {
 156             $Indexer = idx_get_indexer();
 157             $result = $Indexer->deletePage($page);
 158             if ($result === "locked") {
 159                 if ($verbose) print("Indexer: locked".DOKU_LF);
 160                 return false;
 161             }
 162             @unlink($idxtag);
 163         }
 164         if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
 165         return $result;
 166     }
 167
 168     $Indexer = idx_get_indexer();
 169     $pid = $Indexer->getPID($page);
 170     if ($pid === false) {
 171         if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
 172         return false;
 173     }
 174     $body = '';
 175     $metadata = [];
 176     $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
 177     if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
 178         $metadata['relation_references'] = array_keys($references);
 179     else
 180         $metadata['relation_references'] = [];
 181
 182     if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
 183         $metadata['relation_media'] = array_keys($media);
 184     else
 185         $metadata['relation_media'] = [];
 186
 187     $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
 188     $evt = new Event('INDEXER_PAGE_ADD', $data);
 189     if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
 190     $evt->advise_after();
 191     unset($evt);
 192     extract($data);
 193
 194     $result = $Indexer->addPageWords($page, $body);
 195     if ($result === "locked") {
 196         if ($verbose) print("Indexer: locked".DOKU_LF);
 197         return false;
 198     }
 199
 200     if ($result) {
 201         $result = $Indexer->addMetaKeys($page, $metadata);
 202         if ($result === "locked") {
 203             if ($verbose) print("Indexer: locked".DOKU_LF);
 204             return false;
 205         }
 206     }
 207
 208     if ($result)
 209         io_saveFile(metaFN($page, '.indexed'), idx_get_version());
 210     if ($verbose) {
 211         print("Indexer: finished".DOKU_LF);
 212         return true;
 213     }
 214     return $result;
 215 }
 216
 217 /**
 218  * Find tokens in the fulltext index
 219  *
 220  * Takes an array of words and will return a list of matching
 221  * pages for each one.
 222  *
 223  * Important: No ACL checking is done here! All results are
 224  *            returned, regardless of permissions
 225  *
 226  * @param array      $words  list of words to search for
 227  * @return array             list of pages found, associated with the search terms
 228  */
 229 function idx_lookup(&$words)
 230 {
 231     $Indexer = idx_get_indexer();
 232     return $Indexer->lookup($words);
 233 }
 234
 235 /**
 236  * Split a string into tokens
 237  *
 238  * @param string $string
 239  * @param bool $wc
 240  *
 241  * @return array
 242  */
 243 function idx_tokenizer($string, $wc = false)
 244 {
 245     $Indexer = idx_get_indexer();
 246     return $Indexer->tokenizer($string, $wc);
 247 }
 248
 249 /* For compatibility */
 250
 251 /**
 252  * Read the list of words in an index (if it exists).
 253  *
 254  * @author Tom N Harris <tnharris@whoopdedo.org>
 255  *
 256  * @param string $idx
 257  * @param string $suffix
 258  * @return array
 259  */
 260 function idx_getIndex($idx, $suffix)
 261 {
 262     global $conf;
 263     $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
 264     if (!file_exists($fn)) return [];
 265     return file($fn);
 266 }
 267
 268 /**
 269  * Get the list of lengths indexed in the wiki.
 270  *
 271  * Read the index directory or a cache file and returns
 272  * a sorted array of lengths of the words used in the wiki.
 273  *
 274  * @author YoBoY <yoboy.leguesh@gmail.com>
 275  *
 276  * @return array
 277  */
 278 function idx_listIndexLengths()
 279 {
 280     global $conf;
 281     // testing what we have to do, create a cache file or not.
 282     if ($conf['readdircache'] == 0) {
 283         $docache = false;
 284     } else {
 285         clearstatcache();
 286         if (file_exists($conf['indexdir'].'/lengths.idx')
 287         && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
 288             if (
 289                 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
 290                 !== false
 291             ) {
 292                 $idx = [];
 293                 foreach ($lengths as $length) {
 294                     $idx[] = (int)$length;
 295                 }
 296                 return $idx;
 297             }
 298         }
 299         $docache = true;
 300     }
 301
 302     if ($conf['readdircache'] == 0 || $docache) {
 303         $dir = @opendir($conf['indexdir']);
 304         if ($dir === false)
 305             return [];
 306         $idx = [];
 307         while (($f = readdir($dir)) !== false) {
 308             if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
 309                 $i = substr($f, 1, -4);
 310                 if (is_numeric($i))
 311                     $idx[] = (int)$i;
 312             }
 313         }
 314         closedir($dir);
 315         sort($idx);
 316         // save this in a file
 317         if ($docache) {
 318             $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
 319             @fwrite($handle, implode("\n", $idx));
 320             @fclose($handle);
 321         }
 322         return $idx;
 323     }
 324
 325     return [];
 326 }
 327
 328 /**
 329  * Get the word lengths that have been indexed.
 330  *
 331  * Reads the index directory and returns an array of lengths
 332  * that there are indices for.
 333  *
 334  * @author YoBoY <yoboy.leguesh@gmail.com>
 335  *
 336  * @param array|int $filter
 337  * @return array
 338  */
 339 function idx_indexLengths($filter)
 340 {
 341     global $conf;
 342     $idx = [];
 343     if (is_array($filter)) {
 344         // testing if index files exist only
 345         $path = $conf['indexdir']."/i";
 346         foreach (array_keys($filter) as $key) {
 347             if (file_exists($path.$key.'.idx'))
 348                 $idx[] = $key;
 349         }
 350     } else {
 351         $lengths = idx_listIndexLengths();
 352         foreach ($lengths as $length) {
 353             // keep all the values equal or superior
 354             if ((int)$length >= (int)$filter)
 355                 $idx[] = $length;
 356         }
 357     }
 358     return $idx;
 359 }
 360
 361 /**
 362  * Clean a name of a key for use as a file name.
 363  *
 364  * Romanizes non-latin characters, then strips away anything that's
 365  * not a letter, number, or underscore.
 366  *
 367  * @author Tom N Harris <tnharris@whoopdedo.org>
 368  *
 369  * @param string $name
 370  * @return string
 371  */
 372 function idx_cleanName($name)
 373 {
 374     $name = Clean::romanize(trim((string)$name));
 375     $name = preg_replace('#[ \./\\:-]+#', '_', $name);
 376     $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
 377     return strtolower($name);
 378 }
 379
 380 //Setup VIM: ex: et ts=4 :