inc/indexer.php

   1 <?php
   2 /**
   3  * Functions to create the fulltext search index
   4  *
   5  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6  * @author     Andreas Gohr <andi@splitbrain.org>
   7  * @author     Tom N Harris <tnharris@whoopdedo.org>
   8  */
   9
  10 use dokuwiki\Extension\Event;
  11 use dokuwiki\Search\Indexer;
  12
  13 // Version tag used to force rebuild on upgrade
  14 define('INDEXER_VERSION', 8);
  15
  16 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  17 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
  18
  19 /**
  20  * Version of the indexer taking into consideration the external tokenizer.
  21  * The indexer is only compatible with data written by the same version.
  22  *
  23  * @triggers INDEXER_VERSION_GET
  24  * Plugins that modify what gets indexed should hook this event and
  25  * add their version info to the event data like so:
  26  *     $data[$plugin_name] = $plugin_version;
  27  *
  28  * @author Tom N Harris <tnharris@whoopdedo.org>
  29  * @author Michael Hamann <michael@content-space.de>
  30  *
  31  * @return int|string
  32  */
  33 function idx_get_version(){
  34     static $indexer_version = null;
  35     if ($indexer_version == null) {
  36         $version = INDEXER_VERSION;
  37
  38         // DokuWiki version is included for the convenience of plugins
  39         $data = array('dokuwiki'=>$version);
  40         Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  41         unset($data['dokuwiki']); // this needs to be first
  42         ksort($data);
  43         foreach ($data as $plugin=>$vers)
  44             $version .= '+'.$plugin.'='.$vers;
  45         $indexer_version = $version;
  46     }
  47     return $indexer_version;
  48 }
  49
  50 /**
  51  * Measure the length of a string.
  52  * Differs from strlen in handling of asian characters.
  53  *
  54  * @author Tom N Harris <tnharris@whoopdedo.org>
  55  *
  56  * @param string $w
  57  * @return int
  58  */
  59 function wordlen($w){
  60     $l = strlen($w);
  61     // If left alone, all chinese "words" will get put into w3.idx
  62     // So the "length" of a "word" is faked
  63     if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
  64         foreach($leadbytes[0] as $b)
  65             $l += ord($b) - 0xE1;
  66     }
  67     return $l;
  68 }
  69
  70 /**
  71  * Create an instance of the indexer.
  72  *
  73  * @return Indexer    an Indexer
  74  *
  75  * @author Tom N Harris <tnharris@whoopdedo.org>
  76  */
  77 function idx_get_indexer() {
  78     static $Indexer;
  79     if (!isset($Indexer)) {
  80         $Indexer = new Indexer();
  81     }
  82     return $Indexer;
  83 }
  84
  85 /**
  86  * Returns words that will be ignored.
  87  *
  88  * @return array                list of stop words
  89  *
  90  * @author Tom N Harris <tnharris@whoopdedo.org>
  91  */
  92 function & idx_get_stopwords() {
  93     static $stopwords = null;
  94     if (is_null($stopwords)) {
  95         global $conf;
  96         $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  97         if(file_exists($swfile)){
  98             $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
  99         }else{
 100             $stopwords = array();
 101         }
 102     }
 103     return $stopwords;
 104 }
 105
 106 /**
 107  * Adds/updates the search index for the given page
 108  *
 109  * Locking is handled internally.
 110  *
 111  * @param string        $page   name of the page to index
 112  * @param boolean       $verbose    print status messages
 113  * @param boolean       $force  force reindexing even when the index is up to date
 114  * @return string|boolean  the function completed successfully
 115  *
 116  * @author Tom N Harris <tnharris@whoopdedo.org>
 117  */
 118 function idx_addPage($page, $verbose=false, $force=false) {
 119     $idxtag = metaFN($page,'.indexed');
 120     // check if page was deleted but is still in the index
 121     if (!page_exists($page)) {
 122         if (!file_exists($idxtag)) {
 123             if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
 124             return false;
 125         }
 126         $Indexer = idx_get_indexer();
 127         $result = $Indexer->deletePage($page);
 128         if ($result === "locked") {
 129             if ($verbose) print("Indexer: locked".DOKU_LF);
 130             return false;
 131         }
 132         @unlink($idxtag);
 133         return $result;
 134     }
 135
 136     // check if indexing needed
 137     if(!$force && file_exists($idxtag)){
 138         if(trim(io_readFile($idxtag)) == idx_get_version()){
 139             $last = @filemtime($idxtag);
 140             if($last > @filemtime(wikiFN($page))){
 141                 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
 142                 return false;
 143             }
 144         }
 145     }
 146
 147     $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
 148     if ($indexenabled === false) {
 149         $result = false;
 150         if (file_exists($idxtag)) {
 151             $Indexer = idx_get_indexer();
 152             $result = $Indexer->deletePage($page);
 153             if ($result === "locked") {
 154                 if ($verbose) print("Indexer: locked".DOKU_LF);
 155                 return false;
 156             }
 157             @unlink($idxtag);
 158         }
 159         if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
 160         return $result;
 161     }
 162
 163     $Indexer = idx_get_indexer();
 164     $pid = $Indexer->getPID($page);
 165     if ($pid === false) {
 166         if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
 167         return false;
 168     }
 169     $body = '';
 170     $metadata = array();
 171     $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
 172     if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
 173         $metadata['relation_references'] = array_keys($references);
 174     else
 175         $metadata['relation_references'] = array();
 176
 177     if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
 178         $metadata['relation_media'] = array_keys($media);
 179     else
 180         $metadata['relation_media'] = array();
 181
 182     $data = compact('page', 'body', 'metadata', 'pid');
 183     $evt = new Event('INDEXER_PAGE_ADD', $data);
 184     if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
 185     $evt->advise_after();
 186     unset($evt);
 187     extract($data);
 188
 189     $result = $Indexer->addPageWords($page, $body);
 190     if ($result === "locked") {
 191         if ($verbose) print("Indexer: locked".DOKU_LF);
 192         return false;
 193     }
 194
 195     if ($result) {
 196         $result = $Indexer->addMetaKeys($page, $metadata);
 197         if ($result === "locked") {
 198             if ($verbose) print("Indexer: locked".DOKU_LF);
 199             return false;
 200         }
 201     }
 202
 203     if ($result)
 204         io_saveFile(metaFN($page,'.indexed'), idx_get_version());
 205     if ($verbose) {
 206         print("Indexer: finished".DOKU_LF);
 207         return true;
 208     }
 209     return $result;
 210 }
 211
 212 /**
 213  * Find tokens in the fulltext index
 214  *
 215  * Takes an array of words and will return a list of matching
 216  * pages for each one.
 217  *
 218  * Important: No ACL checking is done here! All results are
 219  *            returned, regardless of permissions
 220  *
 221  * @param array      $words  list of words to search for
 222  * @return array             list of pages found, associated with the search terms
 223  */
 224 function idx_lookup(&$words) {
 225     $Indexer = idx_get_indexer();
 226     return $Indexer->lookup($words);
 227 }
 228
 229 /**
 230  * Split a string into tokens
 231  *
 232  * @param string $string
 233  * @param bool $wc
 234  *
 235  * @return array
 236  */
 237 function idx_tokenizer($string, $wc=false) {
 238     $Indexer = idx_get_indexer();
 239     return $Indexer->tokenizer($string, $wc);
 240 }
 241
 242 /* For compatibility */
 243
 244 /**
 245  * Read the list of words in an index (if it exists).
 246  *
 247  * @author Tom N Harris <tnharris@whoopdedo.org>
 248  *
 249  * @param string $idx
 250  * @param string $suffix
 251  * @return array
 252  */
 253 function idx_getIndex($idx, $suffix) {
 254     global $conf;
 255     $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
 256     if (!file_exists($fn)) return array();
 257     return file($fn);
 258 }
 259
 260 /**
 261  * Get the list of lengths indexed in the wiki.
 262  *
 263  * Read the index directory or a cache file and returns
 264  * a sorted array of lengths of the words used in the wiki.
 265  *
 266  * @author YoBoY <yoboy.leguesh@gmail.com>
 267  *
 268  * @return array
 269  */
 270 function idx_listIndexLengths() {
 271     global $conf;
 272     // testing what we have to do, create a cache file or not.
 273     if ($conf['readdircache'] == 0) {
 274         $docache = false;
 275     } else {
 276         clearstatcache();
 277         if (file_exists($conf['indexdir'].'/lengths.idx')
 278         && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
 279             if (
 280                 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
 281                 !== false
 282             ) {
 283                 $idx = array();
 284                 foreach ($lengths as $length) {
 285                     $idx[] = (int)$length;
 286                 }
 287                 return $idx;
 288             }
 289         }
 290         $docache = true;
 291     }
 292
 293     if ($conf['readdircache'] == 0 || $docache) {
 294         $dir = @opendir($conf['indexdir']);
 295         if ($dir === false)
 296             return array();
 297         $idx = array();
 298         while (($f = readdir($dir)) !== false) {
 299             if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
 300                 $i = substr($f, 1, -4);
 301                 if (is_numeric($i))
 302                     $idx[] = (int)$i;
 303             }
 304         }
 305         closedir($dir);
 306         sort($idx);
 307         // save this in a file
 308         if ($docache) {
 309             $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
 310             @fwrite($handle, implode("\n", $idx));
 311             @fclose($handle);
 312         }
 313         return $idx;
 314     }
 315
 316     return array();
 317 }
 318
 319 /**
 320  * Get the word lengths that have been indexed.
 321  *
 322  * Reads the index directory and returns an array of lengths
 323  * that there are indices for.
 324  *
 325  * @author YoBoY <yoboy.leguesh@gmail.com>
 326  *
 327  * @param array|int $filter
 328  * @return array
 329  */
 330 function idx_indexLengths($filter) {
 331     global $conf;
 332     $idx = array();
 333     if (is_array($filter)) {
 334         // testing if index files exist only
 335         $path = $conf['indexdir']."/i";
 336         foreach ($filter as $key => $value) {
 337             if (file_exists($path.$key.'.idx'))
 338                 $idx[] = $key;
 339         }
 340     } else {
 341         $lengths = idx_listIndexLengths();
 342         foreach ($lengths as $key => $length) {
 343             // keep all the values equal or superior
 344             if ((int)$length >= (int)$filter)
 345                 $idx[] = $length;
 346         }
 347     }
 348     return $idx;
 349 }
 350
 351 /**
 352  * Clean a name of a key for use as a file name.
 353  *
 354  * Romanizes non-latin characters, then strips away anything that's
 355  * not a letter, number, or underscore.
 356  *
 357  * @author Tom N Harris <tnharris@whoopdedo.org>
 358  *
 359  * @param string $name
 360  * @return string
 361  */
 362 function idx_cleanName($name) {
 363     $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
 364     $name = preg_replace('#[ \./\\:-]+#', '_', $name);
 365     $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
 366     return strtolower($name);
 367 }
 368
 369 //Setup VIM: ex: et ts=4 :