3 * Functions to create the fulltext search index
5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author Andreas Gohr <andi@splitbrain.org>
7 * @author Tom N Harris <tnharris@whoopdedo.org>
9 use dokuwiki\Utf8\Clean
;
10 use dokuwiki\Extension\Event
;
11 use dokuwiki\Search\Indexer
;
13 // Version tag used to force rebuild on upgrade
14 define('INDEXER_VERSION', 8);
16 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
20 * Version of the indexer taking into consideration the external tokenizer.
21 * The indexer is only compatible with data written by the same version.
23 * @triggers INDEXER_VERSION_GET
24 * Plugins that modify what gets indexed should hook this event and
25 * add their version info to the event data like so:
26 * $data[$plugin_name] = $plugin_version;
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 * @author Michael Hamann <michael@content-space.de>
33 function idx_get_version()
35 static $indexer_version = null;
36 if ($indexer_version == null) {
37 $version = INDEXER_VERSION
;
39 // DokuWiki version is included for the convenience of plugins
40 $data = ['dokuwiki'=>$version];
41 Event
::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
42 unset($data['dokuwiki']); // this needs to be first
44 foreach ($data as $plugin=>$vers)
45 $version .= '+'.$plugin.'='.$vers;
46 $indexer_version = $version;
48 return $indexer_version;
52 * Measure the length of a string.
53 * Differs from strlen in handling of asian characters.
55 * @author Tom N Harris <tnharris@whoopdedo.org>
63 // If left alone, all chinese "words" will get put into w3.idx
64 // So the "length" of a "word" is faked
65 if(preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
66 foreach($leadbytes[0] as $b)
73 * Create an instance of the indexer.
75 * @return Indexer an Indexer
77 * @author Tom N Harris <tnharris@whoopdedo.org>
79 function idx_get_indexer()
82 if (!isset($Indexer)) {
83 $Indexer = new Indexer();
89 * Returns words that will be ignored.
91 * @return array list of stop words
93 * @author Tom N Harris <tnharris@whoopdedo.org>
95 function & idx_get_stopwords()
97 static $stopwords = null;
98 if (is_null($stopwords)) {
100 $swfile = DOKU_INC
.'inc/lang/'.$conf['lang'].'/stopwords.txt';
101 if(file_exists($swfile)){
102 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES
);
111 * Adds/updates the search index for the given page
113 * Locking is handled internally.
115 * @param string $page name of the page to index
116 * @param boolean $verbose print status messages
117 * @param boolean $force force reindexing even when the index is up to date
118 * @return string|boolean the function completed successfully
120 * @author Tom N Harris <tnharris@whoopdedo.org>
122 function idx_addPage($page, $verbose = false, $force = false)
124 $idxtag = metaFN($page, '.indexed');
125 // check if page was deleted but is still in the index
126 if (!page_exists($page)) {
127 if (!file_exists($idxtag)) {
128 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF
);
131 $Indexer = idx_get_indexer();
132 $result = $Indexer->deletePage($page);
133 if ($result === "locked") {
134 if ($verbose) print("Indexer: locked".DOKU_LF
);
141 // check if indexing needed
142 if(!$force && file_exists($idxtag)){
143 if(trim(io_readFile($idxtag)) == idx_get_version()){
144 $last = @filemtime
($idxtag);
145 if($last > @filemtime
(wikiFN($page))){
146 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF
);
152 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED
);
153 if ($indexenabled === false) {
155 if (file_exists($idxtag)) {
156 $Indexer = idx_get_indexer();
157 $result = $Indexer->deletePage($page);
158 if ($result === "locked") {
159 if ($verbose) print("Indexer: locked".DOKU_LF
);
164 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF
);
168 $Indexer = idx_get_indexer();
169 $pid = $Indexer->getPID($page);
170 if ($pid === false) {
171 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF
);
176 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED
);
177 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED
)) !== null)
178 $metadata['relation_references'] = array_keys($references);
180 $metadata['relation_references'] = [];
182 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED
)) !== null)
183 $metadata['relation_media'] = array_keys($media);
185 $metadata['relation_media'] = [];
187 $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
188 $evt = new Event('INDEXER_PAGE_ADD', $data);
189 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
190 $evt->advise_after();
194 $result = $Indexer->addPageWords($page, $body);
195 if ($result === "locked") {
196 if ($verbose) print("Indexer: locked".DOKU_LF
);
201 $result = $Indexer->addMetaKeys($page, $metadata);
202 if ($result === "locked") {
203 if ($verbose) print("Indexer: locked".DOKU_LF
);
209 io_saveFile(metaFN($page, '.indexed'), idx_get_version());
211 print("Indexer: finished".DOKU_LF
);
218 * Find tokens in the fulltext index
220 * Takes an array of words and will return a list of matching
221 * pages for each one.
223 * Important: No ACL checking is done here! All results are
224 * returned, regardless of permissions
226 * @param array $words list of words to search for
227 * @return array list of pages found, associated with the search terms
229 function idx_lookup(&$words)
231 $Indexer = idx_get_indexer();
232 return $Indexer->lookup($words);
236 * Split a string into tokens
238 * @param string $string
243 function idx_tokenizer($string, $wc = false)
245 $Indexer = idx_get_indexer();
246 return $Indexer->tokenizer($string, $wc);
249 /* For compatibility */
252 * Read the list of words in an index (if it exists).
254 * @author Tom N Harris <tnharris@whoopdedo.org>
257 * @param string $suffix
260 function idx_getIndex($idx, $suffix)
263 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
264 if (!file_exists($fn)) return [];
269 * Get the list of lengths indexed in the wiki.
271 * Read the index directory or a cache file and returns
272 * a sorted array of lengths of the words used in the wiki.
274 * @author YoBoY <yoboy.leguesh@gmail.com>
278 function idx_listIndexLengths()
281 // testing what we have to do, create a cache file or not.
282 if ($conf['readdircache'] == 0) {
286 if (file_exists($conf['indexdir'].'/lengths.idx')
287 && (time() < @filemtime
($conf['indexdir'].'/lengths.idx') +
$conf['readdircache'])) {
289 ($lengths = @file
($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
))
293 foreach ($lengths as $length) {
294 $idx[] = (int)$length;
302 if ($conf['readdircache'] == 0 ||
$docache) {
303 $dir = @opendir
($conf['indexdir']);
307 while (($f = readdir($dir)) !== false) {
308 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
309 $i = substr($f, 1, -4);
316 // save this in a file
318 $handle = @fopen
($conf['indexdir'].'/lengths.idx', 'w');
319 @fwrite
($handle, implode("\n", $idx));
329 * Get the word lengths that have been indexed.
331 * Reads the index directory and returns an array of lengths
332 * that there are indices for.
334 * @author YoBoY <yoboy.leguesh@gmail.com>
336 * @param array|int $filter
339 function idx_indexLengths($filter)
343 if (is_array($filter)) {
344 // testing if index files exist only
345 $path = $conf['indexdir']."/i";
346 foreach (array_keys($filter) as $key) {
347 if (file_exists($path.$key.'.idx'))
351 $lengths = idx_listIndexLengths();
352 foreach ($lengths as $length) {
353 // keep all the values equal or superior
354 if ((int)$length >= (int)$filter)
362 * Clean a name of a key for use as a file name.
364 * Romanizes non-latin characters, then strips away anything that's
365 * not a letter, number, or underscore.
367 * @author Tom N Harris <tnharris@whoopdedo.org>
369 * @param string $name
372 function idx_cleanName($name)
374 $name = Clean
::romanize(trim((string)$name));
375 $name = preg_replace('#[ \./\\:-]+#', '_', $name);
376 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
377 return strtolower($name);
380 //Setup VIM: ex: et ts=4 :