code style: line breaks
[dokuwiki.git] / inc / indexer.php
blob31b092e67da5899a014cf53ee3bb2257ce5ada21
1 <?php
3 /**
4 * Functions to create the fulltext search index
6 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
7 * @author Andreas Gohr <andi@splitbrain.org>
8 * @author Tom N Harris <tnharris@whoopdedo.org>
9 */
11 use dokuwiki\Utf8\Clean;
12 use dokuwiki\Extension\Event;
13 use dokuwiki\Search\Indexer;
15 // Version tag used to force rebuild on upgrade
16 define('INDEXER_VERSION', 8);
18 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
19 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
21 /**
22 * Version of the indexer taking into consideration the external tokenizer.
23 * The indexer is only compatible with data written by the same version.
25 * @triggers INDEXER_VERSION_GET
26 * Plugins that modify what gets indexed should hook this event and
27 * add their version info to the event data like so:
28 * $data[$plugin_name] = $plugin_version;
30 * @author Tom N Harris <tnharris@whoopdedo.org>
31 * @author Michael Hamann <michael@content-space.de>
33 * @return int|string
35 function idx_get_version()
37 static $indexer_version = null;
38 if ($indexer_version == null) {
39 $version = INDEXER_VERSION;
41 // DokuWiki version is included for the convenience of plugins
42 $data = ['dokuwiki' => $version];
43 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
44 unset($data['dokuwiki']); // this needs to be first
45 ksort($data);
46 foreach ($data as $plugin => $vers)
47 $version .= '+' . $plugin . '=' . $vers;
48 $indexer_version = $version;
50 return $indexer_version;
53 /**
54 * Measure the length of a string.
55 * Differs from strlen in handling of asian characters.
57 * @author Tom N Harris <tnharris@whoopdedo.org>
59 * @param string $w
60 * @return int
62 function wordlen($w)
64 $l = strlen($w);
65 // If left alone, all chinese "words" will get put into w3.idx
66 // So the "length" of a "word" is faked
67 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
68 foreach ($leadbytes[0] as $b)
69 $l += ord($b) - 0xE1;
71 return $l;
74 /**
75 * Create an instance of the indexer.
77 * @return Indexer an Indexer
79 * @author Tom N Harris <tnharris@whoopdedo.org>
81 function idx_get_indexer()
83 static $Indexer;
84 if (!isset($Indexer)) {
85 $Indexer = new Indexer();
87 return $Indexer;
90 /**
91 * Returns words that will be ignored.
93 * @return array list of stop words
95 * @author Tom N Harris <tnharris@whoopdedo.org>
97 function & idx_get_stopwords()
99 static $stopwords = null;
100 if (is_null($stopwords)) {
101 global $conf;
102 $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
103 if (file_exists($swfile)) {
104 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
105 } else {
106 $stopwords = [];
109 return $stopwords;
113 * Adds/updates the search index for the given page
115 * Locking is handled internally.
117 * @param string $page name of the page to index
118 * @param boolean $verbose print status messages
119 * @param boolean $force force reindexing even when the index is up to date
120 * @return string|boolean the function completed successfully
122 * @author Tom N Harris <tnharris@whoopdedo.org>
124 function idx_addPage($page, $verbose = false, $force = false)
126 $idxtag = metaFN($page, '.indexed');
127 // check if page was deleted but is still in the index
128 if (!page_exists($page)) {
129 if (!file_exists($idxtag)) {
130 if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
131 return false;
133 $Indexer = idx_get_indexer();
134 $result = $Indexer->deletePage($page);
135 if ($result === "locked") {
136 if ($verbose) echo "Indexer: locked" . DOKU_LF;
137 return false;
139 @unlink($idxtag);
140 return $result;
143 // check if indexing needed
144 if (!$force && file_exists($idxtag)) {
145 if (trim(io_readFile($idxtag)) == idx_get_version()) {
146 $last = @filemtime($idxtag);
147 if ($last > @filemtime(wikiFN($page))) {
148 if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
149 return false;
154 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
155 if ($indexenabled === false) {
156 $result = false;
157 if (file_exists($idxtag)) {
158 $Indexer = idx_get_indexer();
159 $result = $Indexer->deletePage($page);
160 if ($result === "locked") {
161 if ($verbose) echo "Indexer: locked" . DOKU_LF;
162 return false;
164 @unlink($idxtag);
166 if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
167 return $result;
170 $Indexer = idx_get_indexer();
171 $pid = $Indexer->getPID($page);
172 if ($pid === false) {
173 if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
174 return false;
176 $body = '';
177 $metadata = [];
178 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
179 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
180 $metadata['relation_references'] = array_keys($references);
181 else $metadata['relation_references'] = [];
183 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
184 $metadata['relation_media'] = array_keys($media);
185 else $metadata['relation_media'] = [];
187 $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
188 $evt = new Event('INDEXER_PAGE_ADD', $data);
189 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
190 $evt->advise_after();
191 unset($evt);
192 extract($data);
194 $result = $Indexer->addPageWords($page, $body);
195 if ($result === "locked") {
196 if ($verbose) echo "Indexer: locked" . DOKU_LF;
197 return false;
200 if ($result) {
201 $result = $Indexer->addMetaKeys($page, $metadata);
202 if ($result === "locked") {
203 if ($verbose) echo "Indexer: locked" . DOKU_LF;
204 return false;
208 if ($result)
209 io_saveFile(metaFN($page, '.indexed'), idx_get_version());
210 if ($verbose) {
211 echo "Indexer: finished" . DOKU_LF;
212 return true;
214 return $result;
218 * Find tokens in the fulltext index
220 * Takes an array of words and will return a list of matching
221 * pages for each one.
223 * Important: No ACL checking is done here! All results are
224 * returned, regardless of permissions
226 * @param array $words list of words to search for
227 * @return array list of pages found, associated with the search terms
229 function idx_lookup(&$words)
231 $Indexer = idx_get_indexer();
232 return $Indexer->lookup($words);
236 * Split a string into tokens
238 * @param string $string
239 * @param bool $wc
241 * @return array
243 function idx_tokenizer($string, $wc = false)
245 $Indexer = idx_get_indexer();
246 return $Indexer->tokenizer($string, $wc);
249 /* For compatibility */
252 * Read the list of words in an index (if it exists).
254 * @author Tom N Harris <tnharris@whoopdedo.org>
256 * @param string $idx
257 * @param string $suffix
258 * @return array
260 function idx_getIndex($idx, $suffix)
262 global $conf;
263 $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
264 if (!file_exists($fn)) return [];
265 return file($fn);
269 * Get the list of lengths indexed in the wiki.
271 * Read the index directory or a cache file and returns
272 * a sorted array of lengths of the words used in the wiki.
274 * @author YoBoY <yoboy.leguesh@gmail.com>
276 * @return array
278 function idx_listIndexLengths()
280 global $conf;
281 // testing what we have to do, create a cache file or not.
282 if ($conf['readdircache'] == 0) {
283 $docache = false;
284 } else {
285 clearstatcache();
286 if (
287 file_exists($conf['indexdir'] . '/lengths.idx')
288 && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
290 if (
291 ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
292 !== false
294 $idx = [];
295 foreach ($lengths as $length) {
296 $idx[] = (int)$length;
298 return $idx;
301 $docache = true;
304 if ($conf['readdircache'] == 0 || $docache) {
305 $dir = @opendir($conf['indexdir']);
306 if ($dir === false)
307 return [];
308 $idx = [];
309 while (($f = readdir($dir)) !== false) {
310 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
311 $i = substr($f, 1, -4);
312 if (is_numeric($i))
313 $idx[] = (int)$i;
316 closedir($dir);
317 sort($idx);
318 // save this in a file
319 if ($docache) {
320 $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
321 @fwrite($handle, implode("\n", $idx));
322 @fclose($handle);
324 return $idx;
327 return [];
331 * Get the word lengths that have been indexed.
333 * Reads the index directory and returns an array of lengths
334 * that there are indices for.
336 * @author YoBoY <yoboy.leguesh@gmail.com>
338 * @param array|int $filter
339 * @return array
341 function idx_indexLengths($filter)
343 global $conf;
344 $idx = [];
345 if (is_array($filter)) {
346 // testing if index files exist only
347 $path = $conf['indexdir'] . "/i";
348 foreach (array_keys($filter) as $key) {
349 if (file_exists($path . $key . '.idx'))
350 $idx[] = $key;
352 } else {
353 $lengths = idx_listIndexLengths();
354 foreach ($lengths as $length) {
355 // keep all the values equal or superior
356 if ((int)$length >= (int)$filter)
357 $idx[] = $length;
360 return $idx;
364 * Clean a name of a key for use as a file name.
366 * Romanizes non-latin characters, then strips away anything that's
367 * not a letter, number, or underscore.
369 * @author Tom N Harris <tnharris@whoopdedo.org>
371 * @param string $name
372 * @return string
374 function idx_cleanName($name)
376 $name = Clean::romanize(trim((string)$name));
377 $name = preg_replace('#[ \./\\:-]+#', '_', $name);
378 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
379 return strtolower($name);
382 //Setup VIM: ex: et ts=4 :