Merge pull request #3024 from splitbrain/cookieupdate
[dokuwiki.git] / inc / indexer.php
blobab02b8ea2f5dfbf8cfbfe0f77d83c0b22efd8536
1 <?php
2 /**
3 * Functions to create the fulltext search index
5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author Andreas Gohr <andi@splitbrain.org>
7 * @author Tom N Harris <tnharris@whoopdedo.org>
8 */
10 use dokuwiki\Extension\Event;
11 use dokuwiki\Search\Indexer;
13 // Version tag used to force rebuild on upgrade
14 define('INDEXER_VERSION', 8);
16 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
19 /**
20 * Version of the indexer taking into consideration the external tokenizer.
21 * The indexer is only compatible with data written by the same version.
23 * @triggers INDEXER_VERSION_GET
24 * Plugins that modify what gets indexed should hook this event and
25 * add their version info to the event data like so:
26 * $data[$plugin_name] = $plugin_version;
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 * @author Michael Hamann <michael@content-space.de>
31 * @return int|string
33 function idx_get_version(){
34 static $indexer_version = null;
35 if ($indexer_version == null) {
36 $version = INDEXER_VERSION;
38 // DokuWiki version is included for the convenience of plugins
39 $data = array('dokuwiki'=>$version);
40 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
41 unset($data['dokuwiki']); // this needs to be first
42 ksort($data);
43 foreach ($data as $plugin=>$vers)
44 $version .= '+'.$plugin.'='.$vers;
45 $indexer_version = $version;
47 return $indexer_version;
50 /**
51 * Measure the length of a string.
52 * Differs from strlen in handling of asian characters.
54 * @author Tom N Harris <tnharris@whoopdedo.org>
56 * @param string $w
57 * @return int
59 function wordlen($w){
60 $l = strlen($w);
61 // If left alone, all chinese "words" will get put into w3.idx
62 // So the "length" of a "word" is faked
63 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
64 foreach($leadbytes[0] as $b)
65 $l += ord($b) - 0xE1;
67 return $l;
70 /**
71 * Create an instance of the indexer.
73 * @return Indexer an Indexer
75 * @author Tom N Harris <tnharris@whoopdedo.org>
77 function idx_get_indexer() {
78 static $Indexer;
79 if (!isset($Indexer)) {
80 $Indexer = new Indexer();
82 return $Indexer;
85 /**
86 * Returns words that will be ignored.
88 * @return array list of stop words
90 * @author Tom N Harris <tnharris@whoopdedo.org>
92 function & idx_get_stopwords() {
93 static $stopwords = null;
94 if (is_null($stopwords)) {
95 global $conf;
96 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
97 if(file_exists($swfile)){
98 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
99 }else{
100 $stopwords = array();
103 return $stopwords;
107 * Adds/updates the search index for the given page
109 * Locking is handled internally.
111 * @param string $page name of the page to index
112 * @param boolean $verbose print status messages
113 * @param boolean $force force reindexing even when the index is up to date
114 * @return string|boolean the function completed successfully
116 * @author Tom N Harris <tnharris@whoopdedo.org>
118 function idx_addPage($page, $verbose=false, $force=false) {
119 $idxtag = metaFN($page,'.indexed');
120 // check if page was deleted but is still in the index
121 if (!page_exists($page)) {
122 if (!file_exists($idxtag)) {
123 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
124 return false;
126 $Indexer = idx_get_indexer();
127 $result = $Indexer->deletePage($page);
128 if ($result === "locked") {
129 if ($verbose) print("Indexer: locked".DOKU_LF);
130 return false;
132 @unlink($idxtag);
133 return $result;
136 // check if indexing needed
137 if(!$force && file_exists($idxtag)){
138 if(trim(io_readFile($idxtag)) == idx_get_version()){
139 $last = @filemtime($idxtag);
140 if($last > @filemtime(wikiFN($page))){
141 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
142 return false;
147 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
148 if ($indexenabled === false) {
149 $result = false;
150 if (file_exists($idxtag)) {
151 $Indexer = idx_get_indexer();
152 $result = $Indexer->deletePage($page);
153 if ($result === "locked") {
154 if ($verbose) print("Indexer: locked".DOKU_LF);
155 return false;
157 @unlink($idxtag);
159 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
160 return $result;
163 $Indexer = idx_get_indexer();
164 $pid = $Indexer->getPID($page);
165 if ($pid === false) {
166 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
167 return false;
169 $body = '';
170 $metadata = array();
171 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
172 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
173 $metadata['relation_references'] = array_keys($references);
174 else
175 $metadata['relation_references'] = array();
177 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
178 $metadata['relation_media'] = array_keys($media);
179 else
180 $metadata['relation_media'] = array();
182 $data = compact('page', 'body', 'metadata', 'pid');
183 $evt = new Event('INDEXER_PAGE_ADD', $data);
184 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
185 $evt->advise_after();
186 unset($evt);
187 extract($data);
189 $result = $Indexer->addPageWords($page, $body);
190 if ($result === "locked") {
191 if ($verbose) print("Indexer: locked".DOKU_LF);
192 return false;
195 if ($result) {
196 $result = $Indexer->addMetaKeys($page, $metadata);
197 if ($result === "locked") {
198 if ($verbose) print("Indexer: locked".DOKU_LF);
199 return false;
203 if ($result)
204 io_saveFile(metaFN($page,'.indexed'), idx_get_version());
205 if ($verbose) {
206 print("Indexer: finished".DOKU_LF);
207 return true;
209 return $result;
213 * Find tokens in the fulltext index
215 * Takes an array of words and will return a list of matching
216 * pages for each one.
218 * Important: No ACL checking is done here! All results are
219 * returned, regardless of permissions
221 * @param array $words list of words to search for
222 * @return array list of pages found, associated with the search terms
224 function idx_lookup(&$words) {
225 $Indexer = idx_get_indexer();
226 return $Indexer->lookup($words);
230 * Split a string into tokens
232 * @param string $string
233 * @param bool $wc
235 * @return array
237 function idx_tokenizer($string, $wc=false) {
238 $Indexer = idx_get_indexer();
239 return $Indexer->tokenizer($string, $wc);
242 /* For compatibility */
245 * Read the list of words in an index (if it exists).
247 * @author Tom N Harris <tnharris@whoopdedo.org>
249 * @param string $idx
250 * @param string $suffix
251 * @return array
253 function idx_getIndex($idx, $suffix) {
254 global $conf;
255 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
256 if (!file_exists($fn)) return array();
257 return file($fn);
261 * Get the list of lengths indexed in the wiki.
263 * Read the index directory or a cache file and returns
264 * a sorted array of lengths of the words used in the wiki.
266 * @author YoBoY <yoboy.leguesh@gmail.com>
268 * @return array
270 function idx_listIndexLengths() {
271 global $conf;
272 // testing what we have to do, create a cache file or not.
273 if ($conf['readdircache'] == 0) {
274 $docache = false;
275 } else {
276 clearstatcache();
277 if (file_exists($conf['indexdir'].'/lengths.idx')
278 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
279 if (
280 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
281 !== false
283 $idx = array();
284 foreach ($lengths as $length) {
285 $idx[] = (int)$length;
287 return $idx;
290 $docache = true;
293 if ($conf['readdircache'] == 0 || $docache) {
294 $dir = @opendir($conf['indexdir']);
295 if ($dir === false)
296 return array();
297 $idx = array();
298 while (($f = readdir($dir)) !== false) {
299 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
300 $i = substr($f, 1, -4);
301 if (is_numeric($i))
302 $idx[] = (int)$i;
305 closedir($dir);
306 sort($idx);
307 // save this in a file
308 if ($docache) {
309 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
310 @fwrite($handle, implode("\n", $idx));
311 @fclose($handle);
313 return $idx;
316 return array();
320 * Get the word lengths that have been indexed.
322 * Reads the index directory and returns an array of lengths
323 * that there are indices for.
325 * @author YoBoY <yoboy.leguesh@gmail.com>
327 * @param array|int $filter
328 * @return array
330 function idx_indexLengths($filter) {
331 global $conf;
332 $idx = array();
333 if (is_array($filter)) {
334 // testing if index files exist only
335 $path = $conf['indexdir']."/i";
336 foreach ($filter as $key => $value) {
337 if (file_exists($path.$key.'.idx'))
338 $idx[] = $key;
340 } else {
341 $lengths = idx_listIndexLengths();
342 foreach ($lengths as $key => $length) {
343 // keep all the values equal or superior
344 if ((int)$length >= (int)$filter)
345 $idx[] = $length;
348 return $idx;
352 * Clean a name of a key for use as a file name.
354 * Romanizes non-latin characters, then strips away anything that's
355 * not a letter, number, or underscore.
357 * @author Tom N Harris <tnharris@whoopdedo.org>
359 * @param string $name
360 * @return string
362 function idx_cleanName($name) {
363 $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
364 $name = preg_replace('#[ \./\\:-]+#', '_', $name);
365 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
366 return strtolower($name);
369 //Setup VIM: ex: et ts=4 :