Merge branch 'MDL-41565-master' of git://github.com/FMCorz/moodle
[moodle.git] / lib / searchlib.php
blobc9512e98fdcd494d4d726a421e3fe9b6c4e38308
1 <?php
3 // This file is part of Moodle - http://moodle.org/
4 //
5 // Moodle is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // Moodle is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
18 /**
19 * @package core
20 * @subpackage search
21 * @copyright 1999 onwards Martin Dougiamas {@link http://moodle.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 defined('MOODLE_INTERNAL') || die();
27 /** @see lexer.php */
28 require_once($CFG->libdir.'/lexer.php');
30 /** Constants for the various types of tokens */
32 define("TOKEN_USER","0");
33 define("TOKEN_META","1");
34 define("TOKEN_EXACT","2");
35 define("TOKEN_NEGATE","3");
36 define("TOKEN_STRING","4");
37 define("TOKEN_USERID","5");
38 define("TOKEN_DATEFROM","6");
39 define("TOKEN_DATETO","7");
40 define("TOKEN_INSTANCE","8");
42 /**
43 * Class to hold token/value pairs after they're parsed.
45 * @package moodlecore
46 * @copyright 1999 onwards Martin Dougiamas {@link http://moodle.com}
47 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
49 class search_token {
50 private $value;
51 private $type;
53 function search_token($type,$value){
54 $this->type = $type;
55 $this->value = $this->sanitize($value);
59 // Try to clean up user input to avoid potential security issues.
60 // Need to think about this some more.
62 function sanitize($userstring){
63 return htmlspecialchars($userstring);
65 function getValue(){
66 return $this->value;
68 function getType(){
69 return $this->type;
74 /**
75 * This class does the heavy lifting of lexing the search string into tokens.
76 * Using a full-blown lexer is probably overkill for this application, but
77 * might be useful for other tasks.
79 * @package moodlecore
80 * @copyright 1999 onwards Martin Dougiamas {@link http://moodle.com}
81 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
83 class search_lexer extends Lexer{
85 function search_lexer(&$parser){
87 // Call parent constructor.
88 $this->Lexer($parser);
90 //Set up the state machine and pattern matches for transitions.
92 // Patterns to handle strings of the form datefrom:foo
94 // If we see the string datefrom: while in the base accept state, start
95 // parsing a username and go to the indatefrom state.
96 $this->addEntryPattern("datefrom:\S+","accept","indatefrom");
98 // Snarf everything into the username until we see whitespace, then exit
99 // back to the base accept state.
100 $this->addExitPattern("\s","indatefrom");
103 // Patterns to handle strings of the form dateto:foo
105 // If we see the string dateto: while in the base accept state, start
106 // parsing a username and go to the indateto state.
107 $this->addEntryPattern("dateto:\S+","accept","indateto");
109 // Snarf everything into the username until we see whitespace, then exit
110 // back to the base accept state.
111 $this->addExitPattern("\s","indateto");
114 // Patterns to handle strings of the form instance:foo
116 // If we see the string instance: while in the base accept state, start
117 // parsing for instance number and go to the ininstance state.
118 $this->addEntryPattern("instance:\S+","accept","ininstance");
120 // Snarf everything into the username until we see whitespace, then exit
121 // back to the base accept state.
122 $this->addExitPattern("\s","ininstance");
125 // Patterns to handle strings of the form userid:foo
127 // If we see the string userid: while in the base accept state, start
128 // parsing a username and go to the inuserid state.
129 $this->addEntryPattern("userid:\S+","accept","inuserid");
131 // Snarf everything into the username until we see whitespace, then exit
132 // back to the base accept state.
133 $this->addExitPattern("\s","inuserid");
136 // Patterns to handle strings of the form user:foo
138 // If we see the string user: while in the base accept state, start
139 // parsing a username and go to the inusername state.
140 $this->addEntryPattern("user:\S+","accept","inusername");
142 // Snarf everything into the username until we see whitespace, then exit
143 // back to the base accept state.
144 $this->addExitPattern("\s","inusername");
147 // Patterns to handle strings of the form meta:foo
149 // If we see the string meta: while in the base accept state, start
150 // parsing a username and go to the inmeta state.
151 $this->addEntryPattern("subject:\S+","accept","inmeta");
153 // Snarf everything into the meta token until we see whitespace, then exit
154 // back to the base accept state.
155 $this->addExitPattern("\s","inmeta");
158 // Patterns to handle required exact match strings (+foo) .
160 // If we see a + sign while in the base accept state, start
161 // parsing an exact match string and enter the inrequired state
162 $this->addEntryPattern("\+\S+","accept","inrequired");
163 // When we see white space, exit back to accept state.
164 $this->addExitPattern("\s","inrequired");
166 // Handle excluded strings (-foo)
168 // If we see a - sign while in the base accept state, start
169 // parsing an excluded string and enter the inexcluded state
170 $this->addEntryPattern("\-\S+","accept","inexcluded");
171 // When we see white space, exit back to accept state.
172 $this->addExitPattern("\s","inexcluded");
175 // Patterns to handle quoted strings.
177 // If we see a quote while in the base accept state, start
178 // parsing a quoted string and enter the inquotedstring state.
179 // Grab everything until we see the closing quote.
181 $this->addEntryPattern("\"[^\"]+","accept","inquotedstring");
183 // When we see a closing quote, reenter the base accept state.
184 $this->addExitPattern("\"","inquotedstring");
186 // Patterns to handle ordinary, nonquoted words.
188 // When we see non-whitespace, snarf everything into the nonquoted word
189 // until we see whitespace again.
190 $this->addEntryPattern("\S+","accept","plainstring");
192 // Once we see whitespace, reenter the base accept state.
193 $this->addExitPattern("\s","plainstring");
201 * This class takes care of sticking the proper token type/value pairs into
202 * the parsed token array.
203 * Most functions in this class should only be called by the lexer, the
204 * one exception being getParseArray() which returns the result.
206 * @package moodlecore
207 * @copyright 1999 onwards Martin Dougiamas {@link http://moodle.com}
208 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
210 class search_parser {
211 private $tokens;
213 // This function is called by the code that's interested in the result of the parse operation.
214 function get_parsed_array(){
215 return $this->tokens;
219 * Functions below this are part of the state machine for the parse
220 * operation and should not be called directly.
223 // Base state. No output emitted.
224 function accept() {
225 return true;
228 // State for handling datefrom:foo constructs. Potentially emits a token.
229 function indatefrom($content){
230 if (strlen($content) < 10) { // State exit or missing parameter.
231 return true;
233 // Strip off the datefrom: part and add the reminder to the parsed token array
234 $param = trim(substr($content,9));
235 $this->tokens[] = new search_token(TOKEN_DATEFROM,$param);
236 return true;
239 // State for handling dateto:foo constructs. Potentially emits a token.
240 function indateto($content){
241 if (strlen($content) < 8) { // State exit or missing parameter.
242 return true;
244 // Strip off the dateto: part and add the reminder to the parsed token array
245 $param = trim(substr($content,7));
246 $this->tokens[] = new search_token(TOKEN_DATETO,$param);
247 return true;
250 // State for handling instance:foo constructs. Potentially emits a token.
251 function ininstance($content){
252 if (strlen($content) < 10) { // State exit or missing parameter.
253 return true;
255 // Strip off the instance: part and add the reminder to the parsed token array
256 $param = trim(substr($content,9));
257 $this->tokens[] = new search_token(TOKEN_INSTANCE,$param);
258 return true;
262 // State for handling userid:foo constructs. Potentially emits a token.
263 function inuserid($content){
264 if (strlen($content) < 8) { // State exit or missing parameter.
265 return true;
267 // Strip off the userid: part and add the reminder to the parsed token array
268 $param = trim(substr($content,7));
269 $this->tokens[] = new search_token(TOKEN_USERID,$param);
270 return true;
274 // State for handling user:foo constructs. Potentially emits a token.
275 function inusername($content){
276 if (strlen($content) < 6) { // State exit or missing parameter.
277 return true;
279 // Strip off the user: part and add the reminder to the parsed token array
280 $param = trim(substr($content,5));
281 $this->tokens[] = new search_token(TOKEN_USER,$param);
282 return true;
286 // State for handling meta:foo constructs. Potentially emits a token.
287 function inmeta($content){
288 if (strlen($content) < 9) { // Missing parameter.
289 return true;
291 // Strip off the meta: part and add the reminder to the parsed token array.
292 $param = trim(substr($content,8));
293 $this->tokens[] = new search_token(TOKEN_META,$param);
294 return true;
298 // State entered when we've seen a required string (+foo). Potentially
299 // emits a token.
300 function inrequired($content){
301 if (strlen($content) < 2) { // State exit or missing parameter, don't emit.
302 return true;
304 // Strip off the + sign and add the reminder to the parsed token array.
305 $this->tokens[] = new search_token(TOKEN_EXACT,substr($content,1));
306 return true;
309 // State entered when we've seen an excluded string (-foo). Potentially
310 // emits a token.
311 function inexcluded($content){
312 if (strlen($content) < 2) { // State exit or missing parameter.
313 return true;
315 // Strip off the -sign and add the reminder to the parsed token array.
316 $this->tokens[] = new search_token(TOKEN_NEGATE,substr($content,1));
317 return true;
321 // State entered when we've seen a quoted string. Potentially emits a token.
322 function inquotedstring($content){
323 if (strlen($content) < 2) { // State exit or missing parameter.
324 return true;
326 // Strip off the opening quote and add the reminder to the parsed token array.
327 $this->tokens[] = new search_token(TOKEN_STRING,substr($content,1));
328 return true;
331 // State entered when we've seen an ordinary, non-quoted word. Potentially
332 // emits a token.
333 function plainstring($content){
334 if (trim($content) === '') { // State exit
335 return true;
337 // Add the string to the parsed token array.
338 $this->tokens[] = new search_token(TOKEN_STRING,$content);
339 return true;
344 * Primitive function to generate a SQL string from a parse tree
345 * using TEXT indexes. If searches aren't suitable to use TEXT
346 * this function calls the default search_generate_SQL() one.
348 * $parsetree should be a parse tree generated by a
349 * search_lexer/search_parser combination.
350 * Other fields are database table names to search.
352 * @global object
353 * @global object
355 function search_generate_text_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
356 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
357 global $CFG, $DB;
358 static $p = 0;
360 /// First of all, search for reasons to switch to standard SQL generation
361 /// Only mysql are supported for now
362 if ($DB->get_dbfamily() != 'mysql') {
363 return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
364 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
367 /// Some languages don't have "word separators" and MySQL FULLTEXT doesn't perform well with them, so
368 /// switch to standard SQL search generation
369 if ($DB->get_dbfamily() == 'mysql') {
370 $nonseparatedlangs = array('ja', 'th', 'zh_cn', 'zh_tw');
371 if (in_array(current_language(), $nonseparatedlangs)) {
372 return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
373 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
377 /// Here we'll acumulate non-textual tokens
378 $non_text_tokens = array();
379 $params = array();
381 $ntokens = count($parsetree);
382 if ($ntokens == 0) {
383 return "";
386 $SQLString = '';
387 $text_sql_string = '';
389 $datasearch_clause = '';
390 $metasearch_clause = '';
392 foreach ($parsetree as $token) {
394 $type = $token->getType();
395 $value = $token->getValue();
397 switch($type){
398 case TOKEN_STRING:
399 /// If it's a multiword token, quote it
400 if (strstr($value, ' ')) {
401 $datasearch_clause .= '"' . $value . '" ';
402 /// Simple word token, search for it as prefix
403 } else {
404 $datasearch_clause .= '+' . $value . '* ';
406 break;
407 case TOKEN_EXACT:
408 /// token must be exactly as requested
409 $datasearch_clause .= '+' . $value . ' ';
410 break;
411 case TOKEN_NEGATE:
412 /// token must not exist as prefix
413 $datasearch_clause .= '-' . $value . '* ';
414 break;
415 case TOKEN_META:
416 /// token in metafield, search for it as prefix
417 $metasearch_clause .= '+' . $value . '* ';
418 break;
419 case TOKEN_USER:
420 case TOKEN_USERID:
421 case TOKEN_INSTANCE:
422 case TOKEN_DATETO:
423 case TOKEN_DATEFROM:
424 /// delegate to standard search
425 $non_text_tokens[] = $token;
426 break;
427 default:
428 return '';
432 /// Call to standard search for pending tokens
433 if (!empty($non_text_tokens)) {
434 list($SQLString, $sparams) = search_generate_SQL($non_text_tokens, $datafield, $metafield, $mainidfield, $useridfield,
435 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
436 $params = array_merge($params, $sparams);
438 /// Build the final SQL clause
439 if (!empty($datasearch_clause)) {
440 /// Must have $datafield to search within
441 if (!empty($datafield)) {
442 $text_sql_string .= 'MATCH (' . $datafield;
443 /// And optionally $metafield
444 if (!empty($metafield)) {
445 $text_sql_string .= ', ' . $metafield;
447 /// Begin with the AGAINST clause
448 $text_sql_string .= ') AGAINST (';
449 /// Add the search terms
450 $text_sql_string .= ':sgt'.$p;
451 $params['sgt'.$p++] = trim($datasearch_clause);
452 /// Close AGAINST clause
453 $text_sql_string .= " IN BOOLEAN MODE)";
456 /// Now add the metasearch_clause
457 if (!empty($metasearch_clause)) {
458 /// Must have $metafield to search within
459 if (!empty($metafield)) {
460 /// AND operator if needed
461 if (!empty($text_sql_string)) {
462 $text_sql_string .= ' AND ';
464 $text_sql_string .= 'MATCH (' . $metafield;
465 /// Begin with the AGAINST clause
466 $text_sql_string .= ') AGAINST (';
467 /// Add the search terms
468 $text_sql_string .= ':sgt'.$p;
469 $params['sgt'.$p++] = trim($metasearch_clause);
470 /// Close AGAINST clause
471 $text_sql_string .= " IN BOOLEAN MODE)";
474 /// Finally add the non-text conditions
475 if (!empty($SQLString)) {
476 /// AND operator if needed
477 if (!empty($text_sql_string)) {
478 $text_sql_string .= ' AND ';
480 $text_sql_string .= $SQLString;
483 return array($text_sql_string, $params);
487 * Primitive function to generate a SQL string from a parse tree.
488 * Parameters:
490 * $parsetree should be a parse tree generated by a
491 * search_lexer/search_parser combination.
492 * Other fields are database table names to search.
494 * @global object
495 * @global object
497 function search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
498 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
499 global $CFG, $DB;
500 static $p = 0;
502 if ($DB->sql_regex_supported()) {
503 $REGEXP = $DB->sql_regex(true);
504 $NOTREGEXP = $DB->sql_regex(false);
507 $params = array();
509 $ntokens = count($parsetree);
510 if ($ntokens == 0) {
511 return "";
514 $SQLString = '';
516 for ($i=0; $i<$ntokens; $i++){
517 if ($i > 0) {// We have more than one clause, need to tack on AND
518 $SQLString .= ' AND ';
521 $type = $parsetree[$i]->getType();
522 $value = $parsetree[$i]->getValue();
524 /// Under Oracle and MSSQL, transform TOKEN searches into STRING searches and trim +- chars
525 if (!$DB->sql_regex_supported()) {
526 $value = trim($value, '+-');
527 if ($type == TOKEN_EXACT) {
528 $type = TOKEN_STRING;
532 $name1 = 'sq'.$p++;
533 $name2 = 'sq'.$p++;
535 switch($type){
536 case TOKEN_STRING:
537 $SQLString .= "((".$DB->sql_like($datafield, ":$name1", false).") OR (".$DB->sql_like($metafield, ":$name2", false)."))";
538 $params[$name1] = "%$value%";
539 $params[$name2] = "%$value%";
540 break;
541 case TOKEN_EXACT:
542 $SQLString .= "(($datafield $REGEXP :$name1) OR ($metafield $REGEXP :$name2))";
543 $params[$name1] = "[[:<:]]".$value."[[:>:]]";
544 $params[$name2] = "[[:<:]]".$value."[[:>:]]";
545 break;
546 case TOKEN_META:
547 if ($metafield != '') {
548 $SQLString .= "(".$DB->sql_like($metafield, ":$name1", false).")";
549 $params[$name1] = "%$value%";
551 break;
552 case TOKEN_USER:
553 $SQLString .= "(($mainidfield = $useridfield) AND ((".$DB->sql_like($userfirstnamefield, ":$name1", false).") OR (".$DB->sql_like($userlastnamefield, ":$name2", false).")))";
554 $params[$name1] = "%$value%";
555 $params[$name2] = "%$value%";
556 break;
557 case TOKEN_USERID:
558 $SQLString .= "($useridfield = :$name1)";
559 $params[$name1] = $value;
560 break;
561 case TOKEN_INSTANCE:
562 $SQLString .= "($instancefield = :$name1)";
563 $params[$name1] = $value;
564 break;
565 case TOKEN_DATETO:
566 $SQLString .= "($timefield <= :$name1)";
567 $params[$name1] = $value;
568 break;
569 case TOKEN_DATEFROM:
570 $SQLString .= "($timefield >= :$name1)";
571 $params[$name1] = $value;
572 break;
573 case TOKEN_NEGATE:
574 $SQLString .= "(NOT ((".$DB->sql_like($datafield, ":$name1", false).") OR (".$DB->sql_like($metafield, ":$name2", false).")))";
575 $params[$name1] = "%$value%";
576 $params[$name2] = "%$value%";
577 break;
578 default:
579 return '';
583 return array($SQLString, $params);