MDL-53579 search: Limit general query to certain fields
[moodle.git] / search / engine / solr / classes / engine.php
blob4fe1f7db4ebee5df62b6f93e6cd7e9c1586360e4
1 <?php
2 // This file is part of Moodle - http://moodle.org/
3 //
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17 /**
18 * Solr engine.
20 * @package search_solr
21 * @copyright 2015 Daniel Neis Araujo
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace search_solr;
27 defined('MOODLE_INTERNAL') || die();
29 /**
30 * Solr engine.
32 * @package search_solr
33 * @copyright 2015 Daniel Neis Araujo
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
36 class engine extends \core_search\engine {
38 /**
39 * @var string The date format used by solr.
41 const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
43 /**
44 * @var int Commit documents interval (number of miliseconds).
46 const AUTOCOMMIT_WITHIN = 15000;
48 /**
49 * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
51 const FRAG_SIZE = 510;
53 /**
54 * Marker for the start of a highlight.
56 const HIGHLIGHT_START = '@@HI_S@@';
58 /**
59 * Marker for the end of a highlight.
61 const HIGHLIGHT_END = '@@HI_E@@';
63 /**
64 * @var \SolrClient
66 protected $client = null;
68 /**
69 * @var bool True if we should reuse SolrClients, false if not.
71 protected $cacheclient = true;
73 /**
74 * @var \curl Direct curl object.
76 protected $curl = null;
78 /**
79 * @var array Fields that can be highlighted.
81 protected $highlightfields = array('title', 'content', 'description1', 'description2');
83 /**
84 * Initialises the search engine configuration.
86 * @return void
88 public function __construct() {
89 parent::__construct();
91 $curlversion = curl_version();
92 if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) {
93 // There is a flaw with curl 7.35.0 that causes problems with client reuse.
94 $this->cacheclient = false;
98 /**
99 * Prepares a Solr query, applies filters and executes it returning its results.
101 * @throws \core_search\engine_exception
102 * @param stdClass $filters Containing query and filters.
103 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts.
104 * @return \core_search\document[] Results or false if no results
106 public function execute_query($filters, $usercontexts) {
107 global $USER;
109 // Let's keep these changes internal.
110 $data = clone $filters;
112 // If there is any problem we trigger the exception as soon as possible.
113 $client = $this->get_search_client();
115 $serverstatus = $this->is_server_ready();
116 if ($serverstatus !== true) {
117 throw new \core_search\engine_exception('engineserverstatus', 'search');
120 $query = new \SolrDisMaxQuery();
121 $maxrows = \core_search\manager::MAX_RESULTS;
122 if ($this->file_indexing_enabled()) {
123 // When using file indexing and grouping, we are going to collapse results, so we want extra results.
124 $maxrows *= 2;
126 $this->set_query($query, $data->q, $maxrows);
127 $this->add_fields($query);
129 // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
130 // we are really interested in caching contexts filters instead.
131 if (!empty($data->title)) {
132 $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
134 if (!empty($data->areaid)) {
135 // Even if it is only supposed to contain PARAM_ALPHANUMEXT, better to prevent.
136 $query->addFilterQuery('{!field cache=false f=areaid}' . $data->areaid);
139 if (!empty($data->timestart) or !empty($data->timeend)) {
140 if (empty($data->timestart)) {
141 $data->timestart = '*';
142 } else {
143 $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
145 if (empty($data->timeend)) {
146 $data->timeend = '*';
147 } else {
148 $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
151 // No cache.
152 $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
155 // Restrict to users who are supposed to be able to see a particular result.
156 $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
158 // And finally restrict it to the context where the user can access, we want this one cached.
159 // If the user can access all contexts $usercontexts value is just true, we don't need to filter
160 // in that case.
161 if ($usercontexts && is_array($usercontexts)) {
162 if (!empty($data->areaid)) {
163 $query->addFilterQuery('contextid:(' . implode(' OR ', $usercontexts[$data->areaid]) . ')');
164 } else {
165 // Join all area contexts into a single array and implode.
166 $allcontexts = array();
167 foreach ($usercontexts as $areacontexts) {
168 foreach ($areacontexts as $contextid) {
169 // Ensure they are unique.
170 $allcontexts[$contextid] = $contextid;
173 $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
177 try {
178 if ($this->file_indexing_enabled()) {
179 // Now group records by solr_filegroupingid. Limit to 3 results per group.
180 $query->setGroup(true);
181 $query->setGroupLimit(3);
182 $query->addGroupField('solr_filegroupingid');
183 return $this->grouped_files_query_response($client->query($query));
184 } else {
185 return $this->query_response($client->query($query));
187 } catch (\SolrClientException $ex) {
188 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
189 $this->queryerror = $ex->getMessage();
190 return array();
191 } catch (\SolrServerException $ex) {
192 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
193 $this->queryerror = $ex->getMessage();
194 return array();
200 * Prepares a new query by setting the query, start offset and rows to return.
202 * @param SolrQuery $query
203 * @param object $q Containing query and filters.
204 * @param null|int $maxresults The number of results to limit. manager::MAX_RESULTS if not set.
206 protected function set_query($query, $q, $maxresults = null) {
207 if (!is_numeric($maxresults)) {
208 $maxresults = \core_search\manager::MAX_RESULTS;
211 // Set hightlighting.
212 $query->setHighlight(true);
213 foreach ($this->highlightfields as $field) {
214 $query->addHighlightField($field);
216 $query->setHighlightFragsize(static::FRAG_SIZE);
217 $query->setHighlightSimplePre(self::HIGHLIGHT_START);
218 $query->setHighlightSimplePost(self::HIGHLIGHT_END);
219 $query->setHighlightMergeContiguous(true);
221 $query->setQuery($q);
223 // A reasonable max.
224 $query->setRows($maxresults);
228 * Sets fields to be returned in the result.
230 * @param SolrDisMaxQuery|SolrQuery $query object.
232 public function add_fields($query) {
233 $documentclass = $this->get_document_classname();
234 $fields = $documentclass::get_default_fields_definition();
236 $dismax = false;
237 if ($query instanceof SolrDisMaxQuery) {
238 $dismax = true;
241 foreach ($fields as $key => $field) {
242 $query->addField($key);
243 if ($dismax && !empty($field['mainquery'])) {
244 // Add fields the main query should be run against.
245 $query->addQueryField($key);
251 * Finds the key common to both highlighing and docs array returned from response.
252 * @param object $response containing results.
254 public function add_highlight_content($response) {
255 if (!isset($response->highlighting)) {
256 // There is no highlighting to add.
257 return;
260 $highlightedobject = $response->highlighting;
261 foreach ($response->response->docs as $doc) {
262 $x = $doc->id;
263 $highlighteddoc = $highlightedobject->$x;
264 $this->merge_highlight_field_values($doc, $highlighteddoc);
269 * Adds the highlighting array values to docs array values.
271 * @throws \core_search\engine_exception
272 * @param object $doc containing the results.
273 * @param object $highlighteddoc containing the highlighted results values.
275 public function merge_highlight_field_values($doc, $highlighteddoc) {
277 foreach ($this->highlightfields as $field) {
278 if (!empty($doc->$field)) {
280 // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
281 if (is_array($doc->{$field})) {
282 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
285 if (!empty($highlighteddoc->$field)) {
286 // Replace by the highlighted result.
287 $doc->$field = reset($highlighteddoc->$field);
294 * Filters the response on Moodle side.
296 * @param object $queryresponse containing the response return from solr server.
297 * @return array $results containing final results to be displayed.
299 public function query_response($queryresponse) {
300 global $USER;
302 $userid = $USER->id;
303 $noownerid = \core_search\manager::NO_OWNER_ID;
305 $response = $queryresponse->getResponse();
306 $numgranted = 0;
308 if (!$docs = $response->response->docs) {
309 return array();
312 if (!empty($response->response->numFound)) {
313 $this->add_highlight_content($response);
315 // Iterate through the results checking its availability and whether they are available for the user or not.
316 foreach ($docs as $key => $docdata) {
317 if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
318 // If owneruserid is set, no other user should be able to access this record.
319 unset($docs[$key]);
320 continue;
323 if (!$searcharea = $this->get_search_area($docdata->areaid)) {
324 unset($docs[$key]);
325 continue;
328 $docdata = $this->standarize_solr_obj($docdata);
330 $access = $searcharea->check_access($docdata['itemid']);
331 switch ($access) {
332 case \core_search\manager::ACCESS_DELETED:
333 $this->delete_by_id($docdata['id']);
334 unset($docs[$key]);
335 break;
336 case \core_search\manager::ACCESS_DENIED:
337 unset($docs[$key]);
338 break;
339 case \core_search\manager::ACCESS_GRANTED:
340 $numgranted++;
342 // Add the doc.
343 $docs[$key] = $this->to_document($searcharea, $docdata);
344 break;
347 // This should never happen.
348 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
349 $docs = array_slice($docs, 0, \core_search\manager::MAX_RESULTS, true);
350 break;
355 return $docs;
359 * Processes grouped file results into documents, with attached matching files.
361 * @param SolrQueryResponse $queryresponse The response returned from solr server
362 * @return array Final results to be displayed.
364 protected function grouped_files_query_response($queryresponse) {
365 $response = $queryresponse->getResponse();
367 // If we can't find the grouping, or there are no matches in the grouping, return empty.
368 if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
369 return array();
372 $numgranted = 0;
373 $orderedids = array();
374 $completedocs = array();
375 $incompletedocs = array();
377 $highlightingobj = $response->highlighting;
379 // Each group represents a "master document".
380 $groups = $response->grouped->solr_filegroupingid->groups;
381 foreach ($groups as $group) {
382 $groupid = $group->groupValue;
383 $groupdocs = $group->doclist->docs;
384 $firstdoc = reset($groupdocs);
386 if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
387 // Well, this is a problem.
388 continue;
391 // Check for access.
392 $access = $searcharea->check_access($firstdoc->itemid);
393 switch ($access) {
394 case \core_search\manager::ACCESS_DELETED:
395 // If deleted from Moodle, delete from index and then continue.
396 $this->delete_by_id($firstdoc->id);
397 continue 2;
398 break;
399 case \core_search\manager::ACCESS_DENIED:
400 // This means we should just skip for the current user.
401 continue 2;
402 break;
404 $numgranted++;
406 $maindoc = false;
407 $fileids = array();
408 // Seperate the main document and any files returned.
409 foreach ($groupdocs as $groupdoc) {
410 if ($groupdoc->id == $groupid) {
411 $maindoc = $groupdoc;
412 } else if (isset($groupdoc->solr_fileid)) {
413 $fileids[] = $groupdoc->solr_fileid;
417 // Store the id of this group, in order, for later merging.
418 $orderedids[] = $groupid;
420 if (!$maindoc) {
421 // We don't have the main doc, store what we know for later building.
422 $incompletedocs[$groupid] = $fileids;
423 } else {
424 if (isset($highlightingobj->$groupid)) {
425 // Merge the highlighting for this doc.
426 $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
428 $docdata = $this->standarize_solr_obj($maindoc);
429 $doc = $this->to_document($searcharea, $docdata);
430 // Now we need to attach the result files to the doc.
431 foreach ($fileids as $fileid) {
432 $doc->add_stored_file($fileid);
434 $completedocs[$groupid] = $doc;
437 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
438 // We have hit the max results, we will just ignore the rest.
439 break;
443 $incompletedocs = $this->get_missing_docs($incompletedocs);
445 $out = array();
446 // Now merge the complete and incomplete documents, in results order.
447 foreach ($orderedids as $docid) {
448 if (isset($completedocs[$docid])) {
449 $out[] = $completedocs[$docid];
450 } else if (isset($incompletedocs[$docid])) {
451 $out[] = $incompletedocs[$docid];
455 return $out;
459 * Retreive any missing main documents and attach provided files.
461 * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
462 * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
464 * Return array also indexed by document id.
466 * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
467 * @return document[]
469 protected function get_missing_docs($missingdocs) {
470 if (empty($missingdocs)) {
471 return array();
474 $docids = array_keys($missingdocs);
476 // Build a custom query that will get all the missing documents.
477 $query = new \SolrQuery();
478 $this->set_query($query, '*', count($docids));
479 $this->add_fields($query);
480 $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
482 try {
483 $results = $this->query_response($this->get_search_client()->query($query));
484 } catch (\SolrClientException $ex) {
485 return array();
486 } catch (\SolrServerException $ex) {
487 return array();
490 $out = array();
491 foreach ($results as $result) {
492 $resultid = $result->get('id');
493 if (!isset($missingdocs[$resultid])) {
494 // We got a result we didn't expect. Skip it.
495 continue;
497 // Attach the files.
498 foreach ($missingdocs[$resultid] as $filedoc) {
499 $result->add_stored_file($filedoc);
501 $out[$resultid] = $result;
504 return $out;
508 * Returns a standard php array from a \SolrObject instance.
510 * @param \SolrObject $obj
511 * @return array The returned document as an array.
513 public function standarize_solr_obj(\SolrObject $obj) {
514 $properties = $obj->getPropertyNames();
516 $docdata = array();
517 foreach($properties as $name) {
518 // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
519 $name = trim($name);
520 $docdata[$name] = $obj->offsetGet($name);
522 return $docdata;
526 * Adds a document to the search engine.
528 * This does not commit to the search engine.
530 * @param document $document
531 * @param bool $fileindexing True if file indexing is to be used
532 * @return bool
534 public function add_document($document, $fileindexing = false) {
535 $docdata = $document->export_for_engine();
537 if (!$this->add_solr_document($docdata)) {
538 return false;
541 if ($fileindexing) {
542 // This will take care of updating all attached files in the index.
543 $this->process_document_files($document);
546 return true;
550 * Adds a text document to the search engine.
552 * @param array $doc
553 * @return bool
555 protected function add_solr_document($doc) {
556 $solrdoc = new \SolrInputDocument();
557 foreach ($doc as $field => $value) {
558 $solrdoc->addField($field, $value);
561 try {
562 $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
563 return true;
564 } catch (\SolrClientException $e) {
565 debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
566 } catch (\SolrServerException $e) {
567 // We only use the first line of the message, as it's a fully java stacktrace behind it.
568 $msg = strtok($e->getMessage(), "\n");
569 debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
572 return false;
576 * Index files attached to the docuemnt, ensuring the index matches the current document files.
578 * For documents that aren't known to be new, we check the index for existing files.
579 * - New files we will add.
580 * - Existing and unchanged files we will skip.
581 * - File that are in the index but not on the document will be deleted from the index.
582 * - Files that have changed will be re-indexed.
584 * @param document $document
586 protected function process_document_files($document) {
587 if (!$this->file_indexing_enabled()) {
588 return;
591 // Maximum rows to process at a time.
592 $rows = 500;
594 // Get the attached files.
595 $files = $document->get_files();
597 // If this isn't a new document, we need to check the exiting indexed files.
598 if (!$document->get_is_new()) {
599 // We do this progressively, so we can handle lots of files cleanly.
600 list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
601 $count = 0;
602 $idstodelete = array();
604 do {
605 // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
606 foreach ($indexedfiles as $indexedfile) {
607 $fileid = $indexedfile->solr_fileid;
609 if (isset($files[$fileid])) {
610 // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
611 // Filelib does not guarantee time modified is updated, so we will check important values.
612 if ($indexedfile->modified < $files[$fileid]->get_timemodified()) {
613 continue;
615 if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
616 continue;
618 if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
619 continue;
621 if ($indexedfile->solr_fileindexedcontent == document::INDEXED_FILE_FALSE &&
622 $this->file_is_indexable($files[$fileid])) {
623 // This means that the last time we indexed this file, filtering blocked it.
624 // Current settings say it is indexable, so we will allow it to be indexed.
625 continue;
628 // If the file is already indexed, we can just remove it from the files array and skip it.
629 unset($files[$fileid]);
630 } else {
631 // This means we have found a file that is no longer attached, so we need to delete from the index.
632 // We do it later, since this is progressive, and it could reorder results.
633 $idstodelete[] = $indexedfile->id;
636 $count += $rows;
638 if ($count < $numfound) {
639 // If we haven't hit the total count yet, fetch the next batch.
640 list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
643 } while ($count < $numfound);
645 // Delete files that are no longer attached.
646 foreach ($idstodelete as $id) {
647 // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
648 $this->get_search_client()->deleteById($id);
652 // Now we can actually index all the remaining files.
653 foreach ($files as $file) {
654 $this->add_stored_file($document, $file);
659 * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
661 * @param document $document
662 * @param int $start The row to start the results on. Zero indexed.
663 * @param int $rows The number of rows to fetch
664 * @return array A two element array, the first is the total number of availble results, the second is an array
665 * of documents for the current request.
667 protected function get_indexed_files($document, $start = 0, $rows = 500) {
668 // Build a custom query that will get any document files that are in our solr_filegroupingid.
669 $query = new \SolrQuery();
671 // We want to get all file records tied to a document.
672 // For efficiency, we are building our own, stripped down, query.
673 $query->setQuery('*');
674 $query->setRows($rows);
675 $query->setStart($start);
676 // We want a consistent sorting.
677 $query->addSortField('id');
679 // We only want the bare minimum of fields.
680 $query->addField('id');
681 $query->addField('modified');
682 $query->addField('title');
683 $query->addField('solr_fileid');
684 $query->addField('solr_filecontenthash');
685 $query->addField('solr_fileindexedcontent');
687 $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
688 $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
690 try {
691 $response = $this->get_search_client()->query($query);
692 $responsedoc = $response->getResponse();
694 if (empty($responsedoc->response->numFound)) {
695 return array(0, array());
697 $numfound = $responsedoc->response->numFound;
699 return array($numfound, $this->convert_file_results($responsedoc));
700 } catch (\SolrClientException $ex) {
701 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
702 $this->queryerror = $ex->getMessage();
703 return array(0, array());
704 } catch (\SolrServerException $ex) {
705 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
706 $this->queryerror = $ex->getMessage();
707 return array(0, array());
712 * A very lightweight handler for getting information about already indexed files from a Solr response.
714 * @param SolrObject $responsedoc A Solr response document
715 * @return stdClass[] An array of objects that contain the basic information for file processing.
717 protected function convert_file_results($responsedoc) {
718 if (!$docs = $responsedoc->response->docs) {
719 return array();
722 $out = array();
724 foreach ($docs as $doc) {
725 // Copy the bare minimim needed info.
726 $result = new \stdClass();
727 $result->id = $doc->id;
728 $result->modified = document::import_time_from_engine($doc->modified);
729 $result->title = $doc->title;
730 $result->solr_fileid = $doc->solr_fileid;
731 $result->solr_filecontenthash = $doc->solr_filecontenthash;
732 $result->solr_fileindexedcontent = $doc->solr_fileindexedcontent;
733 $out[] = $result;
736 return $out;
740 * Adds a file to the search engine.
742 * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
743 * Tika has much better content type detection than Moodle, and we will have many more doc failures
744 * if we try to send mime types.
746 * @param document $document
747 * @param \stored_file $storedfile
748 * @return void
750 protected function add_stored_file($document, $storedfile) {
751 $filedoc = $document->export_file_for_engine($storedfile);
753 if (!$this->file_is_indexable($storedfile)) {
754 // For files that we don't consider indexable, we will still place a reference in the search engine.
755 $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_FALSE;
756 $this->add_solr_document($filedoc);
757 return;
760 $curl = $this->get_curl_object();
762 $url = $this->get_connection_url('/update/extract');
764 // This will prevent solr from automatically making fields for every tika output.
765 $url->param('uprefix', 'ignored_');
767 // These are common fields that matches the standard *_point dynamic field and causes an error.
768 $url->param('fmap.media_white_point', 'ignored_mwp');
769 $url->param('fmap.media_black_point', 'ignored_mbp');
771 // Copy each key to the url with literal.
772 // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
773 foreach ($filedoc as $key => $value) {
774 // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
775 $url->param('fmap.'.$key, 'ignored_'.$key);
776 // Place data in a tmp field.
777 $url->param('literal.mdltmp_'.$key, $value);
778 // Then move to the final field.
779 $url->param('fmap.mdltmp_'.$key, $key);
782 // This sets the true filename for Tika.
783 $url->param('resource.name', $storedfile->get_filename());
785 // A giant block of code that is really just error checking around the curl request.
786 try {
787 // Now actually do the request.
788 $result = $curl->post($url->out(false), array('myfile' => $storedfile));
790 $code = $curl->get_errno();
791 $info = $curl->get_info();
793 // Now error handling. It is just informational, since we aren't tracking per file/doc results.
794 if ($code != 0) {
795 // This means an internal cURL error occurred error is in result.
796 $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
797 debugging($message, DEBUG_DEVELOPER);
798 } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
799 // Unexpected HTTP response code.
800 $message = 'Error while indexing file with document id '.$filedoc['id'];
801 // Try to get error message out of msg or title if it exists.
802 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
803 $message .= ': '.$matches[1];
804 } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
805 $message .= ': '.$matches[1];
807 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
808 if (CLI_SCRIPT && !PHPUNIT_TEST) {
809 mtrace($message);
811 } else {
812 // Check for the expected status field.
813 if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
814 // Now check for the expected status of 0, if not, error.
815 if ((int)$matches[1] !== 0) {
816 $message = 'Unexpected Solr status code '.(int)$matches[1];
817 $message .= ' while indexing file with document id '.$filedoc['id'].'.';
818 debugging($message, DEBUG_DEVELOPER);
819 } else {
820 // The document was successfully indexed.
821 return;
823 } else {
824 // We received an unprocessable response.
825 $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
826 $message .= strtok($result, "\n");
827 debugging($message, DEBUG_DEVELOPER);
830 } catch (\Exception $e) {
831 // There was an error, but we are not tracking per-file success, so we just continue on.
832 debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
835 // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
836 $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_ERROR;
837 $this->add_solr_document($filedoc);
841 * Checks to see if a passed file is indexable.
843 * @param \stored_file $file The file to check
844 * @return bool True if the file can be indexed
846 protected function file_is_indexable($file) {
847 if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
848 // The file is too big to index.
849 return false;
852 $mime = $file->get_mimetype();
854 if ($mime == 'application/vnd.moodle.backup') {
855 // We don't index Moodle backup files. There is nothing usefully indexable in them.
856 return false;
859 return true;
863 * Commits all pending changes.
865 * @return void
867 protected function commit() {
868 $this->get_search_client()->commit();
872 * Do any area cleanup needed, and do anything to confirm contents.
874 * Return false to prevent the search area completed time and stats from being updated.
876 * @param \core_search\area\base $searcharea The search area that was complete
877 * @param int $numdocs The number of documents that were added to the index
878 * @param bool $fullindex True if a full index is being performed
879 * @return bool True means that data is considered indexed
881 public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
882 $this->commit();
884 return true;
888 * Return true if file indexing is supported and enabled. False otherwise.
890 * @return bool
892 public function file_indexing_enabled() {
893 return (bool)$this->config->fileindexing;
897 * Defragments the index.
899 * @return void
901 public function optimize() {
902 $this->get_search_client()->optimize(1, true, false);
906 * Deletes the specified document.
908 * @param string $id The document id to delete
909 * @return void
911 public function delete_by_id($id) {
912 // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
913 $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
914 $this->commit();
918 * Delete all area's documents.
920 * @param string $areaid
921 * @return void
923 public function delete($areaid = null) {
924 if ($areaid) {
925 $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
926 } else {
927 $this->get_search_client()->deleteByQuery('*:*');
929 $this->commit();
933 * Pings the Solr server using search_solr config
935 * @return true|string Returns true if all good or an error string.
937 public function is_server_ready() {
939 if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
940 return 'No solr configuration found';
943 if (!$client = $this->get_search_client(false)) {
944 return get_string('engineserverstatus', 'search');
947 try {
948 @$client->ping();
949 } catch (\SolrClientException $ex) {
950 return 'Solr client error: ' . $ex->getMessage();
951 } catch (\SolrServerException $ex) {
952 return 'Solr server error: ' . $ex->getMessage();
955 // Check that setup schema has already run.
956 try {
957 $schema = new \search_solr\schema();
958 $schema->validate_setup();
959 } catch (\moodle_exception $e) {
960 return $e->getMessage();
963 return true;
967 * Checks if the PHP Solr extension is available.
969 * @return bool
971 public function is_installed() {
972 return function_exists('solr_get_version');
976 * Returns the solr client instance.
978 * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl.
980 * @throws \core_search\engine_exception
981 * @param bool $triggerexception
982 * @return \SolrClient
984 protected function get_search_client($triggerexception = true) {
986 // Type comparison as it is set to false if not available.
987 if ($this->client !== null) {
988 return $this->client;
991 $options = array(
992 'hostname' => $this->config->server_hostname,
993 'path' => '/solr/' . $this->config->indexname,
994 'login' => !empty($this->config->server_username) ? $this->config->server_username : '',
995 'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
996 'port' => !empty($this->config->server_port) ? $this->config->server_port : '',
997 'secure' => !empty($this->config->secure) ? true : false,
998 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '',
999 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '',
1000 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '',
1001 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '',
1002 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '',
1003 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30'
1006 $client = new \SolrClient($options);
1008 if ($client === false && $triggerexception) {
1009 throw new \core_search\engine_exception('engineserverstatus', 'search');
1012 if ($this->cacheclient) {
1013 $this->client = $client;
1016 return $client;
1020 * Returns a curl object for conntecting to solr.
1022 * @return \curl
1024 public function get_curl_object() {
1025 if (!is_null($this->curl)) {
1026 return $this->curl;
1029 $this->curl = new \curl();
1031 $options = array();
1032 // Build the SSL options. Based on pecl-solr and general testing.
1033 if (!empty($this->config->secure)) {
1034 if (!empty($this->config->ssl_cert)) {
1035 $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert;
1036 $options['CURLOPT_SSLCERTTYPE'] = 'PEM';
1039 if (!empty($this->config->ssl_key)) {
1040 $options['CURLOPT_SSLKEY'] = $this->config->ssl_key;
1041 $options['CURLOPT_SSLKEYTYPE'] = 'PEM';
1044 if (!empty($this->config->ssl_keypassword)) {
1045 $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword;
1048 if (!empty($this->config->ssl_cainfo)) {
1049 $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo;
1052 if (!empty($this->config->ssl_capath)) {
1053 $options['CURLOPT_CAPATH'] = $this->config->ssl_capath;
1057 $this->curl->setopt($options);
1059 if (!empty($this->config->server_username) && !empty($this->config->server_password)) {
1060 $authorization = $this->config->server_username . ':' . $this->config->server_password;
1061 $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization));
1064 return $this->curl;
1068 * Return a Moodle url object for the server connection.
1070 * @param string $path The solr path to append.
1071 * @return \moodle_url
1073 public function get_connection_url($path) {
1074 // Must use the proper protocol, or SSL will fail.
1075 $protocol = !empty($this->config->secure) ? 'https' : 'http';
1076 $url = $protocol . '://' . rtrim($this->config->server_hostname, '/');
1077 if (!empty($this->config->server_port)) {
1078 $url .= ':' . $this->config->server_port;
1080 $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/');
1082 return new \moodle_url($url);