Merge branch 'MDL-81457-main' of https://github.com/andrewnicols/moodle
[moodle.git] / search / classes / document.php
blob85e116f6872e6cd44e19a1d36da7b3434dbdb3ec
1 <?php
2 // This file is part of Moodle - http://moodle.org/
3 //
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17 /**
18 * Document representation.
20 * @package core_search
21 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace core_search;
27 use context;
29 defined('MOODLE_INTERNAL') || die();
31 /**
32 * Represents a document to index.
34 * Note that, if you are writting a search engine and you want to change \core_search\document
35 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document.
37 * @package core_search
38 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
39 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
41 class document implements \renderable, \templatable {
43 /**
44 * @var array $data The document data.
46 protected $data = array();
48 /**
49 * @var array Extra data needed to render the document.
51 protected $extradata = array();
53 /**
54 * @var \moodle_url Link to the document.
56 protected $docurl = null;
58 /**
59 * @var \moodle_url Link to the document context.
61 protected $contexturl = null;
63 /**
64 * @var \core_search\document_icon Document icon instance.
66 protected $docicon = null;
68 /**
69 * @var int|null The content field filearea.
71 protected $contentfilearea = null;
73 /**
74 * @var int|null The content field itemid.
76 protected $contentitemid = null;
78 /**
79 * @var bool Should be set to true if document hasn't been indexed before. False if unknown.
81 protected $isnew = false;
83 /**
84 * @var \stored_file[] An array of stored files to attach to the document.
86 protected $files = array();
88 /**
89 * Change list (for engine implementers):
90 * 2017091700 - add optional field groupid
92 * @var int Schema version number (update if any change)
94 const SCHEMA_VERSION = 2017091700;
96 /**
97 * All required fields any doc should contain.
99 * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the
100 * default search engine.
102 * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format
103 * they need.
105 * @var array
107 protected static $requiredfields = array(
108 'id' => array(
109 'type' => 'string',
110 'stored' => true,
111 'indexed' => false
113 'itemid' => array(
114 'type' => 'int',
115 'stored' => true,
116 'indexed' => true
118 'title' => array(
119 'type' => 'text',
120 'stored' => true,
121 'indexed' => true,
122 'mainquery' => true
124 'content' => array(
125 'type' => 'text',
126 'stored' => true,
127 'indexed' => true,
128 'mainquery' => true
130 'contextid' => array(
131 'type' => 'int',
132 'stored' => true,
133 'indexed' => true
135 'areaid' => array(
136 'type' => 'string',
137 'stored' => true,
138 'indexed' => true
140 'type' => array(
141 'type' => 'int',
142 'stored' => true,
143 'indexed' => true
145 'courseid' => array(
146 'type' => 'int',
147 'stored' => true,
148 'indexed' => true
150 'owneruserid' => array(
151 'type' => 'int',
152 'stored' => true,
153 'indexed' => true
155 'modified' => array(
156 'type' => 'tdate',
157 'stored' => true,
158 'indexed' => true
163 * All optional fields docs can contain.
165 * Although it matches solr fields format, this is just to define the field types. Search
166 * engine plugins are responsible of setting their appropriate field types and map these
167 * naming to whatever format they need.
169 * @var array
171 protected static $optionalfields = array(
172 'userid' => array(
173 'type' => 'int',
174 'stored' => true,
175 'indexed' => true
177 'groupid' => array(
178 'type' => 'int',
179 'stored' => true,
180 'indexed' => true
182 'description1' => array(
183 'type' => 'text',
184 'stored' => true,
185 'indexed' => true,
186 'mainquery' => true
188 'description2' => array(
189 'type' => 'text',
190 'stored' => true,
191 'indexed' => true,
192 'mainquery' => true
197 * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin
198 * for internal purposes.
200 * Field names should be prefixed with engine name to avoid potential conflict with core fields.
202 * Uses same format as fields above.
204 * @var array
206 protected static $enginefields = array();
209 * We ensure that the document has a unique id across search areas.
211 * @param int $itemid An id unique to the search area
212 * @param string $componentname The search area component Frankenstyle name
213 * @param string $areaname The area name (the search area class name)
214 * @return void
216 public function __construct($itemid, $componentname, $areaname) {
218 if (!is_numeric($itemid)) {
219 throw new \coding_exception('The itemid should be an integer');
222 $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname);
223 $this->data['id'] = $this->data['areaid'] . '-' . $itemid;
224 $this->data['itemid'] = intval($itemid);
228 * Add a stored file to the document.
230 * @param \stored_file|int $file The file to add, or file id.
231 * @return void
233 public function add_stored_file($file) {
234 if (is_numeric($file)) {
235 $this->files[$file] = $file;
236 } else {
237 $this->files[$file->get_id()] = $file;
242 * Returns the array of attached files.
244 * @return \stored_file[]
246 public function get_files() {
247 // The files array can contain stored file ids, so we need to get instances if asked.
248 foreach ($this->files as $id => $listfile) {
249 if (is_numeric($listfile)) {
250 $fs = get_file_storage();
252 if ($file = $fs->get_file_by_id($id)) {
253 $this->files[$id] = $file;
254 } else {
255 unset($this->files[$id]); // Index is out of date and referencing a file that does not exist.
260 return $this->files;
264 * Setter.
266 * Basic checkings to prevent common issues.
268 * If the field is a string tags will be stripped, if it is an integer or a date it
269 * will be casted to a PHP integer. tdate fields values are expected to be timestamps.
271 * @throws \coding_exception
272 * @param string $fieldname The field name
273 * @param string|int $value The value to store
274 * @return string|int The stored value
276 public function set($fieldname, $value) {
278 if (!empty(static::$requiredfields[$fieldname])) {
279 $fielddata = static::$requiredfields[$fieldname];
280 } else if (!empty(static::$optionalfields[$fieldname])) {
281 $fielddata = static::$optionalfields[$fieldname];
282 } else if (!empty(static::$enginefields[$fieldname])) {
283 $fielddata = static::$enginefields[$fieldname];
286 if (empty($fielddata)) {
287 throw new \coding_exception('"' . $fieldname . '" field does not exist.');
290 // tdate fields should be set as timestamps, later they might be converted to
291 // a date format, it depends on the search engine.
292 if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) {
293 throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"');
296 // We want to be strict here, there might be engines that expect us to
297 // provide them data with the proper type already set.
298 if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
299 $this->data[$fieldname] = intval($value);
300 } else {
301 // Remove disallowed Unicode characters.
302 $value = \core_text::remove_unicode_non_characters($value);
304 // Replace all groups of line breaks and spaces by single spaces.
305 $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
306 if ($this->data[$fieldname] === null) {
307 if (isset($this->data['id'])) {
308 $docid = $this->data['id'];
309 } else {
310 $docid = '(unknown)';
312 throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname .
313 '" value causes preg_replace error (may be caused by unusual characters) ' .
314 'in document with id "' . $docid . '"');
318 return $this->data[$fieldname];
322 * Sets data to this->extradata
324 * This data can be retrieved using \core_search\document->get($fieldname).
326 * @param string $fieldname
327 * @param string $value
328 * @return void
330 public function set_extra($fieldname, $value) {
331 $this->extradata[$fieldname] = $value;
335 * Getter.
337 * Use self::is_set if you are not sure if this field is set or not
338 * as otherwise it will trigger a \coding_exception
340 * @throws \coding_exception
341 * @param string $field
342 * @return string|int
344 public function get($field) {
346 if (isset($this->data[$field])) {
347 return $this->data[$field];
350 // Fallback to extra data.
351 if (isset($this->extradata[$field])) {
352 return $this->extradata[$field];
355 throw new \coding_exception('Field "' . $field . '" is not set in the document');
359 * Checks if a field is set.
361 * @param string $field
362 * @return bool
364 public function is_set($field) {
365 return (isset($this->data[$field]) || isset($this->extradata[$field]));
369 * Set if this is a new document. False if unknown.
371 * @param bool $new
373 public function set_is_new($new) {
374 $this->isnew = (bool)$new;
378 * Returns if the document is new. False if unknown.
380 * @return bool
382 public function get_is_new() {
383 return $this->isnew;
387 * Returns all default fields definitions.
389 * @return array
391 public static function get_default_fields_definition() {
392 return static::$requiredfields + static::$optionalfields + static::$enginefields;
396 * Formats the timestamp preparing the time fields to be inserted into the search engine.
398 * By default it just returns a timestamp so any search engine could just store integers
399 * and use integers comparison to get documents between x and y timestamps, but search
400 * engines might be interested in using their own field formats. They can do it extending
401 * this class in \search_xxx\document.
403 * @param int $timestamp
404 * @return string
406 public static function format_time_for_engine($timestamp) {
407 return $timestamp;
411 * Formats a string value for the search engine.
413 * Search engines may overwrite this method to apply restrictions, like limiting the size.
414 * The default behaviour is just returning the string.
416 * @param string $string
417 * @return string
419 public static function format_string_for_engine($string) {
420 return $string;
424 * Formats a text value for the search engine.
426 * Search engines may overwrite this method to apply restrictions, like limiting the size.
427 * The default behaviour is just returning the string.
429 * @param string $text
430 * @return string
432 public static function format_text_for_engine($text) {
433 return $text;
437 * Returns a timestamp from the value stored in the search engine.
439 * By default it just returns a timestamp so any search engine could just store integers
440 * and use integers comparison to get documents between x and y timestamps, but search
441 * engines might be interested in using their own field formats. They should do it extending
442 * this class in \search_xxx\document.
444 * @param string $time
445 * @return int
447 public static function import_time_from_engine($time) {
448 return $time;
452 * Returns how text is returned from the search engine.
454 * @return int
456 protected function get_text_format() {
457 return FORMAT_PLAIN;
461 * Fills the document with data coming from the search engine.
463 * @throws \core_search\engine_exception
464 * @param array $docdata
465 * @return void
467 public function set_data_from_engine($docdata) {
468 $fields = static::$requiredfields + static::$optionalfields + static::$enginefields;
469 foreach ($fields as $fieldname => $field) {
471 // Optional params might not be there.
472 if (isset($docdata[$fieldname])) {
473 if ($field['type'] === 'tdate') {
474 // Time fields may need a preprocessing.
475 $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname]));
476 } else {
477 // No way we can make this work if there is any multivalue field.
478 if (is_array($docdata[$fieldname])) {
479 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname);
481 $this->set($fieldname, $docdata[$fieldname]);
488 * Sets the document url.
490 * @param \moodle_url $url
491 * @return void
493 public function set_doc_url(\moodle_url $url) {
494 $this->docurl = $url;
498 * Gets the url to the doc.
500 * @return \moodle_url
502 public function get_doc_url() {
503 return $this->docurl;
507 * Sets document icon instance.
509 * @param \core_search\document_icon $docicon
511 public function set_doc_icon(document_icon $docicon) {
512 $this->docicon = $docicon;
516 * Gets document icon instance.
518 * @return \core_search\document_icon
520 public function get_doc_icon() {
521 return $this->docicon;
524 public function set_context_url(\moodle_url $url) {
525 $this->contexturl = $url;
529 * Gets the url to the context.
531 * @return \moodle_url
533 public function get_context_url() {
534 return $this->contexturl;
538 * Returns the document ready to submit to the search engine.
540 * @throws \coding_exception
541 * @return array
543 public function export_for_engine() {
544 // Set any unset defaults.
545 $this->apply_defaults();
547 // We don't want to affect the document instance.
548 $data = $this->data;
550 // Apply specific engine-dependant formats and restrictions.
551 foreach (static::$requiredfields as $fieldname => $field) {
553 // We also check that we have everything we need.
554 if (!isset($data[$fieldname])) {
555 throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"');
558 if ($field['type'] === 'tdate') {
559 // Overwrite the timestamp with the engine dependant format.
560 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
561 } else if ($field['type'] === 'string') {
562 // Overwrite the string with the engine dependant format.
563 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
564 } else if ($field['type'] === 'text') {
565 // Overwrite the text with the engine dependant format.
566 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
571 $fields = static::$optionalfields + static::$enginefields;
572 foreach ($fields as $fieldname => $field) {
573 if (!isset($data[$fieldname])) {
574 continue;
576 if ($field['type'] === 'tdate') {
577 // Overwrite the timestamp with the engine dependant format.
578 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
579 } else if ($field['type'] === 'string') {
580 // Overwrite the string with the engine dependant format.
581 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
582 } else if ($field['type'] === 'text') {
583 // Overwrite the text with the engine dependant format.
584 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
588 return $data;
592 * Apply any defaults to unset fields before export. Called after document building, but before export.
594 * Sub-classes of this should make sure to call parent::apply_defaults().
596 protected function apply_defaults() {
597 // Set the default type, TYPE_TEXT.
598 if (!isset($this->data['type'])) {
599 $this->data['type'] = manager::TYPE_TEXT;
604 * Export the document data to be used as a template context.
606 * Just delegates all the processing to export_doc_info, also used by external functions.
607 * Adding more info than the required one as people might be interested in extending the template.
609 * @param \renderer_base $output The renderer.
610 * @return array
612 public function export_for_template(\renderer_base $output): array {
613 $docdata = $this->export_doc($output);
614 return $docdata;
618 * Returns the current docuement information.
620 * Adding more info than the required one as themers and ws clients might be interested in showing more stuff.
622 * Although content is a required field when setting up the document, it accepts '' (empty) values
623 * as they may be the result of striping out HTML.
625 * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed.
626 * The renderer will output the content without any further cleaning.
628 * @param \renderer_base $output The renderer.
629 * @return array
631 public function export_doc(\renderer_base $output): array {
632 global $USER, $CFG;
633 require_once($CFG->dirroot . '/course/lib.php');
635 list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid'));
636 $context = context::instance_by_id($this->get('contextid'));
638 $searcharea = \core_search\manager::get_search_area($this->data['areaid']);
639 $title = $this->is_set('title') ? $this->format_text($searcharea->get_document_display_title($this)) : '';
640 $data = [
641 'itemid' => $this->get('itemid'),
642 'componentname' => $componentname,
643 'areaname' => $areaname,
644 'courseurl' => (course_get_url($this->get('courseid')))->out(false),
645 'coursefullname' => format_string($this->get('coursefullname'), true, ['context' => $context->id]),
646 'modified' => userdate($this->get('modified')),
647 'timemodified' => $this->get('modified'),
648 'title' => ($title !== '') ? $title : get_string('notitle', 'search'),
649 'docurl' => ($this->get_doc_url())->out(false),
650 'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null,
651 'contextid' => $this->get('contextid'),
652 'contexturl' => ($this->get_context_url())->out(false),
653 'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null,
654 'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null,
657 // Now take any attached any files.
658 $files = $this->get_files();
659 if (!empty($files)) {
660 if (count($files) > 1) {
661 $filenames = [];
662 foreach ($files as $file) {
663 $filenames[] = format_string($file->get_filename(), true, ['context' => $context->id]);
665 $data['multiplefiles'] = true;
666 $data['filenames'] = $filenames;
667 } else {
668 $file = reset($files);
669 $data['filename'] = format_string($file->get_filename(), true, ['context' => $context->id]);
673 if ($this->is_set('userid')) {
674 if ($this->get('userid') == $USER->id ||
675 (has_capability('moodle/user:viewdetails', $context) &&
676 has_capability('moodle/course:viewparticipants', $context))) {
677 $data['userurl'] = (new \moodle_url(
678 '/user/view.php',
679 ['id' => $this->get('userid'), 'course' => $this->get('courseid')]
680 ))->out(false);
681 $data['userfullname'] = format_string($this->get('userfullname'), true, ['context' => $context->id]);
682 $data['userid'] = $this->get('userid');
686 if ($docicon = $this->get_doc_icon()) {
687 $data['icon'] = $output->image_url($docicon->get_name(), $docicon->get_component());
688 $data['iconurl'] = $data['icon']->out(false);
690 $data['textformat'] = $this->get_text_format();
692 return $data;
696 * Formats a text string coming from the search engine.
698 * By default just return the text as it is:
699 * - Search areas are responsible of sending just plain data, the search engine may
700 * append HTML or markdown to it (highlighing for example).
701 * - The view is responsible of shortening the text if it is too big
703 * @param string $text Text to format
704 * @return string HTML text to be renderer
706 protected function format_text($text) {
707 return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid')));