Merge branch 'MDL-42366-master' of https://github.com/lucisgit/moodle
[moodle.git] / search / classes / document.php
blob080806635eb8497d6824f7375dcf237659f85a3d
1 <?php
2 // This file is part of Moodle - http://moodle.org/
3 //
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17 /**
18 * Document representation.
20 * @package core_search
21 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace core_search;
27 defined('MOODLE_INTERNAL') || die();
29 /**
30 * Represents a document to index.
32 * Note that, if you are writting a search engine and you want to change \core_search\document
33 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document.
35 * @package core_search
36 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
37 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
39 class document implements \renderable, \templatable {
41 /**
42 * @var array $data The document data.
44 protected $data = array();
46 /**
47 * @var array Extra data needed to render the document.
49 protected $extradata = array();
51 /**
52 * @var \moodle_url Link to the document.
54 protected $docurl = null;
56 /**
57 * @var \moodle_url Link to the document context.
59 protected $contexturl = null;
61 /**
62 * @var int|null The content field filearea.
64 protected $contentfilearea = null;
66 /**
67 * @var int|null The content field itemid.
69 protected $contentitemid = null;
71 /**
72 * @var bool Should be set to true if document hasn't been indexed before. False if unknown.
74 protected $isnew = false;
76 /**
77 * @var \stored_file[] An array of stored files to attach to the document.
79 protected $files = array();
81 /**
82 * Change list (for engine implementers):
83 * 2017091700 - add optional field groupid
85 * @var int Schema version number (update if any change)
87 const SCHEMA_VERSION = 2017091700;
89 /**
90 * All required fields any doc should contain.
92 * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the
93 * default search engine.
95 * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format
96 * they need.
98 * @var array
100 protected static $requiredfields = array(
101 'id' => array(
102 'type' => 'string',
103 'stored' => true,
104 'indexed' => false
106 'itemid' => array(
107 'type' => 'int',
108 'stored' => true,
109 'indexed' => true
111 'title' => array(
112 'type' => 'text',
113 'stored' => true,
114 'indexed' => true,
115 'mainquery' => true
117 'content' => array(
118 'type' => 'text',
119 'stored' => true,
120 'indexed' => true,
121 'mainquery' => true
123 'contextid' => array(
124 'type' => 'int',
125 'stored' => true,
126 'indexed' => true
128 'areaid' => array(
129 'type' => 'string',
130 'stored' => true,
131 'indexed' => true
133 'type' => array(
134 'type' => 'int',
135 'stored' => true,
136 'indexed' => true
138 'courseid' => array(
139 'type' => 'int',
140 'stored' => true,
141 'indexed' => true
143 'owneruserid' => array(
144 'type' => 'int',
145 'stored' => true,
146 'indexed' => true
148 'modified' => array(
149 'type' => 'tdate',
150 'stored' => true,
151 'indexed' => true
156 * All optional fields docs can contain.
158 * Although it matches solr fields format, this is just to define the field types. Search
159 * engine plugins are responsible of setting their appropriate field types and map these
160 * naming to whatever format they need.
162 * @var array
164 protected static $optionalfields = array(
165 'userid' => array(
166 'type' => 'int',
167 'stored' => true,
168 'indexed' => true
170 'groupid' => array(
171 'type' => 'int',
172 'stored' => true,
173 'indexed' => true
175 'description1' => array(
176 'type' => 'text',
177 'stored' => true,
178 'indexed' => true,
179 'mainquery' => true
181 'description2' => array(
182 'type' => 'text',
183 'stored' => true,
184 'indexed' => true,
185 'mainquery' => true
190 * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin
191 * for internal purposes.
193 * Field names should be prefixed with engine name to avoid potential conflict with core fields.
195 * Uses same format as fields above.
197 * @var array
199 protected static $enginefields = array();
202 * We ensure that the document has a unique id across search areas.
204 * @param int $itemid An id unique to the search area
205 * @param string $componentname The search area component Frankenstyle name
206 * @param string $areaname The area name (the search area class name)
207 * @return void
209 public function __construct($itemid, $componentname, $areaname) {
211 if (!is_numeric($itemid)) {
212 throw new \coding_exception('The itemid should be an integer');
215 $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname);
216 $this->data['id'] = $this->data['areaid'] . '-' . $itemid;
217 $this->data['itemid'] = intval($itemid);
221 * Add a stored file to the document.
223 * @param \stored_file|int $file The file to add, or file id.
224 * @return void
226 public function add_stored_file($file) {
227 if (is_numeric($file)) {
228 $this->files[$file] = $file;
229 } else {
230 $this->files[$file->get_id()] = $file;
235 * Returns the array of attached files.
237 * @return \stored_file[]
239 public function get_files() {
240 // The files array can contain stored file ids, so we need to get instances if asked.
241 foreach ($this->files as $id => $listfile) {
242 if (is_numeric($listfile)) {
243 $fs = get_file_storage();
245 if ($file = $fs->get_file_by_id($id)) {
246 $this->files[$id] = $file;
247 } else {
248 unset($this->files[$id]); // Index is out of date and referencing a file that does not exist.
253 return $this->files;
257 * Setter.
259 * Basic checkings to prevent common issues.
261 * If the field is a string tags will be stripped, if it is an integer or a date it
262 * will be casted to a PHP integer. tdate fields values are expected to be timestamps.
264 * @throws \coding_exception
265 * @param string $fieldname The field name
266 * @param string|int $value The value to store
267 * @return string|int The stored value
269 public function set($fieldname, $value) {
271 if (!empty(static::$requiredfields[$fieldname])) {
272 $fielddata = static::$requiredfields[$fieldname];
273 } else if (!empty(static::$optionalfields[$fieldname])) {
274 $fielddata = static::$optionalfields[$fieldname];
275 } else if (!empty(static::$enginefields[$fieldname])) {
276 $fielddata = static::$enginefields[$fieldname];
279 if (empty($fielddata)) {
280 throw new \coding_exception('"' . $fieldname . '" field does not exist.');
283 // tdate fields should be set as timestamps, later they might be converted to
284 // a date format, it depends on the search engine.
285 if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) {
286 throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"');
289 // We want to be strict here, there might be engines that expect us to
290 // provide them data with the proper type already set.
291 if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
292 $this->data[$fieldname] = intval($value);
293 } else {
294 // Remove disallowed Unicode characters.
295 $value = \core_text::remove_unicode_non_characters($value);
297 // Replace all groups of line breaks and spaces by single spaces.
298 $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
299 if ($this->data[$fieldname] === null) {
300 if (isset($this->data['id'])) {
301 $docid = $this->data['id'];
302 } else {
303 $docid = '(unknown)';
305 throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname .
306 '" value causes preg_replace error (may be caused by unusual characters) ' .
307 'in document with id "' . $docid . '"');
311 return $this->data[$fieldname];
315 * Sets data to this->extradata
317 * This data can be retrieved using \core_search\document->get($fieldname).
319 * @param string $fieldname
320 * @param string $value
321 * @return void
323 public function set_extra($fieldname, $value) {
324 $this->extradata[$fieldname] = $value;
328 * Getter.
330 * Use self::is_set if you are not sure if this field is set or not
331 * as otherwise it will trigger a \coding_exception
333 * @throws \coding_exception
334 * @param string $field
335 * @return string|int
337 public function get($field) {
339 if (isset($this->data[$field])) {
340 return $this->data[$field];
343 // Fallback to extra data.
344 if (isset($this->extradata[$field])) {
345 return $this->extradata[$field];
348 throw new \coding_exception('Field "' . $field . '" is not set in the document');
352 * Checks if a field is set.
354 * @param string $field
355 * @return bool
357 public function is_set($field) {
358 return (isset($this->data[$field]) || isset($this->extradata[$field]));
362 * Set if this is a new document. False if unknown.
364 * @param bool $new
366 public function set_is_new($new) {
367 $this->isnew = (bool)$new;
371 * Returns if the document is new. False if unknown.
373 * @return bool
375 public function get_is_new() {
376 return $this->isnew;
380 * Returns all default fields definitions.
382 * @return array
384 public static function get_default_fields_definition() {
385 return static::$requiredfields + static::$optionalfields + static::$enginefields;
389 * Formats the timestamp preparing the time fields to be inserted into the search engine.
391 * By default it just returns a timestamp so any search engine could just store integers
392 * and use integers comparison to get documents between x and y timestamps, but search
393 * engines might be interested in using their own field formats. They can do it extending
394 * this class in \search_xxx\document.
396 * @param int $timestamp
397 * @return string
399 public static function format_time_for_engine($timestamp) {
400 return $timestamp;
404 * Formats a string value for the search engine.
406 * Search engines may overwrite this method to apply restrictions, like limiting the size.
407 * The default behaviour is just returning the string.
409 * @param string $string
410 * @return string
412 public static function format_string_for_engine($string) {
413 return $string;
417 * Formats a text value for the search engine.
419 * Search engines may overwrite this method to apply restrictions, like limiting the size.
420 * The default behaviour is just returning the string.
422 * @param string $text
423 * @return string
425 public static function format_text_for_engine($text) {
426 return $text;
430 * Returns a timestamp from the value stored in the search engine.
432 * By default it just returns a timestamp so any search engine could just store integers
433 * and use integers comparison to get documents between x and y timestamps, but search
434 * engines might be interested in using their own field formats. They should do it extending
435 * this class in \search_xxx\document.
437 * @param string $time
438 * @return int
440 public static function import_time_from_engine($time) {
441 return $time;
445 * Returns how text is returned from the search engine.
447 * @return int
449 protected function get_text_format() {
450 return FORMAT_PLAIN;
454 * Fills the document with data coming from the search engine.
456 * @throws \core_search\engine_exception
457 * @param array $docdata
458 * @return void
460 public function set_data_from_engine($docdata) {
461 $fields = static::$requiredfields + static::$optionalfields + static::$enginefields;
462 foreach ($fields as $fieldname => $field) {
464 // Optional params might not be there.
465 if (isset($docdata[$fieldname])) {
466 if ($field['type'] === 'tdate') {
467 // Time fields may need a preprocessing.
468 $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname]));
469 } else {
470 // No way we can make this work if there is any multivalue field.
471 if (is_array($docdata[$fieldname])) {
472 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname);
474 $this->set($fieldname, $docdata[$fieldname]);
481 * Sets the document url.
483 * @param \moodle_url $url
484 * @return void
486 public function set_doc_url(\moodle_url $url) {
487 $this->docurl = $url;
491 * Gets the url to the doc.
493 * @return \moodle_url
495 public function get_doc_url() {
496 return $this->docurl;
499 public function set_context_url(\moodle_url $url) {
500 $this->contexturl = $url;
504 * Gets the url to the context.
506 * @return \moodle_url
508 public function get_context_url() {
509 return $this->contexturl;
513 * Returns the document ready to submit to the search engine.
515 * @throws \coding_exception
516 * @return array
518 public function export_for_engine() {
519 // Set any unset defaults.
520 $this->apply_defaults();
522 // We don't want to affect the document instance.
523 $data = $this->data;
525 // Apply specific engine-dependant formats and restrictions.
526 foreach (static::$requiredfields as $fieldname => $field) {
528 // We also check that we have everything we need.
529 if (!isset($data[$fieldname])) {
530 throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"');
533 if ($field['type'] === 'tdate') {
534 // Overwrite the timestamp with the engine dependant format.
535 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
536 } else if ($field['type'] === 'string') {
537 // Overwrite the string with the engine dependant format.
538 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
539 } else if ($field['type'] === 'text') {
540 // Overwrite the text with the engine dependant format.
541 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
546 $fields = static::$optionalfields + static::$enginefields;
547 foreach ($fields as $fieldname => $field) {
548 if (!isset($data[$fieldname])) {
549 continue;
551 if ($field['type'] === 'tdate') {
552 // Overwrite the timestamp with the engine dependant format.
553 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
554 } else if ($field['type'] === 'string') {
555 // Overwrite the string with the engine dependant format.
556 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
557 } else if ($field['type'] === 'text') {
558 // Overwrite the text with the engine dependant format.
559 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
563 return $data;
567 * Apply any defaults to unset fields before export. Called after document building, but before export.
569 * Sub-classes of this should make sure to call parent::apply_defaults().
571 protected function apply_defaults() {
572 // Set the default type, TYPE_TEXT.
573 if (!isset($this->data['type'])) {
574 $this->data['type'] = manager::TYPE_TEXT;
579 * Export the document data to be used as a template context.
581 * Adding more info than the required one as people might be interested in extending the template.
583 * Although content is a required field when setting up the document, it accepts '' (empty) values
584 * as they may be the result of striping out HTML.
586 * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed.
587 * The renderer will output the content without any further cleaning.
589 * @param renderer_base $output The renderer.
590 * @return array
592 public function export_for_template(\renderer_base $output) {
593 list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid'));
595 $title = $this->is_set('title') ? $this->format_text($this->get('title')) : '';
596 $data = [
597 'componentname' => $componentname,
598 'areaname' => $areaname,
599 'courseurl' => course_get_url($this->get('courseid')),
600 'coursefullname' => format_string($this->get('coursefullname'), true, array('context' => $this->get('contextid'))),
601 'modified' => userdate($this->get('modified')),
602 'title' => ($title !== '') ? $title : get_string('notitle', 'search'),
603 'docurl' => $this->get_doc_url(),
604 'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null,
605 'contexturl' => $this->get_context_url(),
606 'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null,
607 'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null,
610 // Now take any attached any files.
611 $files = $this->get_files();
612 if (!empty($files)) {
613 if (count($files) > 1) {
614 $filenames = array();
615 foreach ($files as $file) {
616 $filenames[] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
618 $data['multiplefiles'] = true;
619 $data['filenames'] = $filenames;
620 } else {
621 $file = reset($files);
622 $data['filename'] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
626 if ($this->is_set('userid')) {
627 $data['userurl'] = new \moodle_url('/user/view.php', array('id' => $this->get('userid'), 'course' => $this->get('courseid')));
628 $data['userfullname'] = format_string($this->get('userfullname'), true, array('context' => $this->get('contextid')));
631 return $data;
635 * Formats a text string coming from the search engine.
637 * By default just return the text as it is:
638 * - Search areas are responsible of sending just plain data, the search engine may
639 * append HTML or markdown to it (highlighing for example).
640 * - The view is responsible of shortening the text if it is too big
642 * @param string $text Text to format
643 * @return string HTML text to be renderer
645 protected function format_text($text) {
646 return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid')));