Merge branch 'MDL-79830' of https://github.com/paulholden/moodle
[moodle.git] / analytics / classes / dataset_manager.php
blob811bf7cca6ef7c7658df72d10a051192afe46953
1 <?php
2 // This file is part of Moodle - http://moodle.org/
3 //
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17 /**
18 * Datasets manager.
20 * @package core_analytics
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace core_analytics;
27 defined('MOODLE_INTERNAL') || die();
29 /**
30 * Datasets manager.
32 * @package core_analytics
33 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
36 class dataset_manager {
38 /**
39 * File area for labelled datasets.
41 const LABELLED_FILEAREA = 'labelled';
43 /**
44 * File area for unlabelled datasets.
46 const UNLABELLED_FILEAREA = 'unlabelled';
48 /**
49 * File area for exported datasets.
51 const EXPORT_FILEAREA = 'export';
53 /**
54 * Evaluation file file name.
56 const EVALUATION_FILENAME = 'evaluation.csv';
58 /**
59 * The model id.
61 * @var int
63 protected $modelid;
65 /**
66 * Range processor in use.
68 * @var string
70 protected $timesplittingid;
72 /**
73 * @var int
75 protected $analysableid;
77 /**
78 * Whether this is a dataset for evaluation or not.
80 * @var bool
82 protected $evaluation;
84 /**
85 * The dataset filearea. Must be one of the self::*_FILEAREA options.
87 * @var string
89 protected $filearea;
91 /**
92 * Constructor method.
94 * @throws \coding_exception
95 * @param int $modelid
96 * @param int $analysableid
97 * @param string $timesplittingid
98 * @param string $filearea
99 * @param bool $evaluation
100 * @return void
102 public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {
104 if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&
105 $filearea !== self::UNLABELLED_FILEAREA) {
106 throw new \coding_exception('Invalid provided filearea');
109 $this->modelid = $modelid;
110 $this->analysableid = $analysableid;
111 $this->timesplittingid = $timesplittingid;
112 $this->filearea = $filearea;
113 $this->evaluation = $evaluation;
117 * Store the dataset in the internal file system.
119 * @param array $data
120 * @return \stored_file
122 public function store($data) {
124 // Delete previous file if it exists.
125 $fs = get_file_storage();
127 $filerecord = [
128 'component' => 'analytics',
129 'filearea' => $this->filearea,
130 'itemid' => $this->modelid,
131 'contextid' => \context_system::instance()->id,
132 'filepath' => '/analysable/' . $this->analysableid . '/' .
133 \core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/',
134 'filename' => self::get_filename($this->evaluation)
137 // Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.
138 if ($this->evaluation) {
139 $select = " = {$filerecord['itemid']} AND filepath = :filepath";
140 $fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],
141 $select, array('filepath' => $filerecord['filepath']));
144 // Write all this stuff to a tmp file.
145 $filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];
146 $fh = fopen($filepath, 'w+');
147 if (!$fh) {
148 return false;
150 foreach ($data as $line) {
151 fputcsv($fh, $line);
153 fclose($fh);
155 return $fs->create_file_from_pathname($filerecord, $filepath);
159 * Returns the previous evaluation file.
161 * Important to note that this is per modelid + timesplittingid, when dealing with multiple
162 * analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file
164 * @param int $modelid
165 * @param string $timesplittingid
166 * @return \stored_file
168 public static function get_previous_evaluation_file($modelid, $timesplittingid) {
169 $fs = get_file_storage();
170 // Evaluation data is always labelled.
171 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
172 return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,
173 $filepath, self::EVALUATION_FILENAME);
177 * Gets the list of files that couldn't be previously used for training and prediction.
179 * @param int $modelid
180 * @param bool $includetarget
181 * @param string[] $timesplittingids
182 * @return null
184 public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
185 global $DB;
187 $fs = get_file_storage();
189 if ($includetarget) {
190 $filearea = self::LABELLED_FILEAREA;
191 $usedfileaction = 'trained';
192 } else {
193 $filearea = self::UNLABELLED_FILEAREA;
194 $usedfileaction = 'predicted';
197 $select = 'modelid = :modelid AND action = :action';
198 $params = array('modelid' => $modelid, 'action' => $usedfileaction);
199 $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
201 // Very likely that we will only have 1 time splitting method here.
202 $filesbytimesplitting = array();
203 foreach ($timesplittingids as $timesplittingid) {
205 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
206 $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
207 foreach ($files as $file) {
209 // Discard evaluation files.
210 if ($file->get_filename() === self::EVALUATION_FILENAME) {
211 continue;
214 // No dirs.
215 if ($file->is_directory()) {
216 continue;
219 // Already used for training.
220 if (in_array($file->get_id(), $usedfileids)) {
221 continue;
224 $filesbytimesplitting[$timesplittingid][] = $file;
228 return $filesbytimesplitting;
232 * Deletes previous evaluation files of this model.
234 * @param int $modelid
235 * @param string $timesplittingid
236 * @return bool
238 public static function delete_previous_evaluation_file($modelid, $timesplittingid) {
239 if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {
240 $file->delete();
241 return true;
244 return false;
248 * Returns this (model + analysable + time splitting) file.
250 * @param int $modelid
251 * @param int $analysableid
252 * @param string $timesplittingid
253 * @return \stored_file
255 public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {
257 // Delete previous file if it exists.
258 $fs = get_file_storage();
260 // Always evaluation.csv and labelled as it is an evaluation file.
261 $filearea = self::LABELLED_FILEAREA;
262 $filename = self::get_filename(true);
263 $filepath = '/analysable/' . $analysableid . '/' .
264 \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
265 return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);
269 * Merge multiple files into one.
271 * Important! It is the caller responsability to ensure that the datasets are compatible.
273 * @param array $files
274 * @param int $modelid
275 * @param string $timesplittingid
276 * @param string $filearea
277 * @param bool $evaluation
278 * @return \stored_file
280 public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {
282 $tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';
284 // Add headers.
285 // We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file
286 // once all file contents are merged.
287 $varnames = '';
288 $analysablesvalues = array();
289 foreach ($files as $file) {
290 $rh = $file->get_content_file_handle();
292 // Copy the var names as they are, all files should have the same var names.
293 $varnames = fgetcsv($rh);
295 $analysablesvalues[] = fgetcsv($rh);
297 // Copy the columns as they are, all files should have the same columns.
298 $columns = fgetcsv($rh);
301 // Merge analysable values skipping the ones that are the same in all analysables.
302 $values = array();
303 foreach ($analysablesvalues as $analysablevalues) {
304 foreach ($analysablevalues as $varkey => $value) {
305 // Sha1 to make it unique.
306 $values[$varkey][sha1($value)] = $value;
309 foreach ($values as $varkey => $varvalues) {
310 $values[$varkey] = implode('|', $varvalues);
313 // Start writing to the merge file.
314 $wh = fopen($tmpfilepath, 'w');
315 if (!$wh) {
316 throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
319 fputcsv($wh, $varnames);
320 fputcsv($wh, $values);
321 fputcsv($wh, $columns);
323 // Iterate through all files and add them to the tmp one. We don't want file contents in memory.
324 foreach ($files as $file) {
325 $rh = $file->get_content_file_handle();
327 // Skip headers.
328 fgets($rh);
329 fgets($rh);
330 fgets($rh);
332 // Copy all the following lines.
333 while ($line = fgets($rh)) {
334 fwrite($wh, $line);
336 fclose($rh);
338 fclose($wh);
340 $filerecord = [
341 'component' => 'analytics',
342 'filearea' => $filearea,
343 'itemid' => $modelid,
344 'contextid' => \context_system::instance()->id,
345 'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/',
346 'filename' => self::get_filename($evaluation)
349 $fs = get_file_storage();
351 return $fs->create_file_from_pathname($filerecord, $tmpfilepath);
355 * Exports the model training data.
357 * @param int $modelid
358 * @param string $timesplittingid
359 * @return \stored_file|false
361 public static function export_training_data($modelid, $timesplittingid) {
363 $fs = get_file_storage();
365 $contextid = \context_system::instance()->id;
366 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
368 $files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,
369 $filepath, true, false);
371 // Discard evaluation files.
372 foreach ($files as $key => $file) {
373 if ($file->get_filename() === self::EVALUATION_FILENAME) {
374 unset($files[$key]);
378 if (empty($files)) {
379 return false;
382 return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);
386 * Returns the dataset file data structured by sampleids using the indicators and target column names.
388 * @param \stored_file $dataset
389 * @return array
391 public static function get_structured_data(\stored_file $dataset) {
393 if ($dataset->get_filearea() !== 'unlabelled') {
394 throw new \coding_exception('Sorry, only support for unlabelled data');
397 $rh = $dataset->get_content_file_handle();
399 // Skip dataset info.
400 fgets($rh);
401 fgets($rh);
403 $calculations = array();
405 $headers = fgetcsv($rh);
406 // Get rid of the sampleid column name.
407 array_shift($headers);
409 while ($columns = fgetcsv($rh)) {
410 $uniquesampleid = array_shift($columns);
412 // Unfortunately fgetcsv does not respect line's var types.
413 $calculations[$uniquesampleid] = array_map(function($value) {
415 if ($value === '') {
416 // We really want them as null because converted to float become 0
417 // and we need to treat the values separately.
418 return null;
419 } else if (is_numeric($value)) {
420 return floatval($value);
422 return $value;
423 }, array_combine($headers, $columns));
426 return $calculations;
430 * Delete all files of a model.
432 * @param int $modelid
433 * @return bool
435 public static function clear_model_files($modelid) {
436 $fs = get_file_storage();
437 return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);
441 * Returns the file name to be used.
443 * @param strinbool $evaluation
444 * @return string
446 protected static function get_filename($evaluation) {
448 if ($evaluation === true) {
449 $filename = self::EVALUATION_FILENAME;
450 } else {
451 // Incremental time, the lock will make sure we don't have concurrency problems.
452 $filename = microtime(true) . '.csv';
455 return $filename;