MDL-51579 course: Bump version to update mobile service
[moodle.git] / lib / filestorage / tgz_extractor.php
blob0fb40ba38a0ec94746ead94f269267a3d33d47d8
1 <?php
2 // This file is part of Moodle - http://moodle.org/
3 //
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17 /**
18 * Implementation of .tar.gz extractor. Handles extraction of .tar.gz files.
19 * Do not call directly; use methods in tgz_packer.
21 * @see tgz_packer
22 * @package core_files
23 * @copyright 2013 The Open University
24 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
27 defined('MOODLE_INTERNAL') || die();
29 /**
30 * Extracts .tar.gz files (POSIX format).
32 class tgz_extractor {
33 /**
34 * @var int When writing data, the system writes blocks of this size.
36 const WRITE_BLOCK_SIZE = 65536;
37 /**
38 * @var int When reading data, the system reads blocks of this size.
40 const READ_BLOCK_SIZE = 65536;
41 /**
42 * @var stored_file File object for archive.
44 protected $storedfile;
45 /**
46 * @var string OS path for archive.
48 protected $ospath;
49 /**
50 * @var int Number of files (-1 if not known).
52 protected $numfiles;
53 /**
54 * @var int Number of files processed so far.
56 protected $donefiles;
57 /**
58 * @var string Current file path within archive.
60 protected $currentarchivepath;
61 /**
62 * @var string Full path to current file.
64 protected $currentfile;
65 /**
66 * @var int Size of current file in bytes.
68 protected $currentfilesize;
69 /**
70 * @var int Number of bytes of current file already written into buffer.
72 protected $currentfileprocessed;
73 /**
74 * @var resource File handle to current file.
76 protected $currentfp;
77 /**
78 * @var int Modified time of current file.
80 protected $currentmtime;
81 /**
82 * @var string Buffer containing file data awaiting write.
84 protected $filebuffer;
85 /**
86 * @var int Current length of buffer in bytes.
88 protected $filebufferlength;
89 /**
90 * @var array Results array of all files processed.
92 protected $results;
94 /**
95 * @var array In list mode, content of the list; outside list mode, null.
97 protected $listresults = null;
99 /**
100 * @var int Whether listing or extracting.
102 protected $mode = self::MODE_EXTRACT;
105 * @var int If extracting (default).
107 const MODE_EXTRACT = 0;
110 * @var int Listing contents.
112 const MODE_LIST = 1;
115 * @var int Listing contents; list now complete.
117 const MODE_LIST_COMPLETE = 2;
120 * Constructor.
122 * @param stored_file|string $archivefile Moodle file or OS path to archive
124 public function __construct($archivefile) {
125 if (is_a($archivefile, 'stored_file')) {
126 $this->storedfile = $archivefile;
127 } else {
128 $this->ospath = $archivefile;
133 * Extracts the archive.
135 * @param tgz_extractor_handler $handler Will be called for extracted files
136 * @param file_progress $progress Optional progress reporting
137 * @return array Array from archive path => true of processed files
138 * @throws moodle_exception If there is any error processing the archive
140 public function extract(tgz_extractor_handler $handler, file_progress $progress = null) {
141 $this->mode = self::MODE_EXTRACT;
142 $this->extract_or_list($handler, $progress);
143 $results = $this->results;
144 unset($this->results);
145 return $results;
149 * Extracts or lists the archive depending on $this->listmode.
151 * @param tgz_extractor_handler $handler Optional handler
152 * @param file_progress $progress Optional progress reporting
153 * @throws moodle_exception If there is any error processing the archive
155 protected function extract_or_list(tgz_extractor_handler $handler = null, file_progress $progress = null) {
156 // Open archive.
157 if ($this->storedfile) {
158 $gz = $this->storedfile->get_content_file_handle(stored_file::FILE_HANDLE_GZOPEN);
159 // Estimate number of read-buffers (64KB) in file. Guess that the
160 // uncompressed size is 2x compressed size. Add one just to ensure
161 // it's non-zero.
162 $estimatedbuffers = ($this->storedfile->get_filesize() * 2 / self::READ_BLOCK_SIZE) + 1;
163 } else {
164 $gz = gzopen($this->ospath, 'rb');
165 $estimatedbuffers = (filesize($this->ospath) * 2 / self::READ_BLOCK_SIZE) + 1;
167 if (!$gz) {
168 throw new moodle_exception('errorprocessingarchive', '', '', null,
169 'Failed to open gzip file');
172 // Calculate how much progress to report per buffer read.
173 $progressperbuffer = (int)(tgz_packer::PROGRESS_MAX / $estimatedbuffers);
175 // Process archive in 512-byte blocks (but reading 64KB at a time).
176 $buffer = '';
177 $bufferpos = 0;
178 $bufferlength = 0;
179 $this->numfiles = -1;
180 $read = 0;
181 $done = 0;
182 $beforeprogress = -1;
183 while (true) {
184 if ($bufferpos == $bufferlength) {
185 $buffer = gzread($gz, self::READ_BLOCK_SIZE);
186 $bufferpos = 0;
187 $bufferlength = strlen($buffer);
188 if ($bufferlength == 0) {
189 // EOF.
190 break;
193 // Report progress if enabled.
194 if ($progress) {
195 if ($this->numfiles === -1) {
196 // If we don't know the number of files, do an estimate based
197 // on number of buffers read.
198 $done += $progressperbuffer;
199 if ($done >= tgz_packer::PROGRESS_MAX) {
200 $done = tgz_packer::PROGRESS_MAX - 1;
202 $progress->progress($done, tgz_packer::PROGRESS_MAX);
203 } else {
204 // Once we know the number of files, use this.
205 if ($beforeprogress === -1) {
206 $beforeprogress = $done;
208 // Calculate progress as whatever progress we reported
209 // before we knew how many files there were (might be 0)
210 // plus a proportion of the number of files out of the
211 // remaining progress value.
212 $done = $beforeprogress + (int)(($this->donefiles / $this->numfiles) *
213 (tgz_packer::PROGRESS_MAX - $beforeprogress));
215 $progress->progress($done, tgz_packer::PROGRESS_MAX);
219 $block = substr($buffer, $bufferpos, tgz_packer::TAR_BLOCK_SIZE);
220 if ($this->currentfile) {
221 $this->process_file_block($block, $handler);
222 } else {
223 $this->process_header($block, $handler);
226 // When listing, if we read an index file, we abort archive processing.
227 if ($this->mode === self::MODE_LIST_COMPLETE) {
228 break;
231 $bufferpos += tgz_packer::TAR_BLOCK_SIZE;
232 $read++;
235 // Close archive and finish.
236 gzclose($gz);
240 * Lists files in the archive, either using the index file (if present),
241 * or by basically extracting the whole thing if there isn't an index file.
243 * @return array Array of file listing results:
245 public function list_files() {
246 $this->listresults = array();
247 $this->mode = self::MODE_LIST;
248 $this->extract_or_list();
249 $listresults = $this->listresults;
250 $this->listresults = null;
251 return $listresults;
255 * Process 512-byte header block.
257 * @param string $block Tar block
258 * @param tgz_extractor_handler $handler Will be called for extracted files
260 protected function process_header($block, $handler) {
261 // If the block consists entirely of nulls, ignore it. (This happens
262 // twice at end of archive.)
263 if ($block === str_pad('', tgz_packer::TAR_BLOCK_SIZE, "\0")) {
264 return;
267 // struct header_posix_ustar {
268 // char name[100];
269 $name = rtrim(substr($block, 0, 100), "\0");
271 // char mode[8];
272 // char uid[8];
273 // char gid[8];
274 // char size[12];
275 $filesize = octdec(substr($block, 124, 11));
277 // char mtime[12];
278 $mtime = octdec(substr($block, 136, 11));
280 // char checksum[8];
281 // char typeflag[1];
282 $typeflag = substr($block, 156, 1);
284 // char linkname[100];
285 // char magic[6];
286 $magic = substr($block, 257, 6);
287 if ($magic !== "ustar\0" && $magic !== "ustar ") {
288 // There are two checks above; the first is the correct POSIX format
289 // and the second is for GNU tar default format.
290 throw new moodle_exception('errorprocessingarchive', '', '', null,
291 'Header does not have POSIX ustar magic string');
294 // char version[2];
295 // char uname[32];
296 // char gname[32];
297 // char devmajor[8];
298 // char devminor[8];
299 // char prefix[155];
300 $prefix = rtrim(substr($block, 345, 155), "\0");
302 // char pad[12];
303 // };
305 $archivepath = ltrim($prefix . '/' . $name, '/');
307 // For security, ensure there is no .. folder in the archivepath.
308 $archivepath = clean_param($archivepath, PARAM_PATH);
310 // Handle file depending on the type.
311 switch ($typeflag) {
312 case '1' :
313 case '2' :
314 case '3' :
315 case '4' :
316 case '6' :
317 case '7' :
318 // Ignore these special cases.
319 break;
321 case '5' :
322 // Directory.
323 if ($this->mode === self::MODE_LIST) {
324 $this->listresults[] = (object)array(
325 'original_pathname' => $archivepath,
326 'pathname' => $archivepath,
327 'mtime' => $mtime,
328 'is_directory' => true,
329 'size' => 0);
330 } else if ($handler->tgz_directory($archivepath, $mtime)) {
331 $this->results[$archivepath] = true;
333 break;
335 default:
336 // All other values treated as normal file.
337 $this->start_current_file($archivepath, $filesize, $mtime, $handler);
338 break;
343 * Processes one 512-byte block of an existing file.
345 * @param string $block Data block
346 * @param tgz_extractor_handler $handler Will be called for extracted files
348 protected function process_file_block($block, tgz_extractor_handler $handler = null) {
349 // Write block into buffer.
350 $blocksize = tgz_packer::TAR_BLOCK_SIZE;
351 if ($this->currentfileprocessed + tgz_packer::TAR_BLOCK_SIZE > $this->currentfilesize) {
352 // Partial block at end of file.
353 $blocksize = $this->currentfilesize - $this->currentfileprocessed;
354 $this->filebuffer .= substr($block, 0, $blocksize);
355 } else {
356 // Full-length block.
357 $this->filebuffer .= $block;
359 $this->filebufferlength += $blocksize;
360 $this->currentfileprocessed += $blocksize;
362 // Write block to file if necessary.
363 $eof = $this->currentfileprocessed == $this->currentfilesize;
364 if ($this->filebufferlength >= self::WRITE_BLOCK_SIZE || $eof) {
365 // Except when skipping the file, write it out.
366 if ($this->currentfile !== true) {
367 if (!fwrite($this->currentfp, $this->filebuffer)) {
368 throw new moodle_exception('errorprocessingarchive', '', '', null,
369 'Failed to write buffer to output file: ' . $this->currentfile);
372 $this->filebuffer = '';
373 $this->filebufferlength = 0;
376 // If file is finished, close it.
377 if ($eof) {
378 $this->close_current_file($handler);
383 * Starts processing a file from archive.
385 * @param string $archivepath Path inside archive
386 * @param int $filesize Size in bytes
387 * @param int $mtime File-modified time
388 * @param tgz_extractor_handler $handler Will be called for extracted files
389 * @throws moodle_exception
391 protected function start_current_file($archivepath, $filesize, $mtime,
392 tgz_extractor_handler $handler = null) {
393 global $CFG;
395 $this->currentarchivepath = $archivepath;
396 $this->currentmtime = $mtime;
397 $this->currentfilesize = $filesize;
398 $this->currentfileprocessed = 0;
400 if ($archivepath === tgz_packer::ARCHIVE_INDEX_FILE) {
401 // For index file, store in temp directory.
402 $tempfolder = $CFG->tempdir . '/core_files';
403 check_dir_exists($tempfolder);
404 $this->currentfile = tempnam($tempfolder, '.index');
405 } else {
406 if ($this->mode === self::MODE_LIST) {
407 // If listing, add to list.
408 $this->listresults[] = (object)array(
409 'original_pathname' => $archivepath,
410 'pathname' => $archivepath,
411 'mtime' => $mtime,
412 'is_directory' => false,
413 'size' => $filesize);
415 // Discard file.
416 $this->currentfile = true;
417 } else {
418 // For other files, ask handler for location.
419 $this->currentfile = $handler->tgz_start_file($archivepath);
420 if ($this->currentfile === null) {
421 // This indicates that we are discarding the current file.
422 $this->currentfile = true;
426 $this->filebuffer = '';
427 $this->filebufferlength = 0;
429 // Open file.
430 if ($this->currentfile !== true) {
431 $this->currentfp = fopen($this->currentfile, 'wb');
432 if (!$this->currentfp) {
433 throw new moodle_exception('errorprocessingarchive', '', '', null,
434 'Failed to open output file: ' . $this->currentfile);
436 } else {
437 $this->currentfp = null;
440 // If it has no size, close it right away.
441 if ($filesize == 0) {
442 $this->close_current_file($handler);
447 * Closes the current file, calls handler, and sets up data.
449 * @param tgz_extractor_handler $handler Will be called for extracted files
450 * @throws moodle_exception If there is an error closing it
452 protected function close_current_file($handler) {
453 if ($this->currentfp !== null) {
454 if (!fclose($this->currentfp)) {
455 throw new moodle_exception('errorprocessingarchive', '', '', null,
456 'Failed to close output file: ' . $this->currentfile);
459 // At this point we should touch the file to set its modified
460 // time to $this->currentmtime. However, when extracting to the
461 // temp directory, cron will delete files more than a week old,
462 // so to avoid problems we leave all files at their current time.
465 if ($this->currentarchivepath === tgz_packer::ARCHIVE_INDEX_FILE) {
466 if ($this->mode === self::MODE_LIST) {
467 // When listing array, use the archive index to produce the list.
468 $index = file($this->currentfile);
469 $ok = true;
470 foreach ($index as $num => $value) {
471 // For first line (header), check it's valid then skip it.
472 if ($num == 0) {
473 if (preg_match('~^' . preg_quote(tgz_packer::ARCHIVE_INDEX_COUNT_PREFIX) . '~', $value)) {
474 continue;
475 } else {
476 // Not valid, better ignore the file.
477 $ok = false;
478 break;
481 // Split on tabs and store in results array.
482 $values = explode("\t", trim($value));
483 $this->listresults[] = (object)array(
484 'original_pathname' => $values[0],
485 'pathname' => $values[0],
486 'mtime' => ($values[3] === '?' ? tgz_packer::DEFAULT_TIMESTAMP : (int)$values[3]),
487 'is_directory' => $values[1] === 'd',
488 'size' => (int)$values[2]);
490 if ($ok) {
491 $this->mode = self::MODE_LIST_COMPLETE;
493 unlink($this->currentfile);
494 } else {
495 // For index file, get number of files and delete temp file.
496 $contents = file_get_contents($this->currentfile, null, null, null, 128);
497 $matches = array();
498 if (preg_match('~^' . preg_quote(tgz_packer::ARCHIVE_INDEX_COUNT_PREFIX) .
499 '([0-9]+)~', $contents, $matches)) {
500 $this->numfiles = (int)$matches[1];
502 unlink($this->currentfile);
504 } else {
505 // Report to handler and put in results.
506 if ($this->currentfp !== null) {
507 $handler->tgz_end_file($this->currentarchivepath, $this->currentfile);
508 $this->results[$this->currentarchivepath] = true;
510 $this->donefiles++;
513 // No longer have a current file.
514 $this->currentfp = null;
515 $this->currentfile = null;
516 $this->currentarchivepath = null;
522 * Interface for callback from tgz_extractor::extract.
524 * The file functions will be called (in pairs tgz_start_file, tgz_end_file) for
525 * each file in the archive. (There is only one exception, the special
526 * .ARCHIVE_INDEX file which is not reported to the handler.)
528 * The directory function is called whenever the archive contains a directory
529 * entry.
531 interface tgz_extractor_handler {
533 * Called when the system begins to extract a file. At this point, the
534 * handler must decide where on disk the extracted file should be located.
535 * This can be a temporary location or final target, as preferred.
537 * The handler can request for files to be skipped, in which case no data
538 * will be written and tgz_end_file will not be called.
540 * @param string $archivepath Path and name of file within archive
541 * @return string Location for output file in filesystem, or null to skip file
543 public function tgz_start_file($archivepath);
546 * Called when the system has finished extracting a file. The handler can
547 * now process the extracted file if required.
549 * @param string $archivepath Path and name of file within archive
550 * @param string $realpath Path in filesystem (from tgz_start_file return)
551 * @return bool True to continue processing, false to abort archive extract
553 public function tgz_end_file($archivepath, $realpath);
556 * Called when a directory entry is found in the archive.
558 * The handler can create a corresponding directory if required.
560 * @param string $archivepath Path and name of directory within archive
561 * @param int $mtime Modified time of directory
562 * @return bool True if directory was created, false if skipped
564 public function tgz_directory($archivepath, $mtime);