2 // This file is part of Moodle - http://moodle.org/
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
18 * Implementation of .tar.gz extractor. Handles extraction of .tar.gz files.
19 * Do not call directly; use methods in tgz_packer.
23 * @copyright 2013 The Open University
24 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
27 defined('MOODLE_INTERNAL') ||
die();
30 * Extracts .tar.gz files (POSIX format).
34 * @var int When writing data, the system writes blocks of this size.
36 const WRITE_BLOCK_SIZE
= 65536;
38 * @var int When reading data, the system reads blocks of this size.
40 const READ_BLOCK_SIZE
= 65536;
42 * @var stored_file File object for archive.
44 protected $storedfile;
46 * @var string OS path for archive.
50 * @var int Number of files (-1 if not known).
54 * @var int Number of files processed so far.
58 * @var string Current file path within archive.
60 protected $currentarchivepath;
62 * @var string Full path to current file.
64 protected $currentfile;
66 * @var int Size of current file in bytes.
68 protected $currentfilesize;
70 * @var int Number of bytes of current file already written into buffer.
72 protected $currentfileprocessed;
74 * @var resource File handle to current file.
78 * @var int Modified time of current file.
80 protected $currentmtime;
82 * @var string Buffer containing file data awaiting write.
84 protected $filebuffer;
86 * @var int Current length of buffer in bytes.
88 protected $filebufferlength;
90 * @var array Results array of all files processed.
95 * @var array In list mode, content of the list; outside list mode, null.
97 protected $listresults = null;
100 * @var int Whether listing or extracting.
102 protected $mode = self
::MODE_EXTRACT
;
105 * @var int If extracting (default).
107 const MODE_EXTRACT
= 0;
110 * @var int Listing contents.
115 * @var int Listing contents; list now complete.
117 const MODE_LIST_COMPLETE
= 2;
122 * @param stored_file|string $archivefile Moodle file or OS path to archive
124 public function __construct($archivefile) {
125 if (is_a($archivefile, 'stored_file')) {
126 $this->storedfile
= $archivefile;
128 $this->ospath
= $archivefile;
133 * Extracts the archive.
135 * @param tgz_extractor_handler $handler Will be called for extracted files
136 * @param file_progress $progress Optional progress reporting
137 * @return array Array from archive path => true of processed files
138 * @throws moodle_exception If there is any error processing the archive
140 public function extract(tgz_extractor_handler
$handler, file_progress
$progress = null) {
141 $this->mode
= self
::MODE_EXTRACT
;
142 $this->extract_or_list($handler, $progress);
143 $results = $this->results
;
144 unset($this->results
);
149 * Extracts or lists the archive depending on $this->listmode.
151 * @param tgz_extractor_handler $handler Optional handler
152 * @param file_progress $progress Optional progress reporting
153 * @throws moodle_exception If there is any error processing the archive
155 protected function extract_or_list(tgz_extractor_handler
$handler = null, file_progress
$progress = null) {
157 if ($this->storedfile
) {
158 $gz = $this->storedfile
->get_content_file_handle(stored_file
::FILE_HANDLE_GZOPEN
);
159 // Estimate number of read-buffers (64KB) in file. Guess that the
160 // uncompressed size is 2x compressed size. Add one just to ensure
162 $estimatedbuffers = ($this->storedfile
->get_filesize() * 2 / self
::READ_BLOCK_SIZE
) +
1;
164 $gz = gzopen($this->ospath
, 'rb');
165 $estimatedbuffers = (filesize($this->ospath
) * 2 / self
::READ_BLOCK_SIZE
) +
1;
168 throw new moodle_exception('errorprocessingarchive', '', '', null,
169 'Failed to open gzip file');
172 // Calculate how much progress to report per buffer read.
173 $progressperbuffer = (int)(tgz_packer
::PROGRESS_MAX
/ $estimatedbuffers);
175 // Process archive in 512-byte blocks (but reading 64KB at a time).
179 $this->numfiles
= -1;
182 $beforeprogress = -1;
184 if ($bufferpos == $bufferlength) {
185 $buffer = gzread($gz, self
::READ_BLOCK_SIZE
);
187 $bufferlength = strlen($buffer);
188 if ($bufferlength == 0) {
193 // Report progress if enabled.
195 if ($this->numfiles
=== -1) {
196 // If we don't know the number of files, do an estimate based
197 // on number of buffers read.
198 $done +
= $progressperbuffer;
199 if ($done >= tgz_packer
::PROGRESS_MAX
) {
200 $done = tgz_packer
::PROGRESS_MAX
- 1;
202 $progress->progress($done, tgz_packer
::PROGRESS_MAX
);
204 // Once we know the number of files, use this.
205 if ($beforeprogress === -1) {
206 $beforeprogress = $done;
208 // Calculate progress as whatever progress we reported
209 // before we knew how many files there were (might be 0)
210 // plus a proportion of the number of files out of the
211 // remaining progress value.
212 $done = $beforeprogress +
(int)(($this->donefiles
/ $this->numfiles
) *
213 (tgz_packer
::PROGRESS_MAX
- $beforeprogress));
215 $progress->progress($done, tgz_packer
::PROGRESS_MAX
);
219 $block = substr($buffer, $bufferpos, tgz_packer
::TAR_BLOCK_SIZE
);
220 if ($this->currentfile
) {
221 $this->process_file_block($block, $handler);
223 $this->process_header($block, $handler);
226 // When listing, if we read an index file, we abort archive processing.
227 if ($this->mode
=== self
::MODE_LIST_COMPLETE
) {
231 $bufferpos +
= tgz_packer
::TAR_BLOCK_SIZE
;
235 // Close archive and finish.
240 * Lists files in the archive, either using the index file (if present),
241 * or by basically extracting the whole thing if there isn't an index file.
243 * @return array Array of file listing results:
245 public function list_files() {
246 $this->listresults
= array();
247 $this->mode
= self
::MODE_LIST
;
248 $this->extract_or_list();
249 $listresults = $this->listresults
;
250 $this->listresults
= null;
255 * Process 512-byte header block.
257 * @param string $block Tar block
258 * @param tgz_extractor_handler $handler Will be called for extracted files
260 protected function process_header($block, $handler) {
261 // If the block consists entirely of nulls, ignore it. (This happens
262 // twice at end of archive.)
263 if ($block === str_pad('', tgz_packer
::TAR_BLOCK_SIZE
, "\0")) {
267 // struct header_posix_ustar {
269 $name = rtrim(substr($block, 0, 100), "\0");
275 $filesize = octdec(substr($block, 124, 11));
278 $mtime = octdec(substr($block, 136, 11));
282 $typeflag = substr($block, 156, 1);
284 // char linkname[100];
286 $magic = substr($block, 257, 6);
287 if ($magic !== "ustar\0" && $magic !== "ustar ") {
288 // There are two checks above; the first is the correct POSIX format
289 // and the second is for GNU tar default format.
290 throw new moodle_exception('errorprocessingarchive', '', '', null,
291 'Header does not have POSIX ustar magic string');
300 $prefix = rtrim(substr($block, 345, 155), "\0");
305 $archivepath = ltrim($prefix . '/' . $name, '/');
307 // For security, ensure there is no .. folder in the archivepath.
308 $archivepath = clean_param($archivepath, PARAM_PATH
);
310 // Handle file depending on the type.
318 // Ignore these special cases.
323 if ($this->mode
=== self
::MODE_LIST
) {
324 $this->listresults
[] = (object)array(
325 'original_pathname' => $archivepath,
326 'pathname' => $archivepath,
328 'is_directory' => true,
330 } else if ($handler->tgz_directory($archivepath, $mtime)) {
331 $this->results
[$archivepath] = true;
336 // All other values treated as normal file.
337 $this->start_current_file($archivepath, $filesize, $mtime, $handler);
343 * Processes one 512-byte block of an existing file.
345 * @param string $block Data block
346 * @param tgz_extractor_handler $handler Will be called for extracted files
348 protected function process_file_block($block, tgz_extractor_handler
$handler = null) {
349 // Write block into buffer.
350 $blocksize = tgz_packer
::TAR_BLOCK_SIZE
;
351 if ($this->currentfileprocessed + tgz_packer
::TAR_BLOCK_SIZE
> $this->currentfilesize
) {
352 // Partial block at end of file.
353 $blocksize = $this->currentfilesize
- $this->currentfileprocessed
;
354 $this->filebuffer
.= substr($block, 0, $blocksize);
356 // Full-length block.
357 $this->filebuffer
.= $block;
359 $this->filebufferlength +
= $blocksize;
360 $this->currentfileprocessed +
= $blocksize;
362 // Write block to file if necessary.
363 $eof = $this->currentfileprocessed
== $this->currentfilesize
;
364 if ($this->filebufferlength
>= self
::WRITE_BLOCK_SIZE ||
$eof) {
365 // Except when skipping the file, write it out.
366 if ($this->currentfile
!== true) {
367 if (!fwrite($this->currentfp
, $this->filebuffer
)) {
368 throw new moodle_exception('errorprocessingarchive', '', '', null,
369 'Failed to write buffer to output file: ' . $this->currentfile
);
372 $this->filebuffer
= '';
373 $this->filebufferlength
= 0;
376 // If file is finished, close it.
378 $this->close_current_file($handler);
383 * Starts processing a file from archive.
385 * @param string $archivepath Path inside archive
386 * @param int $filesize Size in bytes
387 * @param int $mtime File-modified time
388 * @param tgz_extractor_handler $handler Will be called for extracted files
389 * @throws moodle_exception
391 protected function start_current_file($archivepath, $filesize, $mtime,
392 tgz_extractor_handler
$handler = null) {
395 $this->currentarchivepath
= $archivepath;
396 $this->currentmtime
= $mtime;
397 $this->currentfilesize
= $filesize;
398 $this->currentfileprocessed
= 0;
400 if ($archivepath === tgz_packer
::ARCHIVE_INDEX_FILE
) {
401 // For index file, store in temp directory.
402 $tempfolder = $CFG->tempdir
. '/core_files';
403 check_dir_exists($tempfolder);
404 $this->currentfile
= tempnam($tempfolder, '.index');
406 if ($this->mode
=== self
::MODE_LIST
) {
407 // If listing, add to list.
408 $this->listresults
[] = (object)array(
409 'original_pathname' => $archivepath,
410 'pathname' => $archivepath,
412 'is_directory' => false,
413 'size' => $filesize);
416 $this->currentfile
= true;
418 // For other files, ask handler for location.
419 $this->currentfile
= $handler->tgz_start_file($archivepath);
420 if ($this->currentfile
=== null) {
421 // This indicates that we are discarding the current file.
422 $this->currentfile
= true;
426 $this->filebuffer
= '';
427 $this->filebufferlength
= 0;
430 if ($this->currentfile
!== true) {
431 $this->currentfp
= fopen($this->currentfile
, 'wb');
432 if (!$this->currentfp
) {
433 throw new moodle_exception('errorprocessingarchive', '', '', null,
434 'Failed to open output file: ' . $this->currentfile
);
437 $this->currentfp
= null;
440 // If it has no size, close it right away.
441 if ($filesize == 0) {
442 $this->close_current_file($handler);
447 * Closes the current file, calls handler, and sets up data.
449 * @param tgz_extractor_handler $handler Will be called for extracted files
450 * @throws moodle_exception If there is an error closing it
452 protected function close_current_file($handler) {
453 if ($this->currentfp
!== null) {
454 if (!fclose($this->currentfp
)) {
455 throw new moodle_exception('errorprocessingarchive', '', '', null,
456 'Failed to close output file: ' . $this->currentfile
);
459 // At this point we should touch the file to set its modified
460 // time to $this->currentmtime. However, when extracting to the
461 // temp directory, cron will delete files more than a week old,
462 // so to avoid problems we leave all files at their current time.
465 if ($this->currentarchivepath
=== tgz_packer
::ARCHIVE_INDEX_FILE
) {
466 if ($this->mode
=== self
::MODE_LIST
) {
467 // When listing array, use the archive index to produce the list.
468 $index = file($this->currentfile
);
470 foreach ($index as $num => $value) {
471 // For first line (header), check it's valid then skip it.
473 if (preg_match('~^' . preg_quote(tgz_packer
::ARCHIVE_INDEX_COUNT_PREFIX
) . '~', $value)) {
476 // Not valid, better ignore the file.
481 // Split on tabs and store in results array.
482 $values = explode("\t", trim($value));
483 $this->listresults
[] = (object)array(
484 'original_pathname' => $values[0],
485 'pathname' => $values[0],
486 'mtime' => ($values[3] === '?' ? tgz_packer
::DEFAULT_TIMESTAMP
: (int)$values[3]),
487 'is_directory' => $values[1] === 'd',
488 'size' => (int)$values[2]);
491 $this->mode
= self
::MODE_LIST_COMPLETE
;
493 unlink($this->currentfile
);
495 // For index file, get number of files and delete temp file.
496 $contents = file_get_contents($this->currentfile
, null, null, null, 128);
498 if (preg_match('~^' . preg_quote(tgz_packer
::ARCHIVE_INDEX_COUNT_PREFIX
) .
499 '([0-9]+)~', $contents, $matches)) {
500 $this->numfiles
= (int)$matches[1];
502 unlink($this->currentfile
);
505 // Report to handler and put in results.
506 if ($this->currentfp
!== null) {
507 $handler->tgz_end_file($this->currentarchivepath
, $this->currentfile
);
508 $this->results
[$this->currentarchivepath
] = true;
513 // No longer have a current file.
514 $this->currentfp
= null;
515 $this->currentfile
= null;
516 $this->currentarchivepath
= null;
522 * Interface for callback from tgz_extractor::extract.
524 * The file functions will be called (in pairs tgz_start_file, tgz_end_file) for
525 * each file in the archive. (There is only one exception, the special
526 * .ARCHIVE_INDEX file which is not reported to the handler.)
528 * The directory function is called whenever the archive contains a directory
531 interface tgz_extractor_handler
{
533 * Called when the system begins to extract a file. At this point, the
534 * handler must decide where on disk the extracted file should be located.
535 * This can be a temporary location or final target, as preferred.
537 * The handler can request for files to be skipped, in which case no data
538 * will be written and tgz_end_file will not be called.
540 * @param string $archivepath Path and name of file within archive
541 * @return string Location for output file in filesystem, or null to skip file
543 public function tgz_start_file($archivepath);
546 * Called when the system has finished extracting a file. The handler can
547 * now process the extracted file if required.
549 * @param string $archivepath Path and name of file within archive
550 * @param string $realpath Path in filesystem (from tgz_start_file return)
551 * @return bool True to continue processing, false to abort archive extract
553 public function tgz_end_file($archivepath, $realpath);
556 * Called when a directory entry is found in the archive.
558 * The handler can create a corresponding directory if required.
560 * @param string $archivepath Path and name of directory within archive
561 * @param int $mtime Modified time of directory
562 * @return bool True if directory was created, false if skipped
564 public function tgz_directory($archivepath, $mtime);