Merge branch 'MDL-27515_m19' of git://github.com/rwijaya/moodle into MOODLE_19_STABLE
[moodle.git] / search / documents / physical_ppt.php
blob875184fa73635e2422126977230463c437c40597
1 <?php
2 /**
3 * Global Search Engine for Moodle
5 * @package search
6 * @category core
7 * @subpackage document_wrappers
8 * @author Valery Fremaux [valery.fremaux@club-internet.fr] > 1.8
9 * @contributor Tatsuva Shirai 20090530
10 * @date 2008/03/31
11 * @license http://www.gnu.org/copyleft/gpl.html GNU Public License
13 * this is a format handler for getting text out of a proprietary binary format
14 * so it can be indexed by Lucene search engine
18 * first implementation is a trivial heuristic based on ppt character stream :
19 * text sequence always starts with a 00 9F 0F 04 sequence followed by a 15 bytes
20 * sequence
21 * In this sequence is a A8 0F or A0 0F or AA 0F followed by a little-indian encoding of text buffer size
22 * A8 0F denotes for ASCII text (local system monobyte encoding)
23 * A0 0F denotes for UTF-16 encoding
24 * AA 0F are non textual sequences
25 * texts are either in ASCII or UTF-16
26 * text ends on a new sequence start, or on a 00 00 NULL UTF-16 end of stream
28 * based on these following rules, here is a little empiric texte extractor for PPT
31 /**
32 * @param object $resource
33 * @uses $CFG
35 function get_text_for_indexing_ppt(&$resource, $directfile = ''){
36 global $CFG;
38 $indextext = null;
40 // SECURITY : do not allow non admin execute anything on system !!
41 if (!has_capability('moodle/site:doanything', get_context_instance(CONTEXT_SYSTEM))) return;
43 if ($directfile == ''){
44 $text = implode('', file("{$CFG->dataroot}/{$resource->course}/{$resource->reference}"));
45 } else {
46 $text = implode('', file("{$CFG->dataroot}/{$directfile}"));
49 $remains = $text;
50 $fragments = array();
51 while (preg_match('/\x00\x9F\x0F\x04.{9}(......)(.*)/s', $remains, $matches)){
52 $unpacked = unpack("ncode/Llength", $matches[1]);
53 $sequencecode = $unpacked['code'];
54 $length = $unpacked['length'];
55 // print "length : ".$length." ; segment type : ".sprintf("%x", $sequencecode)."<br/>";
56 $followup = $matches[2];
57 // local system encoding sequence
58 if ($sequencecode == 0xA80F){
59 $aFragment = substr($followup, 0, $length);
60 $remains = substr($followup, $length);
61 $fragments[] = $aFragment;
63 // denotes unicode encoded sequence
64 elseif ($sequencecode == 0xA00F){
65 $aFragment = substr($followup, 0, $length);
66 // $aFragment = mb_convert_encoding($aFragment, 'UTF-16', 'UTF-8');
67 $aFragment = preg_replace('/\xA0\x00\x19\x20/s', "'", $aFragment); // some quotes
68 $aFragment = preg_replace('/\x00/s', "", $aFragment);
69 $remains = substr($followup, $length);
70 $fragments[] = $aFragment;
72 else{
73 $remains = $followup;
76 $indextext = implode(' ', $fragments);
77 $indextext = preg_replace('/\x19\x20/', "'", $indextext); // some quotes
78 $indextext = preg_replace('/\x09/', '', $indextext); // some extra chars
79 $indextext = preg_replace('/\x0D/', "\n", $indextext); // some quotes
80 $indextext = preg_replace('/\x0A/', "\n", $indextext); // some quotes
81 $indextextprint = implode('<hr/>', $fragments);
83 // debug code
84 // $logppt = fopen("C:/php5/logs/pptlog", "w");
85 // fwrite($logppt, $indextext);
86 // fclose($logppt);
88 if (!empty($CFG->block_search_limit_index_body)){
89 $indextext = shorten_text($text, $CFG->block_search_limit_index_body);
92 $indextext = mb_convert_encoding($indextext, 'UTF-8', 'auto'); // Shirai 20090530 - MDL19342
93 return $indextext;