MDL-56012 lib: Update spout to version 2.6.0
[moodle.git] / lib / spout / src / Spout / Reader / XLSX / Helper / SharedStringsHelper.php
blob0f41e9002535824dc970c82dc3dac441e842d69b
1 <?php
3 namespace Box\Spout\Reader\XLSX\Helper;
5 use Box\Spout\Common\Exception\IOException;
6 use Box\Spout\Reader\Exception\XMLProcessingException;
7 use Box\Spout\Reader\Wrapper\SimpleXMLElement;
8 use Box\Spout\Reader\Wrapper\XMLReader;
9 use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
10 use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
12 /**
13 * Class SharedStringsHelper
14 * This class provides helper functions for reading sharedStrings XML file
16 * @package Box\Spout\Reader\XLSX\Helper
18 class SharedStringsHelper
20 /** Path of sharedStrings XML file inside the XLSX file */
21 const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
23 /** Main namespace for the sharedStrings.xml file */
24 const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
26 /** @var string Path of the XLSX file being read */
27 protected $filePath;
29 /** @var string Temporary folder where the temporary files to store shared strings will be stored */
30 protected $tempFolder;
32 /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
33 protected $cachingStrategy;
35 /**
36 * @param string $filePath Path of the XLSX file being read
37 * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
39 public function __construct($filePath, $tempFolder = null)
41 $this->filePath = $filePath;
42 $this->tempFolder = $tempFolder;
45 /**
46 * Returns whether the XLSX file contains a shared strings XML file
48 * @return bool
50 public function hasSharedStrings()
52 $hasSharedStrings = false;
53 $zip = new \ZipArchive();
55 if ($zip->open($this->filePath) === true) {
56 $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
57 $zip->close();
60 return $hasSharedStrings;
63 /**
64 * Builds an in-memory array containing all the shared strings of the sheet.
65 * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
66 * It is then accessed by the sheet data, via the string index in the built table.
68 * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
70 * The XML file can be really big with sheets containing a lot of data. That is why
71 * we need to use a XML reader that provides streaming like the XMLReader library.
72 * Please note that SimpleXML does not provide such a functionality but since it is faster
73 * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
75 * @return void
76 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
78 public function extractSharedStrings()
80 $xmlReader = new XMLReader();
81 $sharedStringIndex = 0;
82 /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
83 $escaper = \Box\Spout\Common\Escaper\XLSX::getInstance();
85 $sharedStringsFilePath = $this->getSharedStringsFilePath();
86 if ($xmlReader->open($sharedStringsFilePath) === false) {
87 throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
90 try {
91 $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
92 $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
94 $xmlReader->readUntilNodeFound('si');
96 while ($xmlReader->name === 'si') {
97 $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
98 $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
100 // removes nodes that should not be read, like the pronunciation of the Kanji characters
101 $cleanNode = $this->removeSuperfluousTextNodes($node);
103 // find all text nodes "t"; there can be multiple if the cell contains formatting
104 $textNodes = $cleanNode->xpath('//ns:t');
106 $textValue = '';
107 foreach ($textNodes as $nodeIndex => $textNode) {
108 if ($nodeIndex !== 0) {
109 // add a space between each "t" node
110 $textValue .= ' ';
113 if ($this->shouldPreserveWhitespace($textNode)) {
114 $textValue .= $textNode->__toString();
115 } else {
116 $textValue .= trim($textNode->__toString());
120 $unescapedTextValue = $escaper->unescape($textValue);
121 $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
123 $sharedStringIndex++;
125 // jump to the next 'si' tag
126 $xmlReader->next('si');
129 } catch (XMLProcessingException $exception) {
130 throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
133 $this->cachingStrategy->closeCache();
135 $xmlReader->close();
139 * @return string The path to the shared strings XML file
141 protected function getSharedStringsFilePath()
143 return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
147 * Returns the shared strings unique count, as specified in <sst> tag.
149 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
150 * @return int|null Number of unique shared strings in the sharedStrings.xml file
151 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
153 protected function getSharedStringsUniqueCount($xmlReader)
155 $xmlReader->next('sst');
157 // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
158 while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) {
159 $xmlReader->read();
162 $uniqueCount = $xmlReader->getAttribute('uniqueCount');
164 // some software do not add the "uniqueCount" attribute but only use the "count" one
165 // @see https://github.com/box/spout/issues/254
166 if ($uniqueCount === null) {
167 $uniqueCount = $xmlReader->getAttribute('count');
170 return ($uniqueCount !== null) ? intval($uniqueCount) : null;
174 * Returns the best shared strings caching strategy.
176 * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
177 * @return CachingStrategyInterface
179 protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
181 return CachingStrategyFactory::getInstance()
182 ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
186 * Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
187 * This is to simplify the parsing of the subtree.
189 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
190 * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement
191 * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
193 protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
195 $node = null;
196 try {
197 $node = new SimpleXMLElement($xmlReader->readOuterXml());
198 } catch (XMLProcessingException $exception) {
199 throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
202 return $node;
206 * Removes nodes that should not be read, like the pronunciation of the Kanji characters.
207 * By keeping them, their text content would be added to the read string.
209 * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
210 * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
212 protected function removeSuperfluousTextNodes($parentNode)
214 $tagsToRemove = [
215 'rPh', // Pronunciation of the text
216 'pPr', // Paragraph Properties / Previous Paragraph Properties
217 'rPr', // Run Properties for the Paragraph Mark / Previous Run Properties for the Paragraph Mark
220 foreach ($tagsToRemove as $tagToRemove) {
221 $xpath = '//ns:' . $tagToRemove;
222 $parentNode->removeNodesMatchingXPath($xpath);
225 return $parentNode;
229 * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
231 * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
232 * @return bool Whether whitespace should be preserved
234 protected function shouldPreserveWhitespace($textNode)
236 $spaceValue = $textNode->getAttribute('space', 'xml');
237 return ($spaceValue === 'preserve');
241 * Returns the shared string at the given index, using the previously chosen caching strategy.
243 * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
244 * @return string The shared string at the given index
245 * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
247 public function getStringAtIndex($sharedStringIndex)
249 return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
253 * Destroys the cache, freeing memory and removing any created artifacts
255 * @return void
257 public function cleanup()
259 if ($this->cachingStrategy) {
260 $this->cachingStrategy->clearCache();