1 <?php
declare(strict_types
=1);
15 * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause
17 class Document
extends \DOMDocument
21 use ManipulationTrait
;
24 protected $libxmlOptions = 0;
26 /** @var string|null */
27 protected $documentEncoding = null;
29 public function __construct(string $version = '1.0', string $encoding = 'UTF-8') {
30 parent
::__construct($version, $encoding);
32 $this->registerNodeClass('DOMText', 'DOMWrap\\Text');
33 $this->registerNodeClass('DOMElement', 'DOMWrap\\Element');
34 $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment');
35 $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document');
36 $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType');
37 $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction');
43 * Multiple values must use bitwise OR.
44 * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
46 * @link http://php.net/manual/en/libxml.constants.php
48 * @param int $libxmlOptions
50 public function setLibxmlOptions(int $libxmlOptions): void
{
51 $this->libxmlOptions
= $libxmlOptions;
57 public function document(): ?\DOMDocument
{
64 public function collection(): NodeList
{
65 return $this->newNodeList([$this]);
71 public function result(NodeList
$nodeList) {
72 if ($nodeList->count()) {
73 return $nodeList->first();
82 public function parent() {
89 public function parents() {
90 return $this->newNodeList();
96 public function substituteWith($newNode): self
{
97 $this->replaceChild($newNode, $this);
105 public function _clone() {
112 public function getHtml(): string {
113 return $this->getOuterHtml();
119 public function setHtml($html): self
{
120 if (!is_string($html) ||
trim($html) == '') {
124 $internalErrors = libxml_use_internal_errors(true);
125 if (\PHP_VERSION_ID
< 80000) {
126 $disableEntities = libxml_disable_entity_loader(true);
127 $this->composeXmlNode($html);
128 libxml_use_internal_errors($internalErrors);
129 libxml_disable_entity_loader($disableEntities);
131 $this->composeXmlNode($html);
132 libxml_use_internal_errors($internalErrors);
139 * @param string $html
140 * @param int $options
144 public function loadHTML($html, $options = 0): bool {
145 // Fix LibXML's crazy-ness RE root nodes
146 // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists
147 // on having one root node. All subsequent nodes are appended to this first node.
148 // To counter this we will create a fake element, allow LibXML to 'do its thing'
149 // then undo it by taking the contents of the fake element, placing it back into
150 // the root and then remove our fake element.
151 if ($options & LIBXML_HTML_NOIMPLIED
) {
152 $html = '<domwrap></domwrap>' . $html;
155 $html = '<?xml encoding="' . ($this->getEncoding() ??
'UTF-8') . '">' . $html;
157 $result = parent
::loadHTML($html, $options);
159 // Do our re-shuffling of nodes.
160 if ($this->libxmlOptions
& LIBXML_HTML_NOIMPLIED
) {
161 $this->children()->first()->contents()->each(function($node){
162 $this->appendWith($node);
165 $this->removeChild($this->children()->first());
172 * @param $encoding string|null
174 public function setEncoding(string $encoding = null) {
175 $this->documentEncoding
= $encoding;
179 * @return string|null
181 public function getEncoding(): ?
string {
182 return $this->documentEncoding
;
186 * @param $html string
188 * @return string|null
190 private function getCharset(string $html): ?
string {
193 if (preg_match('@<meta.*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) {
194 $charset = mb_strtoupper($matches[1]);
201 * @param $html string
203 private function detectEncoding(string $html) {
204 $charset = $this->getEncoding();
206 if (is_null($charset)) {
207 $charset = $this->getCharset($html);
210 $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true);
212 if ($charset === null && $detectedCharset == 'UTF-8') {
213 $charset = $detectedCharset;
216 $this->setEncoding($charset);
220 * @param $html string
224 private function convertToUtf8(string $html): string {
225 $charset = $this->getEncoding();
227 if ($charset !== null) {
228 $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html);
229 $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings()));
232 $html = mb_convert_encoding($html, 'UTF-8', $charset);
234 // Fallback to iconv if available.
235 } elseif (extension_loaded('iconv')) {
236 $htmlIconv = iconv($charset, 'UTF-8', $html);
238 if ($htmlIconv !== false) {
246 if ($charset === null) {
247 $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
256 private function composeXmlNode($html)
258 $this->detectEncoding($html);
260 $html = $this->convertToUtf8($html);
262 $this->loadHTML($html, $this->libxmlOptions
);
264 // Remove <?xml ...> processing instruction.
265 $this->contents()->each(function($node) {
266 if ($node instanceof ProcessingInstruction
&& $node->nodeName
== 'xml') {