Skip to content

Commit 4f290d7

Browse files
minor #31257 [DomCrawler] fix HTML5 parser integration (nicolas-grekas)
This PR was merged into the 4.3 branch. Discussion ---------- [DomCrawler] fix HTML5 parser integration | Q | A | ------------- | --- | Branch? | master | Bug fix? | yes | New feature? | no | BC breaks? | no | Deprecations? | no | Tests pass? | yes | Fixed tickets | - | License | MIT | Doc PR | - Spotted while reviewing #30892 The current logic is context-dependent: by changing the order of calls, you can get different behaviors. Commits ------- ba83bda [DomCrawler] fix HTML5 parser integration
2 parents cbbf8b7 + ba83bda commit 4f290d7

File tree

1 file changed

+8
-15
lines changed

1 file changed

+8
-15
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

+8-15
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ public function __construct($node = null, string $uri = null, string $baseHref =
6969
{
7070
$this->uri = $uri;
7171
$this->baseHref = $baseHref ?: $uri;
72+
$this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
7273

7374
$this->add($node);
7475
}
@@ -190,13 +191,7 @@ public function addContent($content, $type = null)
190191
public function addHtmlContent($content, $charset = 'UTF-8')
191192
{
192193
// Use HTML5 parser if the content is HTML5 and the library is available
193-
if (!$this->html5Parser
194-
&& class_exists(HTML5::class)
195-
&& '<!doctype html>' === strtolower(substr(ltrim($content), 0, 15))) {
196-
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
197-
}
198-
199-
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
194+
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
200195
$this->addDocument($dom);
201196

202197
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -599,18 +594,16 @@ public function html(/* $default = null */)
599594
throw new \InvalidArgumentException('The current node list is empty.');
600595
}
601596

602-
if (null !== $this->html5Parser) {
603-
$html = '';
604-
foreach ($this->getNode(0)->childNodes as $child) {
605-
$html .= $this->html5Parser->saveHTML($child);
606-
}
597+
$node = $this->getNode(0);
598+
$owner = $node->ownerDocument;
607599

608-
return $html;
600+
if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
601+
$owner = $this->html5Parser;
609602
}
610603

611604
$html = '';
612-
foreach ($this->getNode(0)->childNodes as $child) {
613-
$html .= $child->ownerDocument->saveHTML($child);
605+
foreach ($node->childNodes as $child) {
606+
$html .= $owner->saveHTML($child);
614607
}
615608

616609
return $html;

0 commit comments

Comments
 (0)