Skip to content

[DomCrawler] Use the native HTML5 parser on PHP 8.4 #61475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions UPGRADE-7.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Read more about this in the [Symfony documentation](https://symfony.com/doc/7.4/

If you're upgrading from a version below 7.3, follow the [7.3 upgrade guide](UPGRADE-7.3.md) first.

BrowserKit
----------

* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser

Cache
-----

Expand All @@ -29,6 +34,11 @@ DoctrineBridge

* Deprecate `UniqueEntity::getRequiredOptions()` and `UniqueEntity::getDefaultOption()`

DomCrawler
----------

* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser

FrameworkBundle
---------------

Expand Down
7 changes: 7 additions & 0 deletions src/Symfony/Component/BrowserKit/AbstractBrowser.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ abstract class AbstractBrowser
/** @psalm-var TResponse */
protected object $response;
protected Crawler $crawler;
/** @deprecated since Symfony 7.4, to be removed in Symfony 8 */
protected bool $useHtml5Parser = true;
protected bool $insulated = false;
protected ?string $redirect;
Expand Down Expand Up @@ -204,10 +205,16 @@ public function getCrawler(): Crawler
/**
* Sets whether parsing should be done using "masterminds/html5".
*
* @deprecated since Symfony 7.4, Symfony 8 will unconditionally use the native HTML5 parser
*
* @return $this
*/
public function useHtml5Parser(bool $useHtml5Parser): static
{
if (\PHP_VERSION_ID >= 80400) {
trigger_deprecation('symfony/browser-kit', '7.4', 'Method "%s()" is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.', __METHOD__);
}

$this->useHtml5Parser = $useHtml5Parser;

return $this;
Expand Down
1 change: 1 addition & 0 deletions src/Symfony/Component/BrowserKit/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CHANGELOG

* Add `isFirstPage()` and `isLastPage()` methods to the History class for checking navigation boundaries
* Add PHPUnit constraints: `BrowserHistoryIsOnFirstPage` and `BrowserHistoryIsOnLastPage`
* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser

6.4
---
Expand Down
1 change: 1 addition & 0 deletions src/Symfony/Component/BrowserKit/composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
],
"require": {
"php": ">=8.2",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/dom-crawler": "^6.4|^7.0|^8.0"
},
"require-dev": {
Expand Down
5 changes: 5 additions & 0 deletions src/Symfony/Component/DomCrawler/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
CHANGELOG
=========

7.4
---

* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser

7.0
---

Expand Down
48 changes: 35 additions & 13 deletions src/Symfony/Component/DomCrawler/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,14 @@ public function __construct(
\DOMNodeList|\DOMNode|array|string|null $node = null,
protected ?string $uri = null,
?string $baseHref = null,
bool $useHtml5Parser = true,
private bool $useHtml5Parser = true,
) {
if (\PHP_VERSION_ID >= 80400 && !$useHtml5Parser) {
trigger_deprecation('symfony/dom-crawler', '7.4', 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.');
}

$this->baseHref = $baseHref ?: $uri;
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
$this->html5Parser = \PHP_VERSION_ID < 80400 && $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
$this->cachedNamespaces = new \ArrayObject();

$this->add($node);
Expand Down Expand Up @@ -1081,23 +1085,41 @@ private function supportsEncoding(string $encoding): bool

private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
} else {
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
if (\PHP_VERSION_ID < 80400 || !$this->useHtml5Parser) {
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
} else {
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
}

$internalErrors = libxml_use_internal_errors(true);

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

if ('' !== trim($htmlContent)) {
@$dom->loadHTML($htmlContent);
}

libxml_use_internal_errors($internalErrors);

return $dom;
}

$internalErrors = libxml_use_internal_errors(true);
$document = @\Dom\HTMLDocument::createFromString($htmlContent, \Dom\HTML_NO_DEFAULT_NS, $charset);
$htmlContent = $document->saveXml();
$charset = $document->inputEncoding;

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
$dom->loadXML($htmlContent);

if ('' !== trim($htmlContent)) {
@$dom->loadHTML($htmlContent);
// Register id attributes as ID attributes for getElementById to work
foreach ((new \DOMXPath($dom))->query('//*[@id]') as $element) {
if ($element instanceof \DOMElement) {
$element->setIdAttribute('id', true);
}
}

libxml_use_internal_errors($internalErrors);

return $dom;
}

Expand Down Expand Up @@ -1216,7 +1238,7 @@ private function canParseHtml5String(string $content): bool
return false;
}

if (false === ($pos = stripos($content, '<!doctype html>'))) {
if (false === $pos = stripos($content, '<!doctype html>')) {
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
namespace Symfony\Component\DomCrawler\Tests;

use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\Attributes\Group;
use PHPUnit\Framework\Attributes\IgnoreDeprecations;
use PHPUnit\Framework\Attributes\RequiresPhpExtension;
use PHPUnit\Framework\Error\Notice;
use PHPUnit\Framework\TestCase;
Expand All @@ -20,21 +22,24 @@
use Symfony\Component\DomCrawler\Image;
use Symfony\Component\DomCrawler\Link;

abstract class AbstractCrawlerTestCase extends TestCase
class CrawlerTestCase extends TestCase
{
abstract public static function getDoctype(): string;
public static function getDoctype(): string
{
return '<!DOCTYPE html>';
}

protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null, bool $useHtml5Parser = true)
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
{
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
return new Crawler($node, $uri, $baseHref, \PHP_VERSION_ID >= 80400);
}

public function testConstructor()
{
$crawler = $this->createCrawler();
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');

$doc = new \DOMDocument();
$doc = $this->createDomDocument();
$node = $doc->createElement('test');

$crawler = $this->createCrawler($node);
Expand Down Expand Up @@ -236,7 +241,7 @@ public function testAddNode()

public function testClear()
{
$doc = new \DOMDocument();
$doc = $this->createDomDocument();
$node = $doc->createElement('test');

$crawler = $this->createCrawler($node);
Expand Down Expand Up @@ -407,7 +412,7 @@ public function testInnerText(
public function testHtml()
{
$this->assertEquals('<img alt="Bar">', $this->createTestCrawler()->filterXPath('//a[5]')->html());
$this->assertEquals('<input type="text" value="TextValue" name="TextName"><input type="submit" value="FooValue" name="FooName" id="FooId"><input type="button" value="BarValue" name="BarName" id="BarId"><button value="ButtonValue" name="ButtonName" id="ButtonId"></button>', trim(preg_replace('~>\s+<~', '><', $this->createTestCrawler()->filterXPath('//form[@id="FooFormId"]')->html())));
$this->assertEquals('<input type="text" value="TextValue" name="TextName"><input type="submit" value="FooValue" name="FooName" id="FooId"><input type="button" value="BarValue" name="BarName" id="BarId"><button value="ButtonValue" name="ButtonName" id="ButtonId"><input type="submit" value="FooBarValue" name="FooBarName" form="FooFormId"><input type="text" value="FooTextValue" name="FooTextName" form="FooFormId"><input type="image" alt="ImageAlt" form="FooFormId"></button>', trim(preg_replace('~>\s+<~', '><', $this->createTestCrawler()->filterXPath('//form[@id="FooFormId"]')->html())));

try {
$this->createTestCrawler()->filterXPath('//ol')->html();
Expand All @@ -421,9 +426,9 @@ public function testHtml()

public function testEmojis()
{
$crawler = $this->createCrawler('<body><p>Hey 👋</p></body>');
$crawler = $this->createCrawler('<head></head><body><p>Hey 👋</p></body>');

$this->assertSame('<body><p>Hey 👋</p></body>', $crawler->html());
$this->assertSame('<head></head><body><p>Hey 👋</p></body>', $crawler->html());
}

public function testExtract()
Expand All @@ -448,7 +453,7 @@ public function testFilterXpathComplexQueries()
$this->assertCount(1, $crawler->filterXPath('./body'));
$this->assertCount(1, $crawler->filterXPath('.//body'));
$this->assertCount(6, $crawler->filterXPath('.//input'));
$this->assertCount(4, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
$this->assertCount(7, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
$this->assertCount(1, $crawler->filterXPath('body'));
$this->assertCount(8, $crawler->filterXPath('//button | //input'));
$this->assertCount(1, $crawler->filterXPath('//body'));
Expand Down Expand Up @@ -530,6 +535,16 @@ public function testFilterXPathWithAnUrl()
$this->assertSame('Music', $crawler->text());
}

public function testCaseSentivity()
{
$crawler = $this->createTestXmlCrawler();

$crawler = $crawler->filterXPath('//*[local-name() = "CaseSensitiveTag"]');
$this->assertCount(1, $crawler);
$this->assertSame('Some Content', $crawler->text());
$this->assertSame('CaseSensitiveTag', $crawler->nodeName());
}

public function testFilterXPathWithFakeRoot()
{
$crawler = $this->createTestCrawler();
Expand Down Expand Up @@ -1290,10 +1305,82 @@ public function testAddHtmlContentUnsupportedCharset()
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}

public function createTestCrawler($uri = null)
public function testAddXmlContentWithErrors()
{
$dom = new \DOMDocument();
$dom->loadHTML($this->getDoctype().'
$internalErrors = libxml_use_internal_errors(true);

$crawler = $this->createCrawler();
$crawler->addXmlContent(<<<'EOF'
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
</head>
<body>
<nav><a href="#"><a href="#"></nav>
</body>
</html>
EOF,
'UTF-8'
);

$this->assertGreaterThan(1, libxml_get_errors());

libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}

#[IgnoreDeprecations]
#[Group('legacy')]
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
{
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
$html = '<!DOCTYPE html><html><body><h1><p>Foo</p></h1></body></html>';

$html5Crawler = new Crawler(null, null, null, true);
$html5Crawler->add($html);

$nativeCrawler = new Crawler(null, null, null, false);
$nativeCrawler->add($html);

$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
}

public function testAddHtml5()
{
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
$crawler = $this->createCrawler();
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
}

#[DataProvider('html5Provider')]
public function testHtml5ParserParseContentStartingWithValidHeading(string $content)
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent($content);
self::assertEquals(
'Foo',
$crawler->filterXPath('//h1')->text(),
'->addHtmlContent() parses valid HTML with comment before doctype'
);
}

public static function html5Provider(): iterable
{
$html = self::getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);

yield 'BOM first' => [$BOM.$html];
yield 'Single comment' => ['<!-- comment -->'.$html];
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
yield 'Whitespaces' => [' '.$html];
yield 'All together' => [$BOM.' <!--c-->'.$html];
}

protected function createTestCrawler($uri = null)
{
$html = $this->getDoctype().'
<html>
<body>
<a href="foo">Foo</a>
Expand Down Expand Up @@ -1352,9 +1439,9 @@ public function createTestCrawler($uri = null)
</div>
</body>
</html>
');
';

return $this->createCrawler($dom, $uri);
return $this->createCrawler($html, $uri);
}

protected function createTestXmlCrawler($uri = null)
Expand All @@ -1369,6 +1456,7 @@ protected function createTestXmlCrawler($uri = null)
<yt:aspectRatio>widescreen</yt:aspectRatio>
</media:group>
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
<CaseSensitiveTag>Some Content</CaseSensitiveTag>
</entry>';

return $this->createCrawler($xml, $uri);
Expand Down
Loading
Loading