Skip to content

Commit bd7fb51

Browse files
[DomCrawler] Use the native HTM5 parser on PHP 8.4
1 parent b223c40 commit bd7fb51

12 files changed

+340
-201
lines changed

UPGRADE-7.4.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Read more about this in the [Symfony documentation](https://symfony.com/doc/7.4/
88

99
If you're upgrading from a version below 7.3, follow the [7.3 upgrade guide](UPGRADE-7.3.md) first.
1010

11+
BrowserKit
12+
----------
13+
14+
* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser
15+
1116
Cache
1217
-----
1318

@@ -29,6 +34,11 @@ DoctrineBridge
2934

3035
* Deprecate `UniqueEntity::getRequiredOptions()` and `UniqueEntity::getDefaultOption()`
3136

37+
DomCrawler
38+
----------
39+
40+
* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser
41+
3242
FrameworkBundle
3343
---------------
3444

src/Symfony/Component/BrowserKit/AbstractBrowser.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ abstract class AbstractBrowser
4646
/** @psalm-var TResponse */
4747
protected object $response;
4848
protected Crawler $crawler;
49+
/** @deprecated since Symfony 7.4, to be removed in Symfony 8 */
4950
protected bool $useHtml5Parser = true;
5051
protected bool $insulated = false;
5152
protected ?string $redirect;
@@ -204,10 +205,16 @@ public function getCrawler(): Crawler
204205
/**
205206
* Sets whether parsing should be done using "masterminds/html5".
206207
*
208+
* @deprecated since Symfony 7.4, Symfony 8 will unconditionally use the native HTML5 parser
209+
*
207210
* @return $this
208211
*/
209212
public function useHtml5Parser(bool $useHtml5Parser): static
210213
{
214+
if (\PHP_VERSION_ID >= 80400) {
215+
trigger_deprecation('symfony/browser-kit', '7.4', 'Method "%s()" is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.', __METHOD__);
216+
}
217+
211218
$this->useHtml5Parser = $useHtml5Parser;
212219

213220
return $this;

src/Symfony/Component/BrowserKit/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ CHANGELOG
66

77
* Add `isFirstPage()` and `isLastPage()` methods to the History class for checking navigation boundaries
88
* Add PHPUnit constraints: `BrowserHistoryIsOnFirstPage` and `BrowserHistoryIsOnLastPage`
9+
* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser
910

1011
6.4
1112
---

src/Symfony/Component/BrowserKit/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
],
1818
"require": {
1919
"php": ">=8.2",
20+
"symfony/deprecation-contracts": "^2.5|^3",
2021
"symfony/dom-crawler": "^6.4|^7.0|^8.0"
2122
},
2223
"require-dev": {

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
7.4
5+
---
6+
7+
* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser
8+
49
7.0
510
---
611

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,14 @@ public function __construct(
6262
\DOMNodeList|\DOMNode|array|string|null $node = null,
6363
protected ?string $uri = null,
6464
?string $baseHref = null,
65-
bool $useHtml5Parser = true,
65+
private bool $useHtml5Parser = true,
6666
) {
67+
if (\PHP_VERSION_ID >= 80400 && !$useHtml5Parser) {
68+
trigger_deprecation('symfony/dom-crawler', '7.4', 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.');
69+
}
70+
6771
$this->baseHref = $baseHref ?: $uri;
68-
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
72+
$this->html5Parser = \PHP_VERSION_ID < 80400 && $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
6973
$this->cachedNamespaces = new \ArrayObject();
7074

7175
$this->add($node);
@@ -1081,23 +1085,41 @@ private function supportsEncoding(string $encoding): bool
10811085

10821086
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
10831087
{
1084-
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1085-
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1086-
} else {
1087-
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1088+
if (\PHP_VERSION_ID < 80400 || !$this->useHtml5Parser) {
1089+
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1090+
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1091+
} else {
1092+
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1093+
}
1094+
1095+
$internalErrors = libxml_use_internal_errors(true);
1096+
1097+
$dom = new \DOMDocument('1.0', $charset);
1098+
$dom->validateOnParse = true;
1099+
1100+
if ('' !== trim($htmlContent)) {
1101+
@$dom->loadHTML($htmlContent);
1102+
}
1103+
1104+
libxml_use_internal_errors($internalErrors);
1105+
1106+
return $dom;
10881107
}
10891108

1090-
$internalErrors = libxml_use_internal_errors(true);
1109+
$document = @\Dom\HTMLDocument::createFromString($htmlContent, \Dom\HTML_NO_DEFAULT_NS, $charset);
1110+
$htmlContent = $document->saveXml();
1111+
$charset = $document->inputEncoding;
10911112

10921113
$dom = new \DOMDocument('1.0', $charset);
1093-
$dom->validateOnParse = true;
1114+
$dom->loadXML($htmlContent);
10941115

1095-
if ('' !== trim($htmlContent)) {
1096-
@$dom->loadHTML($htmlContent);
1116+
// Register id attributes as ID attributes for getElementById to work
1117+
foreach ((new \DOMXPath($dom))->query('//*[@id]') as $element) {
1118+
if ($element instanceof \DOMElement) {
1119+
$element->setIdAttribute('id', true);
1120+
}
10971121
}
10981122

1099-
libxml_use_internal_errors($internalErrors);
1100-
11011123
return $dom;
11021124
}
11031125

@@ -1216,7 +1238,7 @@ private function canParseHtml5String(string $content): bool
12161238
return false;
12171239
}
12181240

1219-
if (false === ($pos = stripos($content, '<!doctype html>'))) {
1241+
if (false === $pos = stripos($content, '<!doctype html>')) {
12201242
return false;
12211243
}
12221244

src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php renamed to src/Symfony/Component/DomCrawler/Tests/CrawlerTestCase.php

Lines changed: 103 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
namespace Symfony\Component\DomCrawler\Tests;
1313

1414
use PHPUnit\Framework\Attributes\DataProvider;
15+
use PHPUnit\Framework\Attributes\Group;
16+
use PHPUnit\Framework\Attributes\IgnoreDeprecations;
1517
use PHPUnit\Framework\Attributes\RequiresPhpExtension;
1618
use PHPUnit\Framework\Error\Notice;
1719
use PHPUnit\Framework\TestCase;
@@ -20,21 +22,24 @@
2022
use Symfony\Component\DomCrawler\Image;
2123
use Symfony\Component\DomCrawler\Link;
2224

23-
abstract class AbstractCrawlerTestCase extends TestCase
25+
class CrawlerTestCase extends TestCase
2426
{
25-
abstract public static function getDoctype(): string;
27+
public static function getDoctype(): string
28+
{
29+
return '<!DOCTYPE html>';
30+
}
2631

27-
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null, bool $useHtml5Parser = true)
32+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
2833
{
29-
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
34+
return new Crawler($node, $uri, $baseHref, \PHP_VERSION_ID >= 80400);
3035
}
3136

3237
public function testConstructor()
3338
{
3439
$crawler = $this->createCrawler();
3540
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');
3641

37-
$doc = new \DOMDocument();
42+
$doc = $this->createDomDocument();
3843
$node = $doc->createElement('test');
3944

4045
$crawler = $this->createCrawler($node);
@@ -236,7 +241,7 @@ public function testAddNode()
236241

237242
public function testClear()
238243
{
239-
$doc = new \DOMDocument();
244+
$doc = $this->createDomDocument();
240245
$node = $doc->createElement('test');
241246

242247
$crawler = $this->createCrawler($node);
@@ -407,7 +412,7 @@ public function testInnerText(
407412
public function testHtml()
408413
{
409414
$this->assertEquals('<img alt="Bar">', $this->createTestCrawler()->filterXPath('//a[5]')->html());
410-
$this->assertEquals('<input type="text" value="TextValue" name="TextName"><input type="submit" value="FooValue" name="FooName" id="FooId"><input type="button" value="BarValue" name="BarName" id="BarId"><button value="ButtonValue" name="ButtonName" id="ButtonId"></button>', trim(preg_replace('~>\s+<~', '><', $this->createTestCrawler()->filterXPath('//form[@id="FooFormId"]')->html())));
415+
$this->assertEquals('<input type="text" value="TextValue" name="TextName"><input type="submit" value="FooValue" name="FooName" id="FooId"><input type="button" value="BarValue" name="BarName" id="BarId"><button value="ButtonValue" name="ButtonName" id="ButtonId"><input type="submit" value="FooBarValue" name="FooBarName" form="FooFormId"><input type="text" value="FooTextValue" name="FooTextName" form="FooFormId"><input type="image" alt="ImageAlt" form="FooFormId"></button>', trim(preg_replace('~>\s+<~', '><', $this->createTestCrawler()->filterXPath('//form[@id="FooFormId"]')->html())));
411416

412417
try {
413418
$this->createTestCrawler()->filterXPath('//ol')->html();
@@ -421,9 +426,9 @@ public function testHtml()
421426

422427
public function testEmojis()
423428
{
424-
$crawler = $this->createCrawler('<body><p>Hey 👋</p></body>');
429+
$crawler = $this->createCrawler('<head></head><body><p>Hey 👋</p></body>');
425430

426-
$this->assertSame('<body><p>Hey 👋</p></body>', $crawler->html());
431+
$this->assertSame('<head></head><body><p>Hey 👋</p></body>', $crawler->html());
427432
}
428433

429434
public function testExtract()
@@ -448,7 +453,7 @@ public function testFilterXpathComplexQueries()
448453
$this->assertCount(1, $crawler->filterXPath('./body'));
449454
$this->assertCount(1, $crawler->filterXPath('.//body'));
450455
$this->assertCount(6, $crawler->filterXPath('.//input'));
451-
$this->assertCount(4, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
456+
$this->assertCount(7, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
452457
$this->assertCount(1, $crawler->filterXPath('body'));
453458
$this->assertCount(8, $crawler->filterXPath('//button | //input'));
454459
$this->assertCount(1, $crawler->filterXPath('//body'));
@@ -530,6 +535,16 @@ public function testFilterXPathWithAnUrl()
530535
$this->assertSame('Music', $crawler->text());
531536
}
532537

538+
public function testCaseSentivity()
539+
{
540+
$crawler = $this->createTestXmlCrawler();
541+
542+
$crawler = $crawler->filterXPath('//*[local-name() = "CaseSensitiveTag"]');
543+
$this->assertCount(1, $crawler);
544+
$this->assertSame('Some Content', $crawler->text());
545+
$this->assertSame('CaseSensitiveTag', $crawler->nodeName());
546+
}
547+
533548
public function testFilterXPathWithFakeRoot()
534549
{
535550
$crawler = $this->createTestCrawler();
@@ -1290,10 +1305,82 @@ public function testAddHtmlContentUnsupportedCharset()
12901305
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
12911306
}
12921307

1293-
public function createTestCrawler($uri = null)
1308+
public function testAddXmlContentWithErrors()
12941309
{
1295-
$dom = new \DOMDocument();
1296-
$dom->loadHTML($this->getDoctype().'
1310+
$internalErrors = libxml_use_internal_errors(true);
1311+
1312+
$crawler = $this->createCrawler();
1313+
$crawler->addXmlContent(<<<'EOF'
1314+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
1315+
<html>
1316+
<head>
1317+
</head>
1318+
<body>
1319+
<nav><a href="#"><a href="#"></nav>
1320+
</body>
1321+
</html>
1322+
EOF,
1323+
'UTF-8'
1324+
);
1325+
1326+
$this->assertGreaterThan(1, libxml_get_errors());
1327+
1328+
libxml_clear_errors();
1329+
libxml_use_internal_errors($internalErrors);
1330+
}
1331+
1332+
#[IgnoreDeprecations]
1333+
#[Group('legacy')]
1334+
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
1335+
{
1336+
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
1337+
$html = '<!DOCTYPE html><html><body><h1><p>Foo</p></h1></body></html>';
1338+
1339+
$html5Crawler = new Crawler(null, null, null, true);
1340+
$html5Crawler->add($html);
1341+
1342+
$nativeCrawler = new Crawler(null, null, null, false);
1343+
$nativeCrawler->add($html);
1344+
1345+
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
1346+
}
1347+
1348+
public function testAddHtml5()
1349+
{
1350+
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
1351+
$crawler = $this->createCrawler();
1352+
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
1353+
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
1354+
}
1355+
1356+
#[DataProvider('html5Provider')]
1357+
public function testHtml5ParserParseContentStartingWithValidHeading(string $content)
1358+
{
1359+
$crawler = $this->createCrawler();
1360+
$crawler->addHtmlContent($content);
1361+
self::assertEquals(
1362+
'Foo',
1363+
$crawler->filterXPath('//h1')->text(),
1364+
'->addHtmlContent() parses valid HTML with comment before doctype'
1365+
);
1366+
}
1367+
1368+
public static function html5Provider(): iterable
1369+
{
1370+
$html = self::getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
1371+
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
1372+
1373+
yield 'BOM first' => [$BOM.$html];
1374+
yield 'Single comment' => ['<!-- comment -->'.$html];
1375+
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
1376+
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
1377+
yield 'Whitespaces' => [' '.$html];
1378+
yield 'All together' => [$BOM.' <!--c-->'.$html];
1379+
}
1380+
1381+
protected function createTestCrawler($uri = null)
1382+
{
1383+
$html = $this->getDoctype().'
12971384
<html>
12981385
<body>
12991386
<a href="foo">Foo</a>
@@ -1352,9 +1439,9 @@ public function createTestCrawler($uri = null)
13521439
</div>
13531440
</body>
13541441
</html>
1355-
');
1442+
';
13561443

1357-
return $this->createCrawler($dom, $uri);
1444+
return $this->createCrawler($html, $uri);
13581445
}
13591446

13601447
protected function createTestXmlCrawler($uri = null)
@@ -1369,6 +1456,7 @@ protected function createTestXmlCrawler($uri = null)
13691456
<yt:aspectRatio>widescreen</yt:aspectRatio>
13701457
</media:group>
13711458
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
1459+
<CaseSensitiveTag>Some Content</CaseSensitiveTag>
13721460
</entry>';
13731461

13741462
return $this->createCrawler($xml, $uri);

0 commit comments

Comments
 (0)