Skip to content

Commit f963618

Browse files
[DomCrawler] Use the native HTM5 parser on PHP 8.4
1 parent b223c40 commit f963618

File tree

10 files changed

+214
-26
lines changed

10 files changed

+214
-26
lines changed

UPGRADE-7.4.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Read more about this in the [Symfony documentation](https://symfony.com/doc/7.4/
88

99
If you're upgrading from a version below 7.3, follow the [7.3 upgrade guide](UPGRADE-7.3.md) first.
1010

11+
BrowserKit
12+
----------
13+
14+
* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser
15+
1116
Cache
1217
-----
1318

@@ -29,6 +34,11 @@ DoctrineBridge
2934

3035
* Deprecate `UniqueEntity::getRequiredOptions()` and `UniqueEntity::getDefaultOption()`
3136

37+
DomCrawler
38+
----------
39+
40+
* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser
41+
3242
FrameworkBundle
3343
---------------
3444

src/Symfony/Component/BrowserKit/AbstractBrowser.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ abstract class AbstractBrowser
4646
/** @psalm-var TResponse */
4747
protected object $response;
4848
protected Crawler $crawler;
49+
/** @deprecated since Symfony 7.4, to be removed in Symfony 8 */
4950
protected bool $useHtml5Parser = true;
5051
protected bool $insulated = false;
5152
protected ?string $redirect;
@@ -204,10 +205,16 @@ public function getCrawler(): Crawler
204205
/**
205206
* Sets whether parsing should be done using "masterminds/html5".
206207
*
208+
* @deprecated since Symfony 7.4, Symfony 8 will unconditionally use the native HTML5 parser
209+
*
207210
* @return $this
208211
*/
209212
public function useHtml5Parser(bool $useHtml5Parser): static
210213
{
214+
if (\PHP_VERSION_ID >= 80400) {
215+
trigger_deprecation('symfony/browser-kit', '7.4', 'Method "%s()" is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.', __METHOD__);
216+
}
217+
211218
$this->useHtml5Parser = $useHtml5Parser;
212219

213220
return $this;

src/Symfony/Component/BrowserKit/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ CHANGELOG
66

77
* Add `isFirstPage()` and `isLastPage()` methods to the History class for checking navigation boundaries
88
* Add PHPUnit constraints: `BrowserHistoryIsOnFirstPage` and `BrowserHistoryIsOnLastPage`
9+
* Deprecate `AbstractBrowser::useHtml5Parser()`; Symfony 8 will unconditionally use the native HTML5 parser
910

1011
6.4
1112
---

src/Symfony/Component/BrowserKit/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
],
1818
"require": {
1919
"php": ">=8.2",
20+
"symfony/deprecation-contracts": "^2.5|^3",
2021
"symfony/dom-crawler": "^6.4|^7.0|^8.0"
2122
},
2223
"require-dev": {

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
7.4
5+
---
6+
7+
* Disabling HTML5 parsing is deprecated; Symfony 8 will unconditionally use the native HTML5 parser
8+
49
7.0
510
---
611

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,12 @@ public function __construct(
6262
\DOMNodeList|\DOMNode|array|string|null $node = null,
6363
protected ?string $uri = null,
6464
?string $baseHref = null,
65-
bool $useHtml5Parser = true,
65+
private bool $useHtml5Parser = true,
6666
) {
67+
if (\PHP_VERSION_ID >= 80400 && !$useHtml5Parser) {
68+
trigger_deprecation('symfony/dom-crawler', '7.4', 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.');
69+
}
70+
6771
$this->baseHref = $baseHref ?: $uri;
6872
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
6973
$this->cachedNamespaces = new \ArrayObject();
@@ -1081,23 +1085,41 @@ private function supportsEncoding(string $encoding): bool
10811085

10821086
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
10831087
{
1084-
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1085-
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1086-
} else {
1087-
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1088+
if (\PHP_VERSION_ID < 80400 || !$this->useHtml5Parser) {
1089+
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1090+
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1091+
} else {
1092+
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1093+
}
1094+
1095+
$internalErrors = libxml_use_internal_errors(true);
1096+
1097+
$dom = new \DOMDocument('1.0', $charset);
1098+
$dom->validateOnParse = true;
1099+
1100+
if ('' !== trim($htmlContent)) {
1101+
@$dom->loadHTML($htmlContent);
1102+
}
1103+
1104+
libxml_use_internal_errors($internalErrors);
1105+
1106+
return $dom;
10881107
}
10891108

1090-
$internalErrors = libxml_use_internal_errors(true);
1109+
$document = @\Dom\HTMLDocument::createFromString($htmlContent, \Dom\HTML_NO_DEFAULT_NS, $charset);
1110+
$htmlContent = $document->saveXml();
1111+
$charset = $document->inputEncoding;
10911112

10921113
$dom = new \DOMDocument('1.0', $charset);
1093-
$dom->validateOnParse = true;
1114+
$dom->loadXML($htmlContent);
10941115

1095-
if ('' !== trim($htmlContent)) {
1096-
@$dom->loadHTML($htmlContent);
1116+
// Register id attributes as ID attributes for getElementById to work
1117+
foreach ((new \DOMXPath($dom))->query('//*[@id]') as $element) {
1118+
if ($element instanceof \DOMElement) {
1119+
$element->setIdAttribute('id', true);
1120+
}
10971121
}
10981122

1099-
libxml_use_internal_errors($internalErrors);
1100-
11011123
return $dom;
11021124
}
11031125

@@ -1216,7 +1238,7 @@ private function canParseHtml5String(string $content): bool
12161238
return false;
12171239
}
12181240

1219-
if (false === ($pos = stripos($content, '<!doctype html>'))) {
1241+
if (false === $pos = stripos($content, '<!doctype html>')) {
12201242
return false;
12211243
}
12221244

src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,17 @@ abstract class AbstractCrawlerTestCase extends TestCase
2424
{
2525
abstract public static function getDoctype(): string;
2626

27-
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null, bool $useHtml5Parser = true)
27+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
2828
{
29-
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
29+
return new Crawler($node, $uri, $baseHref, false);
3030
}
3131

3232
public function testConstructor()
3333
{
3434
$crawler = $this->createCrawler();
3535
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');
3636

37-
$doc = new \DOMDocument();
37+
$doc = $this->createDomDocument();
3838
$node = $doc->createElement('test');
3939

4040
$crawler = $this->createCrawler($node);
@@ -236,7 +236,7 @@ public function testAddNode()
236236

237237
public function testClear()
238238
{
239-
$doc = new \DOMDocument();
239+
$doc = $this->createDomDocument();
240240
$node = $doc->createElement('test');
241241

242242
$crawler = $this->createCrawler($node);
@@ -421,9 +421,9 @@ public function testHtml()
421421

422422
public function testEmojis()
423423
{
424-
$crawler = $this->createCrawler('<body><p>Hey 👋</p></body>');
424+
$crawler = $this->createCrawler('<head></head><body><p>Hey 👋</p></body>');
425425

426-
$this->assertSame('<body><p>Hey 👋</p></body>', $crawler->html());
426+
$this->assertSame('<head></head><body><p>Hey 👋</p></body>', $crawler->html());
427427
}
428428

429429
public function testExtract()
@@ -530,6 +530,16 @@ public function testFilterXPathWithAnUrl()
530530
$this->assertSame('Music', $crawler->text());
531531
}
532532

533+
public function testCaseSentivity()
534+
{
535+
$crawler = $this->createTestXmlCrawler();
536+
537+
$crawler = $crawler->filterXPath('//*[local-name() = "CaseSensitiveTag"]');
538+
$this->assertCount(1, $crawler);
539+
$this->assertSame('Some Content', $crawler->text());
540+
$this->assertSame('CaseSensitiveTag', $crawler->nodeName());
541+
}
542+
533543
public function testFilterXPathWithFakeRoot()
534544
{
535545
$crawler = $this->createTestCrawler();
@@ -1292,8 +1302,7 @@ public function testAddHtmlContentUnsupportedCharset()
12921302

12931303
public function createTestCrawler($uri = null)
12941304
{
1295-
$dom = new \DOMDocument();
1296-
$dom->loadHTML($this->getDoctype().'
1305+
$html = $this->getDoctype().'
12971306
<html>
12981307
<body>
12991308
<a href="foo">Foo</a>
@@ -1352,9 +1361,9 @@ public function createTestCrawler($uri = null)
13521361
</div>
13531362
</body>
13541363
</html>
1355-
');
1364+
';
13561365

1357-
return $this->createCrawler($dom, $uri);
1366+
return $this->createCrawler($html, $uri);
13581367
}
13591368

13601369
protected function createTestXmlCrawler($uri = null)
@@ -1369,6 +1378,7 @@ protected function createTestXmlCrawler($uri = null)
13691378
<yt:aspectRatio>widescreen</yt:aspectRatio>
13701379
</media:group>
13711380
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
1381+
<CaseSensitiveTag>Some Content</CaseSensitiveTag>
13721382
</entry>';
13731383

13741384
return $this->createCrawler($xml, $uri);

src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php renamed to src/Symfony/Component/DomCrawler/Tests/LegacyHtml5ParserCrawlerTest.php

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
namespace Symfony\Component\DomCrawler\Tests;
1313

1414
use PHPUnit\Framework\Attributes\DataProvider;
15+
use PHPUnit\Framework\Attributes\RequiresPhp;
1516
use PHPUnit\Framework\Attributes\TestWith;
17+
use Symfony\Component\DomCrawler\Crawler;
1618

17-
class Html5ParserCrawlerTest extends AbstractCrawlerTestCase
19+
#[RequiresPhp('<8.4')]
20+
class LegacyHtml5ParserCrawlerTest extends AbstractCrawlerTestCase
1821
{
1922
public static function getDoctype(): string
2023
{
@@ -54,10 +57,10 @@ public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
5457
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
5558
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
5659

57-
$html5Crawler = $this->createCrawler(null, null, null, true);
60+
$html5Crawler = $this->createCrawler();
5861
$html5Crawler->add($html);
5962

60-
$nativeCrawler = $this->createCrawler(null, null, null, false);
63+
$nativeCrawler = parent::createCrawler();
6164
$nativeCrawler->add($html);
6265

6366
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
@@ -67,7 +70,7 @@ public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
6770
#[TestWith([false])]
6871
public function testHasHtml5Parser(bool $useHtml5Parser)
6972
{
70-
$crawler = $this->createCrawler(null, null, null, $useHtml5Parser);
73+
$crawler = $useHtml5Parser ? $this->createCrawler() : parent::createCrawler();
7174

7275
$r = new \ReflectionProperty($crawler::class, 'html5Parser');
7376
$html5Parser = $r->getValue($crawler);
@@ -99,4 +102,9 @@ public static function invalidHtml5Provider(): iterable
99102
yield 'Text' => ['hello world'.$html];
100103
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
101104
}
105+
106+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
107+
{
108+
return new Crawler($node, $uri, $baseHref, true);
109+
}
102110
}

0 commit comments

Comments
 (0)