Skip to content

Commit 14e422a

Browse files
[DomCrawler] Always parse according to HTML5 rules thanks to the native DOM parser
1 parent 246ffab commit 14e422a

File tree

9 files changed

+22
-309
lines changed

9 files changed

+22
-309
lines changed

UPGRADE-8.0.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ AssetMapper
1414

1515
* Remove `ImportMapConfigReader::splitPackageNameAndFilePath()`, use `ImportMapEntry::splitPackageNameAndFilePath()` instead
1616

17+
BrowserKit
18+
----------
19+
20+
* Remove `AbstractBrowser::useHtml5Parser()`; the native HTML5 parser is used unconditionnally
21+
1722
Cache
1823
-----
1924

@@ -117,6 +122,11 @@ DoctrineBridge
117122
* Remove support for auto-mapping Doctrine entities to controller arguments; use explicit mapping instead
118123
* Make `ProxyCacheWarmer` class `final`
119124

125+
DomCrawler
126+
----------
127+
128+
* Remove argument `$useHtml5Parser` of `Crawler`'s constructor; the native HTML5 parser is used unconditionnally
129+
120130
ExpressionLanguage
121131
------------------
122132

src/Symfony/Component/BrowserKit/AbstractBrowser.php

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ abstract class AbstractBrowser
4646
/** @psalm-var TResponse */
4747
protected object $response;
4848
protected Crawler $crawler;
49-
/** @deprecated since Symfony 7.4, to be removed in Symfony 8 */
50-
protected bool $useHtml5Parser = true;
5149
protected bool $insulated = false;
5250
protected ?string $redirect;
5351
protected bool $followRedirects = true;
@@ -202,24 +200,6 @@ public function getCrawler(): Crawler
202200
return $this->crawler ?? throw new BadMethodCallException(\sprintf('The "request()" method must be called before "%s()".', __METHOD__));
203201
}
204202

205-
/**
206-
* Sets whether parsing should be done using "masterminds/html5".
207-
*
208-
* @deprecated since Symfony 7.4, Symfony 8 will unconditionally use the native HTML5 parser
209-
*
210-
* @return $this
211-
*/
212-
public function useHtml5Parser(bool $useHtml5Parser): static
213-
{
214-
if (\PHP_VERSION_ID >= 80400) {
215-
trigger_deprecation('symfony/browser-kit', '7.4', 'Method "%s()" is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.', __METHOD__);
216-
}
217-
218-
$this->useHtml5Parser = $useHtml5Parser;
219-
220-
return $this;
221-
}
222-
223203
/**
224204
* Returns the current BrowserKit Response instance.
225205
*/
@@ -507,7 +487,7 @@ protected function createCrawlerFromContent(string $uri, string $content, string
507487
return null;
508488
}
509489

510-
$crawler = new Crawler(null, $uri, null, $this->useHtml5Parser);
490+
$crawler = new Crawler(null, $uri, null);
511491
$crawler->addContent($content, $type);
512492

513493
return $crawler;

src/Symfony/Component/BrowserKit/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
8.0
5+
---
6+
7+
* Remove `AbstractBrowser::useHtml5Parser()`; the native HTML5 parser is used unconditionnally
8+
49
7.4
510
---
611

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
8.0
5+
---
6+
7+
* Remove argument `$useHtml5Parser` of `Crawler`'s constructor; the native HTML5 parser is used unconditionnally
8+
49
7.4
510
---
611

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 1 addition & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
namespace Symfony\Component\DomCrawler;
1313

14-
use Masterminds\HTML5;
1514
use Symfony\Component\CssSelector\CssSelectorConverter;
1615

1716
/**
@@ -53,23 +52,15 @@ class Crawler implements \Countable, \IteratorAggregate
5352
*/
5453
private bool $isHtml = true;
5554

56-
private ?HTML5 $html5Parser = null;
57-
5855
/**
5956
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
6057
*/
6158
public function __construct(
6259
\DOMNodeList|\DOMNode|array|string|null $node = null,
6360
protected ?string $uri = null,
6461
?string $baseHref = null,
65-
private bool $useHtml5Parser = true,
6662
) {
67-
if (\PHP_VERSION_ID >= 80400 && !$useHtml5Parser) {
68-
trigger_deprecation('symfony/dom-crawler', '7.4', 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser.');
69-
}
70-
7163
$this->baseHref = $baseHref ?: $uri;
72-
$this->html5Parser = \PHP_VERSION_ID < 80400 && $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
7364
$this->cachedNamespaces = new \ArrayObject();
7465

7566
$this->add($node);
@@ -175,7 +166,7 @@ public function addContent(string $content, ?string $type = null): void
175166
*/
176167
public function addHtmlContent(string $content, string $charset = 'UTF-8'): void
177168
{
178-
$dom = $this->parseHtmlString($content, $charset);
169+
$dom = $this->parseXhtml($content, $charset);
179170
$this->addDocument($dom);
180171

181172
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -609,10 +600,6 @@ public function html(?string $default = null): string
609600
$node = $this->getNode(0);
610601
$owner = $node->ownerDocument;
611602

612-
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
613-
$owner = $this->html5Parser;
614-
}
615-
616603
$html = '';
617604
foreach ($node->childNodes as $child) {
618605
$html .= $owner->saveHTML($child);
@@ -630,10 +617,6 @@ public function outerHtml(): string
630617
$node = $this->getNode(0);
631618
$owner = $node->ownerDocument;
632619

633-
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
634-
$owner = $this->html5Parser;
635-
}
636-
637620
return $owner->saveHTML($node);
638621
}
639622

@@ -1064,48 +1047,8 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'):
10641047
return $nodes;
10651048
}
10661049

1067-
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
1068-
{
1069-
if (!$this->supportsEncoding($charset)) {
1070-
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1071-
$charset = 'UTF-8';
1072-
}
1073-
1074-
return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]);
1075-
}
1076-
1077-
private function supportsEncoding(string $encoding): bool
1078-
{
1079-
try {
1080-
return '' === @mb_convert_encoding('', $encoding, 'UTF-8');
1081-
} catch (\Throwable $e) {
1082-
return false;
1083-
}
1084-
}
1085-
10861050
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
10871051
{
1088-
if (\PHP_VERSION_ID < 80400 || !$this->useHtml5Parser) {
1089-
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1090-
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1091-
} else {
1092-
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1093-
}
1094-
1095-
$internalErrors = libxml_use_internal_errors(true);
1096-
1097-
$dom = new \DOMDocument('1.0', $charset);
1098-
$dom->validateOnParse = true;
1099-
1100-
if ('' !== trim($htmlContent)) {
1101-
@$dom->loadHTML($htmlContent);
1102-
}
1103-
1104-
libxml_use_internal_errors($internalErrors);
1105-
1106-
return $dom;
1107-
}
1108-
11091052
$document = @\Dom\HTMLDocument::createFromString($htmlContent, \Dom\HTML_NO_DEFAULT_NS, $charset);
11101053
$htmlContent = $document->saveXml();
11111054
$charset = $document->inputEncoding;
@@ -1202,7 +1145,6 @@ private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes
12021145
$crawler->document = $this->document;
12031146
$crawler->namespaces = $this->namespaces;
12041147
$crawler->cachedNamespaces = $this->cachedNamespaces;
1205-
$crawler->html5Parser = $this->html5Parser;
12061148

12071149
return $crawler;
12081150
}
@@ -1219,39 +1161,6 @@ private function createCssSelectorConverter(): CssSelectorConverter
12191161
return new CssSelectorConverter($this->isHtml);
12201162
}
12211163

1222-
/**
1223-
* Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1224-
* Use libxml parser otherwise.
1225-
*/
1226-
private function parseHtmlString(string $content, string $charset): \DOMDocument
1227-
{
1228-
if ($this->canParseHtml5String($content)) {
1229-
return $this->parseHtml5($content, $charset);
1230-
}
1231-
1232-
return $this->parseXhtml($content, $charset);
1233-
}
1234-
1235-
private function canParseHtml5String(string $content): bool
1236-
{
1237-
if (!$this->html5Parser) {
1238-
return false;
1239-
}
1240-
1241-
if (false === $pos = stripos($content, '<!doctype html>')) {
1242-
return false;
1243-
}
1244-
1245-
$header = substr($content, 0, $pos);
1246-
1247-
return '' === $header || $this->isValidHtml5Heading($header);
1248-
}
1249-
1250-
private function isValidHtml5Heading(string $heading): bool
1251-
{
1252-
return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
1253-
}
1254-
12551164
private function normalizeWhitespace(string $string): string
12561165
{
12571166
return trim(preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $string), " \n\r\t\x0C");

src/Symfony/Component/DomCrawler/Tests/CrawlerTestCase.php

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
namespace Symfony\Component\DomCrawler\Tests;
1313

1414
use PHPUnit\Framework\Attributes\DataProvider;
15-
use PHPUnit\Framework\Attributes\Group;
16-
use PHPUnit\Framework\Attributes\IgnoreDeprecations;
1715
use PHPUnit\Framework\Attributes\RequiresPhpExtension;
1816
use PHPUnit\Framework\Error\Notice;
1917
use PHPUnit\Framework\TestCase;
@@ -1329,22 +1327,6 @@ public function testAddXmlContentWithErrors()
13291327
libxml_use_internal_errors($internalErrors);
13301328
}
13311329

1332-
#[IgnoreDeprecations]
1333-
#[Group('legacy')]
1334-
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
1335-
{
1336-
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
1337-
$html = '<!DOCTYPE html><html><body><h1><p>Foo</p></h1></body></html>';
1338-
1339-
$html5Crawler = new Crawler(null, null, null, true);
1340-
$html5Crawler->add($html);
1341-
1342-
$nativeCrawler = new Crawler(null, null, null, false);
1343-
$nativeCrawler->add($html);
1344-
1345-
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
1346-
}
1347-
13481330
public function testAddHtml5()
13491331
{
13501332
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)

src/Symfony/Component/DomCrawler/Tests/LegacyHtml5ParserCrawlerTest.php

Lines changed: 0 additions & 79 deletions
This file was deleted.

0 commit comments

Comments
 (0)