11
11
12
12
namespace Symfony \Component \DomCrawler ;
13
13
14
- use Masterminds \HTML5 ;
15
14
use Symfony \Component \CssSelector \CssSelectorConverter ;
16
15
17
16
/**
@@ -53,23 +52,15 @@ class Crawler implements \Countable, \IteratorAggregate
53
52
*/
54
53
private bool $ isHtml = true ;
55
54
56
- private ?HTML5 $ html5Parser = null ;
57
-
58
55
/**
59
56
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
60
57
*/
61
58
public function __construct (
62
59
\DOMNodeList |\DOMNode |array |string |null $ node = null ,
63
60
protected ?string $ uri = null ,
64
61
?string $ baseHref = null ,
65
- private bool $ useHtml5Parser = true ,
66
62
) {
67
- if (\PHP_VERSION_ID >= 80400 && !$ useHtml5Parser ) {
68
- trigger_deprecation ('symfony/dom-crawler ' , '7.4 ' , 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser. ' );
69
- }
70
-
71
63
$ this ->baseHref = $ baseHref ?: $ uri ;
72
- $ this ->html5Parser = \PHP_VERSION_ID < 80400 && $ useHtml5Parser ? new HTML5 (['disable_html_ns ' => true ]) : null ;
73
64
$ this ->cachedNamespaces = new \ArrayObject ();
74
65
75
66
$ this ->add ($ node );
@@ -175,7 +166,7 @@ public function addContent(string $content, ?string $type = null): void
175
166
*/
176
167
public function addHtmlContent (string $ content , string $ charset = 'UTF-8 ' ): void
177
168
{
178
- $ dom = $ this ->parseHtmlString ($ content , $ charset );
169
+ $ dom = $ this ->parseXhtml ($ content , $ charset );
179
170
$ this ->addDocument ($ dom );
180
171
181
172
$ base = $ this ->filterRelativeXPath ('descendant-or-self::base ' )->extract (['href ' ]);
@@ -609,10 +600,6 @@ public function html(?string $default = null): string
609
600
$ node = $ this ->getNode (0 );
610
601
$ owner = $ node ->ownerDocument ;
611
602
612
- if ($ this ->html5Parser && '<!DOCTYPE html> ' === $ owner ->saveXML ($ owner ->childNodes [0 ])) {
613
- $ owner = $ this ->html5Parser ;
614
- }
615
-
616
603
$ html = '' ;
617
604
foreach ($ node ->childNodes as $ child ) {
618
605
$ html .= $ owner ->saveHTML ($ child );
@@ -630,10 +617,6 @@ public function outerHtml(): string
630
617
$ node = $ this ->getNode (0 );
631
618
$ owner = $ node ->ownerDocument ;
632
619
633
- if ($ this ->html5Parser && '<!DOCTYPE html> ' === $ owner ->saveXML ($ owner ->childNodes [0 ])) {
634
- $ owner = $ this ->html5Parser ;
635
- }
636
-
637
620
return $ owner ->saveHTML ($ node );
638
621
}
639
622
@@ -1064,48 +1047,8 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'):
1064
1047
return $ nodes ;
1065
1048
}
1066
1049
1067
- private function parseHtml5 (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
1068
- {
1069
- if (!$ this ->supportsEncoding ($ charset )) {
1070
- $ htmlContent = $ this ->convertToHtmlEntities ($ htmlContent , $ charset );
1071
- $ charset = 'UTF-8 ' ;
1072
- }
1073
-
1074
- return $ this ->html5Parser ->parse ($ htmlContent , ['encoding ' => $ charset ]);
1075
- }
1076
-
1077
- private function supportsEncoding (string $ encoding ): bool
1078
- {
1079
- try {
1080
- return '' === @mb_convert_encoding ('' , $ encoding , 'UTF-8 ' );
1081
- } catch (\Throwable $ e ) {
1082
- return false ;
1083
- }
1084
- }
1085
-
1086
1050
private function parseXhtml (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
1087
1051
{
1088
- if (\PHP_VERSION_ID < 80400 || !$ this ->useHtml5Parser ) {
1089
- if ('UTF-8 ' === $ charset && preg_match ('//u ' , $ htmlContent )) {
1090
- $ htmlContent = '<?xml encoding="UTF-8"> ' .$ htmlContent ;
1091
- } else {
1092
- $ htmlContent = $ this ->convertToHtmlEntities ($ htmlContent , $ charset );
1093
- }
1094
-
1095
- $ internalErrors = libxml_use_internal_errors (true );
1096
-
1097
- $ dom = new \DOMDocument ('1.0 ' , $ charset );
1098
- $ dom ->validateOnParse = true ;
1099
-
1100
- if ('' !== trim ($ htmlContent )) {
1101
- @$ dom ->loadHTML ($ htmlContent );
1102
- }
1103
-
1104
- libxml_use_internal_errors ($ internalErrors );
1105
-
1106
- return $ dom ;
1107
- }
1108
-
1109
1052
$ document = @\Dom \HTMLDocument::createFromString ($ htmlContent , \Dom \HTML_NO_DEFAULT_NS , $ charset );
1110
1053
$ htmlContent = $ document ->saveXml ();
1111
1054
$ charset = $ document ->inputEncoding ;
@@ -1202,7 +1145,6 @@ private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes
1202
1145
$ crawler ->document = $ this ->document ;
1203
1146
$ crawler ->namespaces = $ this ->namespaces ;
1204
1147
$ crawler ->cachedNamespaces = $ this ->cachedNamespaces ;
1205
- $ crawler ->html5Parser = $ this ->html5Parser ;
1206
1148
1207
1149
return $ crawler ;
1208
1150
}
@@ -1219,39 +1161,6 @@ private function createCssSelectorConverter(): CssSelectorConverter
1219
1161
return new CssSelectorConverter ($ this ->isHtml );
1220
1162
}
1221
1163
1222
- /**
1223
- * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1224
- * Use libxml parser otherwise.
1225
- */
1226
- private function parseHtmlString (string $ content , string $ charset ): \DOMDocument
1227
- {
1228
- if ($ this ->canParseHtml5String ($ content )) {
1229
- return $ this ->parseHtml5 ($ content , $ charset );
1230
- }
1231
-
1232
- return $ this ->parseXhtml ($ content , $ charset );
1233
- }
1234
-
1235
- private function canParseHtml5String (string $ content ): bool
1236
- {
1237
- if (!$ this ->html5Parser ) {
1238
- return false ;
1239
- }
1240
-
1241
- if (false === $ pos = stripos ($ content , '<!doctype html> ' )) {
1242
- return false ;
1243
- }
1244
-
1245
- $ header = substr ($ content , 0 , $ pos );
1246
-
1247
- return '' === $ header || $ this ->isValidHtml5Heading ($ header );
1248
- }
1249
-
1250
- private function isValidHtml5Heading (string $ heading ): bool
1251
- {
1252
- return 1 === preg_match ('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u ' , $ heading );
1253
- }
1254
-
1255
1164
private function normalizeWhitespace (string $ string ): string
1256
1165
{
1257
1166
return trim (preg_replace ("/(?:[ \n\r\t\x0C]{2,}+|[ \n\r\t\x0C])/ " , ' ' , $ string ), " \n\r\t\x0C" );
0 commit comments