Skip to content

[DomCrawler] Improve Crawler HTML5 parser need detection #30892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions src/Symfony/Component/DomCrawler/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,24 +61,15 @@ class Crawler implements \Countable, \IteratorAggregate
private $html5Parser;

/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
*/
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
public function __construct($node = null, string $uri = null, string $baseHref = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;

if ($useHtml5Parser && !class_exists(HTML5::class)) {
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
}

if ($useHtml5Parser ?? class_exists(HTML5::class)) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}

$this->add($node);
}

Expand Down Expand Up @@ -198,6 +189,13 @@ public function addContent($content, $type = null)
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
// Use HTML5 parser if the content is HTML5 and the library is available
if (!$this->html5Parser
&& class_exists(HTML5::class)
&& '<!doctype html>' === strtolower(substr(ltrim($content), 0, 15))) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}

$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);

Expand Down Expand Up @@ -1219,6 +1217,7 @@ private function createSubCrawler($nodes)
$crawler->isHtml = $this->isHtml;
$crawler->document = $this->document;
$crawler->namespaces = $this->namespaces;
$crawler->html5Parser = $this->html5Parser;

return $crawler;
}
Expand Down
73 changes: 33 additions & 40 deletions src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@

abstract class AbstractCrawlerTest extends TestCase
{
/**
* @param mixed $node
* @param string|null $uri
* @param string|null $baseHref
*
* @return Crawler
*/
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
abstract public function getDoctype(): string;

protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
{
return new Crawler($node, $uri, $baseHref);
}

public function testConstructor()
{
Expand Down Expand Up @@ -74,7 +72,7 @@ public function testAdd()
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');

$crawler = $this->createCrawler();
$crawler->add('<html><body>Foo</body></html>');
$crawler->add($this->getDoctype().'<html><body>Foo</body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
}

Expand All @@ -94,22 +92,21 @@ public function testAddInvalidType()
public function testAddMultipleDocumentNode()
{
$crawler = $this->createTestCrawler();
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
}

public function testAddHtmlContent()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');

$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
}

public function testAddHtmlContentWithBaseTag()
{
$crawler = $this->createCrawler();

$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');

$this->assertEquals('http://symfony.com', $crawler->filterXPath('//base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string');
$this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string');
Expand All @@ -121,15 +118,15 @@ public function testAddHtmlContentWithBaseTag()
public function testAddHtmlContentCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</html>', 'UTF-8');

$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}

public function testAddHtmlContentInvalidBaseTag()
{
$crawler = $this->createCrawler(null, 'http://symfony.com');
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');

$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
}
Expand All @@ -141,55 +138,55 @@ public function testAddHtmlContentCharsetGbk()
{
$crawler = $this->createCrawler();
//gbk encode of <html><p>中文</p></html>
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
$crawler->addHtmlContent($this->getDoctype().base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');

$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
}

public function testAddXmlContent()
{
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo"></div></html>', 'UTF-8');

$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
}

public function testAddXmlContentCharset()
{
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');

$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}

public function testAddContent()
{
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');

$crawler = $this->createCrawler();
$crawler->addContent('foo bar', 'text/plain');
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');

$crawler = $this->createCrawler();
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
}

Expand All @@ -199,7 +196,7 @@ public function testAddContent()
public function testAddContentNonUtf8()
{
$crawler = $this->createCrawler();
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
$crawler->addContent(iconv('UTF-8', 'SJIS', $this->getDoctype().'<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
}

Expand Down Expand Up @@ -314,7 +311,7 @@ public function testAttr()
public function testMissingAttrValueIsNull()
{
$crawler = $this->createCrawler();
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
$div = $crawler->filterXPath('//div');

$this->assertEquals('sample value', $div->attr('non-empty-attr'), '->attr() reads non-empty attributes correctly');
Expand Down Expand Up @@ -670,7 +667,6 @@ public function testSelectButton()
public function testSelectButtonWithSingleQuotesInNameAttribute()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -683,15 +679,14 @@ public function testSelectButtonWithSingleQuotesInNameAttribute()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);

$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
}

public function testSelectButtonWithDoubleQuotesInNameAttribute()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -704,7 +699,7 @@ public function testSelectButtonWithDoubleQuotesInNameAttribute()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);

$this->assertCount(1, $crawler->selectButton('Click "Here"'));
}
Expand Down Expand Up @@ -763,7 +758,6 @@ public function testImage()
public function testSelectLinkAndLinkFiltered()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -776,7 +770,7 @@ public function testSelectLinkAndLinkFiltered()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");

$this->assertCount(0, $filtered->selectLink('Login'));
Expand All @@ -793,7 +787,7 @@ public function testSelectLinkAndLinkFiltered()

public function testChaining()
{
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
$crawler = $this->createCrawler($this->getDoctype().'<div name="a"><div name="b"><div name="c"></div></div></div>');

$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
}
Expand Down Expand Up @@ -965,7 +959,6 @@ public function testChildren()
public function testFilteredChildren()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="foo">
Expand All @@ -981,7 +974,7 @@ public function testFilteredChildren()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);
$foo = $crawler->filter('#foo');

$this->assertEquals(3, $foo->children()->count());
Expand Down Expand Up @@ -1018,7 +1011,7 @@ public function testParents()
*/
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
{
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
}

Expand All @@ -1038,7 +1031,7 @@ public function getBaseTagData()
*/
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
{
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
}

Expand Down Expand Up @@ -1113,7 +1106,7 @@ public function testEvaluateThrowsAnExceptionIfDocumentIsEmpty()
public function testInheritedClassCallChildrenWithoutArgument()
{
$dom = new \DOMDocument();
$dom->loadHTML('
$dom->loadHTML($this->getDoctype().'
<html>
<body>
<a href="foo">Foo</a>
Expand Down Expand Up @@ -1165,15 +1158,15 @@ public function testInheritedClassCallChildrenWithoutArgument()
public function testAddHtmlContentUnsupportedCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
$crawler->addHtmlContent($this->getDoctype().file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');

$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}

public function createTestCrawler($uri = null)
{
$dom = new \DOMDocument();
$dom->loadHTML('
$dom->loadHTML($this->getDoctype().'
<html>
<body>
<a href="foo">Foo</a>
Expand Down
14 changes: 10 additions & 4 deletions src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@

namespace Symfony\Component\DomCrawler\Tests;

use Symfony\Component\DomCrawler\Crawler;

class Html5ParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
public function getDoctype(): string
{
return '<!DOCTYPE html>';
}

public function testAddHtml5()
{
return new Crawler($node, $uri, $baseHref, true);
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
$crawler = $this->createCrawler();
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@

namespace Symfony\Component\DomCrawler\Tests;

use Symfony\Component\DomCrawler\Crawler;

class NativeParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
public function getDoctype(): string
{
return new Crawler($node, $uri, $baseHref, false);
return '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
}

public function testAddHtmlContentWithErrors()
Expand All @@ -26,7 +24,7 @@ public function testAddHtmlContentWithErrors()

$crawler = $this->createCrawler();
$crawler->addHtmlContent(<<<'EOF'
<!DOCTYPE html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
</head>
Expand All @@ -51,7 +49,7 @@ public function testAddXmlContentWithErrors()

$crawler = $this->createCrawler();
$crawler->addXmlContent(<<<'EOF'
<!DOCTYPE html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
</head>
Expand Down