Skip to content

[DomCrawler] Added auto-discovery and explicit registration of namespaces in filter() and filterByXPath() #6650

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 25, 2013
9 changes: 9 additions & 0 deletions src/Symfony/Component/DomCrawler/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
CHANGELOG
=========

2.4.0
-----

* added support for automatic discovery and explicit registration of document
namespaces for `Crawler::filterXPath()` and `Crawler::filter()`
* improved content type guessing in `Crawler::addContent()`
* [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document
namespace

2.3.0
-----

Expand Down
96 changes: 91 additions & 5 deletions src/Symfony/Component/DomCrawler/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@ class Crawler extends \SplObjectStorage
*/
protected $uri;

/**
* @var string The default namespace prefix to be used with XPath and CSS expressions
*/
private $defaultNamespacePrefix = 'default';

/**
* @var array A map of manually registered namespaces
*/
private $namespaces = array();

/**
* Constructor.
*
Expand Down Expand Up @@ -92,7 +102,7 @@ public function add($node)
public function addContent($content, $type = null)
{
if (empty($type)) {
$type = 'text/html';
$type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
}

// DOM only for HTML/XML content
Expand Down Expand Up @@ -195,9 +205,7 @@ public function addXmlContent($content, $charset = 'UTF-8')

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

// remove the default namespace to make XPath expressions simpler
@$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
@$dom->loadXML($content, LIBXML_NONET);

libxml_use_internal_errors($current);
libxml_disable_entity_loader($disableEntities);
Expand Down Expand Up @@ -579,7 +587,8 @@ public function filterXPath($xpath)
$root->appendChild($document->importNode($node, true));
}

$domxpath = new \DOMXPath($document);
$prefixes = $this->findNamespacePrefixes($xpath);
$domxpath = $this->createDOMXPath($document, $prefixes);

return new static($domxpath->query($xpath), $this->uri);
}
Expand Down Expand Up @@ -709,6 +718,25 @@ public function form(array $values = null, $method = null)
return $form;
}

/**
* Overloads a default namespace prefix to be used with XPath and CSS expressions.
*
* @param string $prefix
*/
public function setDefaultNamespacePrefix($prefix)
{
$this->defaultNamespacePrefix = $prefix;
}

/**
* @param string $prefix
* @param string $namespace
*/
public function registerNamespace($prefix, $namespace)
{
$this->namespaces[$prefix] = $namespace;
}

/**
* Converts string for XPath expressions.
*
Expand Down Expand Up @@ -792,4 +820,62 @@ protected function sibling($node, $siblingDir = 'nextSibling')

return $nodes;
}

/**
* @param \DOMDocument $document
* @param array $prefixes
*
* @return \DOMXPath
*
* @throws \InvalidArgumentException
*/
private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
{
$domxpath = new \DOMXPath($document);

foreach ($prefixes as $prefix) {
$namespace = $this->discoverNamespace($domxpath, $prefix);
$domxpath->registerNamespace($prefix, $namespace);
}

return $domxpath;
}

/**
* @param \DOMXPath $domxpath
* @param string $prefix
*
* @return string
*
* @throws \InvalidArgumentException
*/
private function discoverNamespace(\DOMXPath $domxpath, $prefix)
{
if (isset($this->namespaces[$prefix])) {
return $this->namespaces[$prefix];
}

// ask for one namespace, otherwise we'd get a collection with an item for each node
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));

if ($node = $namespaces->item(0)) {
return $node->nodeValue;
}

throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix));
}

/**
* @param $xpath
*
* @return array
*/
private function findNamespacePrefixes($xpath)
{
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) {
return array_unique($matches['prefix']);
}

return array();
}
}
90 changes: 90 additions & 0 deletions src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

namespace Symfony\Component\DomCrawler\Tests;

use Symfony\Component\CssSelector\CssSelector;
use Symfony\Component\DomCrawler\Crawler;

class CrawlerTest extends \PHPUnit_Framework_TestCase
Expand Down Expand Up @@ -370,6 +371,55 @@ public function testFilterXPath()
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
}

public function testFilterXPathWithDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//default:entry/default:id');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}

public function testFilterXPathWithCustomDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler();
$crawler->setDefaultNamespacePrefix('x');
$crawler = $crawler->filterXPath('//x:entry/x:id');

$this->assertCount(1, $crawler, '->filterXPath() lets to override the default namespace prefix');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}

public function testFilterXPathWithNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl');
$this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace');
}

public function testFilterXPathWithMultipleNamespaces()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces');
$this->assertSame('widescreen', $crawler->text());
}

/**
* @expectedException \InvalidArgumentException
* @expectedExceptionMessage Could not find a namespace for the prefix: "foo"
*/
public function testFilterXPathWithAnInvalidNamespace()
{
$this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio');
}

public function testFilterXPathWithManuallyRegisteredNamespace()
{
$crawler = $this->createTestXmlCrawler();
$crawler->registerNamespace('m', 'http://search.yahoo.com/mrss/');

$crawler = $crawler->filterXPath('//m:group/yt:aspectRatio');
$this->assertCount(1, $crawler, '->filterXPath() uses manually registered namespace');
$this->assertSame('widescreen', $crawler->text());
}

/**
* @covers Symfony\Component\DomCrawler\Crawler::filter
*/
Expand All @@ -384,6 +434,30 @@ public function testFilter()
$this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector');
}

public function testFilterWithDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler()->filter('default|entry default|id');
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}

public function testFilterWithNamespace()
{
CssSelector::disableHtmlExtension();

$crawler = $this->createTestXmlCrawler()->filter('yt|accessControl');
$this->assertCount(2, $crawler, '->filter() automatically registers namespaces');
}

public function testFilterWithMultipleNamespaces()
{
CssSelector::disableHtmlExtension();

$crawler = $this->createTestXmlCrawler()->filter('media|group yt|aspectRatio');
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
$this->assertSame('widescreen', $crawler->text());
}

public function testSelectLink()
{
$crawler = $this->createTestCrawler();
Expand Down Expand Up @@ -656,6 +730,22 @@ public function createTestCrawler($uri = null)
return new Crawler($dom, $uri);
}

protected function createTestXmlCrawler($uri = null)
{
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007">
<id>tag:youtube.com,2008:video:kgZRZmEc9j4</id>
<yt:accessControl action="comment" permission="allowed"/>
<yt:accessControl action="videoRespond" permission="moderated"/>
<media:group>
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
<yt:aspectRatio>widescreen</yt:aspectRatio>
</media:group>
</entry>';

return new Crawler($xml, $uri);
}

protected function createDomDocument()
{
$dom = new \DOMDocument();
Expand Down