diff --git a/Cargo.lock b/Cargo.lock
index 4148d03a..4f83a3c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,14 +4,15 @@ version = 3
[[package]]
name = "ahash"
-version = "0.8.3"
+version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
+checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
dependencies = [
"cfg-if",
"getrandom",
"once_cell",
"version_check",
+ "zerocopy",
]
[[package]]
@@ -28,15 +29,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
-version = "2.4.0"
+version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cfg-if"
@@ -64,7 +65,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
- "syn 2.0.37",
+ "syn 2.0.38",
]
[[package]]
@@ -146,9 +147,9 @@ dependencies = [
[[package]]
name = "hashbrown"
-version = "0.14.1"
+version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12"
+checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156"
[[package]]
name = "html5ever"
@@ -166,9 +167,9 @@ dependencies = [
[[package]]
name = "indexmap"
-version = "2.0.2"
+version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897"
+checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177"
dependencies = [
"equivalent",
"hashbrown",
@@ -182,15 +183,15 @@ checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
[[package]]
name = "libc"
-version = "0.2.148"
+version = "0.2.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
[[package]]
name = "lock_api"
-version = "0.4.10"
+version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
dependencies = [
"autocfg",
"scopeguard",
@@ -230,9 +231,9 @@ checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "parking_lot"
@@ -246,9 +247,9 @@ dependencies = [
[[package]]
name = "parking_lot_core"
-version = "0.9.8"
+version = "0.9.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
+checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
dependencies = [
"cfg-if",
"libc",
@@ -316,7 +317,7 @@ dependencies = [
"phf_shared 0.11.2",
"proc-macro2",
"quote",
- "syn 2.0.37",
+ "syn 2.0.38",
]
[[package]]
@@ -351,9 +352,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
-version = "1.0.67"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
+checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
dependencies = [
"unicode-ident",
]
@@ -399,9 +400,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
-version = "0.3.5"
+version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
@@ -414,7 +415,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
-version = "0.18.1"
+version = "0.19.0"
dependencies = [
"ahash",
"cssparser",
@@ -433,7 +434,7 @@ version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
- "bitflags 2.4.0",
+ "bitflags 2.4.1",
"cssparser",
"derive_more",
"fxhash",
@@ -448,22 +449,22 @@ dependencies = [
[[package]]
name = "serde"
-version = "1.0.188"
+version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
+checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
-version = "1.0.188"
+version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
+checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3"
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.37",
+ "syn 2.0.38",
]
[[package]]
@@ -532,9 +533,9 @@ dependencies = [
[[package]]
name = "syn"
-version = "2.0.37"
+version = "2.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
dependencies = [
"proc-macro2",
"quote",
@@ -638,3 +639,23 @@ name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "zerocopy"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.38",
+]
diff --git a/Cargo.toml b/Cargo.toml
index 21ad948b..304f68d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "scraper"
-version = "0.18.1"
+version = "0.19.0"
edition = "2021"
description = "HTML parsing and querying with CSS selectors"
@@ -19,8 +19,8 @@ html5ever = "0.26"
selectors = "0.25.0"
tendril = "0.4.3"
ahash = "0.8"
-indexmap = { version = "2.0.2", optional = true }
-once_cell = "1.0"
+indexmap = { version = "2.2.3", optional = true }
+once_cell = "1.19"
[dependencies.getopts]
version = "0.2.21"
diff --git a/src/element_ref/mod.rs b/src/element_ref/mod.rs
index 0485fca6..5461041f 100644
--- a/src/element_ref/mod.rs
+++ b/src/element_ref/mod.rs
@@ -1,10 +1,13 @@
//! Element references.
+use std::fmt;
+use std::iter::FusedIterator;
use std::ops::Deref;
use ego_tree::iter::{Edge, Traverse};
use ego_tree::NodeRef;
use html5ever::serialize::{serialize, SerializeOpts, TraversalScope};
+use selectors::NthIndexCache;
use crate::node::Element;
use crate::{Node, Selector};
@@ -46,6 +49,7 @@ impl<'a> ElementRef<'a> {
scope: *self,
inner,
selector,
+ nth_index_cache: NthIndexCache::default(),
}
}
@@ -81,6 +85,36 @@ impl<'a> ElementRef<'a> {
inner: self.traverse(),
}
}
+
+ /// Iterate over all child nodes which are elements
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use scraper::Html;
+ /// let fragment = Html::parse_fragment("foobarbazqux");
+ ///
+ /// let children = fragment.root_element().child_elements().map(|element| element.value().name()).collect::>();
+ /// assert_eq!(children, ["span", "a"]);
+ /// ```
+ pub fn child_elements(&self) -> impl Iterator- > {
+ self.children().filter_map(ElementRef::wrap)
+ }
+
+ /// Iterate over all descendent nodes which are elements
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use scraper::Html;
+ /// let fragment = Html::parse_fragment("foobarbazqux");
+ ///
+ /// let descendants = fragment.root_element().descendent_elements().map(|element| element.value().name()).collect::>();
+ /// assert_eq!(descendants, ["html", "span", "b", "a", "i"]);
+ /// ```
+ pub fn descendent_elements(&self) -> impl Iterator
- > {
+ self.descendants().filter_map(ElementRef::wrap)
+ }
}
impl<'a> Deref for ElementRef<'a> {
@@ -91,11 +125,33 @@ impl<'a> Deref for ElementRef<'a> {
}
/// Iterator over descendent elements matching a selector.
-#[derive(Debug, Clone)]
pub struct Select<'a, 'b> {
scope: ElementRef<'a>,
inner: Traverse<'a, Node>,
selector: &'b Selector,
+ nth_index_cache: NthIndexCache,
+}
+
+impl fmt::Debug for Select<'_, '_> {
+ fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt.debug_struct("Select")
+ .field("scope", &self.scope)
+ .field("inner", &self.inner)
+ .field("selector", &self.selector)
+ .field("nth_index_cache", &"..")
+ .finish()
+ }
+}
+
+impl Clone for Select<'_, '_> {
+ fn clone(&self) -> Self {
+ Self {
+ scope: self.scope,
+ inner: self.inner.clone(),
+ selector: self.selector,
+ nth_index_cache: NthIndexCache::default(),
+ }
+ }
}
impl<'a, 'b> Iterator for Select<'a, 'b> {
@@ -105,7 +161,11 @@ impl<'a, 'b> Iterator for Select<'a, 'b> {
for edge in &mut self.inner {
if let Edge::Open(node) = edge {
if let Some(element) = ElementRef::wrap(node) {
- if self.selector.matches_with_scope(&element, Some(self.scope)) {
+ if self.selector.matches_with_scope_and_cache(
+ &element,
+ Some(self.scope),
+ &mut self.nth_index_cache,
+ ) {
return Some(element);
}
}
@@ -115,6 +175,8 @@ impl<'a, 'b> Iterator for Select<'a, 'b> {
}
}
+impl FusedIterator for Select<'_, '_> {}
+
/// Iterator over descendent text nodes.
#[derive(Debug, Clone)]
pub struct Text<'a> {
@@ -136,6 +198,8 @@ impl<'a> Iterator for Text<'a> {
}
}
+impl FusedIterator for Text<'_> {}
+
mod element;
mod serializable;
diff --git a/src/html/mod.rs b/src/html/mod.rs
index 26ec5ea0..5178149c 100644
--- a/src/html/mod.rs
+++ b/src/html/mod.rs
@@ -2,13 +2,15 @@
#[cfg(feature = "errors")]
use std::borrow::Cow;
+use std::fmt;
+use std::iter::FusedIterator;
use ego_tree::iter::Nodes;
use ego_tree::Tree;
use html5ever::serialize::SerializeOpts;
use html5ever::tree_builder::QuirksMode;
-use html5ever::QualName;
-use html5ever::{driver, serialize};
+use html5ever::{driver, serialize, QualName};
+use selectors::NthIndexCache;
use tendril::TendrilSink;
use crate::selector::Selector;
@@ -93,6 +95,7 @@ impl Html {
Select {
inner: self.tree.nodes(),
selector,
+ nth_index_cache: NthIndexCache::default(),
}
}
@@ -121,10 +124,30 @@ impl Html {
}
/// Iterator over elements matching a selector.
-#[derive(Debug)]
pub struct Select<'a, 'b> {
inner: Nodes<'a, Node>,
selector: &'b Selector,
+ nth_index_cache: NthIndexCache,
+}
+
+impl fmt::Debug for Select<'_, '_> {
+ fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt.debug_struct("Select")
+ .field("inner", &self.inner)
+ .field("selector", &self.selector)
+ .field("nth_index_cache", &"..")
+ .finish()
+ }
+}
+
+impl Clone for Select<'_, '_> {
+ fn clone(&self) -> Self {
+ Self {
+ inner: self.inner.clone(),
+ selector: self.selector,
+ nth_index_cache: NthIndexCache::default(),
+ }
+ }
}
impl<'a, 'b> Iterator for Select<'a, 'b> {
@@ -133,7 +156,13 @@ impl<'a, 'b> Iterator for Select<'a, 'b> {
fn next(&mut self) -> Option> {
for node in self.inner.by_ref() {
if let Some(element) = ElementRef::wrap(node) {
- if element.parent().is_some() && self.selector.matches(&element) {
+ if element.parent().is_some()
+ && self.selector.matches_with_scope_and_cache(
+ &element,
+ None,
+ &mut self.nth_index_cache,
+ )
+ {
return Some(element);
}
}
@@ -152,7 +181,13 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
fn next_back(&mut self) -> Option {
for node in self.inner.by_ref().rev() {
if let Some(element) = ElementRef::wrap(node) {
- if element.parent().is_some() && self.selector.matches(&element) {
+ if element.parent().is_some()
+ && self.selector.matches_with_scope_and_cache(
+ &element,
+ None,
+ &mut self.nth_index_cache,
+ )
+ {
return Some(element);
}
}
@@ -161,6 +196,8 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
}
}
+impl FusedIterator for Select<'_, '_> {}
+
mod serializable;
mod tree_sink;
diff --git a/src/lib.rs b/src/lib.rs
index c000283f..7462cb79 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -149,6 +149,7 @@ pub mod element_ref;
pub mod error;
pub mod html;
pub mod node;
+pub mod selectable;
pub mod selector;
#[cfg(feature = "atomic")]
diff --git a/src/selectable.rs b/src/selectable.rs
new file mode 100644
index 00000000..f77a9976
--- /dev/null
+++ b/src/selectable.rs
@@ -0,0 +1,70 @@
+//! Provides the [`Selectable`] to abstract over collections of elements
+
+use crate::{
+ element_ref::{self, ElementRef},
+ html::{self, Html},
+ selector::Selector,
+};
+
+/// Trait to abstract over collections of elements to which a [CSS selector][Selector] can be applied
+///
+/// The mainly enables writing helper functions which are generic over [`Html`] and [`ElementRef`], e.g.
+///
+/// ```
+/// use scraper::{selectable::Selectable, selector::Selector};
+///
+/// fn text_of_first_match<'a, S>(selectable: S, selector: &Selector) -> Option
+/// where
+/// S: Selectable<'a>,
+/// {
+/// selectable.select(selector).next().map(|element| element.text().collect())
+/// }
+/// ```
+pub trait Selectable<'a> {
+ /// Iterator over [element references][ElementRef] matching a [CSS selector[Selector]
+ type Select<'b>: Iterator
- >;
+
+ /// Applies the given `selector` to the collection of elements represented by `self`
+ fn select(self, selector: &Selector) -> Self::Select<'_>;
+}
+
+impl<'a> Selectable<'a> for &'a Html {
+ type Select<'b> = html::Select<'a, 'b>;
+
+ fn select(self, selector: &Selector) -> Self::Select<'_> {
+ Html::select(self, selector)
+ }
+}
+
+impl<'a> Selectable<'a> for ElementRef<'a> {
+ type Select<'b> = element_ref::Select<'a, 'b>;
+
+ fn select(self, selector: &Selector) -> Self::Select<'_> {
+ ElementRef::select(&self, selector)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn select_one<'a, S>(selectable: S, selector: &Selector) -> Option>
+ where
+ S: Selectable<'a>,
+ {
+ selectable.select(selector).next()
+ }
+
+ #[test]
+ fn html_and_element_ref_are_selectable() {
+ let fragment = Html::parse_fragment(
+ r#""#,
+ );
+
+ let selector = Selector::parse("select.foo").unwrap();
+ let element = select_one(&fragment, &selector).unwrap();
+
+ let selector = Selector::parse("select.foo option[value='bar']").unwrap();
+ let _element = select_one(element, &selector).unwrap();
+ }
+}
diff --git a/src/selector.rs b/src/selector.rs
index 68f880cc..7ef13f2f 100644
--- a/src/selector.rs
+++ b/src/selector.rs
@@ -8,6 +8,7 @@ use html5ever::{LocalName, Namespace};
use selectors::{
matching,
parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind},
+ NthIndexCache,
};
use crate::error::SelectorErrorKind;
@@ -42,11 +43,22 @@ impl Selector {
/// The optional `scope` argument is used to specify which element has `:scope` pseudo-class.
/// When it is `None`, `:scope` will match the root element.
pub fn matches_with_scope(&self, element: &ElementRef, scope: Option) -> bool {
- let mut nth_index_cache = Default::default();
+ self.matches_with_scope_and_cache(element, scope, &mut NthIndexCache::default())
+ }
+
+ // The `nth_index_cache` must not be used after `self` is dropped
+ // to avoid incorrect results (even though no undefined behaviour is possible)
+ // due to the usage of selector memory addresses as cache keys.
+ pub(crate) fn matches_with_scope_and_cache(
+ &self,
+ element: &ElementRef,
+ scope: Option,
+ nth_index_cache: &mut NthIndexCache,
+ ) -> bool {
let mut context = matching::MatchingContext::new(
matching::MatchingMode::Normal,
None,
- &mut nth_index_cache,
+ nth_index_cache,
matching::QuirksMode::NoQuirks,
matching::NeedsSelectorFlags::No,
matching::IgnoreNthChildForInvalidation::No,