diff --git a/Cargo.lock b/Cargo.lock index 4148d03a..4f83a3c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,14 +4,15 @@ version = 3 [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -28,15 +29,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cfg-if" @@ -64,7 +65,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -146,9 +147,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12" +checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" [[package]] name = "html5ever" @@ -166,9 +167,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.2" +version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" dependencies = [ "equivalent", "hashbrown", @@ -182,15 +183,15 @@ checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "libc" -version = "0.2.148" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -230,9 +231,9 @@ checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "parking_lot" @@ -246,9 +247,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", @@ -316,7 +317,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -351,9 +352,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] @@ -399,9 +400,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ "bitflags 1.3.2", ] @@ -414,7 +415,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.18.1" +version = "0.19.0" dependencies = [ "ahash", "cssparser", @@ -433,7 +434,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.1", "cssparser", "derive_more", "fxhash", @@ -448,22 +449,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -532,9 +533,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.37" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", @@ -638,3 +639,23 @@ name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.38", +] diff --git a/Cargo.toml b/Cargo.toml index 21ad948b..304f68d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.18.1" +version = "0.19.0" edition = "2021" description = "HTML parsing and querying with CSS selectors" @@ -19,8 +19,8 @@ html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" -indexmap = { version = "2.0.2", optional = true } -once_cell = "1.0" +indexmap = { version = "2.2.3", optional = true } +once_cell = "1.19" [dependencies.getopts] version = "0.2.21" diff --git a/src/element_ref/mod.rs b/src/element_ref/mod.rs index 0485fca6..5461041f 100644 --- a/src/element_ref/mod.rs +++ b/src/element_ref/mod.rs @@ -1,10 +1,13 @@ //! Element references. +use std::fmt; +use std::iter::FusedIterator; use std::ops::Deref; use ego_tree::iter::{Edge, Traverse}; use ego_tree::NodeRef; use html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; +use selectors::NthIndexCache; use crate::node::Element; use crate::{Node, Selector}; @@ -46,6 +49,7 @@ impl<'a> ElementRef<'a> { scope: *self, inner, selector, + nth_index_cache: NthIndexCache::default(), } } @@ -81,6 +85,36 @@ impl<'a> ElementRef<'a> { inner: self.traverse(), } } + + /// Iterate over all child nodes which are elements + /// + /// # Example + /// + /// ``` + /// # use scraper::Html; + /// let fragment = Html::parse_fragment("foobarbazqux"); + /// + /// let children = fragment.root_element().child_elements().map(|element| element.value().name()).collect::>(); + /// assert_eq!(children, ["span", "a"]); + /// ``` + pub fn child_elements(&self) -> impl Iterator> { + self.children().filter_map(ElementRef::wrap) + } + + /// Iterate over all descendent nodes which are elements + /// + /// # Example + /// + /// ``` + /// # use scraper::Html; + /// let fragment = Html::parse_fragment("foobarbazqux"); + /// + /// let descendants = fragment.root_element().descendent_elements().map(|element| element.value().name()).collect::>(); + /// assert_eq!(descendants, ["html", "span", "b", "a", "i"]); + /// ``` + pub fn descendent_elements(&self) -> impl Iterator> { + self.descendants().filter_map(ElementRef::wrap) + } } impl<'a> Deref for ElementRef<'a> { @@ -91,11 +125,33 @@ impl<'a> Deref for ElementRef<'a> { } /// Iterator over descendent elements matching a selector. -#[derive(Debug, Clone)] pub struct Select<'a, 'b> { scope: ElementRef<'a>, inner: Traverse<'a, Node>, selector: &'b Selector, + nth_index_cache: NthIndexCache, +} + +impl fmt::Debug for Select<'_, '_> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Select") + .field("scope", &self.scope) + .field("inner", &self.inner) + .field("selector", &self.selector) + .field("nth_index_cache", &"..") + .finish() + } +} + +impl Clone for Select<'_, '_> { + fn clone(&self) -> Self { + Self { + scope: self.scope, + inner: self.inner.clone(), + selector: self.selector, + nth_index_cache: NthIndexCache::default(), + } + } } impl<'a, 'b> Iterator for Select<'a, 'b> { @@ -105,7 +161,11 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { for edge in &mut self.inner { if let Edge::Open(node) = edge { if let Some(element) = ElementRef::wrap(node) { - if self.selector.matches_with_scope(&element, Some(self.scope)) { + if self.selector.matches_with_scope_and_cache( + &element, + Some(self.scope), + &mut self.nth_index_cache, + ) { return Some(element); } } @@ -115,6 +175,8 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { } } +impl FusedIterator for Select<'_, '_> {} + /// Iterator over descendent text nodes. #[derive(Debug, Clone)] pub struct Text<'a> { @@ -136,6 +198,8 @@ impl<'a> Iterator for Text<'a> { } } +impl FusedIterator for Text<'_> {} + mod element; mod serializable; diff --git a/src/html/mod.rs b/src/html/mod.rs index 26ec5ea0..5178149c 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -2,13 +2,15 @@ #[cfg(feature = "errors")] use std::borrow::Cow; +use std::fmt; +use std::iter::FusedIterator; use ego_tree::iter::Nodes; use ego_tree::Tree; use html5ever::serialize::SerializeOpts; use html5ever::tree_builder::QuirksMode; -use html5ever::QualName; -use html5ever::{driver, serialize}; +use html5ever::{driver, serialize, QualName}; +use selectors::NthIndexCache; use tendril::TendrilSink; use crate::selector::Selector; @@ -93,6 +95,7 @@ impl Html { Select { inner: self.tree.nodes(), selector, + nth_index_cache: NthIndexCache::default(), } } @@ -121,10 +124,30 @@ impl Html { } /// Iterator over elements matching a selector. -#[derive(Debug)] pub struct Select<'a, 'b> { inner: Nodes<'a, Node>, selector: &'b Selector, + nth_index_cache: NthIndexCache, +} + +impl fmt::Debug for Select<'_, '_> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Select") + .field("inner", &self.inner) + .field("selector", &self.selector) + .field("nth_index_cache", &"..") + .finish() + } +} + +impl Clone for Select<'_, '_> { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + selector: self.selector, + nth_index_cache: NthIndexCache::default(), + } + } } impl<'a, 'b> Iterator for Select<'a, 'b> { @@ -133,7 +156,13 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { fn next(&mut self) -> Option> { for node in self.inner.by_ref() { if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { + if element.parent().is_some() + && self.selector.matches_with_scope_and_cache( + &element, + None, + &mut self.nth_index_cache, + ) + { return Some(element); } } @@ -152,7 +181,13 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { fn next_back(&mut self) -> Option { for node in self.inner.by_ref().rev() { if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { + if element.parent().is_some() + && self.selector.matches_with_scope_and_cache( + &element, + None, + &mut self.nth_index_cache, + ) + { return Some(element); } } @@ -161,6 +196,8 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { } } +impl FusedIterator for Select<'_, '_> {} + mod serializable; mod tree_sink; diff --git a/src/lib.rs b/src/lib.rs index c000283f..7462cb79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -149,6 +149,7 @@ pub mod element_ref; pub mod error; pub mod html; pub mod node; +pub mod selectable; pub mod selector; #[cfg(feature = "atomic")] diff --git a/src/selectable.rs b/src/selectable.rs new file mode 100644 index 00000000..f77a9976 --- /dev/null +++ b/src/selectable.rs @@ -0,0 +1,70 @@ +//! Provides the [`Selectable`] to abstract over collections of elements + +use crate::{ + element_ref::{self, ElementRef}, + html::{self, Html}, + selector::Selector, +}; + +/// Trait to abstract over collections of elements to which a [CSS selector][Selector] can be applied +/// +/// The mainly enables writing helper functions which are generic over [`Html`] and [`ElementRef`], e.g. +/// +/// ``` +/// use scraper::{selectable::Selectable, selector::Selector}; +/// +/// fn text_of_first_match<'a, S>(selectable: S, selector: &Selector) -> Option +/// where +/// S: Selectable<'a>, +/// { +/// selectable.select(selector).next().map(|element| element.text().collect()) +/// } +/// ``` +pub trait Selectable<'a> { + /// Iterator over [element references][ElementRef] matching a [CSS selector[Selector] + type Select<'b>: Iterator>; + + /// Applies the given `selector` to the collection of elements represented by `self` + fn select(self, selector: &Selector) -> Self::Select<'_>; +} + +impl<'a> Selectable<'a> for &'a Html { + type Select<'b> = html::Select<'a, 'b>; + + fn select(self, selector: &Selector) -> Self::Select<'_> { + Html::select(self, selector) + } +} + +impl<'a> Selectable<'a> for ElementRef<'a> { + type Select<'b> = element_ref::Select<'a, 'b>; + + fn select(self, selector: &Selector) -> Self::Select<'_> { + ElementRef::select(&self, selector) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn select_one<'a, S>(selectable: S, selector: &Selector) -> Option> + where + S: Selectable<'a>, + { + selectable.select(selector).next() + } + + #[test] + fn html_and_element_ref_are_selectable() { + let fragment = Html::parse_fragment( + r#""#, + ); + + let selector = Selector::parse("select.foo").unwrap(); + let element = select_one(&fragment, &selector).unwrap(); + + let selector = Selector::parse("select.foo option[value='bar']").unwrap(); + let _element = select_one(element, &selector).unwrap(); + } +} diff --git a/src/selector.rs b/src/selector.rs index 68f880cc..7ef13f2f 100644 --- a/src/selector.rs +++ b/src/selector.rs @@ -8,6 +8,7 @@ use html5ever::{LocalName, Namespace}; use selectors::{ matching, parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind}, + NthIndexCache, }; use crate::error::SelectorErrorKind; @@ -42,11 +43,22 @@ impl Selector { /// The optional `scope` argument is used to specify which element has `:scope` pseudo-class. /// When it is `None`, `:scope` will match the root element. pub fn matches_with_scope(&self, element: &ElementRef, scope: Option) -> bool { - let mut nth_index_cache = Default::default(); + self.matches_with_scope_and_cache(element, scope, &mut NthIndexCache::default()) + } + + // The `nth_index_cache` must not be used after `self` is dropped + // to avoid incorrect results (even though no undefined behaviour is possible) + // due to the usage of selector memory addresses as cache keys. + pub(crate) fn matches_with_scope_and_cache( + &self, + element: &ElementRef, + scope: Option, + nth_index_cache: &mut NthIndexCache, + ) -> bool { let mut context = matching::MatchingContext::new( matching::MatchingMode::Normal, None, - &mut nth_index_cache, + nth_index_cache, matching::QuirksMode::NoQuirks, matching::NeedsSelectorFlags::No, matching::IgnoreNthChildForInvalidation::No,