Skip to content
This repository was archived by the owner on Apr 27, 2022. It is now read-only.

Commit f674386

Browse files
committed
Remove RcDom, use FlatDom for recursive format
* Recursive encoder is now iterative instead of recursive, preventing stack overflows.
1 parent 91150be commit f674386

File tree

6 files changed

+178
-80
lines changed

6 files changed

+178
-80
lines changed

lib/html5ever.ex

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@ defmodule Html5ever do
1717
flat_parse_sync(html)
1818
end
1919

20-
defp parse_async(html) do
21-
Html5ever.Native.parse_async(html)
22-
receive do
20+
defp parse_sync(html) do
21+
case Html5ever.Native.parse_sync(html) do
2322
{:html5ever_nif_result, :ok, result} ->
2423
{:ok, result}
2524
{:html5ever_nif_result, :error, err} ->
2625
{:error, err}
2726
end
2827
end
2928

30-
defp parse_sync(html) do
31-
case Html5ever.Native.parse_sync(html) do
29+
defp parse_async(html) do
30+
:ok = Html5ever.Native.parse_async(html)
31+
receive do
3232
{:html5ever_nif_result, :ok, result} ->
3333
{:ok, result}
3434
{:html5ever_nif_result, :error, err} ->
@@ -46,7 +46,8 @@ defmodule Html5ever do
4646
end
4747

4848
defp flat_parse_async(html) do
49-
case Html5ever.Native.flat_parse_sync(html) do
49+
:ok = Html5ever.Native.flat_parse_async(html)
50+
receive do
5051
{:html5ever_nif_result, :ok, result} ->
5152
{:ok, result}
5253
{:html5ever_nif_result, :error, err} ->

lib/html5ever/native.ex

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@ end
55
defmodule Html5ever.Native do
66
use Rustler, otp_app: :html5ever, crate: "html5ever_nif"
77

8-
def parse_async(_binary), do: err()
98
def parse_sync(_binary), do: err()
9+
def parse_async(_binary), do: err()
1010
def flat_parse_sync(_binary), do: err()
11+
def flat_parse_async(_binary), do: err()
1112

1213
defp err() do
1314
throw NifNotLoadedError

native/html5ever_nif/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ crate-type = ["dylib"]
1212
[dependencies]
1313
rustler = "^0.21"
1414

15-
html5ever = "0.22"
16-
markup5ever = "0.7"
15+
html5ever = "0.25"
16+
markup5ever = "0.10"
1717

1818
tendril = "0.4"
1919
lazy_static = "1.4"

native/html5ever_nif/src/flat_dom.rs

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ impl Node {
3131
}
3232
}
3333

34-
#[derive(Debug)]
34+
#[derive(Debug, PartialEq)]
3535
pub enum NodeData{
3636
Document,
3737
DocType {
@@ -77,6 +77,10 @@ impl FlatSink {
7777
sink
7878
}
7979

80+
pub fn root(&self) -> NodeHandle {
81+
self.root
82+
}
83+
8084
pub fn node_mut<'a>(&'a mut self, handle: NodeHandle) -> &'a mut Node {
8185
&mut self.nodes[handle.0]
8286
}
@@ -309,7 +313,7 @@ mod atoms {
309313
}
310314
}
311315

312-
pub fn flat_sink_to_term<'a>(env: Env<'a>, sink: &FlatSink) -> Term<'a> {
316+
pub fn flat_sink_to_flat_term<'a>(env: Env<'a>, sink: &FlatSink) -> Term<'a> {
313317
let nodes = sink.nodes.iter()
314318
.fold(rustler::types::map::map_new(env), |acc, node| {
315319
acc.map_put(node.id.encode(env), node.encode(env)).ok().unwrap()
@@ -319,3 +323,89 @@ pub fn flat_sink_to_term<'a>(env: Env<'a>, sink: &FlatSink) -> Term<'a> {
319323
.map_put(self::atoms::nodes().encode(env), nodes).ok().unwrap()
320324
.map_put(self::atoms::root().encode(env), sink.root.encode(env)).ok().unwrap()
321325
}
326+
327+
struct RecState<'a> {
328+
node: NodeHandle,
329+
child_n: usize,
330+
331+
children: Vec<Term<'a>>,
332+
}
333+
334+
pub fn flat_sink_to_rec_term<'a>(env: Env<'a>, sink: &FlatSink) -> Term<'a> {
335+
let mut stack: Vec<RecState> = vec![
336+
RecState {
337+
node: sink.root(),
338+
child_n: 0,
339+
children: Vec::new(),
340+
},
341+
];
342+
343+
loop {
344+
let mut top = stack.pop().unwrap();
345+
let top_node = &sink.nodes[top.node.0];
346+
347+
if let Some(child_node) = top_node.children.get(top.child_n) {
348+
// If we find another child, we recurse downwards
349+
350+
let child = RecState {
351+
node: *child_node,
352+
child_n: 0,
353+
children: Vec::new(),
354+
};
355+
debug_assert!(sink.nodes[child_node.0].data != NodeData::Document);
356+
357+
top.child_n += 1;
358+
stack.push(top);
359+
stack.push(child);
360+
continue;
361+
} else {
362+
// If there are no more children, we add the child to the parent
363+
// (or we return if we are the root)
364+
365+
let term;
366+
367+
match &top_node.data {
368+
NodeData::Document => {
369+
let term = top.children.encode(env);
370+
assert_eq!(stack.len(), 0);
371+
return term;
372+
},
373+
NodeData::DocType { name, public_id, system_id } => {
374+
assert!(stack.len() > 0);
375+
assert!(top.children.len() == 0);
376+
377+
term = (
378+
self::atoms::doctype(),
379+
STW(name),
380+
STW(public_id),
381+
STW(system_id),
382+
).encode(env);
383+
},
384+
NodeData::Element { attrs, name, .. } => {
385+
assert!(stack.len() > 0);
386+
387+
let attribute_terms: Vec<Term<'a>> = attrs.iter()
388+
.map(|a| (QNW(&a.name), STW(&a.value)).encode(env))
389+
.collect();
390+
term = (QNW(name), attribute_terms, top.children).encode(env);
391+
},
392+
NodeData::Text { contents } => {
393+
term = STW(contents).encode(env);
394+
},
395+
NodeData::Comment { .. } => continue,
396+
_ => unimplemented!("{:?}", top_node),
397+
}
398+
399+
stack.last_mut().unwrap().children.push(term);
400+
}
401+
}
402+
}
403+
404+
405+
406+
407+
408+
409+
410+
411+

native/html5ever_nif/src/lib.rs

Lines changed: 75 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@ use rustler::env::OwnedEnv;
66
use rustler::types::binary::Binary;
77
use rustler::{Decoder, Encoder, Env, Error, NifResult, Term, rustler_export_nifs};
88

9-
use html5ever::rcdom::RcDom;
9+
//use html5ever::rcdom::RcDom;
1010
use tendril::TendrilSink;
1111

1212
mod common;
1313
mod flat_dom;
14-
mod rc_dom;
1514

1615
mod atoms {
1716
rustler::rustler_atoms! {
@@ -56,6 +55,21 @@ lazy_static! {
5655
static ref POOL: scoped_pool::Pool = scoped_pool::Pool::new(4);
5756
}
5857

58+
fn parse_sync<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
59+
let binary: Binary = args[0].decode()?;
60+
let sink = flat_dom::FlatSink::new();
61+
62+
// TODO: Use Parser.from_bytes instead?
63+
let parser = html5ever::parse_document(sink, Default::default());
64+
let result = parser.one(std::str::from_utf8(binary.as_slice()).unwrap());
65+
66+
// std::thread::sleep(std::time::Duration::from_millis(10));
67+
68+
let result_term = flat_dom::flat_sink_to_rec_term(env, &result);
69+
70+
Ok((atoms::html5ever_nif_result(), atoms::ok(), result_term).encode(env))
71+
}
72+
5973
fn parse_async<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
6074
let mut owned_env = OwnedEnv::new();
6175

@@ -78,17 +92,17 @@ fn parse_async<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
7892
Err(_) => panic!("argument is not a binary"),
7993
};
8094

81-
let sink = RcDom::default();
95+
let sink = flat_dom::FlatSink::new();
8296

8397
// TODO: Use Parser.from_bytes instead?
8498
let parser = html5ever::parse_document(sink, Default::default());
8599
let result = parser.one(std::str::from_utf8(binary.as_slice()).unwrap());
86100

87-
let result_term = rc_dom::handle_to_term(inner_env, &result.document);
101+
let result_term = flat_dom::flat_sink_to_rec_term(inner_env, &result);
88102
(
89103
atoms::html5ever_nif_result(),
90104
atoms::ok(),
91-
result_term.unwrap(),
105+
result_term,
92106
).encode(inner_env)
93107
}) {
94108
Ok(term) => term,
@@ -111,46 +125,83 @@ fn parse_async<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
111125
Ok(atoms::ok().encode(env))
112126
}
113127

114-
fn parse_sync<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
128+
fn flat_parse_sync<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
115129
let binary: Binary = args[0].decode()?;
116-
let sink = RcDom::default();
130+
let sink = flat_dom::FlatSink::new();
117131

118132
// TODO: Use Parser.from_bytes instead?
119133
let parser = html5ever::parse_document(sink, Default::default());
120134
let result = parser.one(std::str::from_utf8(binary.as_slice()).unwrap());
121135

122136
// std::thread::sleep(std::time::Duration::from_millis(10));
123137

124-
let result_term = rc_dom::handle_to_term(env, &result.document);
138+
let result_term = flat_dom::flat_sink_to_flat_term(env, &result);
125139

126-
Ok((
127-
atoms::html5ever_nif_result(),
128-
atoms::ok(),
129-
result_term.unwrap(),
130-
).encode(env))
140+
Ok((atoms::html5ever_nif_result(), atoms::ok(), result_term).encode(env))
131141
}
132142

133-
fn flat_parse_sync<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
134-
let binary: Binary = args[0].decode()?;
135-
let sink = flat_dom::FlatSink::new();
143+
fn flat_parse_async<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
144+
let mut owned_env = OwnedEnv::new();
136145

137-
// TODO: Use Parser.from_bytes instead?
138-
let parser = html5ever::parse_document(sink, Default::default());
139-
let result = parser.one(std::str::from_utf8(binary.as_slice()).unwrap());
146+
// Copies the term into the inner env. Since this term is normally a large
147+
// binary term, copying it over should be cheap, since the binary will be
148+
// refcounted within the BEAM.
149+
let input_term = owned_env.save(args[0]);
140150

141-
// std::thread::sleep(std::time::Duration::from_millis(10));
151+
let return_pid = env.pid();
142152

143-
let result_term = flat_dom::flat_sink_to_term(env, &result);
153+
// let config = term_to_configs(args[1]);
144154

145-
Ok((atoms::html5ever_nif_result(), atoms::ok(), result_term).encode(env))
155+
POOL.spawn(move || {
156+
owned_env.send_and_clear(&return_pid, |inner_env| {
157+
// This should not really be done in user code. We (Rustler project)
158+
// need to find a better abstraction that eliminates this.
159+
match panic::catch_unwind(|| {
160+
let binary: Binary = match input_term.load(inner_env).decode() {
161+
Ok(inner) => inner,
162+
Err(_) => panic!("argument is not a binary"),
163+
};
164+
165+
let sink = flat_dom::FlatSink::new();
166+
167+
// TODO: Use Parser.from_bytes instead?
168+
let parser = html5ever::parse_document(sink, Default::default());
169+
let result = parser.one(std::str::from_utf8(binary.as_slice()).unwrap());
170+
171+
let result_term = flat_dom::flat_sink_to_flat_term(inner_env, &result);
172+
(
173+
atoms::html5ever_nif_result(),
174+
atoms::ok(),
175+
result_term,
176+
).encode(inner_env)
177+
}) {
178+
Ok(term) => term,
179+
Err(err) => {
180+
// Try to extract a panic reason and return that. If this
181+
// fails, fail generically.
182+
let reason = if let Some(s) = err.downcast_ref::<String>() {
183+
s.encode(inner_env)
184+
} else if let Some(&s) = err.downcast_ref::<&'static str>() {
185+
s.encode(inner_env)
186+
} else {
187+
atoms::nif_panic().encode(inner_env)
188+
};
189+
(atoms::html5ever_nif_result(), atoms::error(), reason).encode(inner_env)
190+
}
191+
}
192+
});
193+
});
194+
195+
Ok(atoms::ok().encode(env))
146196
}
147197

148198
rustler_export_nifs!(
149199
"Elixir.Html5ever.Native",
150200
[
151-
("parse_async", 1, parse_async),
152201
("parse_sync", 1, parse_sync),
153-
("flat_parse_sync", 1, flat_parse_sync)
202+
("parse_async", 1, parse_async),
203+
("flat_parse_sync", 1, flat_parse_sync),
204+
("flat_parse_async", 1, flat_parse_async)
154205
],
155206
Some(on_load)
156207
);

native/html5ever_nif/src/rc_dom.rs

Lines changed: 0 additions & 45 deletions
This file was deleted.

0 commit comments

Comments
 (0)