perf(tokenizer): did some perf (#115)

IWANABETHATGUY · web-flow · commit 29465f0be7ed · 2022-07-04T12:23:08.000+08:00
* feat: 🎸 finish

* chore: 🤖 fix clippy

* chore: 🤖 clean up
diff --git a/crates/recursive-parser/benches/parse.rs b/crates/recursive-parser/benches/parse.rs
@@ -4,7 +4,7 @@ use recursive_parser::parser::{Parser, Root};
 const SMALL_CSS_FILE: &str = include_str!("../../../assets/bootstrap-reboot.css");
 const LARGE_CSS_FILE: &str = include_str!("../../../assets/bootstrap.css");
 
-fn parse<'a>(css: &'a str) -> Root {
+fn parse(css: &str) -> Root {
   let parser = Parser::new(css);
   parser.parse().unwrap()
 }
diff --git a/crates/recursive-parser/examples/plugin.rs b/crates/recursive-parser/examples/plugin.rs
@@ -1,6 +1,7 @@
 // use mimalloc_rust::*;
 use recursive_parser::{parser::*, visitor::VisitMut};
-use std::{borrow::Cow, io::Write, time::Instant};
+use std::fmt::Write;
+use std::{borrow::Cow, time::Instant};
 
 // #[global_allocator]
 // static GLOBAL_MIMALLOC: GlobalMiMalloc = GlobalMiMalloc;
@@ -12,17 +13,17 @@ fn main() {
     }
 }";
   let bootstrap = include_str!("../../../assets/bootstrap-reboot.css");
-  let mut start = Instant::now();
+  let start = Instant::now();
   let mut root = Parser::new(bootstrap).parse().unwrap();
   println!("parse {:?}", start.elapsed());
   // start = Instant::now();
   // ReverseProp::default().visit_root(&mut root);
   // println!("reverse {:?}", start.elapsed());
   let start = Instant::now();
-  let mut printer = SimplePrettier::new(Vec::with_capacity(bootstrap.len()));
+  let mut printer = SimplePrettier::new(String::with_capacity(bootstrap.len()));
   printer.visit_root(&mut root).unwrap();
   println!("stringify {:?}", start.elapsed());
-  println!("{}", String::from_utf8(printer.writer).unwrap());
+  println!("{}", printer.writer);
 }
 
 #[derive(Default)]
@@ -37,8 +38,8 @@ impl<W: Write> SimplePrettier<W> {
   }
 }
 
-impl<'a, W: std::io::Write> VisitMut<'a, std::io::Result<()>> for SimplePrettier<W> {
-  fn visit_root(&mut self, root: &mut Root<'a>) -> std::io::Result<()> {
+impl<'a, W: Write> VisitMut<'a, std::fmt::Result> for SimplePrettier<W> {
+  fn visit_root(&mut self, root: &mut Root<'a>) -> std::fmt::Result {
     for child in root.children.iter_mut() {
       match child {
         RuleOrAtRuleOrDecl::Rule(rule) => {
@@ -55,10 +56,13 @@ impl<'a, W: std::io::Write> VisitMut<'a, std::io::Result<()>> for SimplePrettier
     Ok(())
   }
 
-  fn visit_rule(&mut self, rule: &mut Rule<'a>) -> std::io::Result<()> {
-    self
-      .writer
-      .write(format!("{}{} {}\n", " ".repeat(self.level * 2), rule.selector, "{").as_bytes())?;
+  fn visit_rule(&mut self, rule: &mut Rule<'a>) -> std::fmt::Result {
+    writeln!(
+      self.writer,
+      "{}{} {{",
+      " ".repeat(self.level * 2),
+      rule.selector,
+    )?;
     self.level += 1;
     for child in rule.children.iter_mut() {
       match child {
@@ -74,18 +78,17 @@ impl<'a, W: std::io::Write> VisitMut<'a, std::io::Result<()>> for SimplePrettier
       }
     }
     self.level -= 1;
-    write!(self.writer, "{}{}\n", " ".repeat(self.level * 2), "}")?;
+    writeln!(self.writer, "{}}}", " ".repeat(self.level * 2),)?;
     Ok(())
   }
 
-  fn visit_at_rule(&mut self, at_rule: &mut AtRule<'a>) -> std::io::Result<()> {
-    write!(
+  fn visit_at_rule(&mut self, at_rule: &mut AtRule<'a>) -> std::fmt::Result {
+    writeln!(
       self.writer,
-      "{}@{} {} {}\n",
+      "{}@{} {} {{",
       " ".repeat(self.level * 2),
       at_rule.name,
       at_rule.params,
-      "{"
     )?;
     self.level += 1;
     for child in at_rule.children.iter_mut() {
@@ -102,13 +105,13 @@ impl<'a, W: std::io::Write> VisitMut<'a, std::io::Result<()>> for SimplePrettier
       }
     }
     self.level -= 1;
-    write!(self.writer, "{}{}\n", " ".repeat(self.level * 2), "}")
+    writeln!(self.writer, "{}}}", " ".repeat(self.level * 2))
   }
 
-  fn visit_declaration(&mut self, decl: &mut Declaration<'a>) -> std::io::Result<()> {
-    write!(
+  fn visit_declaration(&mut self, decl: &mut Declaration<'a>) -> std::fmt::Result {
+    writeln!(
       self.writer,
-      "{}{} : {};\n",
+      "{}{} : {};",
       " ".repeat(self.level * 2),
       decl.prop,
       decl.value
diff --git a/crates/recursive-parser/tests/basic.rs b/crates/recursive-parser/tests/basic.rs
@@ -26,7 +26,7 @@ mod test_ast {
       dbg!(&file_name);
       let expected_ast_path = format!(
         "./tests/fixtures/{}.ast",
-        file_name.rsplit_once(".").unwrap().0
+        file_name.rsplit_once('.').unwrap().0
       );
       let expected_ast = read_to_string(expected_ast_path)?;
       let parser = Parser::new(&file);
@@ -79,7 +79,7 @@ mod test_ast {
       let file = read_to_string(format!("./tests/official-cases/{}", file_name))?;
       let expected_ast_path = format!(
         "./tests/official-cases/{}.ast",
-        file_name.rsplit_once(".").unwrap().0
+        file_name.rsplit_once('.').unwrap().0
       );
       let expected_ast = read_to_string(expected_ast_path)?;
       let parser = Parser::new(&file);
diff --git a/crates/rowan-parser/benches/parse.rs b/crates/rowan-parser/benches/parse.rs
@@ -4,7 +4,7 @@ use rowan_parser::{parser, syntax::SyntaxNode};
 const SMALL_CSS_FILE: &str = include_str!("../../../assets/bootstrap-reboot.css");
 const LARGE_CSS_FILE: &str = include_str!("../../../assets/bootstrap.css");
 
-fn parse<'a>(css: &'a str) -> SyntaxNode {
+fn parse(css: &str) -> SyntaxNode {
   let parser = parser::Parser::new(css);
   parser.parse()
 }
diff --git a/crates/rowan-parser/examples/remove_space.rs b/crates/rowan-parser/examples/remove_space.rs
@@ -9,7 +9,7 @@ static GLOBAL_MIMALLOC: GlobalMiMalloc = GlobalMiMalloc;
 fn main() {
   let css = "#id { font-size: 12px; }";
   let root = Parser::new(css).parse();
-  let root_mut = root.clone_for_update().clone();
+  let root_mut = root.clone_for_update();
   remove_space_mut(&root_mut);
   let mut output = String::with_capacity(0);
   remove_space(&root, &mut output, css);
@@ -47,7 +47,7 @@ fn remove_space_mut(node: &SyntaxNode) {
     if child.kind() == SyntaxKind::Space {
       child.detach();
     }
-    child.as_node().map(|n| remove_space_mut(n));
+    child.as_node().map(remove_space_mut);
   }
 }
 
diff --git a/crates/tokenizer/src/input.rs b/crates/tokenizer/src/input.rs
@@ -28,7 +28,7 @@ impl fmt::Display for FilePosition {
 }
 
 static DEFAULT_INPUT: Lazy<Input> = Lazy::new(Input::default);
-#[derive(Debug, PartialEq, Clone, Default)]
+#[derive(Debug, PartialEq, Eq, Clone, Default)]
 pub struct Input<'a> {
   pub css: &'a str,
   // map: PreviousMap,
diff --git a/crates/tokenizer/src/main.rs b/crates/tokenizer/src/main.rs
@@ -1,35 +1,47 @@
-use std::env::current_exe;
-use std::fs::read_to_string;
+// use std::env::current_exe;
+// use std::fs::read_to_string;
 use std::time::Instant;
-use tokenizer::Tokenizer;
-
+use tokenizer::tokenize;
 fn main() {
-  let file_list = [
-    // ("tailwind-components.css", "2.8K"),
-    // ("bootstrap-reboot.css", "7.4K"),
-    // ("bootstrap-grid.css", "71K"),
-    ("bootstrap.css", "201K"),
-    // ("tailwind.css", "3.5M"),
-    // ("tailwind-dark.css", "5.8M"),
-  ];
+  let source = include_str!("../../../assets/bootstrap.css");
+  let start = Instant::now();
+  for _ in 0..100 {
+    tokenize(source);
+  }
+  println!("{:?}", start.elapsed());
+  // let vec = vec![
+  //   b'\t', b'\n', b'\r', b' ', b'"', b'#', b'\'', b'(', b')', b'/', b';', b'[', b'\\', b']', b'{',
+  //   b'}',
+  // ];
+  //   '\t', '\n', '\u{c}', '\r', ' ', '!', '"', '#', '\'', '(', ')', ':', ';', '@', '[', '\\', ']',
+  //   '{', '}', '/',
+  // ];
+  // let file_list = [
+  //   // ("tailwind-components.css", "2.8K"),
+  //   // ("bootstrap-reboot.css", "7.4K"),
+  //   // ("bootstrap-grid.css", "71K"),
+  //   ("bootstrap.css", "201K"),
+  //   // ("tailwind.css", "3.5M"),
+  //   // ("tailwind-dark.css", "5.8M"),
+  // ];
 
-  let assets_path = get_assets_path();
+  // let assets_path = get_assets_path();
 
-  for (file, size) in file_list {
-    let css: String = read_to_string(format!("{}/{}", assets_path, file)).unwrap();
-    let mut vec = Vec::default();
-    let start = Instant::now();
-    let processor = Tokenizer::new(&css, false);
-    while !processor.end_of_file() {
-      vec.push(processor.next_token(false));
-    }
-    let end = start.elapsed();
-    println!("rust: tokenizer/{}({}): {:?}", file, size, end);
-  }
+  // for (file, size) in file_list {
+  //   let css: String = read_to_string(format!("{}/{}", assets_path, file)).unwrap();
+  //   let mut vec = Vec::default();
+  //   let start = Instant::now();
+  //   let processor = Tokenizer::new(&css, false);
+  //   while !processor.end_of_file() {
+  //     vec.push(processor.next_token(false));
+  //   }
+  //   let end = start.elapsed();
+  //   println!("rust: tokenizer/{}({}): {:?}", file, size, end);
+  // }
 }
 
-fn get_assets_path() -> String {
-  let mut path = current_exe().unwrap();
-  path.push("../../../assets");
-  path.canonicalize().unwrap().to_str().unwrap().to_string()
-}
+// fn get_assets_path() -> String {
+//   let mut path = current_exe().unwrap();
+//   path.push("../../../assets");
+//   path.canonicalize().unwrap().to_str().unwrap().to_string()
+// }
diff --git a/crates/tokenizer/src/tokenizer.rs b/crates/tokenizer/src/tokenizer.rs
@@ -30,6 +30,26 @@ const AT: char = '@';
 
 const MAX_BUFFER: usize = 102400;
 
+const INDEX_OF_WORD_END: [usize; 255] = [
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+];
+const INDEX_OF_AT_END: [usize; 255] = [
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+];
 static FINDER_END_OF_COMMENT: Lazy<Finder<'static>> = Lazy::new(|| Finder::new("*/"));
 
 #[derive(Debug, Clone, Eq, PartialEq, Copy)]
@@ -261,9 +281,13 @@ impl<'a> Tokenizer<'a> {
         self.pos.replace(next + 1);
       }
       AT => {
-        let next = index_of_at_end(self.css, self.position() + 1) - 1;
-        current_token = Token(TokenType::AtWord, self.position(), next + 1);
-        self.pos.replace(next + 1);
+        let next = index_of_at_end(&self.css[self.position() + 1..]);
+        current_token = Token(
+          TokenType::AtWord,
+          self.position(),
+          next + self.position() + 1,
+        );
+        self.pos.replace(next + self.position() + 1);
       }
       BACKSLASH => {
         let mut next = self.position();
@@ -311,11 +335,12 @@ impl<'a> Tokenizer<'a> {
             current_token = Token(TokenType::Comment, self.position(), next + 1);
             next
           } else {
-            let next = index_of_word_end(self.css, self.position() + 1) - 1;
-            let content = sub_str(self.css, self.position(), next + 1);
-            current_token = Token::new(TokenType::Word, self.position(), next + 1);
+            let position = self.position();
+            let next = index_of_word_end(&self.css[position + 1..]);
+            let content = sub_str(self.css, self.position(), next + position + 1);
+            current_token = Token::new(TokenType::Word, self.position(), next + position + 1);
             self.push(content);
-            next
+            next + position
           },
         );
         self.pos_plus_one();
@@ -366,11 +391,7 @@ fn sub_str(s: &str, start: usize, end: usize) -> &str {
 
 #[inline]
 fn char_code_at(s: &str, n: usize) -> char {
-  if n >= s.len() {
-    '\0'
-  } else {
-    s.as_bytes()[n] as char
-  }
+  *s.as_bytes().get(n).unwrap_or(&b'\0') as char
 }
 
 #[inline]
@@ -398,47 +419,30 @@ fn is_bad_bracket(s: &str) -> bool {
 }
 
 #[inline]
-fn index_of_at_end(s: &str, start: usize) -> usize {
-  let bytes = s.as_bytes();
-  let mut i = start;
-  let len = bytes.len();
-
-  while i < len {
-    match bytes[i] as char {
-      '\t' | '\n' | '\u{c}' | '\r' | ' ' | '"' | '#' | '\'' | '(' | ')' | '/' | ';' | '['
-      | '\\' | ']' | '{' | '}' => {
-        return i;
-      }
-      _ => i += 1,
+fn index_of_at_end(s: &str) -> usize {
+  for (i, ch) in s.bytes().enumerate() {
+    if let 1 = INDEX_OF_AT_END[ch as usize] {
+      return i;
     };
   }
 
-  i
+  s.len()
 }
 
 #[inline]
-fn index_of_word_end(s: &str, start: usize) -> usize {
-  let bytes = s.as_bytes();
-  let mut i = start;
-  let len = bytes.len();
-
-  while i < len {
-    match bytes[i] as char {
-      '\t' | '\n' | '\u{c}' | '\r' | ' ' | '!' | '"' | '#' | '\'' | '(' | ')' | ':' | ';' | '@'
-      | '[' | '\\' | ']' | '{' | '}' => {
-        return i;
-      }
-      '/' => {
-        if bytes[i + 1] as char == '*' {
+fn index_of_word_end(s: &str) -> usize {
+  for (i, ch) in s.bytes().enumerate() {
+    match INDEX_OF_WORD_END[ch as usize] {
+      1 => return i,
+      2 => {
+        if s.as_bytes().get(i + 1) == Some(&b'*') {
           return i;
-        } else {
-          i += 1;
         }
       }
-      _ => i += 1,
-    };
+      _ => continue,
+    }
   }
-  i
+  s.len()
 }
 
 /// SAFETY: YOU SHOULD NEVER CALL THIS FUNCTION WITH THE PARAM OTHER THAN THESE BELOW.

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ use recursive_parser::parser::{Parser, Root};`
`4`	`4`	`const SMALL_CSS_FILE: &str = include_str!("../../../assets/bootstrap-reboot.css");`
`5`	`5`	`const LARGE_CSS_FILE: &str = include_str!("../../../assets/bootstrap.css");`
`6`	`6`
`7`		`-fn parse<'a>(css: &'a str) -> Root {`
	`7`	`+fn parse(css: &str) -> Root {`
`8`	`8`	`let parser = Parser::new(css);`
`9`	`9`	`parser.parse().unwrap()`
`10`	`10`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ use rowan_parser::{parser, syntax::SyntaxNode};`
`4`	`4`	`const SMALL_CSS_FILE: &str = include_str!("../../../assets/bootstrap-reboot.css");`
`5`	`5`	`const LARGE_CSS_FILE: &str = include_str!("../../../assets/bootstrap.css");`
`6`	`6`
`7`		`-fn parse<'a>(css: &'a str) -> SyntaxNode {`
	`7`	`+fn parse(css: &str) -> SyntaxNode {`
`8`	`8`	`let parser = parser::Parser::new(css);`
`9`	`9`	`parser.parse()`
`10`	`10`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ static GLOBAL_MIMALLOC: GlobalMiMalloc = GlobalMiMalloc;`
`9`	`9`	`fn main() {`
`10`	`10`	`let css = "#id { font-size: 12px; }";`
`11`	`11`	`let root = Parser::new(css).parse();`
`12`		`- let root_mut = root.clone_for_update().clone();`
	`12`	`+ let root_mut = root.clone_for_update();`
`13`	`13`	`remove_space_mut(&root_mut);`
`14`	`14`	`let mut output = String::with_capacity(0);`
`15`	`15`	`remove_space(&root, &mut output, css);`
`@@ -47,7 +47,7 @@ fn remove_space_mut(node: &SyntaxNode) {`
`47`	`47`	`if child.kind() == SyntaxKind::Space {`
`48`	`48`	`child.detach();`
`49`	`49`	`}`
`50`		`- child.as_node().map(\|n\| remove_space_mut(n));`
	`50`	`+ child.as_node().map(remove_space_mut);`
`51`	`51`	`}`
`52`	`52`	`}`
`53`	`53`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ impl fmt::Display for FilePosition {`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`static DEFAULT_INPUT: Lazy<Input> = Lazy::new(Input::default);`
`31`		`-#[derive(Debug, PartialEq, Clone, Default)]`
	`31`	`+#[derive(Debug, PartialEq, Eq, Clone, Default)]`
`32`	`32`	`pub struct Input<'a> {`
`33`	`33`	`pub css: &'a str,`
`34`	`34`	`// map: PreviousMap,`