diff --git a/Cargo.lock b/Cargo.lock index c8d0342708..52afbb053f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1428,7 +1428,16 @@ version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d829733185c1ca374f17e52b762f24f535ec625d2cc1f070e34c8a9068f341b" dependencies = [ - "num_enum_derive", + "num_enum_derive 0.5.9", +] + +[[package]] +name = "num_enum" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +dependencies = [ + "num_enum_derive 0.7.2", ] [[package]] @@ -1443,6 +1452,18 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "num_enum_derive" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -2165,6 +2186,15 @@ dependencies = [ "rustpython-derive", ] +[[package]] +name = "rustpython-sre_engine" +version = "0.6.0" +dependencies = [ + "bitflags 2.4.0", + "num_enum 0.7.2", + "optional", +] + [[package]] name = "rustpython-stdlib" version = "0.3.0" @@ -2200,7 +2230,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "num_enum", + "num_enum 0.7.2", "once_cell", "openssl", "openssl-probe", @@ -2270,7 +2300,7 @@ dependencies = [ "num-integer", "num-traits", "num_cpus", - "num_enum", + "num_enum 0.7.2", "once_cell", "optional", "parking_lot", @@ -2527,7 +2557,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a490c5c46c35dba9a6f5e7ee8e4d67e775eb2d2da0f115750b8d10e1c1ac2d28" dependencies = [ "bitflags 1.3.2", - "num_enum", + "num_enum 0.5.9", "optional", ] diff --git a/Cargo.toml b/Cargo.toml index bfc882fdc5..0f4fb49dc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ include = ["LICENSE", "Cargo.toml", "src/**/*.rs"] resolver = "2" members = [ "compiler", "compiler/core", "compiler/codegen", - ".", "common", "derive", "jit", "vm", "pylib", "stdlib", "wasm/lib", "derive-impl", + ".", "common", "derive", "jit", "vm", "vm/sre_engine", "pylib", "stdlib", "wasm/lib", "derive-impl", ] [workspace.dependencies] @@ -27,6 +27,7 @@ rustpython-jit = { path = "jit", version = "0.3.0" } rustpython-vm = { path = "vm", default-features = false, version = "0.3.0" } rustpython-pylib = { path = "pylib", version = "0.3.0" } rustpython-stdlib = { path = "stdlib", default-features = false, version = "0.3.0" } +rustpython-sre_engine = { path = "vm/sre_engine", version = "0.6.0" } rustpython-doc = { git = "https://github.com/RustPython/__doc__", tag = "0.3.0", version = "0.3.0" } rustpython-literal = { git = "https://github.com/RustPython/Parser.git", rev = "29c4728dbedc7e69cc2560b9b34058bbba9b1303" } @@ -64,7 +65,7 @@ malachite-base = "0.4.4" num-complex = "0.4.0" num-integer = "0.1.44" num-traits = "0.2" -num_enum = "0.5.7" +num_enum = "0.7" once_cell = "1.18" parking_lot = "0.12.1" paste = "1.0.7" diff --git a/vm/sre_engine/.gitignore b/vm/sre_engine/.gitignore new file mode 100644 index 0000000000..96ef6c0b94 --- /dev/null +++ b/vm/sre_engine/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/vm/sre_engine/.vscode/launch.json b/vm/sre_engine/.vscode/launch.json new file mode 100644 index 0000000000..5ebfe34f05 --- /dev/null +++ b/vm/sre_engine/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug Unit Test", + "cargo": { + "args": [ + "test", + "--no-run" + ], + "filter": { + "kind": "test" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/vm/sre_engine/Cargo.toml b/vm/sre_engine/Cargo.toml new file mode 100644 index 0000000000..2caa8b73e5 --- /dev/null +++ b/vm/sre_engine/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "rustpython-sre_engine" +version = "0.6.0" +authors = ["Kangzhi Shi ", "RustPython Team"] +description = "A low-level implementation of Python's SRE regex engine" +repository = "https://github.com/RustPython/RustPython" +license = "MIT" +edition = "2021" +keywords = ["regex"] +include = ["LICENSE", "src/**/*.rs"] + +[dependencies] +num_enum = { workspace = true } +bitflags = { workspace = true } +optional = "0.5" diff --git a/vm/sre_engine/LICENSE b/vm/sre_engine/LICENSE new file mode 100644 index 0000000000..7213274e0f --- /dev/null +++ b/vm/sre_engine/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 RustPython Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vm/sre_engine/benches/benches.rs b/vm/sre_engine/benches/benches.rs new file mode 100644 index 0000000000..e89adab0dd --- /dev/null +++ b/vm/sre_engine/benches/benches.rs @@ -0,0 +1,111 @@ +#![feature(test)] + +extern crate test; +use test::Bencher; + +use sre_engine::{Request, State, StrDrive}; + +struct Pattern { + code: &'static [u32], +} + +impl Pattern { + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { + self.state_range(string, 0..usize::MAX) + } + + fn state_range<'a, S: StrDrive>( + &self, + string: S, + range: std::ops::Range, + ) -> (Request<'a, S>, State) { + let req = Request::new(string, range.start, range.end, self.code, false); + let state = State::default(); + (req, state) + } +} + +#[bench] +fn benchmarks(b: &mut Bencher) { + // # test common prefix + // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] }; + // END GENERATED + // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] }; + // END GENERATED + // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] }; + // END GENERATED + // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] }; + // END GENERATED + // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] }; + // END GENERATED + // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; + // END GENERATED + // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; + // END GENERATED + // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; + // END GENERATED + // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; + // END GENERATED + // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] }; + // END GENERATED + // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] }; + // END GENERATED + + let tests = [ + (p1, "Perl"), + (p2, "Perl"), + (p3, "Perl"), + (p4, "Perl"), + (p5, "PythonPython"), + (p6, "a5,b7,c9,"), + (p7, "a5,b7,c9,"), + (p8, "Python"), + (p9, "Python"), + (p10, "Python"), + (p11, "Python"), + ]; + + b.iter(move || { + for (p, s) in &tests { + let (req, mut state) = p.state(s.clone()); + assert!(state.search(req)); + let (req, mut state) = p.state(s.clone()); + assert!(state.pymatch(&req)); + let (mut req, mut state) = p.state(s.clone()); + req.match_all = true; + assert!(state.pymatch(&req)); + let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); + let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + assert!(state.search(req)); + let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + assert!(state.pymatch(&req)); + let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + assert!(state.pymatch(&req)); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + req.match_all = true; + assert!(state.pymatch(&req)); + } + }) +} diff --git a/vm/sre_engine/generate_tests.py b/vm/sre_engine/generate_tests.py new file mode 100644 index 0000000000..8adf043f29 --- /dev/null +++ b/vm/sre_engine/generate_tests.py @@ -0,0 +1,47 @@ +import os +from pathlib import Path +import re +import sre_constants +import sre_compile +import sre_parse +import json +from itertools import chain + +m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) +sre_engine_magic = int(m.group(1)) +del m + +assert sre_constants.MAGIC == sre_engine_magic + +class CompiledPattern: + @classmethod + def compile(cls, pattern, flags=0): + p = sre_parse.parse(pattern) + code = sre_compile._code(p, flags) + self = cls() + self.pattern = pattern + self.code = code + self.flags = re.RegexFlag(flags | p.state.flags) + return self + +for k, v in re.RegexFlag.__members__.items(): + setattr(CompiledPattern, k, v) + + +# matches `// pattern {varname} = re.compile(...)` +pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S) +def replace_compiled(m): + line, indent, varname, pattern = m.groups() + pattern = eval(pattern, {"re": CompiledPattern}) + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}" + return f'''{line} +{indent}// START GENERATED by generate_tests.py +{indent}#[rustfmt::skip] let {varname} = {pattern}; +{indent}// END GENERATED''' + +with os.scandir("tests") as t, os.scandir("benches") as b: + for f in chain(t, b): + path = Path(f.path) + if path.suffix == ".rs": + replaced = pattern_pattern.sub(replace_compiled, path.read_text()) + path.write_text(replaced) diff --git a/vm/sre_engine/src/constants.rs b/vm/sre_engine/src/constants.rs new file mode 100644 index 0000000000..9fe792ce17 --- /dev/null +++ b/vm/sre_engine/src/constants.rs @@ -0,0 +1,125 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +use bitflags::bitflags; + +pub const SRE_MAGIC: usize = 20221023; +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] +#[repr(u32)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] +pub enum SreOpcode { + FAILURE = 0, + SUCCESS = 1, + ANY = 2, + ANY_ALL = 3, + ASSERT = 4, + ASSERT_NOT = 5, + AT = 6, + BRANCH = 7, + CATEGORY = 8, + CHARSET = 9, + BIGCHARSET = 10, + GROUPREF = 11, + GROUPREF_EXISTS = 12, + IN = 13, + INFO = 14, + JUMP = 15, + LITERAL = 16, + MARK = 17, + MAX_UNTIL = 18, + MIN_UNTIL = 19, + NOT_LITERAL = 20, + NEGATE = 21, + RANGE = 22, + REPEAT = 23, + REPEAT_ONE = 24, + SUBPATTERN = 25, + MIN_REPEAT_ONE = 26, + ATOMIC_GROUP = 27, + POSSESSIVE_REPEAT = 28, + POSSESSIVE_REPEAT_ONE = 29, + GROUPREF_IGNORE = 30, + IN_IGNORE = 31, + LITERAL_IGNORE = 32, + NOT_LITERAL_IGNORE = 33, + GROUPREF_LOC_IGNORE = 34, + IN_LOC_IGNORE = 35, + LITERAL_LOC_IGNORE = 36, + NOT_LITERAL_LOC_IGNORE = 37, + GROUPREF_UNI_IGNORE = 38, + IN_UNI_IGNORE = 39, + LITERAL_UNI_IGNORE = 40, + NOT_LITERAL_UNI_IGNORE = 41, + RANGE_UNI_IGNORE = 42, +} +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] +#[repr(u32)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] +pub enum SreAtCode { + BEGINNING = 0, + BEGINNING_LINE = 1, + BEGINNING_STRING = 2, + BOUNDARY = 3, + NON_BOUNDARY = 4, + END = 5, + END_LINE = 6, + END_STRING = 7, + LOC_BOUNDARY = 8, + LOC_NON_BOUNDARY = 9, + UNI_BOUNDARY = 10, + UNI_NON_BOUNDARY = 11, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] +pub enum SreCatCode { + DIGIT = 0, + NOT_DIGIT = 1, + SPACE = 2, + NOT_SPACE = 3, + WORD = 4, + NOT_WORD = 5, + LINEBREAK = 6, + NOT_LINEBREAK = 7, + LOC_WORD = 8, + LOC_NOT_WORD = 9, + UNI_DIGIT = 10, + UNI_NOT_DIGIT = 11, + UNI_SPACE = 12, + UNI_NOT_SPACE = 13, + UNI_WORD = 14, + UNI_NOT_WORD = 15, + UNI_LINEBREAK = 16, + UNI_NOT_LINEBREAK = 17, +} +bitflags! { +#[derive(Debug, PartialEq, Eq, Clone, Copy)] + pub struct SreFlag: u16 { + const TEMPLATE = 1; + const IGNORECASE = 2; + const LOCALE = 4; + const MULTILINE = 8; + const DOTALL = 16; + const UNICODE = 32; + const VERBOSE = 64; + const DEBUG = 128; + const ASCII = 256; + } +} +bitflags! { + pub struct SreInfo: u32 { + const PREFIX = 1; + const LITERAL = 2; + const CHARSET = 4; + } +} diff --git a/vm/sre_engine/src/engine.rs b/vm/sre_engine/src/engine.rs new file mode 100644 index 0000000000..fb7d766e29 --- /dev/null +++ b/vm/sre_engine/src/engine.rs @@ -0,0 +1,1411 @@ +// good luck to those that follow; here be dragons + +use crate::string::{ + is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space, + is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode, +}; + +use super::{SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor, MAXREPEAT}; +use optional::Optioned; +use std::{convert::TryFrom, ptr::null}; + +#[derive(Debug, Clone, Copy)] +pub struct Request<'a, S> { + pub string: S, + pub start: usize, + pub end: usize, + pub pattern_codes: &'a [u32], + pub match_all: bool, + pub must_advance: bool, +} + +impl<'a, S: StrDrive> Request<'a, S> { + pub fn new( + string: S, + start: usize, + end: usize, + pattern_codes: &'a [u32], + match_all: bool, + ) -> Self { + let end = std::cmp::min(end, string.count()); + let start = std::cmp::min(start, end); + + Self { + string, + start, + end, + pattern_codes, + match_all, + must_advance: false, + } + } +} + +#[derive(Debug)] +pub struct Marks { + last_index: isize, + marks: Vec>, + marks_stack: Vec<(Vec>, isize)>, +} + +impl Default for Marks { + fn default() -> Self { + Self { + last_index: -1, + marks: Vec::new(), + marks_stack: Vec::new(), + } + } +} + +impl Marks { + pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (Optioned::none(), Optioned::none()) + } + } + + pub fn last_index(&self) -> isize { + self.last_index + } + + pub fn raw(&self) -> &[Optioned] { + self.marks.as_slice() + } + + fn set(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.last_index = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, Optioned::none()); + } + self.marks[mark_nr] = Optioned::some(position); + } + + fn push(&mut self) { + self.marks_stack.push((self.marks.clone(), self.last_index)); + } + + fn pop(&mut self) { + let (marks, last_index) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_keep(&mut self) { + let (marks, last_index) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_discard(&mut self) { + self.marks_stack.pop(); + } + + fn clear(&mut self) { + self.last_index = -1; + self.marks.clear(); + self.marks_stack.clear(); + } +} + +#[derive(Debug, Default)] +pub struct State { + pub start: usize, + pub marks: Marks, + pub cursor: StringCursor, + repeat_stack: Vec, +} + +impl State { + pub fn reset(&mut self, req: &Request, start: usize) { + self.marks.clear(); + self.repeat_stack.clear(); + self.start = start; + req.string.adjust_cursor(&mut self.cursor, start); + } + + pub fn pymatch(&mut self, req: &Request) -> bool { + self.start = req.start; + req.string.adjust_cursor(&mut self.cursor, self.start); + + let ctx = MatchContext { + cursor: self.cursor, + code_position: 0, + toplevel: true, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + }; + _match(req, self, ctx) + } + + pub fn search(&mut self, mut req: Request) -> bool { + self.start = req.start; + req.string.adjust_cursor(&mut self.cursor, self.start); + + if req.start > req.end { + return false; + } + + let mut end = req.end; + + let mut ctx = MatchContext { + cursor: self.cursor, + code_position: 0, + toplevel: true, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + }; + + if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { + /* optimization info block */ + /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + let min = ctx.peek_code(&req, 3) as usize; + + if ctx.remaining_chars(&req) < min { + return false; + } + + if min > 1 { + /* adjust end point (but make sure we leave at least one + character in there, so literal search will work) */ + // no overflow can happen as remaining chars >= min + end -= min - 1; + + // adjust ctx position + if end < ctx.cursor.position { + let skip = end - self.cursor.position; + S::skip(&mut self.cursor, skip); + } + } + + let flags = SreInfo::from_bits_truncate(ctx.peek_code(&req, 2)); + + if flags.contains(SreInfo::PREFIX) { + if flags.contains(SreInfo::LITERAL) { + return search_info_literal::(&mut req, self, ctx); + } else { + return search_info_literal::(&mut req, self, ctx); + } + } else if flags.contains(SreInfo::CHARSET) { + return search_info_charset(&mut req, self, ctx); + } + // fallback to general search + // skip OP INFO + ctx.skip_code_from(&req, 1); + } + + if _match(&req, self, ctx) { + return true; + } + + if ctx.try_peek_code_as::(&req, 0).unwrap() == SreOpcode::AT + && (ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING + || ctx.try_peek_code_as::(&req, 1).unwrap() + == SreAtCode::BEGINNING_STRING) + { + self.cursor.position = req.end; + self.cursor.ptr = null(); + // self.reset(&req, req.end); + return false; + } + + req.must_advance = false; + ctx.toplevel = false; + while req.start < end { + req.start += 1; + self.reset(&req, req.start); + ctx.cursor = self.cursor; + + if _match(&req, self, ctx) { + return true; + } + } + false + } +} + +pub struct SearchIter<'a, S: StrDrive> { + pub req: Request<'a, S>, + pub state: State, +} + +impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { + type Item = (); + + fn next(&mut self) -> Option { + if self.req.start > self.req.end { + return None; + } + + self.state.reset(&self.req, self.req.start); + if !self.state.search(self.req) { + return None; + } + + self.req.must_advance = self.state.cursor.position == self.state.start; + self.req.start = self.state.cursor.position; + + Some(()) + } +} + +#[derive(Debug, Clone, Copy)] +enum Jump { + OpCode, + Assert1, + AssertNot1, + Branch1, + Branch2, + Repeat1, + UntilBacktrace, + MaxUntil2, + MaxUntil3, + MinUntil1, + RepeatOne1, + RepeatOne2, + MinRepeatOne1, + MinRepeatOne2, + AtomicGroup1, + PossessiveRepeat1, + PossessiveRepeat2, + PossessiveRepeat3, + PossessiveRepeat4, +} + +fn _match(req: &Request, state: &mut State, mut ctx: MatchContext) -> bool { + let mut context_stack = vec![]; + let mut popped_result = false; + + // NOTE: 'result loop is not an actual loop but break label + #[allow(clippy::never_loop)] + 'coro: loop { + popped_result = 'result: loop { + let yielded = 'context: loop { + match ctx.jump { + Jump::OpCode => {} + Jump::Assert1 => { + if popped_result { + ctx.skip_code_from(req, 1); + } else { + break 'result false; + } + } + Jump::AssertNot1 => { + if popped_result { + break 'result false; + } + state.marks.pop(); + ctx.skip_code_from(req, 1); + } + Jump::Branch1 => { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(req, branch_offset) as isize; + if next_length == 0 { + state.marks.pop_discard(); + break 'result false; + } + state.cursor = ctx.cursor; + let next_ctx = ctx.next_offset(branch_offset + 1, Jump::Branch2); + ctx.count += next_length; + break 'context next_ctx; + } + Jump::Branch2 => { + if popped_result { + break 'result true; + } + state.marks.pop_keep(); + ctx.jump = Jump::Branch1; + continue 'context; + } + Jump::Repeat1 => { + state.repeat_stack.pop(); + break 'result popped_result; + } + Jump::UntilBacktrace => { + if !popped_result { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.cursor = ctx.cursor; + } + break 'result popped_result; + } + Jump::MaxUntil2 => { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if popped_result { + state.marks.pop_discard(); + break 'result true; + } + + state.marks.pop(); + repeat_ctx.count -= 1; + state.cursor = ctx.cursor; + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + Jump::MaxUntil3 => { + if !popped_result { + state.cursor = ctx.cursor; + } + break 'result popped_result; + } + Jump::MinUntil1 => { + if popped_result { + break 'result true; + } + ctx.repeat_ctx_id = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.cursor = ctx.cursor; + state.marks.pop(); + + // match more until tail matches + if repeat_ctx.count as usize >= repeat_ctx.max_count + && repeat_ctx.max_count != MAXREPEAT + || state.cursor.position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + break 'result false; + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.cursor.position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + Jump::RepeatOne1 => { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char::() != c { + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + ctx.back_advance_char::(); + ctx.count -= 1; + } + } + + state.cursor = ctx.cursor; + // General case: backtracking + break 'context ctx.next_peek_from(1, req, Jump::RepeatOne2); + } + Jump::RepeatOne2 => { + if popped_result { + break 'result true; + } + + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + + ctx.back_advance_char::(); + ctx.count -= 1; + + state.marks.pop_keep(); + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + Jump::MinRepeatOne1 => { + let max_count = ctx.peek_code(req, 3) as usize; + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.cursor = ctx.cursor; + break 'context ctx.next_peek_from(1, req, Jump::MinRepeatOne2); + } else { + state.marks.pop_discard(); + break 'result false; + } + } + Jump::MinRepeatOne2 => { + if popped_result { + break 'result true; + } + + state.cursor = ctx.cursor; + + let mut count_ctx = ctx; + count_ctx.skip_code(4); + if _count(req, state, &mut count_ctx, 1) == 0 { + state.marks.pop_discard(); + break 'result false; + } + + ctx.advance_char::(); + ctx.count += 1; + state.marks.pop_keep(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + Jump::AtomicGroup1 => { + if popped_result { + ctx.skip_code_from(req, 1); + ctx.cursor = state.cursor; + // dispatch opcode + } else { + state.cursor = ctx.cursor; + break 'result false; + } + } + Jump::PossessiveRepeat1 => { + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count < min_count { + break 'context ctx.next_offset(4, Jump::PossessiveRepeat2); + } + // zero match protection + ctx.cursor.position = usize::MAX; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + Jump::PossessiveRepeat2 => { + if popped_result { + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } else { + state.cursor = ctx.cursor; + break 'result false; + } + } + Jump::PossessiveRepeat3 => { + let max_count = ctx.peek_code(req, 3) as usize; + if ((ctx.count as usize) < max_count || max_count == MAXREPEAT) + && ctx.cursor.position != state.cursor.position + { + state.marks.push(); + ctx.cursor = state.cursor; + break 'context ctx.next_offset(4, Jump::PossessiveRepeat4); + } + ctx.cursor = state.cursor; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + } + Jump::PossessiveRepeat4 => { + if popped_result { + state.marks.pop_discard(); + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + state.marks.pop(); + state.cursor = ctx.cursor; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + } + } + ctx.jump = Jump::OpCode; + + loop { + macro_rules! general_op_literal { + ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] + if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char::()) { + break 'result false; + } + ctx.skip_code(2); + ctx.advance_char::(); + }}; + } + + macro_rules! general_op_in { + ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] + if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char::()) + { + break 'result false; + } + ctx.skip_code_from(req, 1); + ctx.advance_char::(); + }}; + } + + macro_rules! general_op_groupref { + ($f:expr) => {{ + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + break 'result false; + }; + + let mut gctx = MatchContext { + cursor: req.string.create_cursor(group_start), + ..ctx + }; + + for _ in group_start..group_end { + #[allow(clippy::redundant_closure_call)] + if ctx.at_end(req) + || $f(ctx.peek_char::()) != $f(gctx.peek_char::()) + { + break 'result false; + } + ctx.advance_char::(); + gctx.advance_char::(); + } + + ctx.skip_code(2); + }}; + } + + if ctx.remaining_codes(req) == 0 { + break 'result false; + } + let opcode = ctx.peek_code(req, 0); + let opcode = SreOpcode::try_from(opcode).unwrap(); + + match opcode { + SreOpcode::FAILURE => break 'result false, + SreOpcode::SUCCESS => { + if ctx.can_success(req) { + state.cursor = ctx.cursor; + break 'result true; + } + break 'result false; + } + SreOpcode::ANY => { + if ctx.at_end(req) || ctx.at_linebreak(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.advance_char::(); + } + SreOpcode::ANY_ALL => { + if ctx.at_end(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.advance_char::(); + } + /* */ + SreOpcode::ASSERT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.cursor.position < back { + break 'result false; + } + + let mut next_ctx = ctx.next_offset(3, Jump::Assert1); + next_ctx.toplevel = false; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; + break 'context next_ctx; + } + /* */ + SreOpcode::ASSERT_NOT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.cursor.position < back { + ctx.skip_code_from(req, 1); + continue; + } + state.marks.push(); + + let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); + next_ctx.toplevel = false; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; + break 'context next_ctx; + } + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, &ctx, atcode) { + ctx.skip_code(2); + } else { + break 'result false; + } + } + // <0=skip> code ... + SreOpcode::BRANCH => { + state.marks.push(); + ctx.count = 1; + ctx.jump = Jump::Branch1; + continue 'context; + } + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char::()) { + break 'result false; + } + ctx.skip_code(2); + ctx.advance_char::(); + } + SreOpcode::IN => general_op_in!(charset), + SreOpcode::IN_IGNORE => { + general_op_in!(|set, c| charset(set, lower_ascii(c))) + } + SreOpcode::IN_UNI_IGNORE => { + general_op_in!(|set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), + SreOpcode::MARK => { + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.cursor.position); + ctx.skip_code(2); + } + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(req, 1), + /* <1=min> <2=max> item tail */ + SreOpcode::REPEAT => { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + state.repeat_stack.push(repeat_ctx); + let repeat_ctx_id = state.repeat_stack.len() - 1; + state.cursor = ctx.cursor; + let mut next_ctx = ctx.next_peek_from(1, req, Jump::Repeat1); + next_ctx.repeat_ctx_id = repeat_ctx_id; + break 'context next_ctx; + } + SreOpcode::MAX_UNTIL => { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.cursor = ctx.cursor; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count + || repeat_ctx.max_count == MAXREPEAT) + && state.cursor.position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + state.marks.push(); + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.cursor.position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::MaxUntil2); + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + SreOpcode::MIN_UNTIL => { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + state.cursor = ctx.cursor; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + state.marks.push(); + ctx.count = ctx.repeat_ctx_id as isize; + let mut next_ctx = ctx.next_offset(1, Jump::MinUntil1); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + /* <1=min> <2=max> item tail */ + SreOpcode::REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.cursor = ctx.cursor; + + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, &mut count_ctx, max_count); + if count < min_count { + break 'result false; + } + ctx.cursor = count_ctx.cursor; + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.cursor = ctx.cursor; + break 'result true; + } + + state.marks.push(); + ctx.count = count as isize; + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + /* <1=min> <2=max> item tail */ + SreOpcode::MIN_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.cursor = ctx.cursor; + ctx.count = if min_count == 0 { + 0 + } else { + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, &mut count_ctx, min_count); + if count < min_count { + break 'result false; + } + ctx.cursor = count_ctx.cursor; + count as isize + }; + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.cursor = ctx.cursor; + break 'result true; + } + + state.marks.push(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + SreOpcode::LITERAL => general_op_literal!(|code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal!(|code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal!(|code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal!(|code, c| !char_loc_ignore(code, c)) + } + SreOpcode::GROUPREF => general_op_groupref!(|x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) + } + } + /* pattern tail */ + SreOpcode::ATOMIC_GROUP => { + state.cursor = ctx.cursor; + break 'context ctx.next_offset(2, Jump::AtomicGroup1); + } + /* <1=min> <2=max> pattern + tail */ + SreOpcode::POSSESSIVE_REPEAT => { + state.cursor = ctx.cursor; + ctx.count = 0; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } + /* <1=min> <2=max> item + tail */ + SreOpcode::POSSESSIVE_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + state.cursor = ctx.cursor; + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, &mut count_ctx, max_count); + if count < min_count { + break 'result false; + } + ctx.cursor = count_ctx.cursor; + ctx.skip_code_from(req, 1); + } + SreOpcode::CHARSET + | SreOpcode::BIGCHARSET + | SreOpcode::NEGATE + | SreOpcode::RANGE + | SreOpcode::RANGE_UNI_IGNORE + | SreOpcode::SUBPATTERN => { + unreachable!("unexpected opcode on main dispatch") + } + } + } + }; + context_stack.push(ctx); + ctx = yielded; + continue 'coro; + }; + if let Some(popped_ctx) = context_stack.pop() { + ctx = popped_ctx; + } else { + break; + } + } + popped_result +} + +fn search_info_literal( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) -> bool { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + if len == 1 { + // pattern starts with a literal character + let c = prefix[0]; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char::() != c { + ctx.advance_char::(); + if ctx.at_end(req) { + return false; + } + } + + req.start = ctx.cursor.position; + state.start = req.start; + state.cursor = ctx.cursor; + S::skip(&mut state.cursor, skip); + + // literal only + if LITERAL { + return true; + } + + let mut next_ctx = ctx; + next_ctx.skip_char::(skip); + + if _match(req, state, next_ctx) { + return true; + } + + ctx.advance_char::(); + state.marks.clear(); + } + } else { + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char::() != c { + ctx.advance_char::(); + if ctx.at_end(req) { + return false; + } + } + ctx.advance_char::(); + if ctx.at_end(req) { + return false; + } + + let mut i = 1; + loop { + if ctx.peek_char::() == prefix[i] { + i += 1; + if i != len { + ctx.advance_char::(); + if ctx.at_end(req) { + return false; + } + continue; + } + + req.start = ctx.cursor.position - (len - 1); + state.reset(req, req.start); + S::skip(&mut state.cursor, skip); + // state.start = req.start; + // state.cursor = req.string.create_cursor(req.start + skip); + + // literal only + if LITERAL { + return true; + } + + let mut next_ctx = ctx; + if skip != 0 { + next_ctx.advance_char::(); + } else { + next_ctx.cursor = state.cursor; + } + + if _match(req, state, next_ctx) { + return true; + } + + ctx.advance_char::(); + if ctx.at_end(req) { + return false; + } + state.marks.clear(); + } + + i = overlap[i] as usize; + if i == 0 { + break; + } + } + } + } + false +} + +fn search_info_charset( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) -> bool { + let set = &ctx.pattern(req)[5..]; + + ctx.skip_code_from(req, 1); + + req.must_advance = false; + + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char::()) { + ctx.advance_char::(); + } + if ctx.at_end(req) { + return false; + } + + req.start = ctx.cursor.position; + state.start = ctx.cursor.position; + state.cursor = ctx.cursor; + + if _match(req, state, ctx) { + return true; + } + + ctx.advance_char::(); + state.marks.clear(); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +#[derive(Clone, Copy)] +struct MatchContext { + cursor: StringCursor, + code_position: usize, + toplevel: bool, + jump: Jump, + repeat_ctx_id: usize, + count: isize, +} + +impl MatchContext { + fn pattern<'a, S>(&self, req: &Request<'a, S>) -> &'a [u32] { + &req.pattern_codes[self.code_position..] + } + + fn remaining_codes(&self, req: &Request) -> usize { + req.pattern_codes.len() - self.code_position + } + + fn remaining_chars(&self, req: &Request) -> usize { + req.end - self.cursor.position + } + + fn peek_char(&self) -> u32 { + S::peek(&self.cursor) + } + + fn skip_char(&mut self, skip: usize) { + S::skip(&mut self.cursor, skip); + } + + fn advance_char(&mut self) -> u32 { + S::advance(&mut self.cursor) + } + + fn back_peek_char(&self) -> u32 { + S::back_peek(&self.cursor) + } + + fn back_skip_char(&mut self, skip: usize) { + S::back_skip(&mut self.cursor, skip); + } + + fn back_advance_char(&mut self) -> u32 { + S::back_advance(&mut self.cursor) + } + + fn peek_code(&self, req: &Request, peek: usize) -> u32 { + req.pattern_codes[self.code_position + peek] + } + + fn try_peek_code_as(&self, req: &Request, peek: usize) -> Result + where + T: TryFrom, + { + self.peek_code(req, peek).try_into() + } + + fn skip_code(&mut self, skip: usize) { + self.code_position += skip; + } + + fn skip_code_from(&mut self, req: &Request, peek: usize) { + self.skip_code(self.peek_code(req, peek) as usize + 1); + } + + fn at_beginning(&self) -> bool { + // self.ctx().string_position == self.state().start + self.cursor.position == 0 + } + + fn at_end(&self, req: &Request) -> bool { + self.cursor.position == req.end + } + + fn at_linebreak(&self, req: &Request) -> bool { + !self.at_end(req) && is_linebreak(self.peek_char::()) + } + + fn at_boundary bool>( + &self, + req: &Request, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(req) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); + this != that + } + + fn at_non_boundary bool>( + &self, + req: &Request, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(req) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); + this == that + } + + fn can_success(&self, req: &Request) -> bool { + if !self.toplevel { + return true; + } + if req.match_all && !self.at_end(req) { + return false; + } + if req.must_advance && self.cursor.position == req.start { + return false; + } + true + } + + #[must_use] + fn next_peek_from(&mut self, peek: usize, req: &Request, jump: Jump) -> Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, jump) + } + + #[must_use] + fn next_offset(&mut self, offset: usize, jump: Jump) -> Self { + self.next_at(self.code_position + offset, jump) + } + + #[must_use] + fn next_at(&mut self, code_position: usize, jump: Jump) -> Self { + self.jump = jump; + MatchContext { + code_position, + jump: Jump::OpCode, + count: -1, + ..*self + } + } +} + +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::()), + SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), + SreAtCode::END => { + (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) + } + SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), + SreAtCode::END_STRING => ctx.at_end(req), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), + } +} + +fn char_loc_ignore(code: u32, c: u32) -> bool { + code == c || code == lower_locate(c) || code == upper_locate(c) +} + +fn charset_loc_ignore(set: &[u32], c: u32) -> bool { + let lo = lower_locate(c); + if charset(set, c) { + return true; + } + let up = upper_locate(c); + up != lo && charset(set, up) +} + +fn category(catcode: SreCatCode, c: u32) -> bool { + match catcode { + SreCatCode::DIGIT => is_digit(c), + SreCatCode::NOT_DIGIT => !is_digit(c), + SreCatCode::SPACE => is_space(c), + SreCatCode::NOT_SPACE => !is_space(c), + SreCatCode::WORD => is_word(c), + SreCatCode::NOT_WORD => !is_word(c), + SreCatCode::LINEBREAK => is_linebreak(c), + SreCatCode::NOT_LINEBREAK => !is_linebreak(c), + SreCatCode::LOC_WORD => is_loc_word(c), + SreCatCode::LOC_NOT_WORD => !is_loc_word(c), + SreCatCode::UNI_DIGIT => is_uni_digit(c), + SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), + SreCatCode::UNI_SPACE => is_uni_space(c), + SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), + SreCatCode::UNI_WORD => is_uni_word(c), + SreCatCode::UNI_NOT_WORD => !is_uni_word(c), + SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), + SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), + } +} + +fn charset(set: &[u32], ch: u32) -> bool { + /* check if character is a member of the given set */ + let mut ok = true; + let mut i = 0; + while i < set.len() { + let opcode = match SreOpcode::try_from(set[i]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + match opcode { + SreOpcode::FAILURE => { + return !ok; + } + SreOpcode::CATEGORY => { + /* */ + let catcode = match SreCatCode::try_from(set[i + 1]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + if category(catcode, ch) { + return ok; + } + i += 2; + } + SreOpcode::CHARSET => { + /* */ + let set = &set[i + 1..]; + if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { + return ok; + } + i += 1 + 8; + } + SreOpcode::BIGCHARSET => { + /* <256 blockindices> */ + let count = set[i + 1] as usize; + if ch < 0x10000 { + let set = &set[i + 2..]; + let block_index = ch >> 8; + let (_, blockindices, _) = unsafe { set.align_to::() }; + let blocks = &set[64..]; + let block = blockindices[block_index as usize]; + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1u32 << (ch & 31)) + != 0 + { + return ok; + } + } + i += 2 + 64 + count * 8; + } + SreOpcode::LITERAL => { + /* */ + if ch == set[i + 1] { + return ok; + } + i += 2; + } + SreOpcode::NEGATE => { + ok = !ok; + i += 1; + } + SreOpcode::RANGE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + SreOpcode::RANGE_UNI_IGNORE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + let ch = upper_unicode(ch); + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + _ => { + break; + } + } + } + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + false +} + +fn _count( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, + max_count: usize, +) -> usize { + let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); + let end = ctx.cursor.position + max_count; + let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); + + match opcode { + SreOpcode::ANY => { + while ctx.cursor.position < end && !ctx.at_linebreak(req) { + ctx.advance_char::(); + } + } + SreOpcode::ANY_ALL => { + ctx.skip_char::(max_count); + } + SreOpcode::IN => { + while ctx.cursor.position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char::()) + { + ctx.advance_char::(); + } + } + SreOpcode::LITERAL => { + general_count_literal(req, ctx, end, |code, c| code == c); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(req, ctx, end, |code, c| code != c); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c)); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c)); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(req, ctx, end, char_loc_ignore); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c)); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c)); + } + _ => { + /* General case */ + ctx.toplevel = false; + ctx.jump = Jump::OpCode; + ctx.repeat_ctx_id = usize::MAX; + ctx.count = -1; + + let mut sub_state = State { + marks: Marks::default(), + repeat_stack: vec![], + ..*state + }; + + while ctx.cursor.position < end && _match(req, &mut sub_state, *ctx) { + ctx.advance_char::(); + } + } + } + + // TODO: return offset + ctx.cursor.position - state.cursor.position +} + +fn general_count_literal bool>( + req: &Request, + ctx: &mut MatchContext, + end: usize, + mut f: F, +) { + let ch = ctx.peek_code(req, 1); + while ctx.cursor.position < end && f(ch, ctx.peek_char::()) { + ctx.advance_char::(); + } +} diff --git a/vm/sre_engine/src/lib.rs b/vm/sre_engine/src/lib.rs new file mode 100644 index 0000000000..fd9f367dc6 --- /dev/null +++ b/vm/sre_engine/src/lib.rs @@ -0,0 +1,19 @@ +pub mod constants; +pub mod engine; +pub mod string; + +pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC}; +pub use engine::{Request, SearchIter, State}; +pub use string::{StrDrive, StringCursor}; + +pub const CODESIZE: usize = 4; + +#[cfg(target_pointer_width = "32")] +pub const MAXREPEAT: usize = usize::MAX - 1; +#[cfg(target_pointer_width = "64")] +pub const MAXREPEAT: usize = u32::MAX as usize; + +#[cfg(target_pointer_width = "32")] +pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2; +#[cfg(target_pointer_width = "64")] +pub const MAXGROUPS: usize = MAXREPEAT / 2; diff --git a/vm/sre_engine/src/string.rs b/vm/sre_engine/src/string.rs new file mode 100644 index 0000000000..e3f14ef019 --- /dev/null +++ b/vm/sre_engine/src/string.rs @@ -0,0 +1,398 @@ +#[derive(Debug, Clone, Copy)] +pub struct StringCursor { + pub(crate) ptr: *const u8, + pub position: usize, +} + +impl Default for StringCursor { + fn default() -> Self { + Self { + ptr: std::ptr::null(), + position: 0, + } + } +} + +pub trait StrDrive: Copy { + fn count(&self) -> usize; + fn create_cursor(&self, n: usize) -> StringCursor; + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize); + fn advance(cursor: &mut StringCursor) -> u32; + fn peek(cursor: &StringCursor) -> u32; + fn skip(cursor: &mut StringCursor, n: usize); + fn back_advance(cursor: &mut StringCursor) -> u32; + fn back_peek(cursor: &StringCursor) -> u32; + fn back_skip(cursor: &mut StringCursor, n: usize); +} + +impl<'a> StrDrive for &'a [u8] { + #[inline] + fn count(&self) -> usize { + self.len() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + StringCursor { + ptr: self[n..].as_ptr(), + position: n, + } + } + + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + cursor.position = n; + cursor.ptr = self[n..].as_ptr(); + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { cursor.ptr = cursor.ptr.add(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + unsafe { cursor.ptr = cursor.ptr.add(n) }; + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { cursor.ptr = cursor.ptr.sub(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr.offset(-1) as u32 } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + unsafe { cursor.ptr = cursor.ptr.sub(n) }; + } +} + +impl StrDrive for &str { + #[inline] + fn count(&self) -> usize { + self.chars().count() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + let mut cursor = StringCursor { + ptr: self.as_ptr(), + position: 0, + }; + Self::skip(&mut cursor, n); + cursor + } + + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + if cursor.ptr.is_null() || cursor.position > n { + *cursor = Self::create_cursor(self, n); + } else if cursor.position < n { + Self::skip(cursor, n - cursor.position); + } + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { next_code_point(&mut cursor.ptr) } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point(&mut ptr) } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + for _ in 0..n { + unsafe { next_code_point(&mut cursor.ptr) }; + } + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { next_code_point_reverse(&mut cursor.ptr) } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point_reverse(&mut ptr) } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + for _ in 0..n { + unsafe { next_code_point_reverse(&mut cursor.ptr) }; + } + } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + let x = **ptr; + *ptr = ptr.offset(1); + + if x < 128 { + return x as u32; + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = **ptr; + *ptr = ptr.offset(1); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = **ptr; + *ptr = ptr.offset(1); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = **ptr; + *ptr = ptr.offset(1); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + ch +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + *ptr = ptr.offset(-1); + let w = match **ptr { + next_byte if next_byte < 128 => return next_byte as u32, + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let z = **ptr; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let y = **ptr; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let x = **ptr; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + ch +} + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} + +#[inline] +pub(crate) fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_alnum(ch: u32) -> bool { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) +} +#[inline] +pub(crate) fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 +} +#[inline] +pub fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn lower_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + lower_ascii(ch) +} +#[inline] +pub(crate) fn upper_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn is_uni_digit(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_space(ch: u32) -> bool { + // TODO: check with cpython + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +#[inline] +pub(crate) fn is_uni_linebreak(ch: u32) -> bool { + matches!( + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 + ) +} +#[inline] +pub(crate) fn is_uni_alnum(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) +} +#[inline] +pub fn lower_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) +} +#[inline] +pub fn upper_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) +} diff --git a/vm/sre_engine/tests/tests.rs b/vm/sre_engine/tests/tests.rs new file mode 100644 index 0000000000..53494c5e3d --- /dev/null +++ b/vm/sre_engine/tests/tests.rs @@ -0,0 +1,181 @@ +use rustpython_sre_engine::{Request, State, StrDrive}; + +struct Pattern { + code: &'static [u32], +} + +impl Pattern { + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { + let req = Request::new(string, 0, usize::MAX, self.code, false); + let state = State::default(); + (req, state) + } +} + +#[test] +fn test_2427() { + // pattern lookbehind = re.compile(r'(?x)++x') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 28, 8, 1, 4294967295, 27, 4, 16, 120, 1, 1, 16, 120, 1] }; + // END GENERATED + let (req, mut state) = p.state("xxx"); + assert!(!state.pymatch(&req)); +} + +#[test] +fn test_bug_20998() { + // pattern p = re.compile('[a-c]+', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 24, 10, 1, 4294967295, 39, 5, 22, 97, 99, 0, 1, 1] }; + // END GENERATED + let (mut req, mut state) = p.state("ABC"); + req.match_all = true; + assert!(state.pymatch(&req)); + assert_eq!(state.cursor.position, 3); +} + +#[test] +fn test_bigcharset() { + // pattern p = re.compile('[a-z]*', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 24, 97, 0, 4294967295, 39, 92, 10, 3, 33685760, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 0, 0, 0, 134217726, 0, 0, 0, 0, 0, 131072, 0, 2147483648, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("x "); + assert!(state.pymatch(&req)); + assert_eq!(state.cursor.position, 1); +} + +#[test] +fn test_search_nonascii() { + // pattern p = re.compile('\xe0+') +}