From c2ee9ca3e02810c76f82bfc32a6643f2b0af0b84 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 30 Sep 2020 21:20:13 -0500 Subject: [PATCH 01/95] WIP - native _sre --- constants.rs | 114 ++++++++++++++++++++++++++++++++++++++++++++++ interp.rs | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 constants.rs create mode 100644 interp.rs diff --git a/constants.rs b/constants.rs new file mode 100644 index 0000000000..f6aeb3182f --- /dev/null +++ b/constants.rs @@ -0,0 +1,114 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +use bitflags::bitflags; + +pub const SRE_MAGIC: usize = 20140917; +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreOpcode { + FAILURE = 0, + SUCCESS = 1, + ANY = 2, + ANY_ALL = 3, + ASSERT = 4, + ASSERT_NOT = 5, + AT = 6, + BRANCH = 7, + CALL = 8, + CATEGORY = 9, + CHARSET = 10, + BIGCHARSET = 11, + GROUPREF = 12, + GROUPREF_EXISTS = 13, + GROUPREF_IGNORE = 14, + IN = 15, + IN_IGNORE = 16, + INFO = 17, + JUMP = 18, + LITERAL = 19, + LITERAL_IGNORE = 20, + MARK = 21, + MAX_UNTIL = 22, + MIN_UNTIL = 23, + NOT_LITERAL = 24, + NOT_LITERAL_IGNORE = 25, + NEGATE = 26, + RANGE = 27, + REPEAT = 28, + REPEAT_ONE = 29, + SUBPATTERN = 30, + MIN_REPEAT_ONE = 31, + RANGE_IGNORE = 32, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreAtCode { + BEGINNING = 0, + BEGINNING_LINE = 1, + BEGINNING_STRING = 2, + BOUNDARY = 3, + NON_BOUNDARY = 4, + END = 5, + END_LINE = 6, + END_STRING = 7, + LOC_BOUNDARY = 8, + LOC_NON_BOUNDARY = 9, + UNI_BOUNDARY = 10, + UNI_NON_BOUNDARY = 11, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreCatCode { + DIGIT = 0, + NOT_DIGIT = 1, + SPACE = 2, + NOT_SPACE = 3, + WORD = 4, + NOT_WORD = 5, + LINEBREAK = 6, + NOT_LINEBREAK = 7, + LOC_WORD = 8, + LOC_NOT_WORD = 9, + UNI_DIGIT = 10, + UNI_NOT_DIGIT = 11, + UNI_SPACE = 12, + UNI_NOT_SPACE = 13, + UNI_WORD = 14, + UNI_NOT_WORD = 15, + UNI_LINEBREAK = 16, + UNI_NOT_LINEBREAK = 17, +} +bitflags! { + pub struct SreFlag: u16 { + const TEMPLATE = 1; + const IGNORECASE = 2; + const LOCALE = 4; + const MULTILINE = 8; + const DOTALL = 16; + const UNICODE = 32; + const VERBOSE = 64; + const DEBUG = 128; + const ASCII = 256; + } +} +bitflags! { + pub struct SreInfo: u32 { + const PREFIX = 1; + const LITERAL = 2; + const CHARSET = 4; + } +} diff --git a/interp.rs b/interp.rs new file mode 100644 index 0000000000..7f93a82eb4 --- /dev/null +++ b/interp.rs @@ -0,0 +1,126 @@ +// good luck to those that follow; here be dragons + +use crate::builtins::PyStrRef; + +use super::constants::{SreFlag, SreOpcode}; + +use std::convert::TryFrom; +use std::{iter, slice}; + +pub struct State { + start: usize, + s_pos: usize, + end: usize, + pos: usize, + flags: SreFlag, + marks: Vec, + lastindex: isize, + marks_stack: Vec, + context_stack: Vec, + repeat: Option, + s: PyStrRef, +} + +// struct State1<'a> { +// state: &'a mut State, +// } + +struct MatchContext { + s_pos: usize, + code_pos: usize, +} + +// struct Context<'a> { +// context_stack: &mut Vec, +// } + +impl State { + pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { + let end = std::cmp::min(end, s.char_len()); + Self { + start, + s_pos: start, + end, + pos: start, + flags, + marks: Vec::new(), + lastindex: -1, + marks_stack: Vec::new(), + context_stack: Vec::new(), + repeat: None, + s, + } + } +} + +// struct OpcodeDispatcher { +// executing_contexts: HashMap>, +// } + +pub struct BadSreCode; + +pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { + let mut it = code.iter().copied(); + std::iter::from_fn(move || -> Option> { + let op = it.next()?; + let op = SreOpcode::try_from(op) + .ok() + .and_then(|op| extract_code(op, &mut it)); + Some(op) + }) + .map(|x| x.ok_or(BadSreCode)) +} + +type It<'a> = iter::Copied>; +fn extract_code(op: SreOpcode, it: &mut It) -> Option { + let skip = |it: &mut It| { + let skip = it.next()? as usize; + if skip > it.len() { + None + } else { + Some(skip) + } + }; + match op { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::INFO => { + // let skip = it.next()?; + } + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::RANGE_IGNORE => {} + } + todo!() +} + +pub enum Op { + Info {}, +} From e1362ead3c8c7462b0027a7ecb4b9bbd23e76b5a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 1 Dec 2020 17:51:11 +0200 Subject: [PATCH 02/95] WIP structure --- constants.rs | 4 + interp.rs | 467 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 379 insertions(+), 92 deletions(-) diff --git a/constants.rs b/constants.rs index f6aeb3182f..a80534d70b 100644 --- a/constants.rs +++ b/constants.rs @@ -14,6 +14,10 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20140917; +pub const SRE_CODESIZE: usize = 4; +pub const SRE_MAXREPEAT: usize = usize::max_value(); +pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; + #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] diff --git a/interp.rs b/interp.rs index 7f93a82eb4..43c2a54515 100644 --- a/interp.rs +++ b/interp.rs @@ -1,126 +1,409 @@ // good luck to those that follow; here be dragons +use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; use crate::builtins::PyStrRef; - -use super::constants::{SreFlag, SreOpcode}; - +use rustpython_common::borrow::BorrowValue; +use std::collections::HashMap; use std::convert::TryFrom; -use std::{iter, slice}; -pub struct State { +pub struct State<'a> { + // py_string: PyStrRef, + string: &'a str, start: usize, - s_pos: usize, end: usize, - pos: usize, flags: SreFlag, + pattern_codes: Vec, marks: Vec, lastindex: isize, marks_stack: Vec, context_stack: Vec, repeat: Option, - s: PyStrRef, + string_position: usize, } -// struct State1<'a> { -// state: &'a mut State, -// } - -struct MatchContext { - s_pos: usize, - code_pos: usize, -} - -// struct Context<'a> { -// context_stack: &mut Vec, -// } - -impl State { - pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { - let end = std::cmp::min(end, s.char_len()); +impl<'a> State<'a> { + pub(crate) fn new( + // py_string: PyStrRef, + string: &'a str, + start: usize, + end: usize, + flags: SreFlag, + pattern_codes: Vec, + ) -> Self { + // let string = py_string.borrow_value(); Self { + // py_string, + string, start, - s_pos: start, end, - pos: start, flags, - marks: Vec::new(), + pattern_codes, lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat: None, - s, + marks: Vec::new(), + string_position: start, } } + + fn reset(&mut self) { + self.marks.clear(); + self.lastindex = -1; + self.marks_stack.clear(); + self.context_stack.clear(); + self.repeat = None; + } } -// struct OpcodeDispatcher { -// executing_contexts: HashMap>, -// } +pub(crate) fn pymatch(mut state: State) -> bool { + let ctx = MatchContext { + string_position: state.start, + code_position: 0, + has_matched: None, + }; + state.context_stack.push(ctx); -pub struct BadSreCode; + let mut has_matched = None; + loop { + if state.context_stack.is_empty() { + break; + } + let ctx_id = state.context_stack.len() - 1; + let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut dispatcher = OpcodeDispatcher::new(); -pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { - let mut it = code.iter().copied(); - std::iter::from_fn(move || -> Option> { - let op = it.next()?; - let op = SreOpcode::try_from(op) - .ok() - .and_then(|op| extract_code(op, &mut it)); - Some(op) - }) - .map(|x| x.ok_or(BadSreCode)) + has_matched = dispatcher.pymatch(&mut drive); + state = drive.take(); + if has_matched.is_some() { + state.context_stack.pop(); + } + } + has_matched.unwrap_or(false) } -type It<'a> = iter::Copied>; -fn extract_code(op: SreOpcode, it: &mut It) -> Option { - let skip = |it: &mut It| { - let skip = it.next()? as usize; - if skip > it.len() { - None - } else { - Some(skip) +#[derive(Debug, Copy, Clone)] +struct MatchContext { + string_position: usize, + code_position: usize, + has_matched: Option, +} + +struct MatchContextDrive<'a> { + state: State<'a>, + ctx_id: usize, +} + +impl<'a> MatchContextDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn str(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked( + &self.state.string.as_bytes()[self.ctx().string_position..], + ) + } + } + fn peek_char(&self) -> char { + self.str().chars().next().unwrap() + } + fn peek_code(&self, peek: usize) -> u32 { + self.state.pattern_codes[self.ctx().code_position + peek] + } + fn skip_char(&mut self, skip_count: usize) { + let skipped = self.str().char_indices().nth(skip_count).unwrap().0; + self.ctx_mut().string_position += skipped; + } + fn skip_code(&mut self, skip_count: usize) { + self.ctx_mut().code_position += skip_count; + } + fn remaining_chars(&self) -> usize { + let end = self.state.end; + end - self.ctx().string_position + self.str().len() + } + fn remaining_codes(&self) -> usize { + self.state.pattern_codes.len() - self.ctx().code_position + } + fn at_beginning(&self) -> bool { + self.ctx().string_position == 0 + } + fn at_end(&self) -> bool { + self.str().is_empty() + } + fn at_linebreak(&self) -> bool { + match self.str().chars().next() { + Some(c) => c == '\n', + None => false, } + } +} + +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} + +macro_rules! once { + ($val:expr) => { + Box::new(OpEmpty {}) }; - match op { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::INFO => { - // let skip = it.next()?; - } - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::RANGE_IGNORE => {} - } - todo!() -} - -pub enum Op { - Info {}, +} + +trait OpcodeExecutor { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; +} + +struct OpFailure {} +impl OpcodeExecutor for OpFailure { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + drive.ctx_mut().has_matched = Some(false); + None + } +} + +struct OpEmpty {} +impl OpcodeExecutor for OpEmpty { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + None + } +} + +struct OpOnce { + f: Option, +} +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let f = self.f.take()?; + f(drive); + None + } +} +fn once(f: F) -> Box> { + Box::new(OpOnce { f: Some(f) }) +} + +struct OpMinRepeatOne { + trace_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.trace_id { + 0 => self._0(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + trace_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count_repetitions(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + // mark push + self.trace_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.trace_id = 2; + return Some(()); + } + + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count_repetitions(drive, 1) == 0 { + self.trace_id = 3; + return self._3(drive); + } + drive.skip_char(1); + self.count += 1; + // marks pop keep + self.trace_id = 1; + self._1(drive) + } + fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } +} + +impl OpcodeDispatcher { + fn new() -> Self { + Self { + executing_contexts: HashMap::new(), + } + } + // Returns True if the current context matches, False if it doesn't and + // None if matching is not finished, ie must be resumed after child + // contexts have been matched. + fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { + let code = drive.peek_code(0); + let opcode = SreOpcode::try_from(code).unwrap(); + self.dispatch(opcode, drive); + // self.drive = self.drive; + } + match drive.ctx().has_matched { + Some(matched) => Some(matched), + None => { + drive.ctx_mut().has_matched = Some(false); + Some(false) + } + } + } + + // Dispatches a context on a given opcode. Returns True if the context + // is done matching, False if it must be resumed when next encountered. + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { + Some((_, mut executor)) => executor, + None => self.dispatch_table(opcode, drive), + }; + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + false + } else { + true + } + } + + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut MatchContextDrive, + ) -> Box { + // move || { + match opcode { + SreOpcode::FAILURE => { + Box::new(OpFailure {}) + } + SreOpcode::SUCCESS => once(|drive| { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + }), + SreOpcode::ANY => once!(true), + SreOpcode::ANY_ALL => once!(true), + SreOpcode::ASSERT => once!(true), + SreOpcode::ASSERT_NOT => once!(true), + SreOpcode::AT => once!(true), + SreOpcode::BRANCH => once!(true), + SreOpcode::CALL => once!(true), + SreOpcode::CATEGORY => once!(true), + SreOpcode::CHARSET => once!(true), + SreOpcode::BIGCHARSET => once!(true), + SreOpcode::GROUPREF => once!(true), + SreOpcode::GROUPREF_EXISTS => once!(true), + SreOpcode::GROUPREF_IGNORE => once!(true), + SreOpcode::IN => once!(true), + SreOpcode::IN_IGNORE => once!(true), + SreOpcode::INFO => once!(true), + SreOpcode::JUMP => once!(true), + SreOpcode::LITERAL => { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_char(1); + } + drive.skip_code(2); + once!(true) + } + SreOpcode::LITERAL_IGNORE => once!(true), + SreOpcode::MARK => once!(true), + SreOpcode::MAX_UNTIL => once!(true), + SreOpcode::MIN_UNTIL => once!(true), + SreOpcode::NOT_LITERAL => once!(true), + SreOpcode::NOT_LITERAL_IGNORE => once!(true), + SreOpcode::NEGATE => once!(true), + SreOpcode::RANGE => once!(true), + SreOpcode::REPEAT => once!(true), + SreOpcode::REPEAT_ONE => once!(true), + SreOpcode::SUBPATTERN => once!(true), + SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::RANGE_IGNORE => once!(true), + } + } +} + +// Returns the number of repetitions of a single item, starting from the +// current string position. The code pointer is expected to point to a +// REPEAT_ONE operation (with the repeated 4 ahead). +fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.state.end - drive.ctx().string_position; + if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { + real_maxcount = maxcount; + } + count } From 82922bf0d796f6c79f5799a1b14b9d1ad9c26431 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 23 Dec 2020 09:01:50 +0200 Subject: [PATCH 03/95] upgrade re version; implement helper functions; --- constants.rs | 52 ++-- interp.rs | 717 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 588 insertions(+), 181 deletions(-) diff --git a/constants.rs b/constants.rs index a80534d70b..f5ab92c531 100644 --- a/constants.rs +++ b/constants.rs @@ -13,11 +13,7 @@ use bitflags::bitflags; -pub const SRE_MAGIC: usize = 20140917; -pub const SRE_CODESIZE: usize = 4; -pub const SRE_MAXREPEAT: usize = usize::max_value(); -pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; - +pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] @@ -36,25 +32,33 @@ pub enum SreOpcode { BIGCHARSET = 11, GROUPREF = 12, GROUPREF_EXISTS = 13, - GROUPREF_IGNORE = 14, - IN = 15, - IN_IGNORE = 16, - INFO = 17, - JUMP = 18, - LITERAL = 19, - LITERAL_IGNORE = 20, - MARK = 21, - MAX_UNTIL = 22, - MIN_UNTIL = 23, - NOT_LITERAL = 24, - NOT_LITERAL_IGNORE = 25, - NEGATE = 26, - RANGE = 27, - REPEAT = 28, - REPEAT_ONE = 29, - SUBPATTERN = 30, - MIN_REPEAT_ONE = 31, - RANGE_IGNORE = 32, + IN = 14, + INFO = 15, + JUMP = 16, + LITERAL = 17, + MARK = 18, + MAX_UNTIL = 19, + MIN_UNTIL = 20, + NOT_LITERAL = 21, + NEGATE = 22, + RANGE = 23, + REPEAT = 24, + REPEAT_ONE = 25, + SUBPATTERN = 26, + MIN_REPEAT_ONE = 27, + GROUPREF_IGNORE = 28, + IN_IGNORE = 29, + LITERAL_IGNORE = 30, + NOT_LITERAL_IGNORE = 31, + GROUPREF_LOC_IGNORE = 32, + IN_LOC_IGNORE = 33, + LITERAL_LOC_IGNORE = 34, + NOT_LITERAL_LOC_IGNORE = 35, + GROUPREF_UNI_IGNORE = 36, + IN_UNI_IGNORE = 37, + LITERAL_UNI_IGNORE = 38, + NOT_LITERAL_UNI_IGNORE = 39, + RANGE_UNI_IGNORE = 40, } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] diff --git a/interp.rs b/interp.rs index 43c2a54515..bc6753a5fe 100644 --- a/interp.rs +++ b/interp.rs @@ -1,14 +1,14 @@ // good luck to those that follow; here be dragons -use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; -use crate::builtins::PyStrRef; -use rustpython_common::borrow::BorrowValue; +use super::_sre::MAXREPEAT; +use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; pub struct State<'a> { - // py_string: PyStrRef, string: &'a str, + // chars count + string_len: usize, start: usize, end: usize, flags: SreFlag, @@ -23,17 +23,18 @@ pub struct State<'a> { impl<'a> State<'a> { pub(crate) fn new( - // py_string: PyStrRef, string: &'a str, start: usize, end: usize, flags: SreFlag, pattern_codes: Vec, ) -> Self { - // let string = py_string.borrow_value(); + let string_len = string.chars().count(); + let end = std::cmp::min(end, string_len); + let start = std::cmp::min(start, end); Self { - // py_string, string, + string_len, start, end, flags, @@ -59,6 +60,7 @@ impl<'a> State<'a> { pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, + string_offset: state.string.char_indices().nth(state.start).unwrap().0, code_position: 0, has_matched: None, }; @@ -85,6 +87,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, + string_offset: usize, code_position: usize, has_matched: Option, } @@ -108,6 +111,7 @@ impl<'a> MatchContextDrive<'a> { let ctx = self.ctx(); let child_ctx = MatchContext { string_position: ctx.string_position, + string_offset: ctx.string_offset, code_position: ctx.code_position + pattern_offset, has_matched: None, }; @@ -122,9 +126,7 @@ impl<'a> MatchContextDrive<'a> { } fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked( - &self.state.string.as_bytes()[self.ctx().string_position..], - ) + std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) } } fn peek_char(&self) -> char { @@ -135,61 +137,90 @@ impl<'a> MatchContextDrive<'a> { } fn skip_char(&mut self, skip_count: usize) { let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skipped; + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - let end = self.state.end; - end - self.ctx().string_position + self.str().len() + self.state.end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state.pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == 0 + self.ctx().string_position == self.state.start } fn at_end(&self) -> bool { - self.str().is_empty() + self.ctx().string_position == self.state.end } fn at_linebreak(&self) -> bool { - match self.str().chars().next() { - Some(c) => c == '\n', - None => false, + !self.at_end() && is_linebreak(self.peek_char()) + } + fn at_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this != that + } + fn back_peek_offset(&self) -> usize { + let bytes = self.state.string.as_bytes(); + let mut offset = self.ctx().string_offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset + } + fn back_peek_char(&self) -> char { + let bytes = self.state.string.as_bytes(); + let offset = self.back_peek_offset(); + let current_offset = self.ctx().string_offset; + let code = match current_offset - offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + }; + unsafe { std::mem::transmute(code) } + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + for _ in 0..skip_count { + self.ctx_mut().string_offset = self.back_peek_offset(); } } -} - -struct OpcodeDispatcher { - executing_contexts: HashMap>, -} - -macro_rules! once { - ($val:expr) => { - Box::new(OpEmpty {}) - }; } trait OpcodeExecutor { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; } -struct OpFailure {} -impl OpcodeExecutor for OpFailure { +struct OpUnimplemented {} +impl OpcodeExecutor for OpUnimplemented { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } } -struct OpEmpty {} -impl OpcodeExecutor for OpEmpty { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - } -} - struct OpOnce { f: Option, } @@ -204,6 +235,10 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +fn unimplemented() -> Box { + Box::new(OpUnimplemented {}) +} + struct OpMinRepeatOne { trace_id: usize, mincount: usize, @@ -213,10 +248,11 @@ struct OpMinRepeatOne { } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - match self.trace_id { - 0 => self._0(drive), - _ => unreachable!(), - } + None + // match self.trace_id { + // 0 => self._0(drive), + // _ => unreachable!(), + // } } } impl Default for OpMinRepeatOne { @@ -230,75 +266,78 @@ impl Default for OpMinRepeatOne { } } } -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; +// impl OpMinRepeatOne { +// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// self.mincount = drive.peek_code(2) as usize; +// self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } +// if drive.remaining_chars() < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } - drive.state.string_position = drive.ctx().string_position; +// drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count_repetitions(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; +// self.count = if self.mincount == 0 { +// 0 +// } else { +// let count = count_repetitions(drive, self.mincount); +// if count < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } +// drive.skip_char(count); +// count +// }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } +// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { +// drive.state.string_position = drive.ctx().string_position; +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } - // mark push - self.trace_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.trace_id = 2; - return Some(()); - } +// // mark push +// self.trace_id = 1; +// self._1(drive) +// } +// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { +// drive.state.string_position = drive.ctx().string_position; +// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); +// self.trace_id = 2; +// return Some(()); +// } - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count_repetitions(drive, 1) == 0 { - self.trace_id = 3; - return self._3(drive); - } - drive.skip_char(1); - self.count += 1; - // marks pop keep - self.trace_id = 1; - self._1(drive) - } - fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } -} +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// drive.state.string_position = drive.ctx().string_position; +// if count_repetitions(drive, 1) == 0 { +// self.trace_id = 3; +// return self._3(drive); +// } +// drive.skip_char(1); +// self.count += 1; +// // marks pop keep +// self.trace_id = 1; +// self._1(drive) +// } +// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// } +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} impl OpcodeDispatcher { fn new() -> Self { Self { @@ -313,7 +352,6 @@ impl OpcodeDispatcher { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); self.dispatch(opcode, drive); - // self.drive = self.drive; } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -328,8 +366,8 @@ impl OpcodeDispatcher { // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, mut executor)) => executor, - None => self.dispatch_table(opcode, drive), + Some((_, executor)) => executor, + None => self.dispatch_table(opcode), }; if let Some(()) = executor.next(drive) { self.executing_contexts.insert(drive.id(), executor); @@ -339,71 +377,436 @@ impl OpcodeDispatcher { } } - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut MatchContextDrive, - ) -> Box { - // move || { + fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { match opcode { - SreOpcode::FAILURE => { - Box::new(OpFailure {}) - } + SreOpcode::FAILURE => once(|drive| { + drive.ctx_mut().has_matched = Some(false); + }), SreOpcode::SUCCESS => once(|drive| { drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); }), - SreOpcode::ANY => once!(true), - SreOpcode::ANY_ALL => once!(true), - SreOpcode::ASSERT => once!(true), - SreOpcode::ASSERT_NOT => once!(true), - SreOpcode::AT => once!(true), - SreOpcode::BRANCH => once!(true), - SreOpcode::CALL => once!(true), - SreOpcode::CATEGORY => once!(true), - SreOpcode::CHARSET => once!(true), - SreOpcode::BIGCHARSET => once!(true), - SreOpcode::GROUPREF => once!(true), - SreOpcode::GROUPREF_EXISTS => once!(true), - SreOpcode::GROUPREF_IGNORE => once!(true), - SreOpcode::IN => once!(true), - SreOpcode::IN_IGNORE => once!(true), - SreOpcode::INFO => once!(true), - SreOpcode::JUMP => once!(true), - SreOpcode::LITERAL => { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + SreOpcode::ANY => once(|drive| { + if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { + drive.skip_code(1); drive.skip_char(1); } + }), + SreOpcode::ANY_ALL => once(|drive| { + if drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + }), + SreOpcode::ASSERT => Box::new(OpAssert::default()), + SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::AT => once(|drive| { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if !at(drive, atcode) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + } + }), + SreOpcode::BRANCH => unimplemented(), + SreOpcode::CALL => unimplemented(), + SreOpcode::CATEGORY => unimplemented(), + SreOpcode::CHARSET => unimplemented(), + SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::GROUPREF => unimplemented(), + SreOpcode::GROUPREF_EXISTS => unimplemented(), + SreOpcode::GROUPREF_IGNORE => unimplemented(), + SreOpcode::IN => unimplemented(), + SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + drive.skip_code(drive.peek_code(1) as usize + 1); + }), + SreOpcode::LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } drive.skip_code(2); - once!(true) - } - SreOpcode::LITERAL_IGNORE => once!(true), - SreOpcode::MARK => once!(true), - SreOpcode::MAX_UNTIL => once!(true), - SreOpcode::MIN_UNTIL => once!(true), - SreOpcode::NOT_LITERAL => once!(true), - SreOpcode::NOT_LITERAL_IGNORE => once!(true), - SreOpcode::NEGATE => once!(true), - SreOpcode::RANGE => once!(true), - SreOpcode::REPEAT => once!(true), - SreOpcode::REPEAT_ONE => once!(true), - SreOpcode::SUBPATTERN => once!(true), + drive.skip_char(1); + }), + SreOpcode::LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 != code + && c.to_ascii_uppercase() as u32 != code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::MARK => unimplemented(), + SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MIN_UNTIL => unimplemented(), + SreOpcode::NOT_LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 == code + || c.to_ascii_uppercase() as u32 == code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NEGATE => unimplemented(), + SreOpcode::RANGE => unimplemented(), + SreOpcode::REPEAT => unimplemented(), + SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::RANGE_IGNORE => once!(true), + SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), + SreOpcode::IN_LOC_IGNORE => unimplemented(), + SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::IN_UNI_IGNORE => unimplemented(), + SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } + + // Returns the number of repetitions of a single item, starting from the + // current string position. The code pointer is expected to point to a + // REPEAT_ONE operation (with the repeated 4 ahead). + fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.remaining_chars(); + if maxcount < real_maxcount && maxcount != MAXREPEAT { + real_maxcount = maxcount; + } + let code_position = drive.ctx().code_position; + let string_position = drive.ctx().string_position; + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + while count < real_maxcount { + drive.ctx_mut().code_position = reset_position; + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + self.dispatch(opcode, drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + drive.ctx_mut().has_matched = None; + drive.ctx_mut().code_position = code_position; + drive.ctx_mut().string_position = string_position; + count + } +} + +fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), + SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), + SreAtCode::BOUNDARY => drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), + SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), + SreAtCode::END_STRING => drive.at_end(), + SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + } +} + +fn category(catcode: SreCatCode, c: char) -> bool { + match catcode { + SreCatCode::DIGIT => is_digit(c), + SreCatCode::NOT_DIGIT => !is_digit(c), + SreCatCode::SPACE => is_space(c), + SreCatCode::NOT_SPACE => !is_space(c), + SreCatCode::WORD => is_word(c), + SreCatCode::NOT_WORD => !is_word(c), + SreCatCode::LINEBREAK => is_linebreak(c), + SreCatCode::NOT_LINEBREAK => !is_linebreak(c), + SreCatCode::LOC_WORD => is_loc_word(c), + SreCatCode::LOC_NOT_WORD => !is_loc_word(c), + SreCatCode::UNI_DIGIT => is_uni_digit(c), + SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), + SreCatCode::UNI_SPACE => is_uni_space(c), + SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), + SreCatCode::UNI_WORD => is_uni_word(c), + SreCatCode::UNI_NOT_WORD => !is_uni_word(c), + SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), + SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), + } +} + +fn charset(set: &[u32], c: char) -> bool { + /* check if character is a member of the given set */ + let ch = c as u32; + let mut ok = true; + let mut i = 0; + while i < set.len() { + let opcode = match SreOpcode::try_from(set[i]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + match opcode { + SreOpcode::FAILURE => { + return !ok; + } + SreOpcode::CATEGORY => { + /* */ + let catcode = match SreCatCode::try_from(set[i + 1]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + if category(catcode, c) { + return ok; + } + i += 2; + } + SreOpcode::CHARSET => { + /* */ + if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + return ok; + } + i += 8; + } + SreOpcode::BIGCHARSET => { + /* <256 blockindices> */ + let count = set[i + 1]; + if ch < 0x10000 { + let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let block = blockindices[(ch >> 8) as usize]; + if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1 << (ch & (32 - 1))) + != 0 + { + return ok; + } + } + i += 2 + 64 + count as usize * 8; + } + SreOpcode::LITERAL => { + /* */ + if ch == set[i + 1] { + return ok; + } + i += 2; + } + SreOpcode::NEGATE => { + ok = !ok; + i += 1; + } + SreOpcode::RANGE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + SreOpcode::RANGE_UNI_IGNORE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + let ch = upper_unicode(c) as u32; + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + _ => { + break; + } + } + } + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + false +} + +fn count(drive: MatchContextDrive, maxcount: usize) -> usize { + let string_position = drive.state.string_position; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + match opcode { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::IN => { + } + SreOpcode::INFO => {} + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::GROUPREF_LOC_IGNORE => {} + SreOpcode::IN_LOC_IGNORE => {} + SreOpcode::LITERAL_LOC_IGNORE => {} + SreOpcode::NOT_LITERAL_LOC_IGNORE => {} + SreOpcode::GROUPREF_UNI_IGNORE => {} + SreOpcode::IN_UNI_IGNORE => {} + SreOpcode::LITERAL_UNI_IGNORE => {} + SreOpcode::NOT_LITERAL_UNI_IGNORE => {} + SreOpcode::RANGE_UNI_IGNORE => {} + } +} + +fn eq_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } -// Returns the number of repetitions of a single item, starting from the -// current string position. The code pointer is expected to point to a -// REPEAT_ONE operation (with the repeated 4 ahead). -fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.state.end - drive.ctx().string_position; - if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { - real_maxcount = maxcount; +fn is_word(c: char) -> bool { + c.is_ascii_alphanumeric() || c == '_' +} +fn is_space(c: char) -> bool { + c.is_ascii_whitespace() +} +fn is_digit(c: char) -> bool { + c.is_ascii_digit() +} +fn is_loc_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_loc_word(c: char) -> bool { + is_loc_alnum(c) || c == '_' +} +fn is_linebreak(c: char) -> bool { + c == '\n' +} +pub(crate) fn lower_ascii(c: char) -> char { + c.to_ascii_lowercase() +} +fn lower_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + c.to_lowercase().next().unwrap() +} +fn upper_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + c.to_uppercase().next().unwrap() +} +fn is_uni_digit(c: char) -> bool { + // TODO: check with cpython + c.is_digit(10) +} +fn is_uni_space(c: char) -> bool { + // TODO: check with cpython + c.is_whitespace() +} +fn is_uni_linebreak(c: char) -> bool { + match c { + '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' + | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} +fn is_uni_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_uni_word(c: char) -> bool { + is_uni_alnum(c) || c == '_' +} +pub(crate) fn lower_unicode(c: char) -> char { + // TODO: check with cpython + c.to_lowercase().next().unwrap() +} +pub(crate) fn upper_unicode(c: char) -> char { + // TODO: check with cpython + c.to_uppercase().next().unwrap() +} + +fn is_utf8_first_byte(b: u8) -> bool { + // In UTF-8, there are three kinds of byte... + // 0xxxxxxx : ASCII + // 10xxxxxx : 2nd, 3rd or 4th byte of code + // 11xxxxxx : 1st byte of multibyte code + (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) +} + +struct OpAssert { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssert { + fn default() -> Self { + OpAssert { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssert { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssert { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + None } - count } From 4e03fb361f731116d392cfdda0820353454fcaac Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 25 Dec 2020 16:31:22 +0200 Subject: [PATCH 04/95] upgrade sre_parse.py; impl marks count --- interp.rs | 633 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 365 insertions(+), 268 deletions(-) diff --git a/interp.rs b/interp.rs index bc6753a5fe..8e0968cbf5 100644 --- a/interp.rs +++ b/interp.rs @@ -5,6 +5,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; +#[derive(Debug)] pub struct State<'a> { string: &'a str, // chars count @@ -13,9 +14,9 @@ pub struct State<'a> { end: usize, flags: SreFlag, pattern_codes: Vec, - marks: Vec, + marks: Vec>, lastindex: isize, - marks_stack: Vec, + marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, string_position: usize, @@ -55,12 +56,47 @@ impl<'a> State<'a> { self.context_stack.clear(); self.repeat = None; } + + fn set_mark(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.lastindex = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, None); + } + self.marks[mark_nr] = Some(position); + } + fn get_marks(&self, group_index: usize) -> (Option, Option) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (None, None) + } + } + fn marks_push(&mut self) { + self.marks_stack.push(self.marks.clone(), self.lastindex); + } + fn marks_pop(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + } + fn marks_pop_keep(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + } + fn marks_pop_discard(&mut self) { + self.marks_stack.pop(); + } } pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, - string_offset: state.string.char_indices().nth(state.start).unwrap().0, + string_offset: state + .string + .char_indices() + .nth(state.start) + .map(|x| x.0) + .unwrap_or(0), code_position: 0, has_matched: None, }; @@ -72,7 +108,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { break; } let ctx_id = state.context_stack.len() - 1; - let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut drive = StackDrive::drive(ctx_id, state); let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); @@ -92,68 +128,52 @@ struct MatchContext { has_matched: Option, } -struct MatchContextDrive<'a> { - state: State<'a>, - ctx_id: usize, -} - -impl<'a> MatchContextDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] - } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { - let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; - self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } +trait MatchContextDrive { + fn ctx_mut(&mut self) -> &mut MatchContext; + fn ctx(&self) -> &MatchContext; + fn state(&self) -> &State; fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) + std::str::from_utf8_unchecked( + &self.state().string.as_bytes()[self.ctx().string_offset..], + ) } } + fn pattern(&self) -> &[u32] { + &self.state().pattern_codes[self.ctx().code_position..] + } fn peek_char(&self) -> char { self.str().chars().next().unwrap() } fn peek_code(&self, peek: usize) -> u32 { - self.state.pattern_codes[self.ctx().code_position + peek] + self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; + match self.str().char_indices().nth(skip_count).map(|x| x.0) { + Some(skipped) => { + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; + } + None => { + self.ctx_mut().string_position = self.state().end; + self.ctx_mut().string_offset = self.state().string.len(); // bytes len + } + } } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - self.state.end - self.ctx().string_position + self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { - self.state.pattern_codes.len() - self.ctx().code_position + self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state.start + self.ctx().string_position == self.state().start } fn at_end(&self) -> bool { - self.ctx().string_position == self.state.end + self.ctx().string_position == self.state().end } fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) @@ -167,7 +187,7 @@ impl<'a> MatchContextDrive<'a> { this != that } fn back_peek_offset(&self) -> usize { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let mut offset = self.ctx().string_offset - 1; if !is_utf8_first_byte(bytes[offset]) { offset -= 1; @@ -184,7 +204,7 @@ impl<'a> MatchContextDrive<'a> { offset } fn back_peek_char(&self) -> char { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let offset = self.back_peek_offset(); let current_offset = self.ctx().string_offset; let code = match current_offset - offset { @@ -209,13 +229,74 @@ impl<'a> MatchContextDrive<'a> { } } +struct StackDrive<'a> { + state: State<'a>, + ctx_id: usize, +} +impl<'a> StackDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + string_offset: ctx.string_offset, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } +} +impl MatchContextDrive for StackDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn state(&self) -> &State { + &self.state + } +} + +struct WrapDrive<'a> { + stack_drive: &'a StackDrive<'a>, + ctx: MatchContext, +} +impl<'a> WrapDrive<'a> { + fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { + Self { stack_drive, ctx } + } +} +impl MatchContextDrive for WrapDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + + fn ctx(&self) -> &MatchContext { + &self.ctx + } + + fn state(&self) -> &State { + self.stack_drive.state() + } +} + trait OpcodeExecutor { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; + fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } struct OpUnimplemented {} impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } @@ -224,14 +305,14 @@ impl OpcodeExecutor for OpUnimplemented { struct OpOnce { f: Option, } -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { let f = self.f.take()?; f(drive); None } } -fn once(f: F) -> Box> { +fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } @@ -239,102 +320,6 @@ fn unimplemented() -> Box { Box::new(OpUnimplemented {}) } -struct OpMinRepeatOne { - trace_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - child_ctx_id: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - // match self.trace_id { - // 0 => self._0(drive), - // _ => unreachable!(), - // } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - trace_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - child_ctx_id: 0, - } - } -} -// impl OpMinRepeatOne { -// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// self.mincount = drive.peek_code(2) as usize; -// self.maxcount = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } - -// drive.state.string_position = drive.ctx().string_position; - -// self.count = if self.mincount == 0 { -// 0 -// } else { -// let count = count_repetitions(drive, self.mincount); -// if count < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } -// drive.skip_char(count); -// count -// }; - -// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { -// drive.state.string_position = drive.ctx().string_position; -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } - -// // mark push -// self.trace_id = 1; -// self._1(drive) -// } -// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { -// drive.state.string_position = drive.ctx().string_position; -// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); -// self.trace_id = 2; -// return Some(()); -// } - -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// drive.state.string_position = drive.ctx().string_position; -// if count_repetitions(drive, 1) == 0 { -// self.trace_id = 3; -// return self._3(drive); -// } -// drive.skip_char(1); -// self.count += 1; -// // marks pop keep -// self.trace_id = 1; -// self._1(drive) -// } -// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// } - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -347,7 +332,7 @@ impl OpcodeDispatcher { // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. - fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + fn pymatch(&mut self, drive: &mut StackDrive) -> Option { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); @@ -364,7 +349,7 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { Some((_, executor)) => executor, None => self.dispatch_table(opcode), @@ -414,58 +399,67 @@ impl OpcodeDispatcher { }), SreOpcode::BRANCH => unimplemented(), SreOpcode::CALL => unimplemented(), - SreOpcode::CATEGORY => unimplemented(), - SreOpcode::CHARSET => unimplemented(), - SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::CATEGORY => once(|drive| { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + }), + SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::GROUPREF => unimplemented(), SreOpcode::GROUPREF_EXISTS => unimplemented(), SreOpcode::GROUPREF_IGNORE => unimplemented(), - SreOpcode::IN => unimplemented(), - SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::IN => once(|drive| { + general_op_in(drive, |x| x); + }), + SreOpcode::IN_IGNORE => once(|drive| { + general_op_in(drive, lower_ascii); + }), + SreOpcode::IN_UNI_IGNORE => once(|drive| { + general_op_in(drive, lower_unicode); + }), + SreOpcode::IN_LOC_IGNORE => once(|drive| { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } + }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == c as u32); + }), + SreOpcode::NOT_LITERAL => once(|drive| { + general_op_literal(drive, |code, c| code != c as u32); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 != code - && c.to_ascii_uppercase() as u32 != code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + }), + SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + }), + SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + }), + SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + }), + SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), SreOpcode::MARK => unimplemented(), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NOT_LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 == code - || c.to_ascii_uppercase() as u32 == code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), @@ -473,47 +467,45 @@ impl OpcodeDispatcher { SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::IN_LOC_IGNORE => unimplemented(), - SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), - SreOpcode::IN_UNI_IGNORE => unimplemented(), - SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } +} - // Returns the number of repetitions of a single item, starting from the - // current string position. The code pointer is expected to point to a - // REPEAT_ONE operation (with the repeated 4 ahead). - fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.remaining_chars(); - if maxcount < real_maxcount && maxcount != MAXREPEAT { - real_maxcount = maxcount; - } - let code_position = drive.ctx().code_position; - let string_position = drive.ctx().string_position; - drive.skip_code(4); - let reset_position = drive.ctx().code_position; - while count < real_maxcount { - drive.ctx_mut().code_position = reset_position; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); - self.dispatch(opcode, drive); - if drive.ctx().has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx_mut().has_matched = None; - drive.ctx_mut().code_position = code_position; - drive.ctx_mut().string_position = string_position; - count +fn char_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +} + +fn charset_loc_ignore(set: &[u32], c: char) -> bool { + let lo = lower_locate(c); + if charset(set, c) { + return true; } + let up = upper_locate(c); + up != lo && charset(set, up) } -fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { + if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } +} + +fn general_op_in char>(drive: &mut StackDrive, f: F) { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } +} + +fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -642,54 +634,67 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(drive: MatchContextDrive, maxcount: usize) -> usize { - let string_position = drive.state.string_position; +fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { + let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let opcode = match SreOpcode::try_from(drive.peek_code(1)) { + Ok(code) => code, + Err(_) => { + panic!("FIXME:COUNT1"); + } + }; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); match opcode { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::ANY => { + while !drive.at_end() && !drive.at_linebreak() { + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + drive.skip_char(drive.remaining_chars()); + } SreOpcode::IN => { + // TODO: pattern[2 or 1..]? + while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + drive.skip_char(1); + } + } + SreOpcode::LITERAL => { + general_count_literal(drive, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(drive, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); } - SreOpcode::INFO => {} - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::GROUPREF_LOC_IGNORE => {} - SreOpcode::IN_LOC_IGNORE => {} - SreOpcode::LITERAL_LOC_IGNORE => {} - SreOpcode::NOT_LITERAL_LOC_IGNORE => {} - SreOpcode::GROUPREF_UNI_IGNORE => {} - SreOpcode::IN_UNI_IGNORE => {} - SreOpcode::LITERAL_UNI_IGNORE => {} - SreOpcode::NOT_LITERAL_UNI_IGNORE => {} - SreOpcode::RANGE_UNI_IGNORE => {} + _ => { + panic!("TODO: Not Implemented."); + } + } + + drive.ctx().string_position - stack_drive.ctx().string_position +} + +fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { + let ch = drive.peek_code(1); + while !drive.at_end() && f(ch, drive.peek_char()) { + drive.skip_char(1); } } @@ -781,7 +786,7 @@ impl Default for OpAssert { } } impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => self._0(drive), 1 => self._1(drive), @@ -790,7 +795,7 @@ impl OpcodeExecutor for OpAssert { } } impl OpAssert { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { let back = drive.peek_code(2) as usize; if back > drive.ctx().string_position { drive.ctx_mut().has_matched = Some(false); @@ -801,7 +806,7 @@ impl OpAssert { self.jump_id = 1; Some(()) } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { drive.skip_code(drive.peek_code(1) as usize + 1); } else { @@ -810,3 +815,95 @@ impl OpAssert { None } } + +struct OpMinRepeatOne { + jump_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + jump_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + self.jump_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self._1(drive) + } +} From 78485fd8df79444283d3cb978654500da0ce0590 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 16:59:30 +0200 Subject: [PATCH 05/95] create _sre.Match --- interp.rs | 68 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/interp.rs b/interp.rs index 8e0968cbf5..71d89cc0e8 100644 --- a/interp.rs +++ b/interp.rs @@ -1,21 +1,24 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; +use rustpython_common::borrow::BorrowValue; + +use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use crate::builtins::PyStrRef; use std::collections::HashMap; use std::convert::TryFrom; #[derive(Debug)] -pub struct State<'a> { +pub(crate) struct State<'a> { string: &'a str, // chars count string_len: usize, - start: usize, - end: usize, + pub start: usize, + pub end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], marks: Vec>, - lastindex: isize, + pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, @@ -28,7 +31,7 @@ impl<'a> State<'a> { start: usize, end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], ) -> Self { let string_len = string.chars().count(); let end = std::cmp::min(end, string_len); @@ -75,20 +78,36 @@ impl<'a> State<'a> { } } fn marks_push(&mut self) { - self.marks_stack.push(self.marks.clone(), self.lastindex); + self.marks_stack.push((self.marks.clone(), self.lastindex)); } fn marks_pop(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + let (marks, lastindex) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_keep(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } } -pub(crate) fn pymatch(mut state: State) -> bool { +pub(crate) fn pymatch( + string: PyStrRef, + start: usize, + end: usize, + pattern: &Pattern, +) -> Option { + let mut state = State::new( + string.borrow_value(), + start, + end, + pattern.flags.clone(), + &pattern.code, + ); let ctx = MatchContext { string_position: state.start, string_offset: state @@ -117,7 +136,12 @@ pub(crate) fn pymatch(mut state: State) -> bool { state.context_stack.pop(); } } - has_matched.unwrap_or(false) + + if has_matched == None || has_matched == Some(false) { + return None; + } + + Some(Match::new(&state, pattern.pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -635,7 +659,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -660,28 +684,28 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { } } SreOpcode::LITERAL => { - general_count_literal(drive, |code, c| code == c as u32); + general_count_literal(&mut drive, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(drive, |code, c| code != c as u32); + general_count_literal(&mut drive, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); } _ => { panic!("TODO: Not Implemented."); @@ -691,7 +715,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - stack_drive.ctx().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { let ch = drive.peek_code(1); while !drive.at_end() && f(ch, drive.peek_char()) { drive.skip_char(1); From aa0f20b93e86e6b63eadefb1f17958b2d513ccd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 18:44:33 +0200 Subject: [PATCH 06/95] impl Pattern.fullmatch, Pattern.search --- interp.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/interp.rs b/interp.rs index 71d89cc0e8..f87cc85abc 100644 --- a/interp.rs +++ b/interp.rs @@ -1,10 +1,9 @@ // good luck to those that follow; here be dragons -use rustpython_common::borrow::BorrowValue; - use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; From 312e5b875677bbc57b085ffae98f230e56717d21 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 27 Dec 2020 21:27:06 +0200 Subject: [PATCH 07/95] impl opcode groupref and assert_not --- interp.rs | 143 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 19 deletions(-) diff --git a/interp.rs b/interp.rs index f87cc85abc..88dbb034da 100644 --- a/interp.rs +++ b/interp.rs @@ -109,12 +109,7 @@ pub(crate) fn pymatch( ); let ctx = MatchContext { string_position: state.start, - string_offset: state - .string - .char_indices() - .nth(state.start) - .map(|x| x.0) - .unwrap_or(0), + string_offset: calc_string_offset(state.string, state.start), code_position: 0, has_matched: None, }; @@ -411,7 +406,7 @@ impl OpcodeDispatcher { } }), SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -432,9 +427,6 @@ impl OpcodeDispatcher { } }), SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), - SreOpcode::GROUPREF => unimplemented(), - SreOpcode::GROUPREF_EXISTS => unimplemented(), - SreOpcode::GROUPREF_IGNORE => unimplemented(), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -480,7 +472,12 @@ impl OpcodeDispatcher { SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), - SreOpcode::MARK => unimplemented(), + SreOpcode::MARK => once(|drive| { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); + drive.skip_code(2); + }), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), SreOpcode::NEGATE => unimplemented(), @@ -489,13 +486,36 @@ impl OpcodeDispatcher { SreOpcode::REPEAT_ONE => unimplemented(), SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), + SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + SreOpcode::GROUPREF_LOC_IGNORE => { + once(|drive| general_op_groupref(drive, lower_locate)) + } + SreOpcode::GROUPREF_UNI_IGNORE => { + once(|drive| general_op_groupref(drive, lower_unicode)) + } + SreOpcode::GROUPREF_EXISTS => once(|drive| { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code(drive.peek_code(2) as usize + 1), + } + }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } } +fn calc_string_offset(string: &str, position: usize) -> usize { + string + .char_indices() + .nth(position) + .map(|(i, _)| i) + .unwrap_or(0) +} + fn char_loc_ignore(code: u32, c: char) -> bool { code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } @@ -509,6 +529,40 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } +fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + drive.ctx_mut().has_matched = Some(false); + return; + } + }; + let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); + let mut gdrive = WrapDrive::drive( + MatchContext { + string_position: group_start, + // TODO: cache the offset + string_offset: calc_string_offset(drive.state.string, group_start), + ..*drive.ctx() + }, + &drive, + ); + for _ in group_start..group_end { + if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + return; + } + wdrive.skip_char(1); + gdrive.skip_char(1); + } + let position = wdrive.ctx().string_position; + let offset = wdrive.ctx().string_offset; + drive.skip_code(2); + drive.ctx_mut().string_position = position; + drive.ctx_mut().string_offset = offset; +} + fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -766,11 +820,19 @@ fn is_uni_space(c: char) -> bool { c.is_whitespace() } fn is_uni_linebreak(c: char) -> bool { - match c { - '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' - | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } + matches!( + c, + '\u{000A}' + | '\u{000B}' + | '\u{000C}' + | '\u{000D}' + | '\u{001C}' + | '\u{001D}' + | '\u{001E}' + | '\u{0085}' + | '\u{2028}' + | '\u{2029}' + ) } fn is_uni_alnum(c: char) -> bool { // TODO: check with cpython @@ -802,7 +864,7 @@ struct OpAssert { } impl Default for OpAssert { fn default() -> Self { - OpAssert { + Self { child_ctx_id: 0, jump_id: 0, } @@ -839,6 +901,49 @@ impl OpAssert { } } +struct OpAssertNot { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssertNot { + fn default() -> Self { + Self { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssertNot { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssertNot { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + None + } +} + struct OpMinRepeatOne { jump_id: usize, mincount: usize, From 0b2c8d1fa256a0b40223f97b2a5de6b2747c3397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 10:33:28 +0200 Subject: [PATCH 08/95] impl OpMaxUntil --- interp.rs | 133 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 11 deletions(-) diff --git a/interp.rs b/interp.rs index 88dbb034da..85787ae516 100644 --- a/interp.rs +++ b/interp.rs @@ -20,7 +20,7 @@ pub(crate) struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat: Option, + repeat_stack: Vec, string_position: usize, } @@ -45,7 +45,7 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat: None, + repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, } @@ -56,7 +56,7 @@ impl<'a> State<'a> { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); - self.repeat = None; + self.repeat_stack.clear(); } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -104,7 +104,7 @@ pub(crate) fn pymatch( string.borrow_value(), start, end, - pattern.flags.clone(), + pattern.flags, &pattern.code, ); let ctx = MatchContext { @@ -237,6 +237,7 @@ trait MatchContextDrive { ]), _ => unreachable!(), }; + // TODO: char::from_u32_unchecked is stable from 1.5.0 unsafe { std::mem::transmute(code) } } fn back_skip_char(&mut self, skip_count: usize) { @@ -426,7 +427,6 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -467,7 +467,7 @@ impl OpcodeDispatcher { general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + general_op_literal(drive, char_loc_ignore); }), SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); @@ -478,9 +478,8 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), - SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), SreOpcode::REPEAT_ONE => unimplemented(), @@ -504,6 +503,10 @@ impl OpcodeDispatcher { } }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), + _ => { + // TODO + unreachable!("unexpected opcode") + } } } } @@ -661,7 +664,7 @@ fn charset(set: &[u32], c: char) -> bool { /* <256 blockindices> */ let count = set[i + 1]; if ch < 0x10000 { - let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; let block = blockindices[(ch >> 8) as usize]; if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1 << (ch & (32 - 1))) @@ -712,7 +715,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -749,7 +752,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); @@ -1035,3 +1038,111 @@ impl OpMinRepeatOne { self._1(drive) } } + +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + skip: usize, + mincount: usize, + maxcount: usize, + count: isize, + last_position: isize, +} + +struct OpMaxUntil { + jump_id: usize, + count: isize, + save_last_position: isize, + child_ctx_id: usize, +} +impl Default for OpMaxUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + save_last_position: -1, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMaxUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) + && (drive.state.string_position as isize != repeat.last_position) + { + // we may have enough matches, if we can match another item, do so + repeat.count = self.count; + self.save_last_position = repeat.last_position; + repeat.last_position = drive.state.string_position as isize; + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } + None + } + 2 => { + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.last_position = drive.state.string_position as isize; + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + repeat.count = self.count - 1; + drive.state.marks_pop(); + drive.state.string_position = drive.ctx().string_position; + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } else { + drive.state.repeat_stack.pop(); + } + None + } + _ => unreachable!(), + } + } +} From 93c2b8b55513989982156136e6b92a1f07e02c24 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:02:43 +0200 Subject: [PATCH 09/95] impl OpBranch --- interp.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 85787ae516..03cfdfa96a 100644 --- a/interp.rs +++ b/interp.rs @@ -417,7 +417,6 @@ impl OpcodeDispatcher { } }), SreOpcode::BRANCH => unimplemented(), - SreOpcode::CALL => unimplemented(), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,3 +1145,51 @@ impl OpcodeExecutor for OpMaxUntil { } } } + +struct OpBranch { + jump_id: usize, + child_ctx_id: usize, + current_branch_length: usize, +} +impl Default for OpBranch { + fn default() -> Self { + Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + } +} +impl OpcodeExecutor for OpBranch { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.marks_push(); + // jump out the head + self.current_branch_length = 1; + self.jump_id = 1; + self.next(drive) + } + 1 => { + drive.skip_code(self.current_branch_length); + self.current_branch_length = drive.peek_code(0) as usize; + if self.current_branch_length == 0 { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop_keep(); + self.jump_id = 1; + Some(()) + } + _ => unreachable!() + } + } +} \ No newline at end of file From 5a4459856ca57886279f0cc4f1abc33c7cf4a397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:56:37 +0200 Subject: [PATCH 10/95] impl OpRepeat --- interp.rs | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 03cfdfa96a..fbcbf260e5 100644 --- a/interp.rs +++ b/interp.rs @@ -416,7 +416,7 @@ impl OpcodeDispatcher { drive.skip_code(2); } }), - SreOpcode::BRANCH => unimplemented(), + SreOpcode::BRANCH => Box::new(OpBranch::default()), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,6 +1146,39 @@ impl OpcodeExecutor for OpMaxUntil { } } +struct OpMinUntil { + jump_id: usize, + count: isize, + child_ctx_id: usize, +} +impl Default for OpMinUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMinUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + None + } + _ => unreachable!(), + } + } +} + struct OpBranch { jump_id: usize, child_ctx_id: usize, @@ -1153,7 +1186,11 @@ struct OpBranch { } impl Default for OpBranch { fn default() -> Self { - Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + Self { + jump_id: 0, + child_ctx_id: 0, + current_branch_length: 0, + } } } impl OpcodeExecutor for OpBranch { @@ -1189,7 +1226,46 @@ impl OpcodeExecutor for OpBranch { self.jump_id = 1; Some(()) } - _ => unreachable!() + _ => unreachable!(), + } + } +} + +struct OpRepeat { + jump_id: usize, + child_ctx_id: usize, +} +impl Default for OpRepeat { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpRepeat { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + let repeat = RepeatContext { + skip: drive.peek_code(1), + mincount: drive.peek_code(2), + maxcount: drive.peek_code(3), + count: -1, + last_position: -1, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 1; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + None + } + _ => unreachable!(), } } -} \ No newline at end of file +} From af7901dcb21cba20bcfa8635b3c86d84574a988e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 16:48:30 +0200 Subject: [PATCH 11/95] impl OpMinUntil --- interp.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/interp.rs b/interp.rs index fbcbf260e5..ec6d41eade 100644 --- a/interp.rs +++ b/interp.rs @@ -478,11 +478,9 @@ impl OpcodeDispatcher { drive.skip_code(2); }), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::RANGE => unimplemented(), - SreOpcode::REPEAT => unimplemented(), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), + SreOpcode::REPEAT => Box::new(OpRepeat::default()), SreOpcode::REPEAT_ONE => unimplemented(), - SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -501,9 +499,8 @@ impl OpcodeDispatcher { _ => drive.skip_code(drive.peek_code(2) as usize + 1), } }), - SreOpcode::RANGE_UNI_IGNORE => unimplemented(), _ => { - // TODO + // TODO error expcetion unreachable!("unexpected opcode") } } @@ -1172,8 +1169,52 @@ impl OpcodeExecutor for OpMinUntil { } }; self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } None } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.repeat_stack.pop(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more until tail matches + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + Some(()) + } _ => unreachable!(), } } @@ -1248,9 +1289,9 @@ impl OpcodeExecutor for OpRepeat { match self.jump_id { 0 => { let repeat = RepeatContext { - skip: drive.peek_code(1), - mincount: drive.peek_code(2), - maxcount: drive.peek_code(3), + skip: drive.peek_code(1) as usize, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, count: -1, last_position: -1, }; From fa2adaf2ff9bf8acf642ad210480e686848d4061 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 17:53:14 +0200 Subject: [PATCH 12/95] Impl OpRepeatONe --- interp.rs | 132 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 29 deletions(-) diff --git a/interp.rs b/interp.rs index ec6d41eade..64d70216e3 100644 --- a/interp.rs +++ b/interp.rs @@ -313,14 +313,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpUnimplemented {} -impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.ctx_mut().has_matched = Some(false); - None - } -} - struct OpOnce { f: Option, } @@ -335,10 +327,6 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -fn unimplemented() -> Box { - Box::new(OpUnimplemented {}) -} - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -480,7 +468,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -500,7 +488,7 @@ impl OpcodeDispatcher { } }), _ => { - // TODO error expcetion + // TODO python expcetion? unreachable!("unexpected opcode") } } @@ -713,6 +701,7 @@ fn charset(set: &[u32], c: char) -> bool { fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let end = drive.ctx().string_position + maxcount; let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, Err(_) => { @@ -722,54 +711,56 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { match opcode { SreOpcode::ANY => { - while !drive.at_end() && !drive.at_linebreak() { + while !drive.ctx().string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(drive.remaining_chars()); + drive.skip_char(maxcount); } SreOpcode::IN => { // TODO: pattern[2 or 1..]? - while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + while !drive.ctx().string_position < end + && charset(&drive.pattern()[2..], drive.peek_char()) + { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, |code, c| code == c as u32); + general_count_literal(&mut drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, |code, c| code != c as u32); + general_count_literal(&mut drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, char_loc_ignore); + general_count_literal(&mut drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - panic!("TODO: Not Implemented."); + todo!("repeated single character pattern?"); } } - drive.ctx().string_position - stack_drive.ctx().string_position + drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); - while !drive.at_end() && f(ch, drive.peek_char()) { + while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1310,3 +1301,86 @@ impl OpcodeExecutor for OpRepeat { } } } + +struct OpRepeatOne { + jump_id: usize, + child_ctx_id: usize, + mincount: usize, + maxcount: usize, + count: usize, +} +impl Default for OpRepeatOne { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + } + } +} +impl OpcodeExecutor for OpRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 { + // tail is empty. we're finished + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + // TODO: + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + self.jump_id = 1; + self.next(drive) + } + 1 => { + // General case: backtracking + if self.count >= self.mincount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + drive.state.marks_pop_keep(); + + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } + } +} From ae44580371afb3e347f31647368c2a7a8b1a1578 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 14:14:13 +0200 Subject: [PATCH 13/95] general case for count --- interp.rs | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/interp.rs b/interp.rs index 64d70216e3..99574443c0 100644 --- a/interp.rs +++ b/interp.rs @@ -3,6 +3,7 @@ use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use crate::pyobject::PyRef; use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; @@ -98,7 +99,7 @@ pub(crate) fn pymatch( string: PyStrRef, start: usize, end: usize, - pattern: &Pattern, + pattern: PyRef, ) -> Option { let mut state = State::new( string.borrow_value(), @@ -135,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.pattern.clone(), string.clone())) + Some(Match::new(&state, pattern.clone().into_object(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -425,7 +426,7 @@ impl OpcodeDispatcher { }), SreOpcode::IN_LOC_IGNORE => once(|drive| { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -561,7 +562,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F fn general_op_in char>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -698,7 +699,28 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(drive: &mut StackDrive, maxcount: usize) -> usize { + let mut count = 0; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let save_ctx = *drive.ctx(); + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + + let mut dispatcher = OpcodeDispatcher::new(); + while count < maxcount { + drive.ctx_mut().code_position = reset_position; + dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + *drive.ctx_mut() = save_ctx; + count +} + +fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; From f7287553e9ed42df638190b56ce4474eb7783c38 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 18:04:00 +0200 Subject: [PATCH 14/95] impl re.Match object --- interp.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index 99574443c0..ac0eadb4ae 100644 --- a/interp.rs +++ b/interp.rs @@ -17,12 +17,12 @@ pub(crate) struct State<'a> { pub end: usize, flags: SreFlag, pattern_codes: &'a [u32], - marks: Vec>, + pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat_stack: Vec, - string_position: usize, + pub string_position: usize, } impl<'a> State<'a> { @@ -136,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.clone().into_object(), string.clone())) + Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] From 04bb80f157128d59e2a5e25261311ee1be8c143e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 31 Dec 2020 11:22:17 +0200 Subject: [PATCH 15/95] impl Match.group --- interp.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/interp.rs b/interp.rs index ac0eadb4ae..f9b0898a31 100644 --- a/interp.rs +++ b/interp.rs @@ -115,6 +115,7 @@ pub(crate) fn pymatch( has_matched: None, }; state.context_stack.push(ctx); + let mut dispatcher = OpcodeDispatcher::new(); let mut has_matched = None; loop { @@ -123,7 +124,6 @@ pub(crate) fn pymatch( } let ctx_id = state.context_stack.len() - 1; let mut drive = StackDrive::drive(ctx_id, state); - let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); @@ -132,11 +132,11 @@ pub(crate) fn pymatch( } } - if has_matched == None || has_matched == Some(false) { - return None; + if has_matched != Some(true) { + None + } else { + Some(Match::new(&state, pattern.clone(), string.clone())) } - - Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -344,7 +344,9 @@ impl OpcodeDispatcher { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); - self.dispatch(opcode, drive); + if !self.dispatch(opcode, drive) { + return None; + } } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -469,7 +471,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -1329,7 +1331,7 @@ struct OpRepeatOne { child_ctx_id: usize, mincount: usize, maxcount: usize, - count: usize, + count: isize, } impl Default for OpRepeatOne { fn default() -> Self { @@ -1354,9 +1356,9 @@ impl OpcodeExecutor for OpRepeatOne { return None; } drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { + self.count = count(drive, self.maxcount) as isize; + drive.skip_char(self.count as usize); + if self.count < self.mincount as isize { drive.ctx_mut().has_matched = Some(false); return None; } @@ -1378,7 +1380,7 @@ impl OpcodeExecutor for OpRepeatOne { } 1 => { // General case: backtracking - if self.count >= self.mincount { + if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; From 8c442f599bb67f4a9878cad9e4df180ab52b39b7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:30:05 +0200 Subject: [PATCH 16/95] rework OpMaxUntil; restruct popping context; add tests; --- interp.rs | 660 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 364 insertions(+), 296 deletions(-) diff --git a/interp.rs b/interp.rs index f9b0898a31..cb06ee0b8a 100644 --- a/interp.rs +++ b/interp.rs @@ -23,6 +23,7 @@ pub(crate) struct State<'a> { context_stack: Vec, repeat_stack: Vec, pub string_position: usize, + popped_context: Option, } impl<'a> State<'a> { @@ -49,6 +50,7 @@ impl<'a> State<'a> { repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, + popped_context: None, } } @@ -58,6 +60,7 @@ impl<'a> State<'a> { self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.popped_context = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -128,7 +131,7 @@ pub(crate) fn pymatch( has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); if has_matched.is_some() { - state.context_stack.pop(); + state.popped_context = state.context_stack.pop(); } } @@ -151,6 +154,9 @@ trait MatchContextDrive { fn ctx_mut(&mut self) -> &mut MatchContext; fn ctx(&self) -> &MatchContext; fn state(&self) -> &State; + fn repeat_ctx(&self) -> &RepeatContext { + self.state().repeat_stack.last().unwrap() + } fn str(&self) -> &str { unsafe { std::str::from_utf8_unchecked( @@ -181,6 +187,9 @@ trait MatchContextDrive { } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; + if self.ctx().code_position > self.state().pattern_codes.len() { + self.ctx_mut().code_position = self.state().pattern_codes.len(); + } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -263,16 +272,17 @@ impl<'a> StackDrive<'a> { fn take(self) -> State<'a> { self.state } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { + fn push_new_context(&mut self, pattern_offset: usize) { let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; + let mut child_ctx = MatchContext { ..*ctx }; + child_ctx.code_position += pattern_offset; + if child_ctx.code_position > self.state.pattern_codes.len() { + child_ctx.code_position = self.state.pattern_codes.len(); + } self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 + } + fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { + self.state.repeat_stack.last_mut().unwrap() } } impl MatchContextDrive for StackDrive<'_> { @@ -328,6 +338,39 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +// F1 F2 are same identical, but workaround for closure +struct OpTwice { + f1: Option, + f2: Option, +} +impl OpcodeExecutor for OpTwice +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(f1) = self.f1.take() { + f1(drive); + Some(()) + } else if let Some(f2) = self.f2.take() { + f2(drive); + None + } else { + unreachable!() + } + } +} +fn twice(f1: F1, f2: F2) -> Box> +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + Box::new(OpTwice { + f1: Some(f1), + f2: Some(f2), + }) +} + struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -397,8 +440,44 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), + SreOpcode::ASSERT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + }, + ), + SreOpcode::ASSERT_NOT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + }, + ), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -468,9 +547,29 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), + SreOpcode::REPEAT => twice( + // create repeat context. all the hard work is done by the UNTIL + // operator (MAX_UNTIL, MIN_UNTIL) + // <1=min> <2=max> item tail + |drive| { + let repeat = RepeatContext { + count: -1, + code_position: drive.ctx().code_position, + last_position: std::usize::MAX, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + // execute UNTIL operator + drive.push_new_context(drive.peek_code(1) as usize + 1); + }, + |drive| { + drive.state.repeat_stack.pop(); + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + }, + ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT => Box::new(OpRepeat::default()), + SreOpcode::MIN_UNTIL => todo!("min until"), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -872,90 +971,12 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } -struct OpAssert { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssert { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssert { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - None - } -} - -struct OpAssertNot { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssertNot { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssertNot { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssertNot { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - None - } +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + count: isize, + code_position: usize, + // zero-width match protection + last_position: usize, } struct OpMinRepeatOne { @@ -963,7 +984,6 @@ struct OpMinRepeatOne { mincount: usize, maxcount: usize, count: usize, - child_ctx_id: usize, } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -982,7 +1002,6 @@ impl Default for OpMinRepeatOne { mincount: 0, maxcount: 0, count: 0, - child_ctx_id: 0, } } } @@ -1023,7 +1042,7 @@ impl OpMinRepeatOne { fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if self.maxcount == MAXREPEAT || self.count <= self.maxcount { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1033,7 +1052,8 @@ impl OpMinRepeatOne { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } @@ -1050,201 +1070,290 @@ impl OpMinRepeatOne { } } -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - skip: usize, - mincount: usize, - maxcount: usize, - count: isize, - last_position: isize, -} - +// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, - save_last_position: isize, - child_ctx_id: usize, + save_last_position: usize, } impl Default for OpMaxUntil { fn default() -> Self { - Self { + OpMaxUntil { jump_id: 0, count: 0, - save_last_position: -1, - child_ctx_id: 0, + save_last_position: 0, } } } impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) - && (drive.state.string_position as isize != repeat.last_position) - { - // we may have enough matches, if we can match another item, do so - repeat.count = self.count; - self.save_last_position = repeat.last_position; - repeat.last_position = drive.state.string_position as isize; - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } - - self.child_ctx_id = drive.push_new_context(1); - - self.jump_id = 3; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.last_position = drive.state.string_position as isize; - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - repeat.count = self.count - 1; - drive.state.marks_pop(); - drive.state.string_position = drive.ctx().string_position; + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + 3 => self._3(drive), + 4 => self._4(drive), + _ => unreachable!(), + } + } +} +impl OpMaxUntil { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - self.child_ctx_id = drive.push_new_context(1); + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + // self.save_last_position = last_position; + // drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - Some(()) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } else { - drive.state.repeat_stack.pop(); - } - None - } - _ => unreachable!(), + self.jump_id = 3; + self.next(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; } + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + // drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None } } +// struct OpMaxUntil { +// jump_id: usize, +// count: isize, +// save_last_position: isize, +// } +// impl Default for OpMaxUntil { +// fn default() -> Self { +// Self { +// jump_id: 0, +// count: 0, +// save_last_position: -1, +// } +// } +// } +// impl OpcodeExecutor for OpMaxUntil { +// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { +// match self.jump_id { +// 0 => { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = match drive.state.repeat_stack.last_mut() { +// Some(repeat) => repeat, +// None => { +// panic!("Internal re error: MAX_UNTIL without REPEAT."); +// } +// }; +// self.count = repeat.count + 1; + +// if self.count < repeat.mincount as isize { +// // not enough matches +// repeat.count = self.count; +// drive.push_new_context(4); +// self.jump_id = 1; +// return Some(()); +// } + +// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) +// && (drive.state.string_position as isize != repeat.last_position) +// { +// // we may have enough matches, if we can match another item, do so +// repeat.count = self.count; +// self.save_last_position = repeat.last_position; +// repeat.last_position = drive.state.string_position as isize; +// drive.state.marks_push(); +// drive.push_new_context(4); +// self.jump_id = 2; +// return Some(()); +// } + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 1 => { +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.count = self.count - 1; +// } +// None +// } +// 2 => { +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.last_position = drive.state.string_position as isize; +// let child_ctx = drive.state.popped_context.unwrap(); +// if child_ctx.has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// repeat.count = self.count - 1; +// drive.state.marks_pop(); +// drive.state.string_position = drive.ctx().string_position; + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 3 => { +// // cannot match more repeated items here. make sure the tail matches +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// } else { +// drive.state.repeat_stack.pop(); +// } +// None +// } +// _ => unreachable!(), +// } +// } +// } + struct OpMinUntil { jump_id: usize, count: isize, - child_ctx_id: usize, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, - child_ctx_id: 0, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.repeat_stack.pop(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - drive.ctx_mut().has_matched = Some(false); - return None; - } - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } + None + // match self.jump_id { + // 0 => { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = match drive.state.repeat_stack.last_mut() { + // Some(repeat) => repeat, + // None => { + // todo!("Internal re error: MAX_UNTIL without REPEAT."); + // } + // }; + // self.count = repeat.count + 1; + + // if self.count < repeat.mincount as isize { + // // not enough matches + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // return Some(()); + // } + + // // see if the tail matches + // drive.state.marks_push(); + // drive.push_new_context(1); + // self.jump_id = 2; + // Some(()) + // } + // 1 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // drive.ctx_mut().has_matched = child_ctx.has_matched; + // if drive.ctx().has_matched != Some(true) { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // repeat.count = self.count - 1; + // } + // None + // } + // 2 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // if child_ctx.has_matched == Some(true) { + // drive.state.repeat_stack.pop(); + // drive.ctx_mut().has_matched = Some(true); + // return None; + // } + // drive.state.string_position = drive.ctx().string_position; + // drive.state.marks_pop(); + + // // match more until tail matches + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + // drive.ctx_mut().has_matched = Some(false); + // return None; + // } + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // Some(()) + // } + // _ => unreachable!(), + // } } } struct OpBranch { jump_id: usize, - child_ctx_id: usize, current_branch_length: usize, } impl Default for OpBranch { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, current_branch_length: 0, } } @@ -1268,12 +1377,12 @@ impl OpcodeExecutor for OpBranch { return None; } drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(1); + drive.push_new_context(1); self.jump_id = 2; Some(()) } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; @@ -1287,48 +1396,8 @@ impl OpcodeExecutor for OpBranch { } } -struct OpRepeat { - jump_id: usize, - child_ctx_id: usize, -} -impl Default for OpRepeat { - fn default() -> Self { - Self { - jump_id: 0, - child_ctx_id: 0, - } - } -} -impl OpcodeExecutor for OpRepeat { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let repeat = RepeatContext { - skip: drive.peek_code(1) as usize, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - count: -1, - last_position: -1, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 1; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - None - } - _ => unreachable!(), - } - } -} - struct OpRepeatOne { jump_id: usize, - child_ctx_id: usize, mincount: usize, maxcount: usize, count: isize, @@ -1337,7 +1406,6 @@ impl Default for OpRepeatOne { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, mincount: 0, maxcount: 0, count: 0, @@ -1382,7 +1450,7 @@ impl OpcodeExecutor for OpRepeatOne { // General case: backtracking if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1392,7 +1460,7 @@ impl OpcodeExecutor for OpRepeatOne { None } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; From 8fba935bba46ade66107a3c7aabb65e4eae00dcb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:54:44 +0200 Subject: [PATCH 17/95] OpMaxUntil zero-width protection --- interp.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index cb06ee0b8a..4a015187e9 100644 --- a/interp.rs +++ b/interp.rs @@ -1107,6 +1107,7 @@ impl OpMaxUntil { drive.ctx_mut().code_position = code_position; let mincount = drive.peek_code(2) as usize; let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { @@ -1123,8 +1124,8 @@ impl OpMaxUntil { // we may have enough matches, if we can match another item, do so drive.repeat_ctx_mut().count = self.count; drive.state.marks_push(); - // self.save_last_position = last_position; - // drive.repeat_ctx_mut().last_position = drive.state.string_position; + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; drive.push_new_context(4); self.jump_id = 2; return Some(()); @@ -1143,7 +1144,7 @@ impl OpMaxUntil { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - // drive.repeat_ctx_mut().last_position = self.save_last_position; + drive.repeat_ctx_mut().last_position = self.save_last_position; let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.state.marks_pop_discard(); From af1a53cb0530f27804442f86d3e1e05a4abcacef Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 11:34:40 +0200 Subject: [PATCH 18/95] impl Match.groups() --- interp.rs | 249 ++++++++++++++++-------------------------------------- 1 file changed, 72 insertions(+), 177 deletions(-) diff --git a/interp.rs b/interp.rs index 4a015187e9..ad0790a31b 100644 --- a/interp.rs +++ b/interp.rs @@ -1070,7 +1070,6 @@ impl OpMinRepeatOne { } } -// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, @@ -1088,189 +1087,85 @@ impl Default for OpMaxUntil { impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - 3 => self._3(drive), - 4 => self._4(drive), - _ => unreachable!(), - } - } -} -impl OpMaxUntil { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let RepeatContext { - count, - code_position, - last_position, - } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; + 0 => { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - self.next(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { - // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + 4 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None + } + _ => unreachable!(), } - None } } -// struct OpMaxUntil { -// jump_id: usize, -// count: isize, -// save_last_position: isize, -// } -// impl Default for OpMaxUntil { -// fn default() -> Self { -// Self { -// jump_id: 0, -// count: 0, -// save_last_position: -1, -// } -// } -// } -// impl OpcodeExecutor for OpMaxUntil { -// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { -// match self.jump_id { -// 0 => { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = match drive.state.repeat_stack.last_mut() { -// Some(repeat) => repeat, -// None => { -// panic!("Internal re error: MAX_UNTIL without REPEAT."); -// } -// }; -// self.count = repeat.count + 1; - -// if self.count < repeat.mincount as isize { -// // not enough matches -// repeat.count = self.count; -// drive.push_new_context(4); -// self.jump_id = 1; -// return Some(()); -// } - -// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) -// && (drive.state.string_position as isize != repeat.last_position) -// { -// // we may have enough matches, if we can match another item, do so -// repeat.count = self.count; -// self.save_last_position = repeat.last_position; -// repeat.last_position = drive.state.string_position as isize; -// drive.state.marks_push(); -// drive.push_new_context(4); -// self.jump_id = 2; -// return Some(()); -// } - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 1 => { -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.count = self.count - 1; -// } -// None -// } -// 2 => { -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.last_position = drive.state.string_position as isize; -// let child_ctx = drive.state.popped_context.unwrap(); -// if child_ctx.has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// repeat.count = self.count - 1; -// drive.state.marks_pop(); -// drive.state.string_position = drive.ctx().string_position; - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 3 => { -// // cannot match more repeated items here. make sure the tail matches -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// } else { -// drive.state.repeat_stack.pop(); -// } -// None -// } -// _ => unreachable!(), -// } -// } -// } - struct OpMinUntil { jump_id: usize, count: isize, From db84f329816d812349b49fa843b4d0480674aba2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 13:10:19 +0200 Subject: [PATCH 19/95] fix Opcode::CHARSET --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ad0790a31b..289dbc2ec9 100644 --- a/interp.rs +++ b/interp.rs @@ -741,7 +741,8 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::CHARSET => { /* */ - if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + let set = &set[1..]; + if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } i += 8; From 36433a9f4d026df404e419d604e67295fa4db758 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 20:13:06 +0200 Subject: [PATCH 20/95] fix Opcode::BIGCHARSET --- interp.rs | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/interp.rs b/interp.rs index 289dbc2ec9..ca5efd9e4f 100644 --- a/interp.rs +++ b/interp.rs @@ -497,22 +497,16 @@ impl OpcodeDispatcher { } }), SreOpcode::IN => once(|drive| { - general_op_in(drive, |x| x); + general_op_in(drive, |set, c| charset(set, c)); }), SreOpcode::IN_IGNORE => once(|drive| { - general_op_in(drive, lower_ascii); + general_op_in(drive, |set, c| charset(set, lower_ascii(c))); }), SreOpcode::IN_UNI_IGNORE => once(|drive| { - general_op_in(drive, lower_unicode); + general_op_in(drive, |set, c| charset(set, lower_unicode(c))); }), SreOpcode::IN_LOC_IGNORE => once(|drive| { - let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(skip + 1); - drive.skip_char(1); - } + general_op_in(drive, |set, c| charset_loc_ignore(set, c)); }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); @@ -661,9 +655,9 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in char>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { + if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -749,18 +743,20 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ - let count = set[i + 1]; + let count = set[i + 1] as usize; if ch < 0x10000 { - let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; - let block = blockindices[(ch >> 8) as usize]; - if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] - & (1 << (ch & (32 - 1))) + let set = &set[2..]; + let block_index = ch >> 8; + let (_, blockindices, _) = unsafe { set.align_to::() }; + let blocks = &set[64..]; + let block = blockindices[block_index as usize]; + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) != 0 { return ok; } } - i += 2 + 64 + count as usize * 8; + i += 2 + 64 + count * 8; } SreOpcode::LITERAL => { /* */ From f05f6cb44df000ad7cffcd79b6447c97756d15dd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 3 Jan 2021 19:44:40 +0200 Subject: [PATCH 21/95] impl Pattern.sub --- interp.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ca5efd9e4f..98844ee9ca 100644 --- a/interp.rs +++ b/interp.rs @@ -142,6 +142,27 @@ pub(crate) fn pymatch( } } +pub(crate) fn search( + string: PyStrRef, + start: usize, + end: usize, + pattern: PyRef, +) -> Option { + // TODO: optimize by op info and skip prefix + let end = std::cmp::min(end, string.char_len()); + for i in start..end { + if let Some(m) = pymatch( + string.clone(), + i, + end, + pattern.clone(), + ) { + return Some(m); + } + } + None +} + #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, @@ -750,7 +771,8 @@ fn charset(set: &[u32], c: char) -> bool { let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; let block = blockindices[block_index as usize]; - if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1u32 << (ch & 31)) != 0 { return ok; From 817eb66167810a163eb889ce2626418b51bd6afb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 4 Jan 2021 16:01:17 +0200 Subject: [PATCH 22/95] fix OpMinUntil --- interp.rs | 100 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/interp.rs b/interp.rs index 98844ee9ca..a3263eddce 100644 --- a/interp.rs +++ b/interp.rs @@ -151,12 +151,7 @@ pub(crate) fn search( // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); for i in start..end { - if let Some(m) = pymatch( - string.clone(), - i, - end, - pattern.clone(), - ) { + if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } } @@ -294,12 +289,13 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let ctx = self.ctx(); - let mut child_ctx = MatchContext { ..*ctx }; + let mut child_ctx = MatchContext { ..*self.ctx() }; child_ctx.code_position += pattern_offset; - if child_ctx.code_position > self.state.pattern_codes.len() { - child_ctx.code_position = self.state.pattern_codes.len(); - } + self.state.context_stack.push(child_ctx); + } + fn push_new_context_at(&mut self, code_position: usize) { + let mut child_ctx = MatchContext { ..*self.ctx() }; + child_ctx.code_position = code_position; self.state.context_stack.push(child_ctx); } fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { @@ -571,6 +567,8 @@ impl OpcodeDispatcher { count: -1, code_position: drive.ctx().code_position, last_position: std::usize::MAX, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, }; drive.state.repeat_stack.push(repeat); drive.state.string_position = drive.ctx().string_position; @@ -584,7 +582,7 @@ impl OpcodeDispatcher { }, ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => todo!("min until"), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -996,6 +994,8 @@ struct RepeatContext { code_position: usize, // zero-width match protection last_position: usize, + mincount: usize, + maxcount: usize, } struct OpMinRepeatOne { @@ -1111,22 +1111,22 @@ impl OpcodeExecutor for OpMaxUntil { count, code_position, last_position, + mincount, + maxcount, } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { // not enough matches drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 1; return Some(()); } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) + if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) && drive.state.string_position != last_position { // we may have enough matches, if we can match another item, do so @@ -1134,7 +1134,7 @@ impl OpcodeExecutor for OpMaxUntil { drive.state.marks_push(); self.save_last_position = last_position; drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 2; return Some(()); } @@ -1167,7 +1167,6 @@ impl OpcodeExecutor for OpMaxUntil { } 3 => { // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); drive.push_new_context(1); self.jump_id = 4; Some(()) @@ -1188,18 +1187,79 @@ impl OpcodeExecutor for OpMaxUntil { struct OpMinUntil { jump_id: usize, count: isize, + save_repeat: Option, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, + save_repeat: None, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - None + match self.jump_id { + 0 => { + let RepeatContext { + count, + code_position, + last_position: _, + mincount, + maxcount: _, + } = *drive.repeat_ctx(); + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.save_repeat = drive.state.repeat_stack.pop(); + drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more unital tail matches + let maxcount = drive.repeat_ctx().maxcount; + let code_position = drive.repeat_ctx().code_position; + if self.count as usize >= maxcount && maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } // match self.jump_id { // 0 => { // drive.state.string_position = drive.ctx().string_position; From 33ef82364516422776beaa498db8d96a167c7b95 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 6 Jan 2021 10:30:57 +0200 Subject: [PATCH 23/95] impl Match.groupdict --- interp.rs | 58 ------------------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/interp.rs b/interp.rs index a3263eddce..c118984b38 100644 --- a/interp.rs +++ b/interp.rs @@ -1260,64 +1260,6 @@ impl OpcodeExecutor for OpMinUntil { } _ => unreachable!(), } - // match self.jump_id { - // 0 => { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = match drive.state.repeat_stack.last_mut() { - // Some(repeat) => repeat, - // None => { - // todo!("Internal re error: MAX_UNTIL without REPEAT."); - // } - // }; - // self.count = repeat.count + 1; - - // if self.count < repeat.mincount as isize { - // // not enough matches - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // return Some(()); - // } - - // // see if the tail matches - // drive.state.marks_push(); - // drive.push_new_context(1); - // self.jump_id = 2; - // Some(()) - // } - // 1 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // drive.ctx_mut().has_matched = child_ctx.has_matched; - // if drive.ctx().has_matched != Some(true) { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // repeat.count = self.count - 1; - // } - // None - // } - // 2 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // if child_ctx.has_matched == Some(true) { - // drive.state.repeat_stack.pop(); - // drive.ctx_mut().has_matched = Some(true); - // return None; - // } - // drive.state.string_position = drive.ctx().string_position; - // drive.state.marks_pop(); - - // // match more until tail matches - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - // drive.ctx_mut().has_matched = Some(false); - // return None; - // } - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // Some(()) - // } - // _ => unreachable!(), - // } } } From 76c95abbb5ecd91e05f216adefdd437b0f87b79b Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 7 Jan 2021 17:37:18 +0200 Subject: [PATCH 24/95] impl Match.lastgroup --- interp.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index c118984b38..febbfa2437 100644 --- a/interp.rs +++ b/interp.rs @@ -150,7 +150,7 @@ pub(crate) fn search( ) -> Option { // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); - for i in start..end { + for i in start..end + 1 { if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } @@ -1382,6 +1382,13 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } + if self.count <= self.mincount as isize { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + + // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; drive.state.marks_pop_keep(); From 13a8b6cc4e38927273774ab96565f2184840f900 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 17 Jan 2021 19:55:20 +0200 Subject: [PATCH 25/95] add bytes support and refactor --- interp.rs | 457 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 266 insertions(+), 191 deletions(-) diff --git a/interp.rs b/interp.rs index febbfa2437..7c9246142a 100644 --- a/interp.rs +++ b/interp.rs @@ -1,18 +1,18 @@ // good luck to those that follow; here be dragons -use super::_sre::{Match, Pattern, MAXREPEAT}; +use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyStrRef; -use crate::pyobject::PyRef; -use rustpython_common::borrow::BorrowValue; +use crate::builtins::PyBytes; +use crate::bytesinner::is_py_ascii_whitespace; +use crate::pyobject::{IntoPyObject, PyObjectRef}; +use crate::VirtualMachine; use std::collections::HashMap; use std::convert::TryFrom; +use std::unreachable; #[derive(Debug)] pub(crate) struct State<'a> { - string: &'a str, - // chars count - string_len: usize, + pub string: StrDrive<'a>, pub start: usize, pub end: usize, flags: SreFlag, @@ -24,22 +24,21 @@ pub(crate) struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, + pub has_matched: Option, } impl<'a> State<'a> { pub(crate) fn new( - string: &'a str, + string: StrDrive<'a>, start: usize, end: usize, flags: SreFlag, pattern_codes: &'a [u32], ) -> Self { - let string_len = string.chars().count(); - let end = std::cmp::min(end, string_len); + let end = std::cmp::min(end, string.count()); let start = std::cmp::min(start, end); Self { string, - string_len, start, end, flags, @@ -51,16 +50,19 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, + has_matched: None, } } - fn reset(&mut self) { - self.marks.clear(); + pub fn reset(&mut self) { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.marks.clear(); + self.string_position = self.start; self.popped_context = None; + self.has_matched = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -96,66 +98,133 @@ impl<'a> State<'a> { fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } + + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + let mut has_matched = None; + + loop { + if self.context_stack.is_empty() { + break; + } + let ctx_id = self.context_stack.len() - 1; + let mut drive = StackDrive::drive(ctx_id, self); + + has_matched = dispatcher.pymatch(&mut drive); + self = drive.take(); + if has_matched.is_some() { + self.popped_context = self.context_stack.pop(); + } + } + + self.has_matched = has_matched; + self + } + + pub fn search(mut self) -> Self { + // TODO: optimize by op info and skip prefix + loop { + self = self.pymatch(); + + if self.has_matched == Some(true) { + return self; + } + self.start += 1; + if self.start > self.end { + return self; + } + self.reset(); + } + } } -pub(crate) fn pymatch( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - let mut state = State::new( - string.borrow_value(), - start, - end, - pattern.flags, - &pattern.code, - ); - let ctx = MatchContext { - string_position: state.start, - string_offset: calc_string_offset(state.string, state.start), - code_position: 0, - has_matched: None, - }; - state.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); +#[derive(Debug, Clone, Copy)] +pub(crate) enum StrDrive<'a> { + Str(&'a str), + Bytes(&'a [u8]), +} +impl<'a> StrDrive<'a> { + fn offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => s + .get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or_else(|| s.len()), + StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + } + } - let mut has_matched = None; - loop { - if state.context_stack.is_empty() { - break; + pub fn count(&self) -> usize { + match *self { + StrDrive::Str(s) => s.chars().count(), + StrDrive::Bytes(b) => b.len(), } - let ctx_id = state.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, state); + } - has_matched = dispatcher.pymatch(&mut drive); - state = drive.take(); - if has_matched.is_some() { - state.popped_context = state.context_stack.pop(); + fn peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, + StrDrive::Bytes(b) => b[offset] as u32, } } - if has_matched != Some(true) { - None - } else { - Some(Match::new(&state, pattern.clone(), string.clone())) + fn back_peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => { + u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) + } + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + } + } + StrDrive::Bytes(b) => b[offset - 1] as u32, + } } -} -pub(crate) fn search( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - // TODO: optimize by op info and skip prefix - let end = std::cmp::min(end, string.char_len()); - for i in start..end + 1 { - if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { - return Some(m); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); + } + back_offset + } + StrDrive::Bytes(_) => offset - skip, + } + } + + pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { + match *self { + StrDrive::Str(s) => s + .chars() + .take(end) + .skip(start) + .collect::() + .into_pyobject(vm), + StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), } } - None } #[derive(Debug, Copy, Clone)] @@ -173,33 +242,21 @@ trait MatchContextDrive { fn repeat_ctx(&self) -> &RepeatContext { self.state().repeat_stack.last().unwrap() } - fn str(&self) -> &str { - unsafe { - std::str::from_utf8_unchecked( - &self.state().string.as_bytes()[self.ctx().string_offset..], - ) - } - } fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } - fn peek_char(&self) -> char { - self.str().chars().next().unwrap() + fn peek_char(&self) -> u32 { + self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - match self.str().char_indices().nth(skip_count).map(|x| x.0) { - Some(skipped) => { - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; - } - None => { - self.ctx_mut().string_position = self.state().end; - self.ctx_mut().string_offset = self.state().string.len(); // bytes len - } - } + self.ctx_mut().string_offset = self + .state() + .string + .offset(self.ctx().string_offset, skip_count); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -222,7 +279,7 @@ trait MatchContextDrive { fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { + fn at_boundary bool>(&self, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end() { return false; } @@ -230,47 +287,15 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } - fn back_peek_offset(&self) -> usize { - let bytes = self.state().string.as_bytes(); - let mut offset = self.ctx().string_offset - 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - panic!("not utf-8 code point"); - } - } - } - } - offset - } - fn back_peek_char(&self) -> char { - let bytes = self.state().string.as_bytes(); - let offset = self.back_peek_offset(); - let current_offset = self.ctx().string_offset; - let code = match current_offset - offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), - 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], - ]), - _ => unreachable!(), - }; - // TODO: char::from_u32_unchecked is stable from 1.5.0 - unsafe { std::mem::transmute(code) } + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) } fn back_skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_position -= skip_count; - for _ in 0..skip_count { - self.ctx_mut().string_offset = self.back_peek_offset(); - } + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); } } @@ -529,22 +554,22 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code == c as u32); + general_op_literal(drive, |code, c| code == c); }), SreOpcode::NOT_LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code != c as u32); + general_op_literal(drive, |code, c| code != c); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code == lower_ascii(c)); }), SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code != lower_ascii(c)); }), SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code == lower_unicode(c)); }), SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code != lower_unicode(c)); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, char_loc_ignore); @@ -610,19 +635,11 @@ impl OpcodeDispatcher { } } -fn calc_string_offset(string: &str, position: usize) -> usize { - string - .char_indices() - .nth(position) - .map(|(i, _)| i) - .unwrap_or(0) -} - -fn char_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn char_loc_ignore(code: u32, c: u32) -> bool { + code == c || code == lower_locate(c) || code == upper_locate(c) } -fn charset_loc_ignore(set: &[u32], c: char) -> bool { +fn charset_loc_ignore(set: &[u32], c: u32) -> bool { let lo = lower_locate(c); if charset(set, c) { return true; @@ -631,7 +648,7 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } -fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), @@ -645,7 +662,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) MatchContext { string_position: group_start, // TODO: cache the offset - string_offset: calc_string_offset(drive.state.string, group_start), + string_offset: drive.state.string.offset(0, group_start), ..*drive.ctx() }, &drive, @@ -665,7 +682,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) drive.ctx_mut().string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { @@ -674,7 +691,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -700,7 +717,7 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { } } -fn category(catcode: SreCatCode, c: char) -> bool { +fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), SreCatCode::NOT_DIGIT => !is_digit(c), @@ -723,9 +740,8 @@ fn category(catcode: SreCatCode, c: char) -> bool { } } -fn charset(set: &[u32], c: char) -> bool { +fn charset(set: &[u32], ch: u32) -> bool { /* check if character is a member of the given set */ - let ch = c as u32; let mut ok = true; let mut i = 0; while i < set.len() { @@ -747,7 +763,7 @@ fn charset(set: &[u32], c: char) -> bool { break; } }; - if category(catcode, c) { + if category(catcode, ch) { return ok; } i += 2; @@ -801,7 +817,7 @@ fn charset(set: &[u32], c: char) -> bool { if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } - let ch = upper_unicode(c) as u32; + let ch = upper_unicode(ch); if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } @@ -898,86 +914,128 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } -fn eq_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn eq_loc_ignore(code: u32, ch: u32) -> bool { + code == ch || code == lower_locate(ch) || code == upper_locate(ch) } -fn is_word(c: char) -> bool { - c.is_ascii_alphanumeric() || c == '_' +fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_space(c: char) -> bool { - c.is_ascii_whitespace() +fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) } -fn is_digit(c: char) -> bool { - c.is_ascii_digit() +fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } -fn is_loc_alnum(c: char) -> bool { +fn is_loc_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_loc_word(c: char) -> bool { - is_loc_alnum(c) || c == '_' +fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) } -fn is_linebreak(c: char) -> bool { - c == '\n' +fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 } -pub(crate) fn lower_ascii(c: char) -> char { - c.to_ascii_lowercase() +pub(crate) fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) } -fn lower_locate(c: char) -> char { +fn lower_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase - c.to_lowercase().next().unwrap() + lower_ascii(ch) } -fn upper_locate(c: char) -> char { +fn upper_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase - c.to_uppercase().next().unwrap() + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) } -fn is_uni_digit(c: char) -> bool { +fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - c.is_digit(10) + char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) } -fn is_uni_space(c: char) -> bool { +fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython - c.is_whitespace() -} -fn is_uni_linebreak(c: char) -> bool { + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +fn is_uni_linebreak(ch: u32) -> bool { matches!( - c, - '\u{000A}' - | '\u{000B}' - | '\u{000C}' - | '\u{000D}' - | '\u{001C}' - | '\u{001D}' - | '\u{001E}' - | '\u{0085}' - | '\u{2028}' - | '\u{2029}' + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 ) } -fn is_uni_alnum(c: char) -> bool { +fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) } -fn is_uni_word(c: char) -> bool { - is_uni_alnum(c) || c == '_' +fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(c: char) -> char { +pub(crate) fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_lowercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) } -pub(crate) fn upper_unicode(c: char) -> char { +pub(crate) fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_uppercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) } fn is_utf8_first_byte(b: u8) -> bool { @@ -988,6 +1046,23 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } +fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { + let mut offset = offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset +} + #[derive(Debug, Copy, Clone)] struct RepeatContext { count: isize, From 97000fc4e046c5f9d7da42357d8b3290e33972db Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 19 Jan 2021 17:14:00 +0200 Subject: [PATCH 26/95] fix multiple bugs; skip crash tests --- interp.rs | 220 ++++++++++++++++++++++-------------------------------- 1 file changed, 88 insertions(+), 132 deletions(-) diff --git a/interp.rs b/interp.rs index 7c9246142a..dfb2e04e47 100644 --- a/interp.rs +++ b/interp.rs @@ -131,18 +131,17 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - loop { + while self.start <= self.end { self = self.pymatch(); if self.has_matched == Some(true) { return self; } self.start += 1; - if self.start > self.end { - return self; - } self.reset(); } + + self } } @@ -182,16 +181,19 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => { - u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) - } + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_ne_bytes([ + 0, + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], ]), _ => unreachable!(), } @@ -222,7 +224,10 @@ impl<'a> StrDrive<'a> { .skip(start) .collect::() .into_pyobject(vm), - StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), + StrDrive::Bytes(b) => { + PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) + .into_pyobject(vm) + } } } } @@ -256,13 +261,11 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_position = + std::cmp::min(self.ctx().string_position + skip_count, self.state().end); } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; - if self.ctx().code_position > self.state().pattern_codes.len() { - self.ctx_mut().code_position = self.state().pattern_codes.len(); - } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -314,9 +317,7 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position += pattern_offset; - self.state.context_stack.push(child_ctx); + self.push_new_context_at(self.ctx().code_position + pattern_offset); } fn push_new_context_at(&mut self, code_position: usize) { let mut child_ctx = MatchContext { ..*self.ctx() }; @@ -352,11 +353,9 @@ impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { &self.ctx } - fn state(&self) -> &State { self.stack_drive.state() } @@ -833,6 +832,7 @@ fn charset(set: &[u32], ch: u32) -> bool { false } +/* General case */ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -854,6 +854,8 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } +/* TODO: check literal cases should improve the perfermance + fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -924,6 +926,7 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: fn eq_loc_ignore(code: u32, ch: u32) -> bool { code == ch || code == lower_locate(ch) || code == upper_locate(ch) } +*/ fn is_word(ch: u32) -> bool { ch == '_' as u32 @@ -1073,6 +1076,7 @@ struct RepeatContext { maxcount: usize, } +#[derive(Default)] struct OpMinRepeatOne { jump_id: usize, mincount: usize, @@ -1082,102 +1086,79 @@ struct OpMinRepeatOne { impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - _ => unreachable!(), - } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } - drive.state.string_position = drive.ctx().string_position; + drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } - drive.state.marks_push(); - self.jump_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } + drive.state.marks_push(); + self.jump_id = 1; + self.next(drive) + } + 1 => { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self.next(drive) + } + _ => unreachable!(), } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self._1(drive) } } +#[derive(Default)] struct OpMaxUntil { jump_id: usize, count: isize, save_last_position: usize, } -impl Default for OpMaxUntil { - fn default() -> Self { - OpMaxUntil { - jump_id: 0, - count: 0, - save_last_position: 0, - } - } -} impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1259,20 +1240,12 @@ impl OpcodeExecutor for OpMaxUntil { } } +#[derive(Default)] struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, } -impl Default for OpMinUntil { - fn default() -> Self { - Self { - jump_id: 0, - count: 0, - save_repeat: None, - } - } -} impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1338,18 +1311,11 @@ impl OpcodeExecutor for OpMinUntil { } } +#[derive(Default)] struct OpBranch { jump_id: usize, current_branch_length: usize, } -impl Default for OpBranch { - fn default() -> Self { - Self { - jump_id: 0, - current_branch_length: 0, - } - } -} impl OpcodeExecutor for OpBranch { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1388,22 +1354,13 @@ impl OpcodeExecutor for OpBranch { } } +#[derive(Default)] struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, count: isize, } -impl Default for OpRepeatOne { - fn default() -> Self { - Self { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} impl OpcodeExecutor for OpRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1413,7 +1370,6 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); - return None; } drive.state.string_position = drive.ctx().string_position; self.count = count(drive, self.maxcount) as isize; From 84113cba2cd66b83106f4564b03ae8cb999197c3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 20 Jan 2021 16:33:30 +0200 Subject: [PATCH 27/95] fix zero width repeat --- interp.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index dfb2e04e47..8b5a023ffd 100644 --- a/interp.rs +++ b/interp.rs @@ -1245,6 +1245,7 @@ struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, + save_last_position: usize, } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -1280,6 +1281,7 @@ impl OpcodeExecutor for OpMinUntil { drive.ctx_mut().has_matched = child_ctx.has_matched; if drive.ctx().has_matched != Some(true) { drive.repeat_ctx_mut().count = self.count - 1; + drive.repeat_ctx_mut().last_position = self.save_last_position; drive.state.string_position = drive.ctx().string_position; } None @@ -1295,13 +1297,26 @@ impl OpcodeExecutor for OpMinUntil { drive.state.marks_pop(); // match more unital tail matches - let maxcount = drive.repeat_ctx().maxcount; - let code_position = drive.repeat_ctx().code_position; - if self.count as usize >= maxcount && maxcount != MAXREPEAT { + let RepeatContext { + count: _, + code_position, + last_position, + mincount: _, + maxcount, + } = *drive.repeat_ctx(); + + if self.count as usize >= maxcount && maxcount != MAXREPEAT + || drive.state.string_position == last_position + { drive.ctx_mut().has_matched = Some(false); return None; } drive.repeat_ctx_mut().count = self.count; + + /* zero-width match protection */ + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context_at(code_position + 4); self.jump_id = 1; Some(()) From f2311b56fcbc87431eb0e291b68f44bb3658c2c1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 12:48:32 +0200 Subject: [PATCH 28/95] fix op branch --- interp.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/interp.rs b/interp.rs index 8b5a023ffd..5fa7ca8c0e 100644 --- a/interp.rs +++ b/interp.rs @@ -1329,28 +1329,30 @@ impl OpcodeExecutor for OpMinUntil { #[derive(Default)] struct OpBranch { jump_id: usize, - current_branch_length: usize, + branch_offset: usize, } impl OpcodeExecutor for OpBranch { + // alternation + // <0=skip> code ... fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { drive.state.marks_push(); // jump out the head - self.current_branch_length = 1; + self.branch_offset = 1; self.jump_id = 1; self.next(drive) } 1 => { - drive.skip_code(self.current_branch_length); - self.current_branch_length = drive.peek_code(0) as usize; - if self.current_branch_length == 0 { + let next_branch_length = drive.peek_code(self.branch_offset) as usize; + if next_branch_length == 0 { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(1); + drive.push_new_context(self.branch_offset + 1); + self.branch_offset += next_branch_length; self.jump_id = 2; Some(()) } From 6a792324b027bea56c39cb04fe072f719fd1efed Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 16:14:23 +0200 Subject: [PATCH 29/95] fix at_beginning --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 5fa7ca8c0e..11010c7152 100644 --- a/interp.rs +++ b/interp.rs @@ -274,7 +274,8 @@ trait MatchContextDrive { self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state().start + // self.ctx().string_position == self.state().start + self.ctx().string_position == 0 } fn at_end(&self) -> bool { self.ctx().string_position == self.state().end From 7f0dad7901751abcc09ac8308742c91627401c81 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 19:24:10 +0200 Subject: [PATCH 30/95] fix back_peek_char --- interp.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 11010c7152..beb96f1f09 100644 --- a/interp.rs +++ b/interp.rs @@ -181,15 +181,15 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_ne_bytes([ + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([ 0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1], ]), - 4 => u32::from_ne_bytes([ + 4 => u32::from_be_bytes([ bytes[offset - 4], bytes[offset - 3], bytes[offset - 2], From 4416158eceb9dd6931ca25cfc610f50582dd83dc Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 22 Jan 2021 16:40:33 +0200 Subject: [PATCH 31/95] fix multiple bugs; pass tests --- interp.rs | 97 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/interp.rs b/interp.rs index beb96f1f09..71e3ead143 100644 --- a/interp.rs +++ b/interp.rs @@ -25,6 +25,7 @@ pub(crate) struct State<'a> { pub string_position: usize, popped_context: Option, pub has_matched: Option, + pub match_all: bool, } impl<'a> State<'a> { @@ -51,6 +52,7 @@ impl<'a> State<'a> { string_position: start, popped_context: None, has_matched: None, + match_all: false, } } @@ -105,6 +107,7 @@ impl<'a> State<'a> { string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, + toplevel: true, }; self.context_stack.push(ctx); @@ -132,6 +135,7 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix while self.start <= self.end { + self.match_all = false; self = self.pymatch(); if self.has_matched == Some(true) { @@ -232,12 +236,13 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, + toplevel: bool, } trait MatchContextDrive { @@ -463,8 +468,12 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); + if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + } }), SreOpcode::ANY => once(|drive| { if drive.at_end() || drive.at_linebreak() { @@ -482,15 +491,19 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), + /* assert subpattern */ + /* */ SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.ctx_mut().has_matched = Some(false); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,12 +517,14 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -770,17 +785,17 @@ fn charset(set: &[u32], ch: u32) -> bool { } SreOpcode::CHARSET => { /* */ - let set = &set[1..]; + let set = &set[i + 1..]; if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } - i += 8; + i += 1 + 8; } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ let count = set[i + 1] as usize; if ch < 0x10000 { - let set = &set[2..]; + let set = &set[i + 2..]; let block_index = ch >> 8; let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; @@ -1085,6 +1100,7 @@ struct OpMinRepeatOne { count: usize, } impl OpcodeExecutor for OpMinRepeatOne { + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1110,7 +1126,11 @@ impl OpcodeExecutor for OpMinRepeatOne { count }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 + && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) + { + // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); return None; @@ -1377,9 +1397,18 @@ struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, - count: isize, + count: usize, + following_literal: Option, } impl OpcodeExecutor for OpRepeatOne { + /* match repeated sequence (maximizing regexp) */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1388,17 +1417,21 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); + return None; } + drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount) as isize; - drive.skip_char(self.count as usize); - if self.count < self.mincount as isize { + + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { drive.ctx_mut().has_matched = Some(false); return None; } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 { + if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel + { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1406,24 +1439,34 @@ impl OpcodeExecutor for OpRepeatOne { } drive.state.marks_push(); - // TODO: + // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. + if next_code == SreOpcode::LITERAL as u32 { + self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) + } + self.jump_id = 1; self.next(drive) } 1 => { - // General case: backtracking - if self.count >= self.mincount as isize { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); + if let Some(c) = self.following_literal { + while drive.at_end() || drive.peek_char() != c { + if self.count <= self.mincount { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + } } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None + // General case: backtracking + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + Some(()) } 2 => { let child_ctx = drive.state.popped_context.unwrap(); @@ -1431,19 +1474,19 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } - if self.count <= self.mincount as isize { + if self.count <= self.mincount { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } - // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; + drive.state.marks_pop_keep(); self.jump_id = 1; - Some(()) + self.next(drive) } _ => unreachable!(), } From db5bd646b928048c7183f46ede40a114f19a82e4 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 18:54:38 -0600 Subject: [PATCH 32/95] Initial commit to switch to new repo --- .gitignore | 2 ++ Cargo.toml | 7 +++++++ constants.rs => src/constants.rs | 0 interp.rs => src/engine.rs | 0 src/lib.rs | 2 ++ 5 files changed, 11 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml rename constants.rs => src/constants.rs (100%) rename interp.rs => src/engine.rs (100%) create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..96ef6c0b94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..f9f504966b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "sre-engine" +version = "0.1.0" +authors = ["Kangzhi Shi ", "RustPython Team"] +edition = "2018" + +[dependencies] diff --git a/constants.rs b/src/constants.rs similarity index 100% rename from constants.rs rename to src/constants.rs diff --git a/interp.rs b/src/engine.rs similarity index 100% rename from interp.rs rename to src/engine.rs diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..f305aa094a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +pub mod constants; +pub mod engine; From 9c95994dab20fea56004940ba50323ee7e327f81 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:18:42 -0600 Subject: [PATCH 33/95] Modify to work outside of rustpython-vm --- Cargo.toml | 2 ++ src/engine.rs | 38 +++++++++++--------------------------- src/lib.rs | 12 ++++++++++++ 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f9f504966b..3a82ba73f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,5 @@ authors = ["Kangzhi Shi ", "RustPython Team"] edition = "2018" [dependencies] +num_enum = "0.5" +bitflags = "1.2" diff --git a/src/engine.rs b/src/engine.rs index 71e3ead143..9aff2dcc91 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,17 +1,16 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyBytes; -use crate::bytesinner::is_py_ascii_whitespace; -use crate::pyobject::{IntoPyObject, PyObjectRef}; -use crate::VirtualMachine; +use super::MAXREPEAT; use std::collections::HashMap; use std::convert::TryFrom; -use std::unreachable; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} #[derive(Debug)] -pub(crate) struct State<'a> { +pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, @@ -29,7 +28,7 @@ pub(crate) struct State<'a> { } impl<'a> State<'a> { - pub(crate) fn new( + pub fn new( string: StrDrive<'a>, start: usize, end: usize, @@ -150,7 +149,7 @@ impl<'a> State<'a> { } #[derive(Debug, Clone, Copy)] -pub(crate) enum StrDrive<'a> { +pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } @@ -219,21 +218,6 @@ impl<'a> StrDrive<'a> { StrDrive::Bytes(_) => offset - skip, } } - - pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { - match *self { - StrDrive::Str(s) => s - .chars() - .take(end) - .skip(start) - .collect::() - .into_pyobject(vm), - StrDrive::Bytes(b) => { - PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) - .into_pyobject(vm) - } - } - } } #[derive(Debug, Clone, Copy)] @@ -972,7 +956,7 @@ fn is_loc_word(ch: u32) -> bool { fn is_linebreak(ch: u32) -> bool { ch == '\n' as u32 } -pub(crate) fn lower_ascii(ch: u32) -> u32 { +pub fn lower_ascii(ch: u32) -> u32 { u8::try_from(ch) .map(|x| x.to_ascii_lowercase() as u32) .unwrap_or(ch) @@ -1044,13 +1028,13 @@ fn is_uni_alnum(ch: u32) -> bool { fn is_uni_word(ch: u32) -> bool { ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(ch: u32) -> u32 { +pub fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_lowercase().next().unwrap() as u32) .unwrap_or(ch) } -pub(crate) fn upper_unicode(ch: u32) -> u32 { +pub fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_uppercase().next().unwrap() as u32) diff --git a/src/lib.rs b/src/lib.rs index f305aa094a..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,14 @@ pub mod constants; pub mod engine; + +pub const CODESIZE: usize = 4; + +#[cfg(target_pointer_width = "32")] +pub const MAXREPEAT: usize = usize::MAX; +#[cfg(target_pointer_width = "64")] +pub const MAXREPEAT: usize = u32::MAX as usize; + +#[cfg(target_pointer_width = "32")] +pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2; +#[cfg(target_pointer_width = "64")] +pub const MAXGROUPS: usize = MAXREPEAT / 2; From 2592067f95f530850db8451fd45982dc3bf161e0 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:19:31 -0600 Subject: [PATCH 34/95] Add LICENSE --- Cargo.toml | 1 + LICENSE | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 LICENSE diff --git a/Cargo.toml b/Cargo.toml index 3a82ba73f1..03db7aba4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +license = "MIT" edition = "2018" [dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7213274e0f --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 RustPython Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 2473c3e49f6ba539549a8bf4f87db9146044fc52 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Feb 2021 20:25:45 +0200 Subject: [PATCH 35/95] add tests; fix OpAssert panic --- src/engine.rs | 15 ++++++++------- src/lib.rs | 2 ++ src/tests.rs | 19 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 src/tests.rs diff --git a/src/engine.rs b/src/engine.rs index 9aff2dcc91..f6a6092f9b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -369,20 +369,18 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -// F1 F2 are same identical, but workaround for closure struct OpTwice { f1: Option, f2: Option, } impl OpcodeExecutor for OpTwice where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { if let Some(f1) = self.f1.take() { - f1(drive); - Some(()) + f1(drive) } else if let Some(f2) = self.f2.take() { f2(drive); None @@ -393,7 +391,7 @@ where } fn twice(f1: F1, f2: F2) -> Box> where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { Box::new(OpTwice { @@ -483,11 +481,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.ctx_mut().has_matched = Some(false); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,11 +503,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -598,6 +598,7 @@ impl OpcodeDispatcher { drive.state.string_position = drive.ctx().string_position; // execute UNTIL operator drive.push_new_context(drive.peek_code(1) as usize + 1); + Some(()) }, |drive| { drive.state.repeat_stack.pop(); diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..eae3be617d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ pub mod constants; pub mod engine; +#[cfg(test)] +mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000000..95922e88a7 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,19 @@ +use engine::{State, StrDrive}; + +use super::*; + +#[test] +fn test_2427() { + let str_drive = StrDrive::Str("x"); + // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; + let mut state = State::new( + str_drive, + 0, + std::usize::MAX, + constants::SreFlag::UNICODE, + &code, + ); + state = state.pymatch(); + assert!(state.has_matched == Some(true)); +} From cc4441b50ffa361c8f52eeecad54a9459c471e9b Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 1 Feb 2021 14:27:01 -0600 Subject: [PATCH 36/95] Compile regex patterns for tests with a script --- generate_tests.py | 37 +++++++++++++++++++++++++++++++++++++ src/engine.rs | 12 ++++++++++++ src/lib.rs | 2 -- src/tests.rs | 19 ------------------- tests/lookbehind.py | 1 + tests/lookbehind.re | 2 ++ tests/tests.rs | 26 ++++++++++++++++++++++++++ 7 files changed, 78 insertions(+), 21 deletions(-) create mode 100644 generate_tests.py delete mode 100644 src/tests.rs create mode 100644 tests/lookbehind.py create mode 100644 tests/lookbehind.re create mode 100644 tests/tests.rs diff --git a/generate_tests.py b/generate_tests.py new file mode 100644 index 0000000000..49a24792be --- /dev/null +++ b/generate_tests.py @@ -0,0 +1,37 @@ +import os +from pathlib import Path +import re +import sre_constants +import sre_compile +import sre_parse +import json + +m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) +sre_engine_magic = int(m.group(1)) +del m + +assert sre_constants.MAGIC == sre_engine_magic + +class CompiledPattern: + @classmethod + def compile(cls, pattern, flags=0): + p = sre_parse.parse(pattern) + code = sre_compile._code(p, flags) + self = cls() + self.pattern = pattern + self.code = code + self.flags = re.RegexFlag(flags | p.state.flags) + return self + +for k, v in re.RegexFlag.__members__.items(): + setattr(CompiledPattern, k, v) + +with os.scandir("tests") as d: + for f in d: + path = Path(f.path) + if path.suffix == ".py": + pattern = eval(path.read_text(), {"re": CompiledPattern}) + path.with_suffix(".re").write_text( + f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" + f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + ) diff --git a/src/engine.rs b/src/engine.rs index f6a6092f9b..a48b799e1b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -153,6 +153,18 @@ pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } + +impl<'a> From<&'a str> for StrDrive<'a> { + fn from(s: &'a str) -> Self { + Self::Str(s) + } +} +impl<'a> From<&'a [u8]> for StrDrive<'a> { + fn from(b: &'a [u8]) -> Self { + Self::Bytes(b) + } +} + impl<'a> StrDrive<'a> { fn offset(&self, offset: usize, skip: usize) -> usize { match *self { diff --git a/src/lib.rs b/src/lib.rs index eae3be617d..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ pub mod constants; pub mod engine; -#[cfg(test)] -mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs deleted file mode 100644 index 95922e88a7..0000000000 --- a/src/tests.rs +++ /dev/null @@ -1,19 +0,0 @@ -use engine::{State, StrDrive}; - -use super::*; - -#[test] -fn test_2427() { - let str_drive = StrDrive::Str("x"); - // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; - let mut state = State::new( - str_drive, - 0, - std::usize::MAX, - constants::SreFlag::UNICODE, - &code, - ); - state = state.pymatch(); - assert!(state.has_matched == Some(true)); -} diff --git a/tests/lookbehind.py b/tests/lookbehind.py new file mode 100644 index 0000000000..3da6425959 --- /dev/null +++ b/tests/lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} + +#[test] +fn test_2427() { + // r'(? Date: Wed, 3 Feb 2021 13:32:48 +0200 Subject: [PATCH 37/95] fix OpAssert positive lookbehind --- src/engine.rs | 28 ++++++++++++++++++++++++---- tests/positive_lookbehind.py | 1 + tests/positive_lookbehind.re | 2 ++ tests/tests.rs | 9 +++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 tests/positive_lookbehind.py create mode 100644 tests/positive_lookbehind.re diff --git a/src/engine.rs b/src/engine.rs index a48b799e1b..5e0e0f4208 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -490,14 +490,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.ctx_mut().has_matched = Some(false); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { @@ -512,14 +522,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { diff --git a/tests/positive_lookbehind.py b/tests/positive_lookbehind.py new file mode 100644 index 0000000000..2a0ab29253 --- /dev/null +++ b/tests/positive_lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?<=abc)def') \ No newline at end of file diff --git a/tests/positive_lookbehind.re b/tests/positive_lookbehind.re new file mode 100644 index 0000000000..68923b58ee --- /dev/null +++ b/tests/positive_lookbehind.re @@ -0,0 +1,2 @@ +// '(?<=abc)def', flags=re.UNICODE +Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) } \ No newline at end of file diff --git a/tests/tests.rs b/tests/tests.rs index 4db110177d..8a18b5f333 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,3 +24,12 @@ fn test_2427() { state = state.pymatch(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_assert() { + // '(?<=abc)def', flags=re.UNICODE + let pattern = include!("positive_lookbehind.re"); + let mut state = pattern.state("abcdef", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == Some(true)); +} From 2a43d66e11a4a2245c12b24484b14eb01d7033a1 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 21:38:14 -0500 Subject: [PATCH 38/95] Fix clippy lint --- src/constants.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index f5ab92c531..f3962b339a 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -16,7 +16,7 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreOpcode { FAILURE = 0, SUCCESS = 1, @@ -62,7 +62,7 @@ pub enum SreOpcode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreAtCode { BEGINNING = 0, BEGINNING_LINE = 1, @@ -79,7 +79,7 @@ pub enum SreAtCode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreCatCode { DIGIT = 0, NOT_DIGIT = 1, From ca1346ee031624b37e6e5d531a1c5dcaa5b284fc Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 22:05:45 -0500 Subject: [PATCH 39/95] Have generate_tests.py generate Patterns inline in tests.rs --- generate_tests.py | 21 +++++++++++++++------ tests/lookbehind.py | 1 - tests/lookbehind.re | 2 -- tests/positive_lookbehind.py | 1 - tests/positive_lookbehind.re | 2 -- tests/tests.rs | 16 ++++++++++------ 6 files changed, 25 insertions(+), 18 deletions(-) delete mode 100644 tests/lookbehind.py delete mode 100644 tests/lookbehind.re delete mode 100644 tests/positive_lookbehind.py delete mode 100644 tests/positive_lookbehind.re diff --git a/generate_tests.py b/generate_tests.py index 49a24792be..7af1d2f0c2 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -26,12 +26,21 @@ def compile(cls, pattern, flags=0): for k, v in re.RegexFlag.__members__.items(): setattr(CompiledPattern, k, v) + +# matches `// pattern {varname} = re.compile(...)` +pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S) +def replace_compiled(m): + line, indent, varname, pattern = m.groups() + pattern = eval(pattern, {"re": CompiledPattern}) + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + return f'''{line} +{indent}// START GENERATED by generate_tests.py +{indent}#[rustfmt::skip] let {varname} = {pattern}; +{indent}// END GENERATED''' + with os.scandir("tests") as d: for f in d: path = Path(f.path) - if path.suffix == ".py": - pattern = eval(path.read_text(), {"re": CompiledPattern}) - path.with_suffix(".re").write_text( - f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" - f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" - ) + if path.suffix == ".rs": + replaced = pattern_pattern.sub(replace_compiled, path.read_text()) + path.write_text(replaced) diff --git a/tests/lookbehind.py b/tests/lookbehind.py deleted file mode 100644 index 3da6425959..0000000000 --- a/tests/lookbehind.py +++ /dev/null @@ -1 +0,0 @@ -re.compile(r'(? Date: Mon, 5 Apr 2021 11:10:32 -0500 Subject: [PATCH 40/95] Add more info to Cargo.toml --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 03db7aba4f..8e69ab5235 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,8 +2,11 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +description = "A low-level implementation of Python's SRE regex engine" +repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" +keywords = ["regex"] [dependencies] num_enum = "0.5" From 9728dd8699a255686deaedbdd0f23f549e62009f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 16 Apr 2021 09:35:11 +0200 Subject: [PATCH 41/95] fix test_string_boundaries --- src/engine.rs | 14 +++++++++++--- tests/tests.rs | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5e0e0f4208..0de9f43844 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -292,6 +292,14 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } + fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this == that + } fn back_peek_char(&self) -> u32 { self.state().string.back_peek(self.ctx().string_offset) } @@ -738,14 +746,14 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), SreAtCode::END_STRING => drive.at_end(), SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), } } diff --git a/tests/tests.rs b/tests/tests.rs index f4cd091f0d..690c72861b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -37,3 +37,14 @@ fn test_assert() { state = state.search(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_string_boundaries() { + // pattern big_b = re.compile(r'\B') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = big_b.state("", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == None) +} From d2b48fdea2986e8513d63d19ea30859c5a48a66e Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:40:42 -0500 Subject: [PATCH 42/95] Release 0.1.1 sre-engine@0.1.1 Generated by cargo-workspaces --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8e69ab5235..cf830a6053 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.0" +version = "0.1.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 73abbace85aebf063eb8c7ce4815cacd2449f522 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:53:37 -0500 Subject: [PATCH 43/95] Add explicit include for Cargo files --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index cf830a6053..f0cd628f0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" keywords = ["regex"] +include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" From df8453d387c0a4bc6b84131f40703b46000ba7ee Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 10:19:27 +0200 Subject: [PATCH 44/95] fix zerowidth search --- Cargo.toml | 2 +- src/engine.rs | 99 +++++++++++++++++++++++++++++++++++--------------- tests/tests.rs | 18 +++++++-- 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f0cd628f0e..614243eeb1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.1" +version = "0.1.2" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 0de9f43844..bded52448d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -23,8 +23,9 @@ pub struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, - pub has_matched: Option, + pub has_matched: bool, pub match_all: bool, + pub must_advance: bool, } impl<'a> State<'a> { @@ -50,8 +51,9 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, - has_matched: None, + has_matched: false, match_all: false, + must_advance: false, } } @@ -63,7 +65,7 @@ impl<'a> State<'a> { self.marks.clear(); self.string_position = self.start; self.popped_context = None; - self.has_matched = None; + self.has_matched = false; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -100,17 +102,7 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - pub fn pymatch(mut self) -> Self { - let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), - code_position: 0, - has_matched: None, - toplevel: true, - }; - self.context_stack.push(ctx); - - let mut dispatcher = OpcodeDispatcher::new(); + fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { let mut has_matched = None; loop { @@ -127,21 +119,58 @@ impl<'a> State<'a> { } } - self.has_matched = has_matched; + self.has_matched = has_matched == Some(true); self } + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + + self._match(&mut dispatcher) + } + pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - while self.start <= self.end { - self.match_all = false; - self = self.pymatch(); - if self.has_matched == Some(true) { - return self; - } + if self.start > self.end { + return self; + } + + let mut dispatcher = OpcodeDispatcher::new(); + + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); + + self.must_advance = false; + while !self.has_matched && self.start < self.end { self.start += 1; self.reset(); + dispatcher.clear(); + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: false, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); } self @@ -310,6 +339,18 @@ trait MatchContextDrive { .string .back_offset(self.ctx().string_offset, skip_count); } + fn can_success(&self) -> bool { + if !self.ctx().toplevel { + return true; + } + if self.state().match_all && !self.at_end() { + return false; + } + if self.state().must_advance && self.ctx().string_position == self.state().start { + return false; + } + true + } } struct StackDrive<'a> { @@ -429,6 +470,9 @@ impl OpcodeDispatcher { executing_contexts: HashMap::new(), } } + fn clear(&mut self) { + self.executing_contexts.clear(); + } // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. @@ -470,11 +514,9 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { + drive.ctx_mut().has_matched = Some(drive.can_success()); + if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); } }), SreOpcode::ANY => once(|drive| { @@ -1152,9 +1194,7 @@ impl OpcodeExecutor for OpMinRepeatOne { }; let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 - && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1455,8 +1495,7 @@ impl OpcodeExecutor for OpRepeatOne { } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); diff --git a/tests/tests.rs b/tests/tests.rs index 690c72861b..d76ff3cfb5 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,7 +24,7 @@ fn test_2427() { // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); state = state.pymatch(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -35,7 +35,7 @@ fn test_assert() { // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -46,5 +46,17 @@ fn test_string_boundaries() { // END GENERATED let mut state = big_b.state("", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == None) + assert!(!state.has_matched); +} + +#[test] +fn test_zerowidth() { + // pattern p = re.compile(r'\b|:+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 7, 5, 6, 10, 16, 12, 10, 25, 6, 1, 4294967295, 17, 58, 1, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("a:", 0..usize::MAX); + state.must_advance = true; + state = state.search(); + assert!(state.string_position == 1); } From a3c3573d67f94d6119a8bb7126f385c38ba438e8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 17:36:32 +0200 Subject: [PATCH 45/95] optimize count --- src/engine.rs | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index bded52448d..5409baf66d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -916,7 +916,7 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -937,18 +937,11 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } -/* TODO: check literal cases should improve the perfermance - -fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; - let opcode = match SreOpcode::try_from(drive.peek_code(1)) { - Ok(code) => code, - Err(_) => { - panic!("FIXME:COUNT1"); - } - }; + let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { @@ -960,7 +953,6 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.skip_char(maxcount); } SreOpcode::IN => { - // TODO: pattern[2 or 1..]? while !drive.ctx().string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { @@ -992,7 +984,7 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - todo!("repeated single character pattern?"); + return general_count(stack_drive, maxcount); } } @@ -1006,11 +998,6 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: } } -fn eq_loc_ignore(code: u32, ch: u32) -> bool { - code == ch || code == lower_locate(ch) || code == upper_locate(ch) -} -*/ - fn is_word(ch: u32) -> bool { ch == '_' as u32 || u8::try_from(ch) @@ -1028,7 +1015,7 @@ fn is_digit(ch: u32) -> bool { .unwrap_or(false) } fn is_loc_alnum(ch: u32) -> bool { - // TODO: check with cpython + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.is_ascii_alphanumeric()) .unwrap_or(false) @@ -1045,13 +1032,11 @@ pub fn lower_ascii(ch: u32) -> u32 { .unwrap_or(ch) } fn lower_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + // FIXME: Ignore the locales lower_ascii(ch) } fn upper_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.to_ascii_uppercase() as u32) .unwrap_or(ch) From 5bd6b672d089fa4dc8db1d5d3d8564f6a98dbacd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:09:10 +0200 Subject: [PATCH 46/95] optimize opcode that execute only once --- src/engine.rs | 183 ++++++++++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 81 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5409baf66d..2888b6930c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -416,20 +416,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpOnce { - f: Option, -} -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - let f = self.f.take()?; - f(drive); - None - } -} -fn once(f: F) -> Box> { - Box::new(OpOnce { f: Some(f) }) -} - struct OpTwice { f1: Option, f2: Option, @@ -496,48 +482,58 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, executor)) => executor, - None => self.dispatch_table(opcode), - }; - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - false - } else { - true + let executor = self + .executing_contexts + .remove_entry(&drive.id()) + .map(|(_, x)| x) + .or_else(|| self.dispatch_table(opcode, drive)); + if let Some(mut executor) = executor { + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + return false; + } } + true } - fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut StackDrive, + ) -> Option> { match opcode { - SreOpcode::FAILURE => once(|drive| { + SreOpcode::FAILURE => { drive.ctx_mut().has_matched = Some(false); - }), - SreOpcode::SUCCESS => once(|drive| { + None + } + SreOpcode::SUCCESS => { drive.ctx_mut().has_matched = Some(drive.can_success()); if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; } - }), - SreOpcode::ANY => once(|drive| { + None + } + SreOpcode::ANY => { if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), - SreOpcode::ANY_ALL => once(|drive| { + None + } + SreOpcode::ANY_ALL => { if drive.at_end() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), + None + } /* assert subpattern */ /* */ - SreOpcode::ASSERT => twice( + SreOpcode::ASSERT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -568,8 +564,8 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); } }, - ), - SreOpcode::ASSERT_NOT => twice( + )), + SreOpcode::ASSERT_NOT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -600,17 +596,18 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); } }, - ), - SreOpcode::AT => once(|drive| { + )), + SreOpcode::AT => { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(2); } - }), - SreOpcode::BRANCH => Box::new(OpBranch::default()), - SreOpcode::CATEGORY => once(|drive| { + None + } + SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), + SreOpcode::CATEGORY => { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -618,53 +615,68 @@ impl OpcodeDispatcher { drive.skip_code(2); drive.skip_char(1); } - }), - SreOpcode::IN => once(|drive| { + None + } + SreOpcode::IN => { general_op_in(drive, |set, c| charset(set, c)); - }), - SreOpcode::IN_IGNORE => once(|drive| { + None + } + SreOpcode::IN_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - }), - SreOpcode::IN_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::IN_UNI_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - }), - SreOpcode::IN_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::IN_LOC_IGNORE => { general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - }), - SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + None + } + SreOpcode::INFO | SreOpcode::JUMP => { drive.skip_code(drive.peek_code(1) as usize + 1); - }), - SreOpcode::LITERAL => once(|drive| { + None + } + SreOpcode::LITERAL => { general_op_literal(drive, |code, c| code == c); - }), - SreOpcode::NOT_LITERAL => once(|drive| { + None + } + SreOpcode::NOT_LITERAL => { general_op_literal(drive, |code, c| code != c); - }), - SreOpcode::LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_IGNORE => { general_op_literal(drive, |code, c| code == lower_ascii(c)); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_IGNORE => { general_op_literal(drive, |code, c| code != lower_ascii(c)); - }), - SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code == lower_unicode(c)); - }), - SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code != lower_unicode(c)); - }), - SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_LOC_IGNORE => { general_op_literal(drive, char_loc_ignore); - }), - SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - }), - SreOpcode::MARK => once(|drive| { + None + } + SreOpcode::MARK => { drive .state .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); - }), - SreOpcode::REPEAT => twice( + None + } + SreOpcode::REPEAT => Some(twice( // create repeat context. all the hard work is done by the UNTIL // operator (MAX_UNTIL, MIN_UNTIL) // <1=min> <2=max> item tail @@ -687,20 +699,28 @@ impl OpcodeDispatcher { let child_ctx = drive.state.popped_context.unwrap(); drive.ctx_mut().has_matched = child_ctx.has_matched; }, - ), - SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), - SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), - SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + )), + SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), + SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), + SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), + SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), + SreOpcode::GROUPREF => { + general_op_groupref(drive, |x| x); + None + } + SreOpcode::GROUPREF_IGNORE => { + general_op_groupref(drive, lower_ascii); + None + } SreOpcode::GROUPREF_LOC_IGNORE => { - once(|drive| general_op_groupref(drive, lower_locate)) + general_op_groupref(drive, lower_locate); + None } SreOpcode::GROUPREF_UNI_IGNORE => { - once(|drive| general_op_groupref(drive, lower_unicode)) + general_op_groupref(drive, lower_unicode); + None } - SreOpcode::GROUPREF_EXISTS => once(|drive| { + SreOpcode::GROUPREF_EXISTS => { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { @@ -708,7 +728,8 @@ impl OpcodeDispatcher { } _ => drive.skip_code(drive.peek_code(2) as usize + 1), } - }), + None + } _ => { // TODO python expcetion? unreachable!("unexpected opcode") From 7324feef89dd692d909c98849e54969617728ecf Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:13:58 +0200 Subject: [PATCH 47/95] optimize search cache the string offset --- src/engine.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2888b6930c..2b85ea3514 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -147,9 +147,11 @@ impl<'a> State<'a> { let mut dispatcher = OpcodeDispatcher::new(); + let mut start_offset = self.string.offset(0, self.start); + let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: true, @@ -160,11 +162,12 @@ impl<'a> State<'a> { self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; + start_offset = self.string.offset(start_offset, 1); self.reset(); dispatcher.clear(); let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, From 86435b8a4b44d0b79109ace6acd66cbfcebaac66 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 17:15:03 +0200 Subject: [PATCH 48/95] add benchmark --- benches/benches.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++ generate_tests.py | 5 +- 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 benches/benches.rs diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000000..b86a592967 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,112 @@ +#![feature(test)] + +extern crate test; +use test::Bencher; + +use sre_engine::constants::SreFlag; +use sre_engine::engine; +pub struct Pattern { + pub code: &'static [u32], + pub flags: SreFlag, +} + +impl Pattern { + pub fn state<'a>( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} +#[bench] +fn benchmarks(b: &mut Bencher) { + // # test common prefix + // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + + let tests = [ + (p1, "Perl"), + (p2, "Perl"), + (p3, "Perl"), + (p4, "Perl"), + (p5, "PythonPython"), + (p6, "a5,b7,c9,"), + (p7, "a5,b7,c9,"), + (p8, "Python"), + (p9, "Python"), + (p10, "Python"), + (p11, "Python"), + ]; + + b.iter(move || { + for (p, s) in &tests { + let mut state = p.state(s.clone(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); + state = p.state(s2.as_str(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + } + }) +} diff --git a/generate_tests.py b/generate_tests.py index 7af1d2f0c2..b432720cd1 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -5,6 +5,7 @@ import sre_compile import sre_parse import json +from itertools import chain m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) sre_engine_magic = int(m.group(1)) @@ -38,8 +39,8 @@ def replace_compiled(m): {indent}#[rustfmt::skip] let {varname} = {pattern}; {indent}// END GENERATED''' -with os.scandir("tests") as d: - for f in d: +with os.scandir("tests") as t, os.scandir("benches") as b: + for f in chain(t, b): path = Path(f.path) if path.suffix == ".rs": replaced = pattern_pattern.sub(replace_compiled, path.read_text()) From 58981a41e99cbcb53002b559ea13c7131b597938 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 19:27:24 +0200 Subject: [PATCH 49/95] optimize; replace hashmap with btreemap --- src/engine.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2b85ea3514..3837974838 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::HashMap; +use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -204,7 +204,7 @@ impl<'a> StrDrive<'a> { .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) .unwrap_or_else(|| s.len()), - StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + StrDrive::Bytes(_) => offset + skip, } } @@ -294,8 +294,7 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position = - std::cmp::min(self.ctx().string_position + skip_count, self.state().end); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -451,12 +450,12 @@ where } struct OpcodeDispatcher { - executing_contexts: HashMap>, + executing_contexts: BTreeMap>, } impl OpcodeDispatcher { fn new() -> Self { Self { - executing_contexts: HashMap::new(), + executing_contexts: BTreeMap::new(), } } fn clear(&mut self) { @@ -487,8 +486,7 @@ impl OpcodeDispatcher { fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let executor = self .executing_contexts - .remove_entry(&drive.id()) - .map(|(_, x)| x) + .remove(&drive.id()) .or_else(|| self.dispatch_table(opcode, drive)); if let Some(mut executor) = executor { if let Some(()) = executor.next(drive) { From 74ebdaf4e8a50330ea6195333bcb47e086ff3b5e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 11 Jul 2022 21:30:48 +0200 Subject: [PATCH 50/95] fix panic OpMinUntil return before restore repeat --- .vscode/launch.json | 21 +++++++++++++++++++++ src/engine.rs | 10 ++++++---- tests/tests.rs | 11 +++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000..5ebfe34f05 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug Unit Test", + "cargo": { + "args": [ + "test", + "--no-run" + ], + "filter": { + "kind": "test" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/src/engine.rs b/src/engine.rs index 3837974838..d4e036ff32 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -14,7 +14,7 @@ pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, - flags: SreFlag, + _flags: SreFlag, pattern_codes: &'a [u32], pub marks: Vec>, pub lastindex: isize, @@ -42,7 +42,7 @@ impl<'a> State<'a> { string, start, end, - flags, + _flags: flags, pattern_codes, lastindex: -1, marks_stack: Vec::new(), @@ -1380,16 +1380,18 @@ impl OpcodeExecutor for OpMinUntil { None } 2 => { + // restore repeat before return + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } - drive.state.repeat_stack.push(self.save_repeat.unwrap()); drive.state.string_position = drive.ctx().string_position; drive.state.marks_pop(); - // match more unital tail matches + // match more until tail matches let RepeatContext { count: _, code_position, diff --git a/tests/tests.rs b/tests/tests.rs index d76ff3cfb5..b430947a9b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -60,3 +60,14 @@ fn test_zerowidth() { state = state.search(); assert!(state.string_position == 1); } + +#[test] +fn test_repeat_context_panic() { + // pattern p = re.compile(r'(?:a*?(xx)??z)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("axxzaz", 0..usize::MAX); + state = state.pymatch(); + assert!(state.marks == vec![Some(1), Some(3)]); +} \ No newline at end of file From 919e1d7933b62bba0830b5b8dce63c1dfc758056 Mon Sep 17 00:00:00 2001 From: Steve Shi Date: Tue, 26 Jul 2022 20:38:03 +0200 Subject: [PATCH 51/95] Refactor and fix multiple max_until recusion (#10) * wip refactor engine * wip 2 refactor engine * wip 3 refactor engine * wip 3 refactor engine * wip 4 refactor engine * wip 5 refactor engine * refactor seperate Stacks * fix clippy * fix pymatch and search restore _stacks * fix toplevel * fix marks panic * fix double max_until repeat context * clearup * update version to 0.2.0 --- Cargo.toml | 2 +- src/engine.rs | 1700 ++++++++++++++++++++++++------------------------ src/lib.rs | 2 +- tests/tests.rs | 13 +- 4 files changed, 847 insertions(+), 870 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 614243eeb1..6ba3996947 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.2" +version = "0.2.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index d4e036ff32..81903ccfdd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,6 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -20,7 +19,7 @@ pub struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat_stack: Vec, + _stacks: Option>, pub string_position: usize, popped_context: Option, pub has_matched: bool, @@ -44,11 +43,11 @@ impl<'a> State<'a> { end, _flags: flags, pattern_codes, + marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat_stack: Vec::new(), - marks: Vec::new(), + _stacks: Default::default(), string_position: start, popped_context: None, has_matched: false, @@ -59,10 +58,12 @@ impl<'a> State<'a> { pub fn reset(&mut self) { self.lastindex = -1; + self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - self.repeat_stack.clear(); - self.marks.clear(); + if let Some(x) = self._stacks.as_mut() { + x.clear() + }; self.string_position = self.start; self.popped_context = None; self.has_matched = false; @@ -102,51 +103,71 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { - let mut has_matched = None; + fn _match(mut self, stacks: &mut Stacks) -> Self { + while let Some(ctx) = self.context_stack.pop() { + let mut drive = StateContext { + state: self, + ctx, + next_ctx: None, + }; - loop { - if self.context_stack.is_empty() { - break; + if let Some(handler) = drive.ctx.handler { + handler(&mut drive, stacks); + } else if drive.remaining_codes() > 0 { + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, &mut drive, stacks); + } else { + drive.failure(); } - let ctx_id = self.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, self); - has_matched = dispatcher.pymatch(&mut drive); - self = drive.take(); - if has_matched.is_some() { - self.popped_context = self.context_stack.pop(); + let StateContext { + mut state, + ctx, + next_ctx, + } = drive; + + if ctx.has_matched.is_some() { + state.popped_context = Some(ctx); + } else { + state.context_stack.push(ctx); + if let Some(next_ctx) = next_ctx { + state.context_stack.push(next_ctx); + } } + self = state } - - self.has_matched = has_matched == Some(true); + self.has_matched = self.popped_context.unwrap().has_matched == Some(true); self } pub fn pymatch(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); + let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); - - self._match(&mut dispatcher) + self = self._match(&mut stacks); + self._stacks = Some(stacks); + self } pub fn search(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { return self; } - let mut dispatcher = OpcodeDispatcher::new(); - let mut start_offset = self.string.offset(0, self.start); let ctx = MatchContext { @@ -155,31 +176,664 @@ impl<'a> State<'a> { code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - dispatcher.clear(); + stacks.clear(); + let ctx = MatchContext { string_position: self.start, string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); } + self._stacks = Some(stacks); self } } +fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { + match opcode { + SreOpcode::FAILURE => { + drive.failure(); + } + SreOpcode::SUCCESS => { + drive.ctx.has_matched = Some(drive.can_success()); + if drive.ctx.has_matched == Some(true) { + drive.state.string_position = drive.ctx.string_position; + } + } + SreOpcode::ANY => { + if drive.at_end() || drive.at_linebreak() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + if drive.at_end() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ASSERT => op_assert(drive), + SreOpcode::ASSERT_NOT => op_assert_not(drive), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if at(drive, atcode) { + drive.skip_code(2); + } else { + drive.failure(); + } + } + SreOpcode::BRANCH => op_branch(drive, stacks), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.failure(); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + } + SreOpcode::IN => general_op_in(drive, charset), + SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), + SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), + SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(drive, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); + drive.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), + SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), + SreOpcode::REPEAT => op_repeat(drive, stacks), + SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), + SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code_from(2), + } + } + _ => unreachable!("unexpected opcode"), + } +} + +/* assert subpattern */ +/* */ +fn op_assert(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.failure(); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.ctx.handler = None; + drive.skip_code_from(1); + } else { + drive.failure(); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.skip_code_from(1); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.failure(); + } else { + drive.ctx.handler = None; + drive.skip_code_from(1); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +#[derive(Debug)] +struct BranchContext { + branch_offset: usize, +} + +// alternation +// <0=skip> code ... +fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { + drive.state.marks_push(); + stacks.branch.push(BranchContext { branch_offset: 1 }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let branch_offset = stacks.branch_last().branch_offset; + let next_length = drive.peek_code(branch_offset) as usize; + if next_length == 0 { + drive.state.marks_pop_discard(); + stacks.branch.pop(); + return drive.failure(); + } + + drive.sync_string_position(); + + stacks.branch_last().branch_offset += next_length; + drive.next_ctx(branch_offset + 1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.branch.pop(); + return drive.success(); + } + drive.state.marks_pop_keep(); + drive.ctx.handler = Some(create_context) + } +} + +#[derive(Debug, Copy, Clone)] +struct MinRepeatOneContext { + count: usize, + max_count: usize, +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = if min_count == 0 { + 0 + } else { + let count = count(drive, stacks, min_count); + if count < min_count { + return drive.failure(); + } + drive.skip_char(count); + count + }; + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + drive.state.marks_push(); + stacks + .min_repeat_one + .push(MinRepeatOneContext { count, max_count }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + + if max_count == MAXREPEAT || count <= max_count { + drive.sync_string_position(); + drive.next_ctx_from(1, callback); + } else { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + drive.failure(); + } + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_repeat_one.pop(); + return drive.success(); + } + + drive.sync_string_position(); + + if crate::engine::count(drive, stacks, 1) == 0 { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + return drive.failure(); + } + + drive.skip_char(1); + stacks.min_repeat_one_last().count += 1; + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Copy, Clone)] +struct RepeatOneContext { + count: usize, + min_count: usize, + following_literal: Option, +} + +/* match repeated sequence (maximizing regexp) */ + +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ + +/* <1=min> <2=max> item tail */ +fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = count(drive, stacks, max_count); + drive.skip_char(count); + if count < min_count { + return drive.failure(); + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let following_literal = (next_code == SreOpcode::LITERAL as u32) + .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + + drive.state.marks_push(); + stacks.repeat_one.push(RepeatOneContext { + count, + min_count, + following_literal, + }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let RepeatOneContext { + mut count, + min_count, + following_literal, + } = *stacks.repeat_one_last(); + + if let Some(c) = following_literal { + while drive.at_end() || drive.peek_char() != c { + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + drive.back_skip_char(1); + count -= 1; + } + } + stacks.repeat_one_last().count = count; + + drive.sync_string_position(); + + // General case: backtracking + drive.next_ctx_from(1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.repeat_one.pop(); + return drive.success(); + } + + let RepeatOneContext { + count, + min_count, + following_literal: _, + } = stacks.repeat_one_last(); + + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + + drive.back_skip_char(1); + *count -= 1; + + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: drive.peek_code(2) as usize, + max_count: drive.peek_code(3) as usize, + code_position: drive.ctx.code_position, + last_position: std::usize::MAX, + prev_id: drive.ctx.repeat_ctx_id, + }; + + stacks.repeat.push(repeat_ctx); + + drive.sync_string_position(); + + let next_ctx = drive.next_ctx_from(1, |drive, stacks| { + drive.ctx.has_matched = drive.popped_ctx().has_matched; + stacks.repeat.pop(); + }); + next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +} + +#[derive(Debug, Clone, Copy)] +struct MinUntilContext { + count: isize, + save_repeat_ctx: Option, + save_last_position: usize, +} + +/* minimizing repeat */ +fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = stacks.repeat.last_mut().unwrap(); + + drive.sync_string_position(); + + let count = repeat_ctx.count + 1; + + stacks.min_until.push(MinUntilContext { + count, + save_repeat_ctx: None, + save_last_position: repeat_ctx.last_position, + }); + + if (count as usize) < repeat_ctx.min_count { + // not enough matches + repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + return drive.success(); + } + + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + }); + return; + } + + drive.state.marks_push(); + + // see if the tail matches + stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + + drive.next_ctx(1, |drive, stacks| { + let MinUntilContext { + count, + save_repeat_ctx, + save_last_position, + } = stacks.min_until_last(); + let count = *count; + + let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.success(); + } + + drive.sync_string_position(); + + drive.state.marks_pop(); + + // match more until tail matches + + if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || drive.state.string_position == repeat_ctx.last_position + { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.failure(); + } + + repeat_ctx.count = count; + /* zero-width match protection */ + *save_last_position = repeat_ctx.last_position; + repeat_ctx.last_position = drive.state.string_position; + + stacks.repeat.push(repeat_ctx); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + drive.success(); + } else { + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + } + }); + }); +} + +#[derive(Debug, Clone, Copy)] +struct MaxUntilContext { + save_last_position: usize, +} + +/* maximizing repeat */ +fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { + // let repeat_ctx = stacks.repeat.last_mut().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + + drive.sync_string_position(); + + repeat_ctx.count += 1; + + // let count = repeat_ctx.count + 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + // repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + // stacks.max_until.pop(); + drive.success(); + } else { + // let count = stacks.max_until_last().count; + // stacks.repeat_last().count -= 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + // stacks.max_until.pop(); + drive.failure(); + } + }); + return; + } + + stacks.max_until.push(MaxUntilContext { + save_last_position: repeat_ctx.last_position, + }); + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && drive.state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + repeat_ctx.last_position = drive.state.string_position; + + drive.state.marks_push(); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + let save_last_position = stacks.max_until_last().save_last_position; + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { + drive.state.marks_pop_discard(); + stacks.max_until.pop(); + return drive.success(); + } + drive.state.marks_pop(); + repeat_ctx.count -= 1; + drive.sync_string_position(); + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { + stacks.max_until.pop(); + + if drive.popped_ctx().has_matched == Some(true) { + drive.success(); + } else { + drive.sync_string_position(); + drive.failure(); + } + } +} + +#[derive(Debug, Default)] +struct Stacks { + branch: Vec, + min_repeat_one: Vec, + repeat_one: Vec, + repeat: Vec, + min_until: Vec, + max_until: Vec, +} + +impl Stacks { + fn clear(&mut self) { + self.branch.clear(); + self.min_repeat_one.clear(); + self.repeat_one.clear(); + self.repeat.clear(); + self.min_until.clear(); + self.max_until.clear(); + } + + fn branch_last(&mut self) -> &mut BranchContext { + self.branch.last_mut().unwrap() + } + fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { + self.min_repeat_one.last_mut().unwrap() + } + fn repeat_one_last(&mut self) -> &mut RepeatOneContext { + self.repeat_one.last_mut().unwrap() + } + fn repeat_last(&mut self) -> &mut RepeatContext { + self.repeat.last_mut().unwrap() + } + fn min_until_last(&mut self) -> &mut MinUntilContext { + self.min_until.last_mut().unwrap() + } + fn max_until_last(&mut self) -> &mut MaxUntilContext { + self.max_until.last_mut().unwrap() + } +} + #[derive(Debug, Clone, Copy)] pub enum StrDrive<'a> { Str(&'a str), @@ -203,7 +857,7 @@ impl<'a> StrDrive<'a> { StrDrive::Str(s) => s .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or_else(|| s.len()), + .unwrap_or(s.len()), StrDrive::Bytes(_) => offset + skip, } } @@ -264,31 +918,63 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Clone, Copy)] +type OpcodeHandler = fn(&mut StateContext, &mut Stacks); + +#[derive(Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, + handler: Option, + repeat_ctx_id: usize, } -trait MatchContextDrive { - fn ctx_mut(&mut self) -> &mut MatchContext; +impl std::fmt::Debug for MatchContext { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MatchContext") + .field("string_position", &self.string_position) + .field("string_offset", &self.string_offset) + .field("code_position", &self.code_position) + .field("has_matched", &self.has_matched) + .field("toplevel", &self.toplevel) + .field("handler", &self.handler.map(|x| x as usize)) + .finish() + } +} + +trait ContextDrive { fn ctx(&self) -> &MatchContext; + fn ctx_mut(&mut self) -> &mut MatchContext; fn state(&self) -> &State; - fn repeat_ctx(&self) -> &RepeatContext { - self.state().repeat_stack.last().unwrap() + + fn popped_ctx(&self) -> &MatchContext { + self.state().popped_context.as_ref().unwrap() } + fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } + fn peek_char(&self) -> u32 { self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } + + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); + } + fn skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_offset = self .state() @@ -299,12 +985,17 @@ trait MatchContextDrive { fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } + fn skip_code_from(&mut self, peek: usize) { + self.skip_code(self.peek_code(peek) as usize + 1); + } + fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state().pattern_codes.len() - self.ctx().code_position } + fn at_beginning(&self) -> bool { // self.ctx().string_position == self.state().start self.ctx().string_position == 0 @@ -331,16 +1022,7 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this == that } - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } + fn can_success(&self) -> bool { if !self.ctx().toplevel { return true; @@ -353,389 +1035,71 @@ trait MatchContextDrive { } true } -} -struct StackDrive<'a> { - state: State<'a>, - ctx_id: usize, -} -impl<'a> StackDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } - fn push_new_context(&mut self, pattern_offset: usize) { - self.push_new_context_at(self.ctx().code_position + pattern_offset); - } - fn push_new_context_at(&mut self, code_position: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position = code_position; - self.state.context_stack.push(child_ctx); - } - fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { - self.state.repeat_stack.last_mut().unwrap() - } -} -impl MatchContextDrive for StackDrive<'_> { - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] + fn success(&mut self) { + self.ctx_mut().has_matched = Some(true); } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn state(&self) -> &State { - &self.state + + fn failure(&mut self) { + self.ctx_mut().has_matched = Some(false); } } -struct WrapDrive<'a> { - stack_drive: &'a StackDrive<'a>, +struct StateContext<'a> { + state: State<'a>, ctx: MatchContext, + next_ctx: Option, } -impl<'a> WrapDrive<'a> { - fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { - Self { stack_drive, ctx } + +impl ContextDrive for StateContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } -} -impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { - &self.ctx - } fn state(&self) -> &State { - self.stack_drive.state() + &self.state } } -trait OpcodeExecutor { - fn next(&mut self, drive: &mut StackDrive) -> Option<()>; -} +impl StateContext<'_> { + fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx(self.peek_code(peek) as usize + 1, handler) + } + fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx_at(self.ctx.code_position + offset, handler) + } + fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx = Some(MatchContext { + code_position, + has_matched: None, + handler: None, + ..self.ctx + }); + self.ctx.handler = Some(handler); + self.next_ctx.as_mut().unwrap() + } -struct OpTwice { - f1: Option, - f2: Option, -} -impl OpcodeExecutor for OpTwice -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(f1) = self.f1.take() { - f1(drive) - } else if let Some(f2) = self.f2.take() { - f2(drive); - None - } else { - unreachable!() - } + fn sync_string_position(&mut self) { + self.state.string_position = self.ctx.string_position; } } -fn twice(f1: F1, f2: F2) -> Box> -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - Box::new(OpTwice { - f1: Some(f1), - f2: Some(f2), - }) -} -struct OpcodeDispatcher { - executing_contexts: BTreeMap>, +struct StateRefContext<'a> { + entity: &'a StateContext<'a>, + ctx: MatchContext, } -impl OpcodeDispatcher { - fn new() -> Self { - Self { - executing_contexts: BTreeMap::new(), - } - } - fn clear(&mut self) { - self.executing_contexts.clear(); - } - // Returns True if the current context matches, False if it doesn't and - // None if matching is not finished, ie must be resumed after child - // contexts have been matched. - fn pymatch(&mut self, drive: &mut StackDrive) -> Option { - while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { - let code = drive.peek_code(0); - let opcode = SreOpcode::try_from(code).unwrap(); - if !self.dispatch(opcode, drive) { - return None; - } - } - match drive.ctx().has_matched { - Some(matched) => Some(matched), - None => { - drive.ctx_mut().has_matched = Some(false); - Some(false) - } - } - } - // Dispatches a context on a given opcode. Returns True if the context - // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let executor = self - .executing_contexts - .remove(&drive.id()) - .or_else(|| self.dispatch_table(opcode, drive)); - if let Some(mut executor) = executor { - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - return false; - } - } - true +impl ContextDrive for StateRefContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } - - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut StackDrive, - ) -> Option> { - match opcode { - SreOpcode::FAILURE => { - drive.ctx_mut().has_matched = Some(false); - None - } - SreOpcode::SUCCESS => { - drive.ctx_mut().has_matched = Some(drive.can_success()); - if drive.ctx().has_matched == Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - /* assert subpattern */ - /* */ - SreOpcode::ASSERT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.ctx_mut().has_matched = Some(false); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - }, - )), - SreOpcode::ASSERT_NOT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - }, - )), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if !at(drive, atcode) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - } - None - } - SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - None - } - SreOpcode::IN => { - general_op_in(drive, |set, c| charset(set, c)); - None - } - SreOpcode::IN_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - None - } - SreOpcode::IN_UNI_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - None - } - SreOpcode::IN_LOC_IGNORE => { - general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - None - } - SreOpcode::INFO | SreOpcode::JUMP => { - drive.skip_code(drive.peek_code(1) as usize + 1); - None - } - SreOpcode::LITERAL => { - general_op_literal(drive, |code, c| code == c); - None - } - SreOpcode::NOT_LITERAL => { - general_op_literal(drive, |code, c| code != c); - None - } - SreOpcode::LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code == lower_ascii(c)); - None - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)); - None - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)); - None - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)); - None - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_op_literal(drive, char_loc_ignore); - None - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - None - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); - drive.skip_code(2); - None - } - SreOpcode::REPEAT => Some(twice( - // create repeat context. all the hard work is done by the UNTIL - // operator (MAX_UNTIL, MIN_UNTIL) - // <1=min> <2=max> item tail - |drive| { - let repeat = RepeatContext { - count: -1, - code_position: drive.ctx().code_position, - last_position: std::usize::MAX, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - // execute UNTIL operator - drive.push_new_context(drive.peek_code(1) as usize + 1); - Some(()) - }, - |drive| { - drive.state.repeat_stack.pop(); - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - }, - )), - SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), - SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), - SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), - SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), - SreOpcode::GROUPREF => { - general_op_groupref(drive, |x| x); - None - } - SreOpcode::GROUPREF_IGNORE => { - general_op_groupref(drive, lower_ascii); - None - } - SreOpcode::GROUPREF_LOC_IGNORE => { - general_op_groupref(drive, lower_locate); - None - } - SreOpcode::GROUPREF_UNI_IGNORE => { - general_op_groupref(drive, lower_unicode); - None - } - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code(drive.peek_code(2) as usize + 1), - } - None - } - _ => { - // TODO python expcetion? - unreachable!("unexpected opcode") - } - } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + fn state(&self) -> &State { + &self.entity.state } } @@ -752,60 +1116,63 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } }; - let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); - let mut gdrive = WrapDrive::drive( - MatchContext { + + let mut wdrive = StateRefContext { + entity: drive, + ctx: drive.ctx, + }; + let mut gdrive = StateRefContext { + entity: drive, + ctx: MatchContext { string_position: group_start, // TODO: cache the offset string_offset: drive.state.string.offset(0, group_start), - ..*drive.ctx() + ..drive.ctx }, - &drive, - ); + }; + for _ in group_start..group_end { if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } wdrive.skip_char(1); gdrive.skip_char(1); } - let position = wdrive.ctx().string_position; - let offset = wdrive.ctx().string_offset; + + let position = wdrive.ctx.string_position; + let offset = wdrive.ctx.string_offset; drive.skip_code(2); - drive.ctx_mut().string_position = position; - drive.ctx_mut().string_offset = offset; + drive.ctx.string_position = position; + drive.ctx.string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { drive.skip_code(2); drive.skip_char(1); } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { - let skip = drive.peek_code(1) as usize; +fn general_op_in bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { - drive.skip_code(skip + 1); + drive.skip_code_from(1); drive.skip_char(1); } } -fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { +fn at(drive: &StateContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -938,84 +1305,91 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let mut count = 0; - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let save_ctx = *drive.ctx(); + let save_ctx = drive.ctx; drive.skip_code(4); - let reset_position = drive.ctx().code_position; - - let mut dispatcher = OpcodeDispatcher::new(); - while count < maxcount { - drive.ctx_mut().code_position = reset_position; - dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); - if drive.ctx().has_matched == Some(false) { + let reset_position = drive.ctx.code_position; + + while count < max_count { + drive.ctx.code_position = reset_position; + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, drive, stacks); + if drive.ctx.has_matched == Some(false) { break; } count += 1; } - *drive.ctx_mut() = save_ctx; + drive.ctx = save_ctx; count } -fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); - let end = drive.ctx().string_position + maxcount; +fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { + let save_ctx = drive.ctx; + let max_count = std::cmp::min(max_count, drive.remaining_chars()); + let end = drive.ctx.string_position + max_count; let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { - while !drive.ctx().string_position < end && !drive.at_linebreak() { + while !drive.ctx.string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(maxcount); + drive.skip_char(max_count); } SreOpcode::IN => { - while !drive.ctx().string_position < end + while !drive.ctx.string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, end, |code, c| code == c as u32); + general_count_literal(drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, end, |code, c| code != c as u32); + general_count_literal(drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, char_loc_ignore); + general_count_literal(drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - return general_count(stack_drive, maxcount); + return general_count(drive, stacks, max_count); } } - drive.ctx().string_position - drive.state().string_position + let count = drive.ctx.string_position - drive.state.string_position; + drive.ctx = save_ctx; + count } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>( + drive: &mut StateContext, + end: usize, + mut f: F, +) { let ch = drive.peek_code(1); - while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { + while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1065,7 +1439,9 @@ fn upper_locate(ch: u32) -> u32 { } fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython @@ -1155,413 +1531,3 @@ fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { } offset } - -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - count: isize, - code_position: usize, - // zero-width match protection - last_position: usize, - mincount: usize, - maxcount: usize, -} - -#[derive(Default)] -struct OpMinRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - self.jump_id = 1; - self.next(drive) - } - 1 => { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } - - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMaxUntil { - jump_id: usize, - count: isize, - save_last_position: usize, -} -impl OpcodeExecutor for OpMaxUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position, - mincount, - maxcount, - } = *drive.repeat_ctx(); - - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context_at(code_position + 4); - self.jump_id = 2; - return Some(()); - } - - self.jump_id = 3; - self.next(drive) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - 4 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMinUntil { - jump_id: usize, - count: isize, - save_repeat: Option, - save_last_position: usize, -} -impl OpcodeExecutor for OpMinUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position: _, - mincount, - maxcount: _, - } = *drive.repeat_ctx(); - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.save_repeat = drive.state.repeat_stack.pop(); - drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.repeat_ctx_mut().last_position = self.save_last_position; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - // restore repeat before return - drive.state.repeat_stack.push(self.save_repeat.unwrap()); - - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let RepeatContext { - count: _, - code_position, - last_position, - mincount: _, - maxcount, - } = *drive.repeat_ctx(); - - if self.count as usize >= maxcount && maxcount != MAXREPEAT - || drive.state.string_position == last_position - { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.repeat_ctx_mut().count = self.count; - - /* zero-width match protection */ - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpBranch { - jump_id: usize, - branch_offset: usize, -} -impl OpcodeExecutor for OpBranch { - // alternation - // <0=skip> code ... - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.marks_push(); - // jump out the head - self.branch_offset = 1; - self.jump_id = 1; - self.next(drive) - } - 1 => { - let next_branch_length = drive.peek_code(self.branch_offset) as usize; - if next_branch_length == 0 { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(self.branch_offset + 1); - self.branch_offset += next_branch_length; - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop_keep(); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - following_literal: Option, -} -impl OpcodeExecutor for OpRepeatOne { - /* match repeated sequence (maximizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MAX_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - if next_code == SreOpcode::LITERAL as u32 { - self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) - } - - self.jump_id = 1; - self.next(drive) - } - 1 => { - if let Some(c) = self.following_literal { - while drive.at_end() || drive.peek_char() != c { - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.back_skip_char(1); - self.count -= 1; - } - } - - // General case: backtracking - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.back_skip_char(1); - self.count -= 1; - - drive.state.marks_pop_keep(); - - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..c23e807501 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ pub mod engine; pub const CODESIZE: usize = 4; #[cfg(target_pointer_width = "32")] -pub const MAXREPEAT: usize = usize::MAX; +pub const MAXREPEAT: usize = usize::MAX - 1; #[cfg(target_pointer_width = "64")] pub const MAXREPEAT: usize = u32::MAX as usize; diff --git a/tests/tests.rs b/tests/tests.rs index b430947a9b..e8ae487029 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -70,4 +70,15 @@ fn test_repeat_context_panic() { let mut state = p.state("axxzaz", 0..usize::MAX); state = state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); -} \ No newline at end of file +} + +#[test] +fn test_double_max_until() { + // pattern p = re.compile(r'((1)?)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("1111", 0..usize::MAX); + state = state.pymatch(); + assert!(state.string_position == 4); +} From 4007f8276550efb6aa1ada429a3e89c042eae86c Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:20 +0200 Subject: [PATCH 52/95] optimize max_until and min_until --- src/engine.rs | 96 +++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 69 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 81903ccfdd..223aa3425c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -420,7 +420,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { let count = if min_count == 0 { 0 } else { - let count = count(drive, stacks, min_count); + let count = _count(drive, stacks, min_count); if count < min_count { return drive.failure(); } @@ -462,7 +462,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - if crate::engine::count(drive, stacks, 1) == 0 { + if _count(drive, stacks, 1) == 0 { drive.state.marks_pop_discard(); stacks.min_repeat_one.pop(); return drive.failure(); @@ -500,7 +500,7 @@ fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = count(drive, stacks, max_count); + let count = _count(drive, stacks, max_count); drive.skip_char(count); if count < min_count { return drive.failure(); @@ -614,9 +614,7 @@ fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { #[derive(Debug, Clone, Copy)] struct MinUntilContext { - count: isize, - save_repeat_ctx: Option, - save_last_position: usize, + save_repeat_ctx_id: usize, } /* minimizing repeat */ @@ -625,50 +623,35 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = repeat_ctx.count + 1; - - stacks.min_until.push(MinUntilContext { - count, - save_repeat_ctx: None, - save_last_position: repeat_ctx.last_position, - }); + repeat_ctx.count += 1; - if (count as usize) < repeat_ctx.min_count { + if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - return drive.success(); + drive.success(); + } else { + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + drive.failure(); } - - stacks.repeat_last().count = stacks.min_until_last().count - 1; - drive.sync_string_position(); - stacks.min_until.pop(); - drive.failure(); }); return; } drive.state.marks_push(); - // see if the tail matches - stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + stacks.min_until.push(MinUntilContext { + save_repeat_ctx_id: drive.ctx.repeat_ctx_id, + }); - drive.next_ctx(1, |drive, stacks| { - let MinUntilContext { - count, - save_repeat_ctx, - save_last_position, - } = stacks.min_until_last(); - let count = *count; + // see if the tail matches + let next_ctx = drive.next_ctx(1, |drive, stacks| { + drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); return drive.success(); } @@ -678,34 +661,27 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { // match more until tail matches - if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT || drive.state.string_position == repeat_ctx.last_position { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); + repeat_ctx.count -= 1; return drive.failure(); } - repeat_ctx.count = count; /* zero-width match protection */ - *save_last_position = repeat_ctx.last_position; repeat_ctx.last_position = drive.state.string_position; - stacks.repeat.push(repeat_ctx); - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); drive.success(); } else { - stacks.repeat_last().count = stacks.min_until_last().count - 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - stacks.min_until.pop(); drive.failure(); } }); }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; } #[derive(Debug, Clone, Copy)] @@ -715,28 +691,20 @@ struct MaxUntilContext { /* maximizing repeat */ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - // let repeat_ctx = stacks.repeat.last_mut().unwrap(); let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; drive.sync_string_position(); repeat_ctx.count += 1; - // let count = repeat_ctx.count + 1; - if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - // stacks.max_until.pop(); drive.success(); } else { - // let count = stacks.max_until_last().count; - // stacks.repeat_last().count -= 1; stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - // stacks.max_until.pop(); drive.failure(); } }); @@ -757,14 +725,15 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.state.marks_push(); drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until_last().save_last_position; + let save_last_position = stacks.max_until.pop().unwrap().save_last_position; let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { drive.state.marks_pop_discard(); - stacks.max_until.pop(); return drive.success(); } + drive.state.marks_pop(); repeat_ctx.count -= 1; drive.sync_string_position(); @@ -782,9 +751,7 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { let next_ctx = drive.next_ctx(1, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { - stacks.max_until.pop(); - + fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { if drive.popped_ctx().has_matched == Some(true) { drive.success(); } else { @@ -823,15 +790,6 @@ impl Stacks { fn repeat_one_last(&mut self) -> &mut RepeatOneContext { self.repeat_one.last_mut().unwrap() } - fn repeat_last(&mut self) -> &mut RepeatContext { - self.repeat.last_mut().unwrap() - } - fn min_until_last(&mut self) -> &mut MinUntilContext { - self.min_until.last_mut().unwrap() - } - fn max_until_last(&mut self) -> &mut MaxUntilContext { - self.max_until.last_mut().unwrap() - } } #[derive(Debug, Clone, Copy)] @@ -1327,7 +1285,7 @@ fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize count } -fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let save_ctx = drive.ctx; let max_count = std::cmp::min(max_count, drive.remaining_chars()); let end = drive.ctx.string_position + max_count; From bf57f289bff1ec5633316a924e82fbb5f5ed6eb0 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:43 +0200 Subject: [PATCH 53/95] update version to 0.2.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6ba3996947..00123d92c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.0" +version = "0.2.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 9058f287881af7fc0f004759184a0ff5d0811967 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 28 Jul 2022 22:46:28 +0200 Subject: [PATCH 54/95] refactor trait StrDrive instead enum --- src/engine.rs | 2049 +++++++++++++++++++++++++------------------------ 1 file changed, 1055 insertions(+), 994 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 223aa3425c..8865eb6a39 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -9,8 +9,8 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { } #[derive(Debug)] -pub struct State<'a> { - pub string: StrDrive<'a>, +pub struct State<'a, S: StrDrive> { + pub string: S, pub start: usize, pub end: usize, _flags: SreFlag, @@ -18,18 +18,25 @@ pub struct State<'a> { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec, - _stacks: Option>, + context_stack: Vec>, + // branch_stack: Vec, + // min_repeat_one_stack: Vec, + // repeat_one_stack: Vec, + // repeat_stack: Vec, + // min_until_stack: Vec, + // max_until_stack: Vec, + // _stacks: Option>, pub string_position: usize, - popped_context: Option, + popped_context: Option>, + next_context: Option>, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } -impl<'a> State<'a> { +impl<'a, S: StrDrive> State<'a, S> { pub fn new( - string: StrDrive<'a>, + string: S, start: usize, end: usize, flags: SreFlag, @@ -47,9 +54,16 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - _stacks: Default::default(), + // branch_stack: Vec::new(), + // min_repeat_one_stack: Vec::new(), + // repeat_one_stack: Vec::new(), + // repeat_stack: Vec::new(), + // min_until_stack: Vec::new(), + // max_until_stack: Vec::new(), + // _stacks: Default::default(), string_position: start, popped_context: None, + next_context: None, has_matched: false, match_all: false, must_advance: false, @@ -61,11 +75,15 @@ impl<'a> State<'a> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - if let Some(x) = self._stacks.as_mut() { - x.clear() - }; + // self.branch_stack.clear(); + // self.min_repeat_one_stack.clear(); + // self.repeat_one_stack.clear(); + // self.repeat_stack.clear(); + // self.min_until_stack.clear(); + // self.max_until_stack.clear(); self.string_position = self.start; self.popped_context = None; + self.next_context = None; self.has_matched = false; } @@ -103,47 +121,46 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, stacks: &mut Stacks) -> Self { - while let Some(ctx) = self.context_stack.pop() { - let mut drive = StateContext { - state: self, - ctx, - next_ctx: None, - }; + fn _match(&mut self) { + while let Some(mut ctx) = self.context_stack.pop() { + // let mut drive = StateContext { + // state: self, + // ctx, + // next_ctx: None, + // }; + // let mut state = self; - if let Some(handler) = drive.ctx.handler { - handler(&mut drive, stacks); - } else if drive.remaining_codes() > 0 { - let code = drive.peek_code(0); + if let Some(handler) = ctx.handler { + handler(self, &mut ctx); + } else if ctx.remaining_codes(self) > 0 { + let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, &mut drive, stacks); + self.dispatch(code, &mut ctx); } else { - drive.failure(); + ctx.failure(); } - let StateContext { - mut state, - ctx, - next_ctx, - } = drive; + // let StateContext { + // mut state, + // ctx, + // next_ctx, + // } = drive; if ctx.has_matched.is_some() { - state.popped_context = Some(ctx); + self.popped_context = Some(ctx); } else { - state.context_stack.push(ctx); - if let Some(next_ctx) = next_ctx { - state.context_stack.push(next_ctx); + self.context_stack.push(ctx); + if let Some(next_ctx) = self.next_context.take() { + self.context_stack.push(next_ctx); } } - self = state + // self = state } - self.has_matched = self.popped_context.unwrap().has_matched == Some(true); - self + self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); + // self } pub fn pymatch(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); - let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -155,13 +172,11 @@ impl<'a> State<'a> { }; self.context_stack.push(ctx); - self = self._match(&mut stacks); - self._stacks = Some(stacks); + self._match(); self } pub fn search(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { @@ -180,14 +195,13 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - stacks.clear(); let ctx = MatchContext { string_position: self.start, @@ -199,697 +213,730 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); } - self._stacks = Some(stacks); self } -} -fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { - match opcode { - SreOpcode::FAILURE => { - drive.failure(); - } - SreOpcode::SUCCESS => { - drive.ctx.has_matched = Some(drive.can_success()); - if drive.ctx.has_matched == Some(true) { - drive.state.string_position = drive.ctx.string_position; - } - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ASSERT => op_assert(drive), - SreOpcode::ASSERT_NOT => op_assert_not(drive), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if at(drive, atcode) { - drive.skip_code(2); - } else { - drive.failure(); - } - } - SreOpcode::BRANCH => op_branch(drive, stacks), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - } - SreOpcode::IN => general_op_in(drive, charset), - SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), - SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), - SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), - SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)) - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)) - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)) - } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); - drive.skip_code(2); - } - SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), - SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), - SreOpcode::REPEAT => op_repeat(drive, stacks), - SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), - SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code_from(2), + fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { + match opcode { + SreOpcode::FAILURE => { + ctx.has_matched = Some(false); } + SreOpcode::SUCCESS => todo!(), + SreOpcode::ANY => todo!(), + SreOpcode::ANY_ALL => todo!(), + SreOpcode::ASSERT => todo!(), + SreOpcode::ASSERT_NOT => todo!(), + SreOpcode::AT => todo!(), + SreOpcode::BRANCH => todo!(), + SreOpcode::CALL => todo!(), + SreOpcode::CATEGORY => todo!(), + SreOpcode::CHARSET => todo!(), + SreOpcode::BIGCHARSET => todo!(), + SreOpcode::GROUPREF => todo!(), + SreOpcode::GROUPREF_EXISTS => todo!(), + SreOpcode::IN => todo!(), + SreOpcode::INFO => todo!(), + SreOpcode::JUMP => todo!(), + SreOpcode::LITERAL => todo!(), + SreOpcode::MARK => todo!(), + SreOpcode::MAX_UNTIL => todo!(), + SreOpcode::MIN_UNTIL => todo!(), + SreOpcode::NOT_LITERAL => todo!(), + SreOpcode::NEGATE => todo!(), + SreOpcode::RANGE => todo!(), + SreOpcode::REPEAT => todo!(), + SreOpcode::REPEAT_ONE => todo!(), + SreOpcode::SUBPATTERN => todo!(), + SreOpcode::MIN_REPEAT_ONE => todo!(), + SreOpcode::GROUPREF_IGNORE => todo!(), + SreOpcode::IN_IGNORE => todo!(), + SreOpcode::LITERAL_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_IGNORE => todo!(), + SreOpcode::GROUPREF_LOC_IGNORE => todo!(), + SreOpcode::IN_LOC_IGNORE => todo!(), + SreOpcode::LITERAL_LOC_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), + SreOpcode::GROUPREF_UNI_IGNORE => todo!(), + SreOpcode::IN_UNI_IGNORE => todo!(), + SreOpcode::LITERAL_UNI_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), + SreOpcode::RANGE_UNI_IGNORE => todo!(), } - _ => unreachable!("unexpected opcode"), - } -} - -/* assert subpattern */ -/* */ -fn op_assert(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.failure(); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.ctx.handler = None; - drive.skip_code_from(1); - } else { - drive.failure(); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -/* assert not subpattern */ -/* */ -fn op_assert_not(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.skip_code_from(1); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.failure(); - } else { - drive.ctx.handler = None; - drive.skip_code_from(1); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -#[derive(Debug)] -struct BranchContext { - branch_offset: usize, -} - -// alternation -// <0=skip> code ... -fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { - drive.state.marks_push(); - stacks.branch.push(BranchContext { branch_offset: 1 }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let branch_offset = stacks.branch_last().branch_offset; - let next_length = drive.peek_code(branch_offset) as usize; - if next_length == 0 { - drive.state.marks_pop_discard(); - stacks.branch.pop(); - return drive.failure(); - } - - drive.sync_string_position(); - - stacks.branch_last().branch_offset += next_length; - drive.next_ctx(branch_offset + 1, callback); - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.branch.pop(); - return drive.success(); - } - drive.state.marks_pop_keep(); - drive.ctx.handler = Some(create_context) - } -} - -#[derive(Debug, Copy, Clone)] -struct MinRepeatOneContext { - count: usize, - max_count: usize, -} - -/* <1=min> <2=max> item tail */ -fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - - let count = if min_count == 0 { - 0 - } else { - let count = _count(drive, stacks, min_count); - if count < min_count { - return drive.failure(); - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); - } - - drive.state.marks_push(); - stacks - .min_repeat_one - .push(MinRepeatOneContext { count, max_count }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - - if max_count == MAXREPEAT || count <= max_count { - drive.sync_string_position(); - drive.next_ctx_from(1, callback); - } else { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - drive.failure(); - } - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.min_repeat_one.pop(); - return drive.success(); - } - - drive.sync_string_position(); - - if _count(drive, stacks, 1) == 0 { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - return drive.failure(); - } - - drive.skip_char(1); - stacks.min_repeat_one_last().count += 1; - drive.state.marks_pop_keep(); - create_context(drive, stacks); } } -#[derive(Debug, Copy, Clone)] -struct RepeatOneContext { - count: usize, - min_count: usize, - following_literal: Option, +// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { +// match opcode { +// SreOpcode::FAILURE => { +// drive.failure(); +// } +// SreOpcode::SUCCESS => { +// drive.ctx.has_matched = Some(drive.can_success()); +// if drive.ctx.has_matched == Some(true) { +// drive.state.string_position = drive.ctx.string_position; +// } +// } +// SreOpcode::ANY => { +// if drive.at_end() || drive.at_linebreak() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// if drive.at_end() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ASSERT => op_assert(drive), +// SreOpcode::ASSERT_NOT => op_assert_not(drive), +// SreOpcode::AT => { +// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); +// if at(drive, atcode) { +// drive.skip_code(2); +// } else { +// drive.failure(); +// } +// } +// SreOpcode::BRANCH => op_branch(drive, stacks), +// SreOpcode::CATEGORY => { +// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); +// if drive.at_end() || !category(catcode, drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } +// SreOpcode::IN => general_op_in(drive, charset), +// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), +// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), +// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), +// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), +// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), +// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), +// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_ascii(c)) +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code == lower_unicode(c)) +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_unicode(c)) +// } +// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) +// } +// SreOpcode::MARK => { +// drive +// .state +// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); +// drive.skip_code(2); +// } +// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), +// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), +// SreOpcode::REPEAT => op_repeat(drive, stacks), +// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), +// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), +// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), +// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), +// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), +// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), +// SreOpcode::GROUPREF_EXISTS => { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => { +// drive.skip_code(3); +// } +// _ => drive.skip_code_from(2), +// } +// } +// _ => unreachable!("unexpected opcode"), +// } +// } + +// /* assert subpattern */ +// /* */ +// fn op_assert(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.failure(); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } else { +// drive.failure(); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// /* assert not subpattern */ +// /* */ +// fn op_assert_not(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.skip_code_from(1); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.failure(); +// } else { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// #[derive(Debug)] +// struct BranchContext { +// branch_offset: usize, +// } + +// // alternation +// // <0=skip> code ... +// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { +// drive.state.marks_push(); +// stacks.branch.push(BranchContext { branch_offset: 1 }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let branch_offset = stacks.branch_last().branch_offset; +// let next_length = drive.peek_code(branch_offset) as usize; +// if next_length == 0 { +// drive.state.marks_pop_discard(); +// stacks.branch.pop(); +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// stacks.branch_last().branch_offset += next_length; +// drive.next_ctx(branch_offset + 1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.branch.pop(); +// return drive.success(); +// } +// drive.state.marks_pop_keep(); +// drive.ctx.handler = Some(create_context) +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct MinRepeatOneContext { +// count: usize, +// max_count: usize, +// } + +// /* <1=min> <2=max> item tail */ +// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = if min_count == 0 { +// 0 +// } else { +// let count = _count(drive, stacks, min_count); +// if count < min_count { +// return drive.failure(); +// } +// drive.skip_char(count); +// count +// }; + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// drive.state.marks_push(); +// stacks +// .min_repeat_one +// .push(MinRepeatOneContext { count, max_count }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + +// if max_count == MAXREPEAT || count <= max_count { +// drive.sync_string_position(); +// drive.next_ctx_from(1, callback); +// } else { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// drive.failure(); +// } +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.min_repeat_one.pop(); +// return drive.success(); +// } + +// drive.sync_string_position(); + +// if _count(drive, stacks, 1) == 0 { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// return drive.failure(); +// } + +// drive.skip_char(1); +// stacks.min_repeat_one_last().count += 1; +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct RepeatOneContext { +// count: usize, +// min_count: usize, +// following_literal: Option, +// } + +// /* match repeated sequence (maximizing regexp) */ + +// /* this operator only works if the repeated item is +// exactly one character wide, and we're not already +// collecting backtracking points. for other cases, +// use the MAX_REPEAT operator */ + +// /* <1=min> <2=max> item tail */ +// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = _count(drive, stacks, max_count); +// drive.skip_char(count); +// if count < min_count { +// return drive.failure(); +// } + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// // Special case: Tail starts with a literal. Skip positions where +// // the rest of the pattern cannot possibly match. +// let following_literal = (next_code == SreOpcode::LITERAL as u32) +// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + +// drive.state.marks_push(); +// stacks.repeat_one.push(RepeatOneContext { +// count, +// min_count, +// following_literal, +// }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let RepeatOneContext { +// mut count, +// min_count, +// following_literal, +// } = *stacks.repeat_one_last(); + +// if let Some(c) = following_literal { +// while drive.at_end() || drive.peek_char() != c { +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } +// drive.back_skip_char(1); +// count -= 1; +// } +// } +// stacks.repeat_one_last().count = count; + +// drive.sync_string_position(); + +// // General case: backtracking +// drive.next_ctx_from(1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.repeat_one.pop(); +// return drive.success(); +// } + +// let RepeatOneContext { +// count, +// min_count, +// following_literal: _, +// } = stacks.repeat_one_last(); + +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } + +// drive.back_skip_char(1); +// *count -= 1; + +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Clone, Copy)] +// struct RepeatContext { +// count: isize, +// min_count: usize, +// max_count: usize, +// code_position: usize, +// last_position: usize, +// prev_id: usize, +// } + +// /* create repeat context. all the hard work is done +// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +// /* <1=min> <2=max> item tail */ +// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = RepeatContext { +// count: -1, +// min_count: drive.peek_code(2) as usize, +// max_count: drive.peek_code(3) as usize, +// code_position: drive.ctx.code_position, +// last_position: std::usize::MAX, +// prev_id: drive.ctx.repeat_ctx_id, +// }; + +// stacks.repeat.push(repeat_ctx); + +// drive.sync_string_position(); + +// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { +// drive.ctx.has_matched = drive.popped_ctx().has_matched; +// stacks.repeat.pop(); +// }); +// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MinUntilContext { +// save_repeat_ctx_id: usize, +// } + +// /* minimizing repeat */ +// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = stacks.repeat.last_mut().unwrap(); + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// drive.state.marks_push(); + +// stacks.min_until.push(MinUntilContext { +// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, +// }); + +// // see if the tail matches +// let next_ctx = drive.next_ctx(1, |drive, stacks| { +// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; + +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// if drive.popped_ctx().has_matched == Some(true) { +// return drive.success(); +// } + +// drive.sync_string_position(); + +// drive.state.marks_pop(); + +// // match more until tail matches + +// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT +// || drive.state.string_position == repeat_ctx.last_position +// { +// repeat_ctx.count -= 1; +// return drive.failure(); +// } + +// /* zero-width match protection */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// }); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MaxUntilContext { +// save_last_position: usize, +// } + +// /* maximizing repeat */ +// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// stacks.max_until.push(MaxUntilContext { +// save_last_position: repeat_ctx.last_position, +// }); + +// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) +// && drive.state.string_position != repeat_ctx.last_position +// { +// /* we may have enough matches, but if we can +// match another item, do so */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.state.marks_push(); + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; +// repeat_ctx.last_position = save_last_position; + +// if drive.popped_ctx().has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// return drive.success(); +// } + +// drive.state.marks_pop(); +// repeat_ctx.count -= 1; +// drive.sync_string_position(); + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// }); +// return; +// } + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + +// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// drive.sync_string_position(); +// drive.failure(); +// } +// } +// } + +// #[derive(Debug, Default)] +// struct Stacks { +// } + +// impl Stacks { +// fn clear(&mut self) { +// self.branch.clear(); +// self.min_repeat_one.clear(); +// self.repeat_one.clear(); +// self.repeat.clear(); +// self.min_until.clear(); +// self.max_until.clear(); +// } + +// fn branch_last(&mut self) -> &mut BranchContext { +// self.branch.last_mut().unwrap() +// } +// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { +// self.min_repeat_one.last_mut().unwrap() +// } +// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { +// self.repeat_one.last_mut().unwrap() +// } +// } + +pub trait StrDrive { + fn offset(&self, offset: usize, skip: usize) -> usize; + fn count(&self) -> usize; + fn peek(&self, offset: usize) -> u32; + fn back_peek(&self, offset: usize) -> u32; + fn back_offset(&self, offset: usize, skip: usize) -> usize; } -/* match repeated sequence (maximizing regexp) */ - -/* this operator only works if the repeated item is -exactly one character wide, and we're not already -collecting backtracking points. for other cases, -use the MAX_REPEAT operator */ - -/* <1=min> <2=max> item tail */ -fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - let count = _count(drive, stacks, max_count); - drive.skip_char(count); - if count < min_count { - return drive.failure(); +impl<'a> StrDrive for &'a str { + fn offset(&self, offset: usize, skip: usize) -> usize { + self.get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or(self.len()) } - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); + fn count(&self) -> usize { + self.chars().count() } - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - let following_literal = (next_code == SreOpcode::LITERAL as u32) - .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - - drive.state.marks_push(); - stacks.repeat_one.push(RepeatOneContext { - count, - min_count, - following_literal, - }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let RepeatOneContext { - mut count, - min_count, - following_literal, - } = *stacks.repeat_one_last(); - - if let Some(c) = following_literal { - while drive.at_end() || drive.peek_char() != c { - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); - } - drive.back_skip_char(1); - count -= 1; - } - } - stacks.repeat_one_last().count = count; - - drive.sync_string_position(); - - // General case: backtracking - drive.next_ctx_from(1, callback); + fn peek(&self, offset: usize) -> u32 { + unsafe { self.get_unchecked(offset..) } + .chars() + .next() + .unwrap() as u32 } - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.repeat_one.pop(); - return drive.success(); - } - - let RepeatOneContext { - count, - min_count, - following_literal: _, - } = stacks.repeat_one_last(); - - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); + fn back_peek(&self, offset: usize) -> u32 { + let bytes = self.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]), + 4 => u32::from_be_bytes([ + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), + _ => unreachable!(), } - - drive.back_skip_char(1); - *count -= 1; - - drive.state.marks_pop_keep(); - create_context(drive, stacks); } -} - -#[derive(Debug, Clone, Copy)] -struct RepeatContext { - count: isize, - min_count: usize, - max_count: usize, - code_position: usize, - last_position: usize, - prev_id: usize, -} - -/* create repeat context. all the hard work is done -by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -/* <1=min> <2=max> item tail */ -fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = RepeatContext { - count: -1, - min_count: drive.peek_code(2) as usize, - max_count: drive.peek_code(3) as usize, - code_position: drive.ctx.code_position, - last_position: std::usize::MAX, - prev_id: drive.ctx.repeat_ctx_id, - }; - - stacks.repeat.push(repeat_ctx); - - drive.sync_string_position(); - - let next_ctx = drive.next_ctx_from(1, |drive, stacks| { - drive.ctx.has_matched = drive.popped_ctx().has_matched; - stacks.repeat.pop(); - }); - next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -} - -#[derive(Debug, Clone, Copy)] -struct MinUntilContext { - save_repeat_ctx_id: usize, -} - -/* minimizing repeat */ -fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = stacks.repeat.last_mut().unwrap(); - - drive.sync_string_position(); - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - drive.state.marks_push(); - - stacks.min_until.push(MinUntilContext { - save_repeat_ctx_id: drive.ctx.repeat_ctx_id, - }); - - // see if the tail matches - let next_ctx = drive.next_ctx(1, |drive, stacks| { - drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - if drive.popped_ctx().has_matched == Some(true) { - return drive.success(); - } - - drive.sync_string_position(); - - drive.state.marks_pop(); - - // match more until tail matches - - if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || drive.state.string_position == repeat_ctx.last_position - { - repeat_ctx.count -= 1; - return drive.failure(); - } - - /* zero-width match protection */ - repeat_ctx.last_position = drive.state.string_position; - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -} - -#[derive(Debug, Clone, Copy)] -struct MaxUntilContext { - save_last_position: usize, -} - -/* maximizing repeat */ -fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - drive.sync_string_position(); - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - stacks.max_until.push(MaxUntilContext { - save_last_position: repeat_ctx.last_position, - }); - - if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && drive.state.string_position != repeat_ctx.last_position - { - /* we may have enough matches, but if we can - match another item, do so */ - repeat_ctx.last_position = drive.state.string_position; - - drive.state.marks_push(); - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until.pop().unwrap().save_last_position; - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - repeat_ctx.last_position = save_last_position; - - if drive.popped_ctx().has_matched == Some(true) { - drive.state.marks_pop_discard(); - return drive.success(); - } - - drive.state.marks_pop(); - repeat_ctx.count -= 1; - drive.sync_string_position(); - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - }); - return; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - - fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - drive.sync_string_position(); - drive.failure(); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + let bytes = self.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); } + back_offset } } -#[derive(Debug, Default)] -struct Stacks { - branch: Vec, - min_repeat_one: Vec, - repeat_one: Vec, - repeat: Vec, - min_until: Vec, - max_until: Vec, -} - -impl Stacks { - fn clear(&mut self) { - self.branch.clear(); - self.min_repeat_one.clear(); - self.repeat_one.clear(); - self.repeat.clear(); - self.min_until.clear(); - self.max_until.clear(); - } - - fn branch_last(&mut self) -> &mut BranchContext { - self.branch.last_mut().unwrap() - } - fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { - self.min_repeat_one.last_mut().unwrap() - } - fn repeat_one_last(&mut self) -> &mut RepeatOneContext { - self.repeat_one.last_mut().unwrap() - } -} - -#[derive(Debug, Clone, Copy)] -pub enum StrDrive<'a> { - Str(&'a str), - Bytes(&'a [u8]), -} - -impl<'a> From<&'a str> for StrDrive<'a> { - fn from(s: &'a str) -> Self { - Self::Str(s) - } -} -impl<'a> From<&'a [u8]> for StrDrive<'a> { - fn from(b: &'a [u8]) -> Self { - Self::Bytes(b) - } -} - -impl<'a> StrDrive<'a> { +impl<'a> StrDrive for &'a [u8] { fn offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => s - .get(offset..) - .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or(s.len()), - StrDrive::Bytes(_) => offset + skip, - } + offset + skip } - pub fn count(&self) -> usize { - match *self { - StrDrive::Str(s) => s.chars().count(), - StrDrive::Bytes(b) => b.len(), - } + fn count(&self) -> usize { + self.len() } fn peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, - StrDrive::Bytes(b) => b[offset] as u32, - } + self[offset] as u32 } fn back_peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let back_offset = utf8_back_peek_offset(bytes, offset); - match offset - back_offset { - 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_be_bytes([ - 0, - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - 4 => u32::from_be_bytes([ - bytes[offset - 4], - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - _ => unreachable!(), - } - } - StrDrive::Bytes(b) => b[offset - 1] as u32, - } + self[offset - 1] as u32 } fn back_offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let mut back_offset = offset; - for _ in 0..skip { - back_offset = utf8_back_peek_offset(bytes, back_offset); - } - back_offset - } - StrDrive::Bytes(_) => offset - skip, - } + offset - skip } } -type OpcodeHandler = fn(&mut StateContext, &mut Stacks); +// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); #[derive(Clone, Copy)] -struct MatchContext { +struct MatchContext<'a, S: StrDrive> { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, + handler: Option, &mut Self)>, repeat_ctx_id: usize, } -impl std::fmt::Debug for MatchContext { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -902,164 +949,178 @@ impl std::fmt::Debug for MatchContext { } } -trait ContextDrive { - fn ctx(&self) -> &MatchContext; - fn ctx_mut(&mut self) -> &mut MatchContext; - fn state(&self) -> &State; - - fn popped_ctx(&self) -> &MatchContext { - self.state().popped_context.as_ref().unwrap() +impl<'a, S: StrDrive> MatchContext<'a, S> { + fn remaining_codes(&self, state: &State<'a, S>) -> usize { + state.pattern_codes.len() - self.code_position } - - fn pattern(&self) -> &[u32] { - &self.state().pattern_codes[self.ctx().code_position..] - } - - fn peek_char(&self) -> u32 { - self.state().string.peek(self.ctx().string_offset) - } - fn peek_code(&self, peek: usize) -> u32 { - self.state().pattern_codes[self.ctx().code_position + peek] - } - - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } - - fn skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_offset = self - .state() - .string - .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; - } - fn skip_code(&mut self, skip_count: usize) { - self.ctx_mut().code_position += skip_count; - } - fn skip_code_from(&mut self, peek: usize) { - self.skip_code(self.peek_code(peek) as usize + 1); - } - - fn remaining_chars(&self) -> usize { - self.state().end - self.ctx().string_position - } - fn remaining_codes(&self) -> usize { - self.state().pattern_codes.len() - self.ctx().code_position - } - - fn at_beginning(&self) -> bool { - // self.ctx().string_position == self.state().start - self.ctx().string_position == 0 - } - fn at_end(&self) -> bool { - self.ctx().string_position == self.state().end - } - fn at_linebreak(&self) -> bool { - !self.at_end() && is_linebreak(self.peek_char()) - } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this != that - } - fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this == that - } - - fn can_success(&self) -> bool { - if !self.ctx().toplevel { - return true; - } - if self.state().match_all && !self.at_end() { - return false; - } - if self.state().must_advance && self.ctx().string_position == self.state().start { - return false; - } - true - } - - fn success(&mut self) { - self.ctx_mut().has_matched = Some(true); + + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { + state.pattern_codes[self.code_position + peek] } fn failure(&mut self) { - self.ctx_mut().has_matched = Some(false); + self.has_matched = Some(false); } } -struct StateContext<'a> { - state: State<'a>, - ctx: MatchContext, - next_ctx: Option, -} - -impl ContextDrive for StateContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.state - } -} - -impl StateContext<'_> { - fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx(self.peek_code(peek) as usize + 1, handler) - } - fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx_at(self.ctx.code_position + offset, handler) - } - fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx = Some(MatchContext { - code_position, - has_matched: None, - handler: None, - ..self.ctx - }); - self.ctx.handler = Some(handler); - self.next_ctx.as_mut().unwrap() - } - - fn sync_string_position(&mut self) { - self.state.string_position = self.ctx.string_position; - } -} - -struct StateRefContext<'a> { - entity: &'a StateContext<'a>, - ctx: MatchContext, -} - -impl ContextDrive for StateRefContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.entity.state - } -} +// trait ContextDrive<'a, T: StrDrive<'a>> { +// fn ctx(&self) -> &MatchContext; +// fn ctx_mut(&mut self) -> &mut MatchContext; +// fn state(&self) -> &State<'a, T>; + +// fn popped_ctx(&self) -> &MatchContext { +// self.state().popped_context.as_ref().unwrap() +// } + +// fn pattern(&self) -> &[u32] { +// &self.state().pattern_codes[self.ctx().code_position..] +// } + +// fn peek_char(&self) -> u32 { +// self.state().string.peek(self.ctx().string_offset) +// } +// fn peek_code(&self, peek: usize) -> u32 { +// self.state().pattern_codes[self.ctx().code_position + peek] +// } + +// fn back_peek_char(&self) -> u32 { +// self.state().string.back_peek(self.ctx().string_offset) +// } +// fn back_skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_position -= skip_count; +// self.ctx_mut().string_offset = self +// .state() +// .string +// .back_offset(self.ctx().string_offset, skip_count); +// } + +// fn skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_offset = self +// .state() +// .string +// .offset(self.ctx().string_offset, skip_count); +// self.ctx_mut().string_position += skip_count; +// } +// fn skip_code(&mut self, skip_count: usize) { +// self.ctx_mut().code_position += skip_count; +// } +// fn skip_code_from(&mut self, peek: usize) { +// self.skip_code(self.peek_code(peek) as usize + 1); +// } + +// fn remaining_chars(&self) -> usize { +// self.state().end - self.ctx().string_position +// } +// fn remaining_codes(&self) -> usize { +// self.state().pattern_codes.len() - self.ctx().code_position +// } + +// fn at_beginning(&self) -> bool { +// // self.ctx().string_position == self.state().start +// self.ctx().string_position == 0 +// } +// fn at_end(&self) -> bool { +// self.ctx().string_position == self.state().end +// } +// fn at_linebreak(&self) -> bool { +// !self.at_end() && is_linebreak(self.peek_char()) +// } +// fn at_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this != that +// } +// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this == that +// } + +// fn can_success(&self) -> bool { +// if !self.ctx().toplevel { +// return true; +// } +// if self.state().match_all && !self.at_end() { +// return false; +// } +// if self.state().must_advance && self.ctx().string_position == self.state().start { +// return false; +// } +// true +// } + +// fn success(&mut self) { +// self.ctx_mut().has_matched = Some(true); +// } + +// fn failure(&mut self) { +// self.ctx_mut().has_matched = Some(false); +// } +// } + +// struct StateContext<'a, S: StrDrive<'a>> { +// state: State<'a, S>, +// ctx: MatchContext, +// next_ctx: Option, +// } + +// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State<'a, S> { +// &self.state +// } +// } + +// impl StateContext<'_> { +// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx(self.peek_code(peek) as usize + 1, handler) +// } +// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx_at(self.ctx.code_position + offset, handler) +// } +// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx = Some(MatchContext { +// code_position, +// has_matched: None, +// handler: None, +// ..self.ctx +// }); +// self.ctx.handler = Some(handler); +// self.next_ctx.as_mut().unwrap() +// } + +// fn sync_string_position(&mut self) { +// self.state.string_position = self.ctx.string_position; +// } +// } + +// struct StateRefContext<'a> { +// entity: &'a StateContext<'a>, +// ctx: MatchContext, +// } + +// impl ContextDrive for StateRefContext<'_> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State { +// &self.entity.state +// } +// } fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1074,77 +1135,77 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return drive.failure(); - } - }; - - let mut wdrive = StateRefContext { - entity: drive, - ctx: drive.ctx, - }; - let mut gdrive = StateRefContext { - entity: drive, - ctx: MatchContext { - string_position: group_start, - // TODO: cache the offset - string_offset: drive.state.string.offset(0, group_start), - ..drive.ctx - }, - }; - - for _ in group_start..group_end { - if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - return drive.failure(); - } - wdrive.skip_char(1); - gdrive.skip_char(1); - } - - let position = wdrive.ctx.string_position; - let offset = wdrive.ctx.string_offset; - drive.skip_code(2); - drive.ctx.string_position = position; - drive.ctx.string_offset = offset; -} - -fn general_op_literal bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } -} - -fn general_op_in bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code_from(1); - drive.skip_char(1); - } -} - -fn at(drive: &StateContext, atcode: SreAtCode) -> bool { - match atcode { - SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), - SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), - SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), - SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), - SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), - SreAtCode::END_STRING => drive.at_end(), - SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), - SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), - } -} +// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// let (group_start, group_end) = match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => (start, end), +// _ => { +// return drive.failure(); +// } +// }; + +// let mut wdrive = StateRefContext { +// entity: drive, +// ctx: drive.ctx, +// }; +// let mut gdrive = StateRefContext { +// entity: drive, +// ctx: MatchContext { +// string_position: group_start, +// // TODO: cache the offset +// string_offset: drive.state.string.offset(0, group_start), +// ..drive.ctx +// }, +// }; + +// for _ in group_start..group_end { +// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { +// return drive.failure(); +// } +// wdrive.skip_char(1); +// gdrive.skip_char(1); +// } + +// let position = wdrive.ctx.string_position; +// let offset = wdrive.ctx.string_offset; +// drive.skip_code(2); +// drive.ctx.string_position = position; +// drive.ctx.string_offset = offset; +// } + +// fn general_op_literal bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } + +// fn general_op_in bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code_from(1); +// drive.skip_char(1); +// } +// } + +// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { +// match atcode { +// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), +// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), +// SreAtCode::BOUNDARY => drive.at_boundary(is_word), +// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), +// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), +// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), +// SreAtCode::END_STRING => drive.at_end(), +// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), +// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), +// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), +// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), +// } +// } fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { @@ -1262,95 +1323,95 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -/* General case */ -fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let mut count = 0; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - - let save_ctx = drive.ctx; - drive.skip_code(4); - let reset_position = drive.ctx.code_position; - - while count < max_count { - drive.ctx.code_position = reset_position; - let code = drive.peek_code(0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, drive, stacks); - if drive.ctx.has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx = save_ctx; - count -} - -fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let save_ctx = drive.ctx; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let end = drive.ctx.string_position + max_count; - let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - - match opcode { - SreOpcode::ANY => { - while !drive.ctx.string_position < end && !drive.at_linebreak() { - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - drive.skip_char(max_count); - } - SreOpcode::IN => { - while !drive.ctx.string_position < end - && charset(&drive.pattern()[2..], drive.peek_char()) - { - drive.skip_char(1); - } - } - SreOpcode::LITERAL => { - general_count_literal(drive, end, |code, c| code == c as u32); - } - SreOpcode::NOT_LITERAL => { - general_count_literal(drive, end, |code, c| code != c as u32); - } - SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, char_loc_ignore); - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); - } - _ => { - return general_count(drive, stacks, max_count); - } - } - - let count = drive.ctx.string_position - drive.state.string_position; - drive.ctx = save_ctx; - count -} - -fn general_count_literal bool>( - drive: &mut StateContext, - end: usize, - mut f: F, -) { - let ch = drive.peek_code(1); - while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { - drive.skip_char(1); - } -} +// /* General case */ +// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let mut count = 0; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); + +// let save_ctx = drive.ctx; +// drive.skip_code(4); +// let reset_position = drive.ctx.code_position; + +// while count < max_count { +// drive.ctx.code_position = reset_position; +// let code = drive.peek_code(0); +// let code = SreOpcode::try_from(code).unwrap(); +// dispatch(code, drive, stacks); +// if drive.ctx.has_matched == Some(false) { +// break; +// } +// count += 1; +// } +// drive.ctx = save_ctx; +// count +// } + +// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let save_ctx = drive.ctx; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); +// let end = drive.ctx.string_position + max_count; +// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); + +// match opcode { +// SreOpcode::ANY => { +// while !drive.ctx.string_position < end && !drive.at_linebreak() { +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// drive.skip_char(max_count); +// } +// SreOpcode::IN => { +// while !drive.ctx.string_position < end +// && charset(&drive.pattern()[2..], drive.peek_char()) +// { +// drive.skip_char(1); +// } +// } +// SreOpcode::LITERAL => { +// general_count_literal(drive, end, |code, c| code == c as u32); +// } +// SreOpcode::NOT_LITERAL => { +// general_count_literal(drive, end, |code, c| code != c as u32); +// } +// SreOpcode::LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); +// } +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); +// } +// SreOpcode::LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, char_loc_ignore); +// } +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); +// } +// _ => { +// return general_count(drive, stacks, max_count); +// } +// } + +// let count = drive.ctx.string_position - drive.state.string_position; +// drive.ctx = save_ctx; +// count +// } + +// fn general_count_literal bool>( +// drive: &mut StateContext, +// end: usize, +// mut f: F, +// ) { +// let ch = drive.peek_code(1); +// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { +// drive.skip_char(1); +// } +// } fn is_word(ch: u32) -> bool { ch == '_' as u32 From 34bde45a2c0906a3a3e03dca79a4426b4cf57655 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Aug 2022 22:19:49 +0200 Subject: [PATCH 55/95] pass compile --- benches/benches.rs | 22 +- src/engine.rs | 1802 ++++++++++++++++++++------------------------ tests/tests.rs | 20 +- 3 files changed, 821 insertions(+), 1023 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index b86a592967..d000ceb62e 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -11,12 +11,12 @@ pub struct Pattern { } impl Pattern { - pub fn state<'a>( + pub fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } #[bench] @@ -84,28 +84,28 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { let mut state = p.state(s.clone(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); state = p.state(s2.as_str(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 8865eb6a39..b0717e1671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -19,21 +19,45 @@ pub struct State<'a, S: StrDrive> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec>, - // branch_stack: Vec, - // min_repeat_one_stack: Vec, - // repeat_one_stack: Vec, - // repeat_stack: Vec, - // min_until_stack: Vec, - // max_until_stack: Vec, - // _stacks: Option>, + repeat_stack: Vec, pub string_position: usize, - popped_context: Option>, next_context: Option>, + popped_has_matched: bool, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } +macro_rules! next_ctx { + (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) + }; + (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + }; + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { + {$state.next_context.insert(MatchContext { + code_position: $position, + has_matched: None, + handler: Some($handler), + ..*$ctx + })} + }; +} + +macro_rules! mark { + (push, $state:expr) => { + $state + .marks_stack + .push(($state.marks.clone(), $state.lastindex)) + }; + (pop, $state:expr) => { + let (marks, lastindex) = $state.marks_stack.pop().unwrap(); + $state.marks = marks; + $state.lastindex = lastindex; + }; +} + impl<'a, S: StrDrive> State<'a, S> { pub fn new( string: S, @@ -54,16 +78,10 @@ impl<'a, S: StrDrive> State<'a, S> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - // branch_stack: Vec::new(), - // min_repeat_one_stack: Vec::new(), - // repeat_one_stack: Vec::new(), - // repeat_stack: Vec::new(), - // min_until_stack: Vec::new(), - // max_until_stack: Vec::new(), - // _stacks: Default::default(), + repeat_stack: Vec::new(), string_position: start, - popped_context: None, next_context: None, + popped_has_matched: false, has_matched: false, match_all: false, must_advance: false, @@ -75,15 +93,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - // self.branch_stack.clear(); - // self.min_repeat_one_stack.clear(); - // self.repeat_one_stack.clear(); - // self.repeat_stack.clear(); - // self.min_until_stack.clear(); - // self.max_until_stack.clear(); + self.repeat_stack.clear(); self.string_position = self.start; - self.popped_context = None; self.next_context = None; + self.popped_has_matched = false; self.has_matched = false; } @@ -104,14 +117,14 @@ impl<'a, S: StrDrive> State<'a, S> { (None, None) } } - fn marks_push(&mut self) { - self.marks_stack.push((self.marks.clone(), self.lastindex)); - } - fn marks_pop(&mut self) { - let (marks, lastindex) = self.marks_stack.pop().unwrap(); - self.marks = marks; - self.lastindex = lastindex; - } + // fn marks_push(&mut self) { + // self.marks_stack.push((self.marks.clone(), self.lastindex)); + // } + // fn marks_pop(&mut self) { + // let (marks, lastindex) = self.marks_stack.pop().unwrap(); + // self.marks = marks; + // self.lastindex = lastindex; + // } fn marks_pop_keep(&mut self) { let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); self.marks = marks; @@ -121,46 +134,31 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self) { while let Some(mut ctx) = self.context_stack.pop() { - // let mut drive = StateContext { - // state: self, - // ctx, - // next_ctx: None, - // }; - // let mut state = self; - - if let Some(handler) = ctx.handler { + if let Some(handler) = ctx.handler.take() { handler(self, &mut ctx); } else if ctx.remaining_codes(self) > 0 { let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - self.dispatch(code, &mut ctx); + dispatch(self, &mut ctx, code); } else { ctx.failure(); } - // let StateContext { - // mut state, - // ctx, - // next_ctx, - // } = drive; - - if ctx.has_matched.is_some() { - self.popped_context = Some(ctx); + if let Some(has_matched) = ctx.has_matched { + self.popped_has_matched = has_matched; } else { self.context_stack.push(ctx); if let Some(next_ctx) = self.next_context.take() { self.context_stack.push(next_ctx); } } - // self = state } - self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); - // self + self.has_matched = self.popped_has_matched; } - pub fn pymatch(mut self) -> Self { + pub fn pymatch(&mut self) { let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -169,18 +167,18 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); - self } - pub fn search(mut self) -> Self { + pub fn search(&mut self) { // TODO: optimize by op info and skip prefix if self.start > self.end { - return self; + return; } let mut start_offset = self.string.offset(0, self.start); @@ -193,6 +191,7 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); @@ -211,643 +210,503 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: false, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); } + } +} + +fn dispatch<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + opcode: SreOpcode, +) { + match opcode { + SreOpcode::FAILURE => { + ctx.failure(); + } + SreOpcode::SUCCESS => { + if ctx.can_success(state) { + state.string_position = ctx.string_position; + ctx.success(); + } else { + ctx.failure(); + } + } + SreOpcode::ANY => { + if ctx.at_end(state) || ctx.at_linebreak(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + if ctx.at_end(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + /* assert subpattern */ + /* */ + SreOpcode::ASSERT => op_assert(state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if at(state, ctx, atcode) { + ctx.skip_code(2); + } else { + ctx.failure(); + } + } + SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } + } + SreOpcode::IN => general_op_in(state, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => { + general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + ctx.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(state, ctx), + SreOpcode::MIN_UNTIL => op_min_until(state, ctx), + SreOpcode::REPEAT => op_repeat(state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), + SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + ctx.skip_code(3); + } + _ => ctx.skip_code_from(state, 2), + } + } + _ => unreachable!("unexpected opcode"), + } +} - self +/* assert subpattern */ +/* */ +fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + if ctx.string_position < back { + return ctx.failure(); } - fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { - match opcode { - SreOpcode::FAILURE => { - ctx.has_matched = Some(false); + // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.skip_code_from(state, 1); + } else { + ctx.failure(); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + + if ctx.string_position < back { + return ctx.skip_code_from(state, 1); + } + + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +// alternation +// <0=skip> code ... +fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + // state.marks_push(); + mark!(push, state); + + ctx.count = 1; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(state, branch_offset) as isize; + if next_length == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count += next_length; + next_ctx!(offset branch_offset + 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + // let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count = if min_count == 0 { + 0 + } else { + let count = _count(state, ctx, min_count); + if count < min_count { + return ctx.failure(); + } + ctx.skip_char(state, count); + count as isize + }; + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let max_count = ctx.peek_code(state, 3) as usize; + + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.string_position = ctx.string_position; + next_ctx!(from 1, state, ctx, callback); + } else { + state.marks_pop_discard(); + ctx.failure(); + } + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + state.string_position = ctx.string_position; + + if _count(state, ctx, 1) == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.skip_char(state, 1); + ctx.count += 1; + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* match repeated sequence (maximizing regexp) */ +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ +/* <1=min> <2=max> item tail */ +fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + let count = _count(state, ctx, max_count); + ctx.skip_char(state, count); + if count < min_count { + return ctx.failure(); + } + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + ctx.count = count as isize; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as isize; + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); + while ctx.at_end(state) || ctx.peek_char(state) != c { + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + } + } + + state.string_position = ctx.string_position; + + // General case: backtracking + next_ctx!(from 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + let min_count = ctx.peek_code(state, 2) as isize; + + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(state, 2) as usize, + max_count: ctx.peek_code(state, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + + state.repeat_stack.push(repeat_ctx); + + state.string_position = ctx.string_position; + + let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + ctx.has_matched = Some(state.popped_has_matched); + state.repeat_stack.pop(); + }); + next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; +} + +/* minimizing repeat */ +fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + mark!(push, state); + + ctx.count = ctx.repeat_ctx_id as isize; + + // see if the tail matches + let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + if state.popped_has_matched { + return ctx.success(); + } + + ctx.repeat_ctx_id = ctx.count as usize; + + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + mark!(pop, state); + + // match more until tail matches + + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || state.string_position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + return ctx.failure(); + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +} + +/* maximizing repeat */ +fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + mark!(push, state); + + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if state.popped_has_matched { + state.marks_pop_discard(); + return ctx.success(); } - SreOpcode::SUCCESS => todo!(), - SreOpcode::ANY => todo!(), - SreOpcode::ANY_ALL => todo!(), - SreOpcode::ASSERT => todo!(), - SreOpcode::ASSERT_NOT => todo!(), - SreOpcode::AT => todo!(), - SreOpcode::BRANCH => todo!(), - SreOpcode::CALL => todo!(), - SreOpcode::CATEGORY => todo!(), - SreOpcode::CHARSET => todo!(), - SreOpcode::BIGCHARSET => todo!(), - SreOpcode::GROUPREF => todo!(), - SreOpcode::GROUPREF_EXISTS => todo!(), - SreOpcode::IN => todo!(), - SreOpcode::INFO => todo!(), - SreOpcode::JUMP => todo!(), - SreOpcode::LITERAL => todo!(), - SreOpcode::MARK => todo!(), - SreOpcode::MAX_UNTIL => todo!(), - SreOpcode::MIN_UNTIL => todo!(), - SreOpcode::NOT_LITERAL => todo!(), - SreOpcode::NEGATE => todo!(), - SreOpcode::RANGE => todo!(), - SreOpcode::REPEAT => todo!(), - SreOpcode::REPEAT_ONE => todo!(), - SreOpcode::SUBPATTERN => todo!(), - SreOpcode::MIN_REPEAT_ONE => todo!(), - SreOpcode::GROUPREF_IGNORE => todo!(), - SreOpcode::IN_IGNORE => todo!(), - SreOpcode::LITERAL_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_IGNORE => todo!(), - SreOpcode::GROUPREF_LOC_IGNORE => todo!(), - SreOpcode::IN_LOC_IGNORE => todo!(), - SreOpcode::LITERAL_LOC_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), - SreOpcode::GROUPREF_UNI_IGNORE => todo!(), - SreOpcode::IN_UNI_IGNORE => todo!(), - SreOpcode::LITERAL_UNI_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), - SreOpcode::RANGE_UNI_IGNORE => todo!(), + + mark!(pop, state); + repeat_ctx.count -= 1; + + state.string_position = ctx.string_position; + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + ctx.success(); + } else { + state.string_position = ctx.string_position; + ctx.failure(); } } } -// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { -// match opcode { -// SreOpcode::FAILURE => { -// drive.failure(); -// } -// SreOpcode::SUCCESS => { -// drive.ctx.has_matched = Some(drive.can_success()); -// if drive.ctx.has_matched == Some(true) { -// drive.state.string_position = drive.ctx.string_position; -// } -// } -// SreOpcode::ANY => { -// if drive.at_end() || drive.at_linebreak() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// if drive.at_end() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ASSERT => op_assert(drive), -// SreOpcode::ASSERT_NOT => op_assert_not(drive), -// SreOpcode::AT => { -// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); -// if at(drive, atcode) { -// drive.skip_code(2); -// } else { -// drive.failure(); -// } -// } -// SreOpcode::BRANCH => op_branch(drive, stacks), -// SreOpcode::CATEGORY => { -// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); -// if drive.at_end() || !category(catcode, drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } -// SreOpcode::IN => general_op_in(drive, charset), -// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), -// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), -// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), -// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), -// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), -// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), -// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_ascii(c)) -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code == lower_unicode(c)) -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_unicode(c)) -// } -// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) -// } -// SreOpcode::MARK => { -// drive -// .state -// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); -// drive.skip_code(2); -// } -// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), -// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), -// SreOpcode::REPEAT => op_repeat(drive, stacks), -// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), -// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), -// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), -// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), -// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), -// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), -// SreOpcode::GROUPREF_EXISTS => { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => { -// drive.skip_code(3); -// } -// _ => drive.skip_code_from(2), -// } -// } -// _ => unreachable!("unexpected opcode"), -// } -// } - -// /* assert subpattern */ -// /* */ -// fn op_assert(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.failure(); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } else { -// drive.failure(); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// /* assert not subpattern */ -// /* */ -// fn op_assert_not(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.skip_code_from(1); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.failure(); -// } else { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// #[derive(Debug)] -// struct BranchContext { -// branch_offset: usize, -// } - -// // alternation -// // <0=skip> code ... -// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { -// drive.state.marks_push(); -// stacks.branch.push(BranchContext { branch_offset: 1 }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let branch_offset = stacks.branch_last().branch_offset; -// let next_length = drive.peek_code(branch_offset) as usize; -// if next_length == 0 { -// drive.state.marks_pop_discard(); -// stacks.branch.pop(); -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// stacks.branch_last().branch_offset += next_length; -// drive.next_ctx(branch_offset + 1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.branch.pop(); -// return drive.success(); -// } -// drive.state.marks_pop_keep(); -// drive.ctx.handler = Some(create_context) -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct MinRepeatOneContext { -// count: usize, -// max_count: usize, -// } - -// /* <1=min> <2=max> item tail */ -// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = if min_count == 0 { -// 0 -// } else { -// let count = _count(drive, stacks, min_count); -// if count < min_count { -// return drive.failure(); -// } -// drive.skip_char(count); -// count -// }; - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// drive.state.marks_push(); -// stacks -// .min_repeat_one -// .push(MinRepeatOneContext { count, max_count }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - -// if max_count == MAXREPEAT || count <= max_count { -// drive.sync_string_position(); -// drive.next_ctx_from(1, callback); -// } else { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// drive.failure(); -// } -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.min_repeat_one.pop(); -// return drive.success(); -// } - -// drive.sync_string_position(); - -// if _count(drive, stacks, 1) == 0 { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// return drive.failure(); -// } - -// drive.skip_char(1); -// stacks.min_repeat_one_last().count += 1; -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct RepeatOneContext { -// count: usize, -// min_count: usize, -// following_literal: Option, -// } - -// /* match repeated sequence (maximizing regexp) */ - -// /* this operator only works if the repeated item is -// exactly one character wide, and we're not already -// collecting backtracking points. for other cases, -// use the MAX_REPEAT operator */ - -// /* <1=min> <2=max> item tail */ -// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = _count(drive, stacks, max_count); -// drive.skip_char(count); -// if count < min_count { -// return drive.failure(); -// } - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// // Special case: Tail starts with a literal. Skip positions where -// // the rest of the pattern cannot possibly match. -// let following_literal = (next_code == SreOpcode::LITERAL as u32) -// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - -// drive.state.marks_push(); -// stacks.repeat_one.push(RepeatOneContext { -// count, -// min_count, -// following_literal, -// }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let RepeatOneContext { -// mut count, -// min_count, -// following_literal, -// } = *stacks.repeat_one_last(); - -// if let Some(c) = following_literal { -// while drive.at_end() || drive.peek_char() != c { -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } -// drive.back_skip_char(1); -// count -= 1; -// } -// } -// stacks.repeat_one_last().count = count; - -// drive.sync_string_position(); - -// // General case: backtracking -// drive.next_ctx_from(1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.repeat_one.pop(); -// return drive.success(); -// } - -// let RepeatOneContext { -// count, -// min_count, -// following_literal: _, -// } = stacks.repeat_one_last(); - -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } - -// drive.back_skip_char(1); -// *count -= 1; - -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Clone, Copy)] -// struct RepeatContext { -// count: isize, -// min_count: usize, -// max_count: usize, -// code_position: usize, -// last_position: usize, -// prev_id: usize, -// } - -// /* create repeat context. all the hard work is done -// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -// /* <1=min> <2=max> item tail */ -// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = RepeatContext { -// count: -1, -// min_count: drive.peek_code(2) as usize, -// max_count: drive.peek_code(3) as usize, -// code_position: drive.ctx.code_position, -// last_position: std::usize::MAX, -// prev_id: drive.ctx.repeat_ctx_id, -// }; - -// stacks.repeat.push(repeat_ctx); - -// drive.sync_string_position(); - -// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { -// drive.ctx.has_matched = drive.popped_ctx().has_matched; -// stacks.repeat.pop(); -// }); -// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MinUntilContext { -// save_repeat_ctx_id: usize, -// } - -// /* minimizing repeat */ -// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = stacks.repeat.last_mut().unwrap(); - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// drive.state.marks_push(); - -// stacks.min_until.push(MinUntilContext { -// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, -// }); - -// // see if the tail matches -// let next_ctx = drive.next_ctx(1, |drive, stacks| { -// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// if drive.popped_ctx().has_matched == Some(true) { -// return drive.success(); -// } - -// drive.sync_string_position(); - -// drive.state.marks_pop(); - -// // match more until tail matches - -// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT -// || drive.state.string_position == repeat_ctx.last_position -// { -// repeat_ctx.count -= 1; -// return drive.failure(); -// } - -// /* zero-width match protection */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// }); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MaxUntilContext { -// save_last_position: usize, -// } - -// /* maximizing repeat */ -// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// stacks.max_until.push(MaxUntilContext { -// save_last_position: repeat_ctx.last_position, -// }); - -// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) -// && drive.state.string_position != repeat_ctx.last_position -// { -// /* we may have enough matches, but if we can -// match another item, do so */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.state.marks_push(); - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; -// repeat_ctx.last_position = save_last_position; - -// if drive.popped_ctx().has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// return drive.success(); -// } - -// drive.state.marks_pop(); -// repeat_ctx.count -= 1; -// drive.sync_string_position(); - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// }); -// return; -// } - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - -// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// drive.sync_string_position(); -// drive.failure(); -// } -// } -// } - -// #[derive(Debug, Default)] -// struct Stacks { -// } - -// impl Stacks { -// fn clear(&mut self) { -// self.branch.clear(); -// self.min_repeat_one.clear(); -// self.repeat_one.clear(); -// self.repeat.clear(); -// self.min_until.clear(); -// self.max_until.clear(); -// } - -// fn branch_last(&mut self) -> &mut BranchContext { -// self.branch.last_mut().unwrap() -// } -// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { -// self.min_repeat_one.last_mut().unwrap() -// } -// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { -// self.repeat_one.last_mut().unwrap() -// } -// } - -pub trait StrDrive { +pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; fn count(&self) -> usize; fn peek(&self, offset: usize) -> u32; @@ -855,7 +714,6 @@ pub trait StrDrive { fn back_offset(&self, offset: usize, skip: usize) -> usize; } - impl<'a> StrDrive for &'a str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) @@ -934,6 +792,7 @@ struct MatchContext<'a, S: StrDrive> { toplevel: bool, handler: Option, &mut Self)>, repeat_ctx_id: usize, + count: isize, } impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { @@ -945,182 +804,188 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { + fn pattern(&self, state: &State<'a, S>) -> &[u32] { + &state.pattern_codes[self.code_position..] + } + fn remaining_codes(&self, state: &State<'a, S>) -> usize { state.pattern_codes.len() - self.code_position } - + + fn remaining_chars(&self, state: &State<'a, S>) -> usize { + state.end - self.string_position + } + + fn peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.peek(self.string_offset) + } + + fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + self.string_position += skip; + self.string_offset = state.string.offset(self.string_offset, skip); + } + + fn back_peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.back_peek(self.string_offset) + } + + fn back_skip_char(&mut self, string: &S, skip: usize) { + self.string_position -= skip; + self.string_offset = string.back_offset(self.string_offset, skip); + } + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { state.pattern_codes[self.code_position + peek] } + fn skip_code(&mut self, skip: usize) { + self.code_position += skip; + } + + fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { + self.skip_code(self.peek_code(state, peek) as usize + 1); + } + + fn at_beginning(&self) -> bool { + // self.ctx().string_position == self.state().start + self.string_position == 0 + } + + fn at_end(&self, state: &State<'a, S>) -> bool { + self.string_position == state.end + } + + fn at_linebreak(&self, state: &State<'a, S>) -> bool { + !self.at_end(state) && is_linebreak(self.peek_char(state)) + } + + fn at_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this != that + } + + fn at_non_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this == that + } + + fn can_success(&self, state: &State<'a, S>) -> bool { + if !self.toplevel { + return true; + } + if state.match_all && !self.at_end(state) { + return false; + } + if state.must_advance && self.string_position == state.start { + return false; + } + true + } + + fn success(&mut self) { + self.has_matched = Some(true); + } + fn failure(&mut self) { self.has_matched = Some(false); } } -// trait ContextDrive<'a, T: StrDrive<'a>> { -// fn ctx(&self) -> &MatchContext; -// fn ctx_mut(&mut self) -> &mut MatchContext; -// fn state(&self) -> &State<'a, T>; - -// fn popped_ctx(&self) -> &MatchContext { -// self.state().popped_context.as_ref().unwrap() -// } - -// fn pattern(&self) -> &[u32] { -// &self.state().pattern_codes[self.ctx().code_position..] -// } - -// fn peek_char(&self) -> u32 { -// self.state().string.peek(self.ctx().string_offset) -// } -// fn peek_code(&self, peek: usize) -> u32 { -// self.state().pattern_codes[self.ctx().code_position + peek] -// } - -// fn back_peek_char(&self) -> u32 { -// self.state().string.back_peek(self.ctx().string_offset) -// } -// fn back_skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_position -= skip_count; -// self.ctx_mut().string_offset = self -// .state() -// .string -// .back_offset(self.ctx().string_offset, skip_count); -// } - -// fn skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_offset = self -// .state() -// .string -// .offset(self.ctx().string_offset, skip_count); -// self.ctx_mut().string_position += skip_count; -// } -// fn skip_code(&mut self, skip_count: usize) { -// self.ctx_mut().code_position += skip_count; -// } -// fn skip_code_from(&mut self, peek: usize) { -// self.skip_code(self.peek_code(peek) as usize + 1); -// } - -// fn remaining_chars(&self) -> usize { -// self.state().end - self.ctx().string_position -// } -// fn remaining_codes(&self) -> usize { -// self.state().pattern_codes.len() - self.ctx().code_position -// } - -// fn at_beginning(&self) -> bool { -// // self.ctx().string_position == self.state().start -// self.ctx().string_position == 0 -// } -// fn at_end(&self) -> bool { -// self.ctx().string_position == self.state().end -// } -// fn at_linebreak(&self) -> bool { -// !self.at_end() && is_linebreak(self.peek_char()) -// } -// fn at_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this != that -// } -// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this == that -// } - -// fn can_success(&self) -> bool { -// if !self.ctx().toplevel { -// return true; -// } -// if self.state().match_all && !self.at_end() { -// return false; -// } -// if self.state().must_advance && self.ctx().string_position == self.state().start { -// return false; -// } -// true -// } - -// fn success(&mut self) { -// self.ctx_mut().has_matched = Some(true); -// } - -// fn failure(&mut self) { -// self.ctx_mut().has_matched = Some(false); -// } -// } - -// struct StateContext<'a, S: StrDrive<'a>> { -// state: State<'a, S>, -// ctx: MatchContext, -// next_ctx: Option, -// } - -// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State<'a, S> { -// &self.state -// } -// } - -// impl StateContext<'_> { -// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx(self.peek_code(peek) as usize + 1, handler) -// } -// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx_at(self.ctx.code_position + offset, handler) -// } -// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx = Some(MatchContext { -// code_position, -// has_matched: None, -// handler: None, -// ..self.ctx -// }); -// self.ctx.handler = Some(handler); -// self.next_ctx.as_mut().unwrap() -// } - -// fn sync_string_position(&mut self) { -// self.state.string_position = self.ctx.string_position; -// } -// } - -// struct StateRefContext<'a> { -// entity: &'a StateContext<'a>, -// ctx: MatchContext, -// } - -// impl ContextDrive for StateRefContext<'_> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State { -// &self.entity.state -// } -// } +fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), + SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::END => { + (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + } + SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), + SreAtCode::END_STRING => ctx.at_end(state), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + } +} + +fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } +} + +fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + ctx.skip_char(state, 1); + } +} + +fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + mut f: F, +) { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + return ctx.failure(); + } + }; + + let mut gctx = MatchContext { + string_position: group_start, + string_offset: state.string.offset(0, group_start), + ..*ctx + }; + + for _ in group_start..group_end { + if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + return ctx.failure(); + } + ctx.skip_char(state, 1); + gctx.skip_char(state, 1); + } + + ctx.skip_code(2); +} fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1135,78 +1000,6 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// let (group_start, group_end) = match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => (start, end), -// _ => { -// return drive.failure(); -// } -// }; - -// let mut wdrive = StateRefContext { -// entity: drive, -// ctx: drive.ctx, -// }; -// let mut gdrive = StateRefContext { -// entity: drive, -// ctx: MatchContext { -// string_position: group_start, -// // TODO: cache the offset -// string_offset: drive.state.string.offset(0, group_start), -// ..drive.ctx -// }, -// }; - -// for _ in group_start..group_end { -// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { -// return drive.failure(); -// } -// wdrive.skip_char(1); -// gdrive.skip_char(1); -// } - -// let position = wdrive.ctx.string_position; -// let offset = wdrive.ctx.string_offset; -// drive.skip_code(2); -// drive.ctx.string_position = position; -// drive.ctx.string_offset = offset; -// } - -// fn general_op_literal bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } - -// fn general_op_in bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code_from(1); -// drive.skip_char(1); -// } -// } - -// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { -// match atcode { -// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), -// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), -// SreAtCode::BOUNDARY => drive.at_boundary(is_word), -// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), -// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), -// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), -// SreAtCode::END_STRING => drive.at_end(), -// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), -// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), -// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), -// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), -// } -// } - fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), @@ -1323,95 +1116,100 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -// /* General case */ -// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let mut count = 0; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); - -// let save_ctx = drive.ctx; -// drive.skip_code(4); -// let reset_position = drive.ctx.code_position; - -// while count < max_count { -// drive.ctx.code_position = reset_position; -// let code = drive.peek_code(0); -// let code = SreOpcode::try_from(code).unwrap(); -// dispatch(code, drive, stacks); -// if drive.ctx.has_matched == Some(false) { -// break; -// } -// count += 1; -// } -// drive.ctx = save_ctx; -// count -// } - -// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let save_ctx = drive.ctx; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); -// let end = drive.ctx.string_position + max_count; -// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - -// match opcode { -// SreOpcode::ANY => { -// while !drive.ctx.string_position < end && !drive.at_linebreak() { -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// drive.skip_char(max_count); -// } -// SreOpcode::IN => { -// while !drive.ctx.string_position < end -// && charset(&drive.pattern()[2..], drive.peek_char()) -// { -// drive.skip_char(1); -// } -// } -// SreOpcode::LITERAL => { -// general_count_literal(drive, end, |code, c| code == c as u32); -// } -// SreOpcode::NOT_LITERAL => { -// general_count_literal(drive, end, |code, c| code != c as u32); -// } -// SreOpcode::LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); -// } -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); -// } -// SreOpcode::LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, char_loc_ignore); -// } -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); -// } -// _ => { -// return general_count(drive, stacks, max_count); -// } -// } - -// let count = drive.ctx.string_position - drive.state.string_position; -// drive.ctx = save_ctx; -// count -// } - -// fn general_count_literal bool>( -// drive: &mut StateContext, -// end: usize, -// mut f: F, -// ) { -// let ch = drive.peek_code(1); -// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { -// drive.skip_char(1); -// } -// } +fn _count<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &MatchContext<'a, S>, + max_count: usize, +) -> usize { + let mut ctx = *ctx; + let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let end = ctx.string_position + max_count; + let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + + match opcode { + SreOpcode::ANY => { + while !ctx.string_position < end && !ctx.at_linebreak(state) { + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + ctx.skip_char(state, max_count); + } + SreOpcode::IN => { + while !ctx.string_position < end + && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + { + ctx.skip_char(state, 1); + } + } + SreOpcode::LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_ascii(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_ascii(c) as u32 + }); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, char_loc_ignore); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_unicode(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_unicode(c) as u32 + }); + } + _ => { + /* General case */ + let mut count = 0; + + ctx.skip_code(4); + let reset_position = ctx.code_position; + + while count < max_count { + ctx.code_position = reset_position; + let code = ctx.peek_code(state, 0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(state, &mut ctx, code); + if ctx.has_matched == Some(false) { + break; + } + count += 1; + } + return count; + } + } + + // TODO: return offset + ctx.string_position - state.string_position +} + +fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + end: usize, + mut f: F, +) { + let ch = ctx.peek_code(state, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { + ctx.skip_char(state, 1); + } +} fn is_word(ch: u32) -> bool { ch == '_' as u32 diff --git a/tests/tests.rs b/tests/tests.rs index e8ae487029..cc5c4d1f38 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -7,12 +7,12 @@ struct Pattern { } impl Pattern { - fn state<'a>( + fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } @@ -23,7 +23,7 @@ fn test_2427() { #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } @@ -34,7 +34,7 @@ fn test_assert() { #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); } @@ -45,7 +45,7 @@ fn test_string_boundaries() { #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = big_b.state("", 0..usize::MAX); - state = state.search(); + state.search(); assert!(!state.has_matched); } @@ -57,7 +57,7 @@ fn test_zerowidth() { // END GENERATED let mut state = p.state("a:", 0..usize::MAX); state.must_advance = true; - state = state.search(); + state.search(); assert!(state.string_position == 1); } @@ -68,7 +68,7 @@ fn test_repeat_context_panic() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("axxzaz", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); } @@ -79,6 +79,6 @@ fn test_double_max_until() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("1111", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.string_position == 4); } From 982d8f53f2eea5eb80d37a0d22c807aebd694445 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:10:31 +0200 Subject: [PATCH 56/95] fix next_ctx --- src/engine.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b0717e1671..64043cee53 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -35,14 +35,16 @@ macro_rules! next_ctx { (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { - {$state.next_context.insert(MatchContext { + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ + $ctx.handler = Some($handler); + $state.next_context.insert(MatchContext { code_position: $position, has_matched: None, - handler: Some($handler), + handler: None, + count: -1, ..*$ctx - })} - }; + }) + }}; } macro_rules! mark { @@ -678,7 +680,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex return ctx.success(); } - mark!(pop, state); + mark!(pop, state); repeat_ctx.count -= 1; state.string_position = ctx.string_position; From ccae898885496a1390a10fa6fca4cf394445dd35 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:24:11 +0200 Subject: [PATCH 57/95] fix next_ctx bug --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 64043cee53..4acc172bc6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -33,7 +33,7 @@ macro_rules! next_ctx { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) }; (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ $ctx.handler = Some($handler); From f3b30443aab82095ed2ce786482309e659f9c107 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:05:10 +0200 Subject: [PATCH 58/95] fix lifetime --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 4acc172bc6..810e011c95 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -716,7 +716,7 @@ pub trait StrDrive: Copy { fn back_offset(&self, offset: usize, skip: usize) -> usize; } -impl<'a> StrDrive for &'a str { +impl StrDrive for &str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) From ca20b5951d7092cf0ec25198f01d3620a69e61e2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:08:14 +0200 Subject: [PATCH 59/95] update version to 0.3.0 --- Cargo.toml | 2 +- src/engine.rs | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 00123d92c5..98b632a4dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.1" +version = "0.3.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 810e011c95..1302667d1d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -253,8 +253,6 @@ fn dispatch<'a, S: StrDrive>( ctx.skip_char(state, 1); } } - /* assert subpattern */ - /* */ SreOpcode::ASSERT => op_assert(state, ctx), SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), SreOpcode::AT => { @@ -334,7 +332,6 @@ fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' return ctx.failure(); } - // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { if state.popped_has_matched { ctx.skip_code_from(state, 1); @@ -371,7 +368,6 @@ fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte // alternation // <0=skip> code ... fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - // state.marks_push(); mark!(push, state); ctx.count = 1; @@ -403,7 +399,6 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' /* <1=min> <2=max> item tail */ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { let min_count = ctx.peek_code(state, 2) as usize; - // let max_count = ctx.peek_code(state, 3) as usize; if ctx.remaining_chars(state) < min_count { return ctx.failure(); From a48f5b07c5671b690e262b935b81f0a7a832b566 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:49:46 +0200 Subject: [PATCH 60/95] impl op_info --- src/engine.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 1302667d1d..fcade829f2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -279,7 +279,15 @@ fn dispatch<'a, S: StrDrive>( general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::INFO => { + let min = ctx.peek_code(state, 3) as usize; + if ctx.remaining_chars(state) < min { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + } + SreOpcode::JUMP => ctx.skip_code_from(state, 1), SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), SreOpcode::LITERAL_IGNORE => { From 8b1fcea7ec27aa22f698d485ee203dbe2a552334 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 7 Aug 2022 08:10:52 +0200 Subject: [PATCH 61/95] update version to 0.3.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 98b632a4dc..4b403f2861 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.0" +version = "0.3.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c31462d51b7d3adbf8f121403c4e8b305a9dab6f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:29:51 +0200 Subject: [PATCH 62/95] refactor split State with Request --- src/engine.rs | 669 +++++++++++++++++++++++++++++++------------------- 1 file changed, 420 insertions(+), 249 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fcade829f2..5e1f1457ec 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; use std::convert::TryFrom; @@ -8,26 +8,37 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } -#[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, pub end: usize, - _flags: SreFlag, - pattern_codes: &'a [u32], - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, - repeat_stack: Vec, - pub string_position: usize, - next_context: Option>, - popped_has_matched: bool, - pub has_matched: bool, + pub pattern_codes: &'a [u32], pub match_all: bool, pub must_advance: bool, } +impl<'a, S: StrDrive> Request<'a, S> { + pub fn new( + string: S, + start: usize, + end: usize, + pattern_codes: &'a [u32], + match_all: bool, + ) -> Self { + let end = std::cmp::min(end, string.count()); + let start = std::cmp::min(start, end); + + Self { + string, + start, + end, + pattern_codes, + match_all, + must_advance: false, + } + } +} + macro_rules! next_ctx { (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) @@ -60,43 +71,41 @@ macro_rules! mark { }; } +#[derive(Debug)] +pub struct State<'a, S: StrDrive> { + pub marks: Vec>, + pub lastindex: isize, + marks_stack: Vec<(Vec>, isize)>, + context_stack: Vec>, + repeat_stack: Vec, + pub string_position: usize, + next_context: Option>, + popped_has_matched: bool, + has_matched: bool, +} + impl<'a, S: StrDrive> State<'a, S> { - pub fn new( - string: S, - start: usize, - end: usize, - flags: SreFlag, - pattern_codes: &'a [u32], - ) -> Self { - let end = std::cmp::min(end, string.count()); - let start = std::cmp::min(start, end); + pub fn new(string_position: usize) -> Self { Self { - string, - start, - end, - _flags: flags, - pattern_codes, marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position: start, + string_position, next_context: None, popped_has_matched: false, has_matched: false, - match_all: false, - must_advance: false, } } - pub fn reset(&mut self) { + pub fn reset(&mut self, string_position: usize) { self.lastindex = -1; self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = self.start; + self.string_position = string_position; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; @@ -136,14 +145,14 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self, req: &mut Request<'a, S>) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { - handler(self, &mut ctx); - } else if ctx.remaining_codes(self) > 0 { - let code = ctx.peek_code(self, 0); + handler(req, self, &mut ctx); + } else if ctx.remaining_codes(req) > 0 { + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(self, &mut ctx, code); + dispatch(req, self, &mut ctx, code); } else { ctx.failure(); } @@ -160,10 +169,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self) { + pub fn pymatch(&mut self, req: &mut Request<'a, S>) { let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_position: req.start, + string_offset: req.string.offset(0, req.start), code_position: 0, has_matched: None, toplevel: true, @@ -173,20 +182,22 @@ impl<'a, S: StrDrive> State<'a, S> { }; self.context_stack.push(ctx); - self._match(); + self._match(req); } - pub fn search(&mut self) { + pub fn search(&mut self, req: &mut Request<'a, S>) { // TODO: optimize by op info and skip prefix - - if self.start > self.end { + if req.start > req.end { return; } - let mut start_offset = self.string.offset(0, self.start); + // let start = self.start; + // let end = self.end; - let ctx = MatchContext { - string_position: self.start, + let mut start_offset = req.string.offset(0, req.start); + + let mut ctx = MatchContext { + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -195,17 +206,26 @@ impl<'a, S: StrDrive> State<'a, S> { repeat_ctx_id: usize::MAX, count: -1, }; + + // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { + // search_op_info(self, &mut ctx); + // if let Some(has_matched) = ctx.has_matched { + // self.has_matched = has_matched; + // return; + // } + // } + self.context_stack.push(ctx); - self._match(); + self._match(req); - self.must_advance = false; - while !self.has_matched && self.start < self.end { - self.start += 1; - start_offset = self.string.offset(start_offset, 1); - self.reset(); + req.must_advance = false; + while !self.has_matched && req.start < req.end { + req.start += 1; + start_offset = req.string.offset(start_offset, 1); + self.reset(req.start); let ctx = MatchContext { - string_position: self.start, + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -215,12 +235,13 @@ impl<'a, S: StrDrive> State<'a, S> { count: -1, }; self.context_stack.push(ctx); - self._match(); + self._match(req); } } } fn dispatch<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>, opcode: SreOpcode, @@ -230,7 +251,7 @@ fn dispatch<'a, S: StrDrive>( ctx.failure(); } SreOpcode::SUCCESS => { - if ctx.can_success(state) { + if ctx.can_success(req) { state.string_position = ctx.string_position; ctx.success(); } else { @@ -238,152 +259,224 @@ fn dispatch<'a, S: StrDrive>( } } SreOpcode::ANY => { - if ctx.at_end(state) || ctx.at_linebreak(state) { + if ctx.at_end(req) || ctx.at_linebreak(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - if ctx.at_end(state) { + if ctx.at_end(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::ASSERT => op_assert(state, ctx), - SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::ASSERT => op_assert(req, state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx), SreOpcode::AT => { - let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if at(state, ctx, atcode) { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, ctx, atcode) { ctx.skip_code(2); } else { ctx.failure(); } } - SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::BRANCH => op_branch(req, state, ctx), SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::IN => general_op_in(state, ctx, charset), - SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN => general_op_in(req, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))), SreOpcode::IN_UNI_IGNORE => { - general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c))) } - SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore), SreOpcode::INFO => { - let min = ctx.peek_code(state, 3) as usize; - if ctx.remaining_chars(state) < min { + let min = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } } - SreOpcode::JUMP => ctx.skip_code_from(state, 1), - SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) - } + SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)), SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + general_op_literal(req, ctx, |code, c| code != lower_ascii(c)) } SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code == lower_unicode(c)) } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code != lower_unicode(c)) } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore), SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), SreOpcode::MIN_UNTIL => op_min_until(state, ctx), - SreOpcode::REPEAT => op_repeat(state, ctx), - SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), - SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::REPEAT => op_repeat(req, state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx), + SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { ctx.skip_code(3); } - _ => ctx.skip_code_from(state, 2), + _ => ctx.skip_code_from(req, 2), } } _ => unreachable!("unexpected opcode"), } } +/* optimization info block */ +/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ +// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +// let min = ctx.peek_code(state, 3) as usize; + +// if ctx.remaining_chars(state) < min { +// return ctx.failure(); +// } + +// if min > 1 { +// /* adjust end point (but make sure we leave at least one +// character in there, so literal search will work) */ +// // no overflow can happen as remaining chars >= min +// state.end -= min - 1; + +// // adjust ctx position +// if state.end < ctx.string_position { +// ctx.string_position = state.end; +// ctx.string_offset = state.string.offset(0, ctx.string_position); +// } +// } + +// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); + +// if flags.contains(SreInfo::PREFIX) { +// /* pattern starts with a known prefix */ +// /* */ +// let len = ctx.peek_code(state, 5) as usize; +// let skip = ctx.peek_code(state, 6) as usize; +// let prefix = &ctx.pattern(state)[7..]; +// let overlap = &prefix[len - 1..]; + +// ctx.skip_code_from(state, 1); + +// if len == 1 { +// // pattern starts with a literal character +// let c = prefix[0]; +// let end = state.end; + +// while (!ctx.at_end(state)) { +// // find the next matched literal +// while (ctx.peek_char(state) != c) { +// ctx.skip_char(state, 1); +// if (ctx.at_end(state)) { +// return ctx.failure(); +// } +// } + +// // literal only +// if flags.contains(SreInfo::LITERAL) { +// return ctx.success(); +// } +// } +// } +// } +// } + /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } else { ctx.failure(); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert_not<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { - return ctx.skip_code_from(state, 1); + return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_branch<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { mark!(push, state); ctx.count = 1; - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { let branch_offset = ctx.count as usize; - let next_length = ctx.peek_code(state, branch_offset) as isize; + let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { state.marks_pop_discard(); return ctx.failure(); @@ -395,20 +488,28 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' next_ctx!(offset branch_offset + 1, state, ctx, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; +fn op_min_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } @@ -417,52 +518,61 @@ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchC ctx.count = if min_count == 0 { 0 } else { - let count = _count(state, ctx, min_count); + let count = _count(req, state, ctx, min_count); if count < min_count { return ctx.failure(); } - ctx.skip_char(state, count); + ctx.skip_char(req, count); count as isize }; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); } mark!(push, state); - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let max_count = ctx.peek_code(state, 3) as usize; + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } else { state.marks_pop_discard(); ctx.failure(); } } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.string_position = ctx.string_position; - if _count(state, ctx, 1) == 0 { + if _count(req, state, ctx, 1) == 0 { state.marks_pop_discard(); return ctx.failure(); } - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); ctx.count += 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -472,24 +582,28 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; - let max_count = ctx.peek_code(state, 3) as usize; +fn op_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } state.string_position = ctx.string_position; - let count = _count(state, ctx, max_count); - ctx.skip_char(state, count); + let count = _count(req, state, ctx, max_count); + ctx.skip_char(req, count); if count < min_count { return ctx.failure(); } - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); @@ -497,21 +611,25 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte mark!(push, state); ctx.count = count as isize; - create_context(state, ctx); - - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as isize; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + create_context(req, state, ctx); + + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::LITERAL as u32 { // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. - let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); - while ctx.at_end(state) || ctx.peek_char(state) != c { + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; } } @@ -519,26 +637,31 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte state.string_position = ctx.string_position; // General case: backtracking - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } - let min_count = ctx.peek_code(state, 2) as isize; + let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -555,11 +678,15 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_repeat<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { let repeat_ctx = RepeatContext { count: -1, - min_count: ctx.peek_code(state, 2) as usize, - max_count: ctx.peek_code(state, 3) as usize, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, code_position: ctx.code_position, last_position: std::usize::MAX, prev_id: ctx.repeat_ctx_id, @@ -569,11 +696,14 @@ fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' state.string_position = ctx.string_position; - let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let repeat_ctx_id = state.repeat_stack.len(); + + // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); - next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; + next_ctx.repeat_ctx_id = repeat_ctx_id; } /* minimizing repeat */ @@ -586,7 +716,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -602,8 +733,11 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = ctx.repeat_ctx_id as isize; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + // see if the tail matches - let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -628,7 +762,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -638,7 +773,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } }); }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; } /* maximizing repeat */ @@ -651,7 +786,8 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -673,7 +809,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -701,7 +837,11 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn tail_callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { ctx.success(); } else { @@ -786,8 +926,6 @@ impl<'a> StrDrive for &'a [u8] { } } -// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); - #[derive(Clone, Copy)] struct MatchContext<'a, S: StrDrive> { string_position: usize, @@ -795,7 +933,7 @@ struct MatchContext<'a, S: StrDrive> { code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut Self)>, + handler: Option, &mut State<'a, S>, &mut Self)>, repeat_ctx_id: usize, count: isize, } @@ -809,52 +947,53 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("repeat_ctx_id", &self.repeat_ctx_id) .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, state: &State<'a, S>) -> &[u32] { - &state.pattern_codes[self.code_position..] + fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { + &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, state: &State<'a, S>) -> usize { - state.pattern_codes.len() - self.code_position + fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, state: &State<'a, S>) -> usize { - state.end - self.string_position + fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + req.end - self.string_position } - fn peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.peek(self.string_offset) + fn peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.peek(self.string_offset) } - fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position += skip; - self.string_offset = state.string.offset(self.string_offset, skip); + self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.back_peek(self.string_offset) + fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, string: &S, skip: usize) { + fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position -= skip; - self.string_offset = string.back_offset(self.string_offset, skip); + self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { - state.pattern_codes[self.code_position + peek] + fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + req.pattern_codes[self.code_position + peek] } fn skip_code(&mut self, skip: usize) { self.code_position += skip; } - fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { - self.skip_code(self.peek_code(state, peek) as usize + 1); + fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + self.skip_code(self.peek_code(req, peek) as usize + 1); } fn at_beginning(&self) -> bool { @@ -862,48 +1001,48 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, state: &State<'a, S>) -> bool { - self.string_position == state.end + fn at_end(&self, req: &Request<'a, S>) -> bool { + self.string_position == req.end } - fn at_linebreak(&self, state: &State<'a, S>) -> bool { - !self.at_end(state) && is_linebreak(self.peek_char(state)) + fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + !self.at_end(req) && is_linebreak(self.peek_char(req)) } fn at_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this != that } fn at_non_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this == that } - fn can_success(&self, state: &State<'a, S>) -> bool { + fn can_success(&self, req: &Request<'a, S>) -> bool { if !self.toplevel { return true; } - if state.match_all && !self.at_end(state) { + if req.match_all && !self.at_end(req) { return false; } - if state.must_advance && self.string_position == state.start { + if req.must_advance && self.string_position == req.start { return false; } true @@ -916,58 +1055,94 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn failure(&mut self) { self.has_matched = Some(false); } + + fn next_from<'b>( + &mut self, + peek: usize, + req: &Request<'a, S>, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) + } + + fn next_offset<'b>( + &mut self, + offset: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_at(self.code_position + offset, state, f) + } + + fn next_at<'b>( + &mut self, + code_position: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.handler = Some(f); + state.next_context.insert(MatchContext { + code_position, + has_matched: None, + handler: None, + count: -1, + ..*self + }) + } } -fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), - SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), - SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), + SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), SreAtCode::END => { - (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) } - SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), - SreAtCode::END_STRING => ctx.at_end(state), - SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), - SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), + SreAtCode::END_STRING => ctx.at_end(req), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), } } fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { ctx.failure(); } else { - ctx.skip_code_from(state, 1); - ctx.skip_char(state, 1); + ctx.skip_code_from(req, 1); + ctx.skip_char(req, 1); } } fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + req: &Request<'a, S>, state: &State<'a, S>, ctx: &mut MatchContext<'a, S>, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { @@ -977,16 +1152,16 @@ fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( let mut gctx = MatchContext { string_position: group_start, - string_offset: state.string.offset(0, group_start), + string_offset: req.string.offset(0, group_start), ..*ctx }; for _ in group_start..group_end { - if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) { return ctx.failure(); } - ctx.skip_char(state, 1); - gctx.skip_char(state, 1); + ctx.skip_char(req, 1); + gctx.skip_char(req, 1); } ctx.skip_code(2); @@ -1122,60 +1297,56 @@ fn charset(set: &[u32], ch: u32) -> bool { } fn _count<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &MatchContext<'a, S>, max_count: usize, ) -> usize { let mut ctx = *ctx; - let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); let end = ctx.string_position + max_count; - let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while !ctx.string_position < end && !ctx.at_linebreak(state) { - ctx.skip_char(state, 1); + while !ctx.string_position < end && !ctx.at_linebreak(req) { + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - ctx.skip_char(state, max_count); + ctx.skip_char(req, max_count); } SreOpcode::IN => { - while !ctx.string_position < end - && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code == lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code != lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, char_loc_ignore); + general_count_literal(req, &mut ctx, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code == lower_unicode(c) as u32 }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code != lower_unicode(c) as u32 }); } @@ -1188,9 +1359,9 @@ fn _count<'a, S: StrDrive>( while count < max_count { ctx.code_position = reset_position; - let code = ctx.peek_code(state, 0); + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(state, &mut ctx, code); + dispatch(req, state, &mut ctx, code); if ctx.has_matched == Some(false) { break; } @@ -1205,14 +1376,14 @@ fn _count<'a, S: StrDrive>( } fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, end: usize, mut f: F, ) { - let ch = ctx.peek_code(state, 1); - while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { - ctx.skip_char(state, 1); + let ch = ctx.peek_code(req, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(req)) { + ctx.skip_char(req, 1); } } From c15387e97289386bbf0891a7ed18367220b59a15 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:44:47 +0200 Subject: [PATCH 63/95] refactor tests --- generate_tests.py | 2 +- src/engine.rs | 4 ++-- tests/tests.rs | 47 +++++++++++++++++++++++------------------------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/generate_tests.py b/generate_tests.py index b432720cd1..8adf043f29 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -33,7 +33,7 @@ def compile(cls, pattern, flags=0): def replace_compiled(m): line, indent, varname, pattern = m.groups() pattern = eval(pattern, {"re": CompiledPattern}) - pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}" return f'''{line} {indent}// START GENERATED by generate_tests.py {indent}#[rustfmt::skip] let {varname} = {pattern}; diff --git a/src/engine.rs b/src/engine.rs index 5e1f1457ec..b9487a2fd2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -81,7 +81,7 @@ pub struct State<'a, S: StrDrive> { pub string_position: usize, next_context: Option>, popped_has_matched: bool, - has_matched: bool, + pub has_matched: bool, } impl<'a, S: StrDrive> State<'a, S> { @@ -696,7 +696,7 @@ fn op_repeat<'a, S: StrDrive>( state.string_position = ctx.string_position; - let repeat_ctx_id = state.repeat_stack.len(); + let repeat_ctx_id = state.repeat_stack.len() - 1; // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { diff --git a/tests/tests.rs b/tests/tests.rs index cc5c4d1f38..b4ad09f7be 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,18 +1,17 @@ -use sre_engine::constants::SreFlag; use sre_engine::engine; struct Pattern { code: &'static [u32], - flags: SreFlag, } impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, 0, usize::MAX, self.code, false); + let state = engine::State::new(0); + (req, state) } } @@ -20,10 +19,10 @@ impl Pattern { fn test_2427() { // pattern lookbehind = re.compile(r'(? Date: Tue, 9 Aug 2022 16:54:41 +0200 Subject: [PATCH 64/95] refactor benches --- benches/benches.rs | 95 +++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index d000ceb62e..e24ea2f972 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,68 +3,77 @@ extern crate test; use test::Bencher; -use sre_engine::constants::SreFlag; use sre_engine::engine; -pub struct Pattern { - pub code: &'static [u32], - pub flags: SreFlag, + +struct Pattern { + code: &'static [u32], } impl Pattern { - pub fn state<'a, S: engine::StrDrive>( + fn state<'a, S: engine::StrDrive>( + &self, + string: S, + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + self.state_range(string, 0..usize::MAX) + } + + fn state_range<'a, S: engine::StrDrive>( &self, string: S, range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, range.start, range.end, self.code, false); + let state = engine::State::new(0); + (req, state) } } + #[bench] fn benchmarks(b: &mut Bencher) { // # test common prefix // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; // END GENERATED // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1] }; // END GENERATED - // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference // START GENERATED by generate_tests.py - #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; // END GENERATED - // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization // START GENERATED by generate_tests.py - #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets // START GENERATED by generate_tests.py - #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1] }; // END GENERATED - // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping // START GENERATED by generate_tests.py - #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1] }; // END GENERATED let tests = [ @@ -83,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let mut state = p.state(s.clone(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state(s.clone()); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - state = p.state(s2.as_str(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); } }) From de8973d77a40303693e8e15da70fe63f6f974546 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 17:34:03 +0200 Subject: [PATCH 65/95] simplify lifetime --- benches/benches.rs | 6 +- src/engine.rs | 265 ++++++++++++++++++--------------------------- tests/tests.rs | 4 +- 3 files changed, 110 insertions(+), 165 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index e24ea2f972..8e0e87935a 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -13,7 +13,7 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { self.state_range(string, 0..usize::MAX) } @@ -21,9 +21,9 @@ impl Pattern { &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index b9487a2fd2..ace75d1a36 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -39,25 +39,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! next_ctx { - (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) - }; - (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) - }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ - $ctx.handler = Some($handler); - $state.next_context.insert(MatchContext { - code_position: $position, - has_matched: None, - handler: None, - count: -1, - ..*$ctx - }) - }}; -} - macro_rules! mark { (push, $state:expr) => { $state @@ -72,27 +53,27 @@ macro_rules! mark { } #[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct State { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, + context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, - next_context: Option>, + next_context: Option>, popped_has_matched: bool, pub has_matched: bool, } -impl<'a, S: StrDrive> State<'a, S> { - pub fn new(string_position: usize) -> Self { +impl State { + pub fn new() -> Self { Self { marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position, + string_position: 0, next_context: None, popped_has_matched: false, has_matched: false, @@ -145,7 +126,7 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self, req: &mut Request<'a, S>) { + fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { handler(req, self, &mut ctx); @@ -169,7 +150,9 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request<'a, S>) { + pub fn pymatch(&mut self, req: &mut Request) { + self.string_position = req.start; + let ctx = MatchContext { string_position: req.start, string_offset: req.string.offset(0, req.start), @@ -185,7 +168,9 @@ impl<'a, S: StrDrive> State<'a, S> { self._match(req); } - pub fn search(&mut self, req: &mut Request<'a, S>) { + pub fn search(&mut self, req: &mut Request) { + self.string_position = req.start; + // TODO: optimize by op info and skip prefix if req.start > req.end { return; @@ -196,7 +181,7 @@ impl<'a, S: StrDrive> State<'a, S> { let mut start_offset = req.string.offset(0, req.start); - let mut ctx = MatchContext { + let ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -240,10 +225,10 @@ impl<'a, S: StrDrive> State<'a, S> { } } -fn dispatch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn dispatch( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, opcode: SreOpcode, ) { match opcode { @@ -410,11 +395,7 @@ fn dispatch<'a, S: StrDrive>( /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); @@ -435,18 +416,14 @@ fn op_assert<'a, S: StrDrive>( /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { @@ -460,20 +437,16 @@ fn op_assert_not<'a, S: StrDrive>( // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { mark!(push, state); ctx.count = 1; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; @@ -485,14 +458,10 @@ fn op_branch<'a, S: StrDrive>( state.string_position = ctx.string_position; ctx.count += next_length; - next_ctx!(offset branch_offset + 1, state, ctx, callback); + ctx.next_offset(branch_offset + 1, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -502,10 +471,10 @@ fn op_branch<'a, S: StrDrive>( } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn op_min_repeat_one( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as usize; @@ -536,10 +505,10 @@ fn op_min_repeat_one<'a, S: StrDrive>( mark!(push, state); create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let max_count = ctx.peek_code(req, 3) as usize; @@ -553,11 +522,7 @@ fn op_min_repeat_one<'a, S: StrDrive>( } } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -582,11 +547,7 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -613,10 +574,10 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.count = count as isize; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as isize; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); @@ -641,11 +602,7 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -678,11 +635,7 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = RepeatContext { count: -1, min_count: ctx.peek_code(req, 2) as usize, @@ -698,8 +651,7 @@ fn op_repeat<'a, S: StrDrive>( let repeat_ctx_id = state.repeat_stack.len() - 1; - // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); @@ -707,7 +659,7 @@ fn op_repeat<'a, S: StrDrive>( } /* minimizing repeat */ -fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_min_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); state.string_position = ctx.string_position; @@ -716,8 +668,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -736,8 +687,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let repeat_ctx_prev_id = repeat_ctx.prev_id; // see if the tail matches - // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { + let next_ctx = ctx.next_offset(1, state, |_, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -762,8 +712,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -777,7 +726,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } /* maximizing repeat */ -fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; state.string_position = ctx.string_position; @@ -786,8 +735,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -809,7 +757,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -826,22 +774,21 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; }); return; } /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - fn tail_callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { ctx.success(); } else { @@ -926,19 +873,21 @@ impl<'a> StrDrive for &'a [u8] { } } +type OpFunc = for<'a> fn(&Request<'a, S>, &mut State, &mut MatchContext); + #[derive(Clone, Copy)] -struct MatchContext<'a, S: StrDrive> { +struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut State<'a, S>, &mut Self)>, + handler: Option>, repeat_ctx_id: usize, count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -953,38 +902,38 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { } } -impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { +impl MatchContext { + fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] { &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + fn remaining_codes(&self, req: &Request) -> usize { req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + fn remaining_chars(&self, req: &Request) -> usize { req.end - self.string_position } - fn peek_char(&self, req: &Request<'a, S>) -> u32 { + fn peek_char(&self, req: &Request) -> u32 { req.string.peek(self.string_offset) } - fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request, skip: usize) { self.string_position += skip; self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + fn back_peek_char(&self, req: &Request) -> u32 { req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn back_skip_char(&mut self, req: &Request, skip: usize) { self.string_position -= skip; self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + fn peek_code(&self, req: &Request, peek: usize) -> u32 { req.pattern_codes[self.code_position + peek] } @@ -992,7 +941,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.code_position += skip; } - fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + fn skip_code_from(&mut self, req: &Request, peek: usize) { self.skip_code(self.peek_code(req, peek) as usize + 1); } @@ -1001,19 +950,15 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, req: &Request<'a, S>) -> bool { + fn at_end(&self, req: &Request) -> bool { self.string_position == req.end } - fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + fn at_linebreak(&self, req: &Request) -> bool { !self.at_end(req) && is_linebreak(self.peek_char(req)) } - fn at_boundary bool>( - &self, - req: &Request<'a, S>, - mut word_checker: F, - ) -> bool { + fn at_boundary bool>(&self, req: &Request, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end(req) { return false; } @@ -1024,7 +969,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn at_non_boundary bool>( &self, - req: &Request<'a, S>, + req: &Request, mut word_checker: F, ) -> bool { if self.at_beginning() && self.at_end(req) { @@ -1035,7 +980,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { this == that } - fn can_success(&self, req: &Request<'a, S>) -> bool { + fn can_success(&self, req: &Request) -> bool { if !self.toplevel { return true; } @@ -1059,9 +1004,9 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_from<'b>( &mut self, peek: usize, - req: &Request<'a, S>, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + req: &Request, + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) } @@ -1069,8 +1014,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_offset<'b>( &mut self, offset: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_at(self.code_position + offset, state, f) } @@ -1078,8 +1023,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_at<'b>( &mut self, code_position: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.handler = Some(f); state.next_context.insert(MatchContext { @@ -1092,7 +1037,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { } } -fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), @@ -1110,9 +1055,9 @@ fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: } } -fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_literal bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { @@ -1123,9 +1068,9 @@ fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( } } -fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_in bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { @@ -1136,10 +1081,10 @@ fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( } } -fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( - req: &Request<'a, S>, - state: &State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_groupref u32>( + req: &Request, + state: &State, + ctx: &mut MatchContext, mut f: F, ) { let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); @@ -1296,10 +1241,10 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -fn _count<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &MatchContext<'a, S>, +fn _count( + req: &Request, + state: &mut State, + ctx: &MatchContext, max_count: usize, ) -> usize { let mut ctx = *ctx; @@ -1375,9 +1320,9 @@ fn _count<'a, S: StrDrive>( ctx.string_position - state.string_position } -fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_count_literal bool>( + req: &Request, + ctx: &mut MatchContext, end: usize, mut f: F, ) { diff --git a/tests/tests.rs b/tests/tests.rs index b4ad09f7be..ead111c74a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,9 +8,9 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } From c494feb7f776e8e15185710f72d41b603db995d8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 21:34:06 +0200 Subject: [PATCH 66/95] refactor split Marks --- Cargo.toml | 1 + benches/benches.rs | 2 +- src/engine.rs | 242 ++++++++++++++++++++++++++++++--------------- tests/tests.rs | 5 +- 4 files changed, 166 insertions(+), 84 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4b403f2861..8993c1e71d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" bitflags = "1.2" +optional = "0.5" diff --git a/benches/benches.rs b/benches/benches.rs index 8e0e87935a..f19b92d64b 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -23,7 +23,7 @@ impl Pattern { range: std::ops::Range, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index ace75d1a36..087d64e8cd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,9 @@ use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; +use optional::Optioned; use std::convert::TryFrom; +use std::ops::Deref; const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') @@ -39,24 +41,98 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! mark { - (push, $state:expr) => { - $state - .marks_stack - .push(($state.marks.clone(), $state.lastindex)) - }; - (pop, $state:expr) => { - let (marks, lastindex) = $state.marks_stack.pop().unwrap(); - $state.marks = marks; - $state.lastindex = lastindex; - }; +// macro_rules! mark { +// (push, $state:expr) => { +// $state +// .marks_stack +// .push(($state.marks.clone(), $state.lastindex)) +// }; +// (pop, $state:expr) => { +// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); +// $state.marks = marks; +// $state.lastindex = lastindex; +// }; +// } + +#[derive(Debug)] +pub struct Marks { + last_index: isize, + marks: Vec>, + marks_stack: Vec<(Vec>, isize)>, +} + +impl Default for Marks { + fn default() -> Self { + Self { + last_index: -1, + marks: Vec::new(), + marks_stack: Vec::new(), + } + } +} + +impl Deref for Marks { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.marks + } +} + +impl Marks { + pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (Optioned::none(), Optioned::none()) + } + } + + pub fn last_index(&self) -> isize { + self.last_index + } + + fn set(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.last_index = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, Optioned::none()); + } + self.marks[mark_nr] = Optioned::some(position); + } + + fn push(&mut self) { + self.marks_stack.push((self.marks.clone(), self.last_index)); + } + + fn pop(&mut self) { + let (marks, last_index) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_keep(&mut self) { + let (marks, last_index) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_discard(&mut self) { + self.marks_stack.pop(); + } + + fn clear(&mut self) { + self.last_index = -1; + self.marks.clear(); + self.marks_stack.clear(); + } } #[derive(Debug)] pub struct State { - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, + pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, @@ -65,25 +141,23 @@ pub struct State { pub has_matched: bool, } -impl State { - pub fn new() -> Self { +impl Default for State { + fn default() -> Self { Self { - marks: Vec::new(), - lastindex: -1, - marks_stack: Vec::new(), - context_stack: Vec::new(), - repeat_stack: Vec::new(), - string_position: 0, - next_context: None, - popped_has_matched: false, - has_matched: false, + marks: Default::default(), + context_stack: Default::default(), + repeat_stack: Default::default(), + string_position: Default::default(), + next_context: Default::default(), + popped_has_matched: Default::default(), + has_matched: Default::default(), } } +} +impl State { pub fn reset(&mut self, string_position: usize) { - self.lastindex = -1; self.marks.clear(); - self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); self.string_position = string_position; @@ -92,23 +166,23 @@ impl State { self.has_matched = false; } - fn set_mark(&mut self, mark_nr: usize, position: usize) { - if mark_nr & 1 != 0 { - self.lastindex = mark_nr as isize / 2 + 1; - } - if mark_nr >= self.marks.len() { - self.marks.resize(mark_nr + 1, None); - } - self.marks[mark_nr] = Some(position); - } - fn get_marks(&self, group_index: usize) -> (Option, Option) { - let marks_index = 2 * group_index; - if marks_index + 1 < self.marks.len() { - (self.marks[marks_index], self.marks[marks_index + 1]) - } else { - (None, None) - } - } + // fn set_mark(&mut self, mark_nr: usize, position: usize) { + // if mark_nr & 1 != 0 { + // self.lastindex = mark_nr as isize / 2 + 1; + // } + // if mark_nr >= self.marks.len() { + // self.marks.resize(mark_nr + 1, None); + // } + // self.marks[mark_nr] = Some(position); + // } + // fn get_marks(&self, group_index: usize) -> (Option, Option) { + // let marks_index = 2 * group_index; + // if marks_index + 1 < self.marks.len() { + // (self.marks[marks_index], self.marks[marks_index + 1]) + // } else { + // (None, None) + // } + // } // fn marks_push(&mut self) { // self.marks_stack.push((self.marks.clone(), self.lastindex)); // } @@ -117,14 +191,14 @@ impl State { // self.marks = marks; // self.lastindex = lastindex; // } - fn marks_pop_keep(&mut self) { - let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - self.marks = marks; - self.lastindex = lastindex; - } - fn marks_pop_discard(&mut self) { - self.marks_stack.pop(); - } + // fn marks_pop_keep(&mut self) { + // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + // self.marks = marks; + // self.lastindex = lastindex; + // } + // fn marks_pop_discard(&mut self) { + // self.marks_stack.pop(); + // } fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { @@ -311,7 +385,9 @@ fn dispatch( general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), @@ -324,12 +400,14 @@ fn dispatch( SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - ctx.skip_code(3); - } - _ => ctx.skip_code_from(req, 2), + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) } } _ => unreachable!("unexpected opcode"), @@ -438,7 +516,7 @@ fn op_assert_not(req: &Request, state: &mut State, ctx: &mut // alternation // <0=skip> code ... fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { - mark!(push, state); + state.marks.push(); ctx.count = 1; create_context(req, state, ctx); @@ -451,7 +529,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } @@ -465,7 +543,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc if state.popped_has_matched { return ctx.success(); } - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -502,7 +580,7 @@ fn op_min_repeat_one( return ctx.success(); } - mark!(push, state); + state.marks.push(); create_context(req, state, ctx); fn create_context( @@ -517,7 +595,7 @@ fn op_min_repeat_one( // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { - state.marks_pop_discard(); + state.marks.pop_discard(); ctx.failure(); } } @@ -530,13 +608,13 @@ fn op_min_repeat_one( state.string_position = ctx.string_position; if _count(req, state, ctx, 1) == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.skip_char(req, 1); ctx.count += 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -570,7 +648,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut return ctx.success(); } - mark!(push, state); + state.marks.push(); ctx.count = count as isize; create_context(req, state, ctx); @@ -587,7 +665,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); @@ -610,14 +688,14 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); ctx.count -= 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -680,7 +758,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { return; } - mark!(push, state); + state.marks.push(); ctx.count = ctx.repeat_ctx_id as isize; @@ -698,7 +776,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { state.string_position = ctx.string_position; - mark!(pop, state); + state.marks.pop(); // match more until tail matches @@ -752,7 +830,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { { /* we may have enough matches, but if we can match another item, do so */ - mark!(push, state); + state.marks.push(); ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; @@ -763,11 +841,11 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { repeat_ctx.last_position = save_last_position; if state.popped_has_matched { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.success(); } - mark!(pop, state); + state.marks.pop(); repeat_ctx.count -= 1; state.string_position = ctx.string_position; @@ -1087,12 +1165,14 @@ fn general_op_groupref u32>( ctx: &mut MatchContext, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return ctx.failure(); - } + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + return ctx.failure(); }; let mut gctx = MatchContext { diff --git a/tests/tests.rs b/tests/tests.rs index ead111c74a..cb11db3483 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -10,7 +10,7 @@ impl Pattern { string: S, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } @@ -62,13 +62,14 @@ fn test_zerowidth() { #[test] fn test_repeat_context_panic() { + use optional::Optioned; // pattern p = re.compile(r'(?:a*?(xx)??z)*') // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED let (mut req, mut state) = p.state("axxzaz"); state.pymatch(&mut req); - assert!(state.marks == vec![Some(1), Some(3)]); + assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); } #[test] From 18258000cde2c848198b745e92f74a89d48c7fe8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:17:01 +0200 Subject: [PATCH 67/95] clearup --- src/engine.rs | 78 +++++++++++---------------------------------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 087d64e8cd..652ca04c27 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -41,19 +41,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -// macro_rules! mark { -// (push, $state:expr) => { -// $state -// .marks_stack -// .push(($state.marks.clone(), $state.lastindex)) -// }; -// (pop, $state:expr) => { -// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); -// $state.marks = marks; -// $state.lastindex = lastindex; -// }; -// } - #[derive(Debug)] pub struct Marks { last_index: isize, @@ -135,6 +122,7 @@ pub struct State { pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, + pub start: usize, pub string_position: usize, next_context: Option>, popped_has_matched: bool, @@ -144,62 +132,30 @@ pub struct State { impl Default for State { fn default() -> Self { Self { - marks: Default::default(), - context_stack: Default::default(), - repeat_stack: Default::default(), - string_position: Default::default(), - next_context: Default::default(), - popped_has_matched: Default::default(), - has_matched: Default::default(), + marks: Marks::default(), + context_stack: Vec::new(), + repeat_stack: Vec::new(), + start: 0, + string_position: 0, + next_context: None, + popped_has_matched: false, + has_matched: false, } } } impl State { - pub fn reset(&mut self, string_position: usize) { + pub fn reset(&mut self, start: usize) { self.marks.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = string_position; + self.start = start; + self.string_position = start; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; } - // fn set_mark(&mut self, mark_nr: usize, position: usize) { - // if mark_nr & 1 != 0 { - // self.lastindex = mark_nr as isize / 2 + 1; - // } - // if mark_nr >= self.marks.len() { - // self.marks.resize(mark_nr + 1, None); - // } - // self.marks[mark_nr] = Some(position); - // } - // fn get_marks(&self, group_index: usize) -> (Option, Option) { - // let marks_index = 2 * group_index; - // if marks_index + 1 < self.marks.len() { - // (self.marks[marks_index], self.marks[marks_index + 1]) - // } else { - // (None, None) - // } - // } - // fn marks_push(&mut self) { - // self.marks_stack.push((self.marks.clone(), self.lastindex)); - // } - // fn marks_pop(&mut self) { - // let (marks, lastindex) = self.marks_stack.pop().unwrap(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_keep(&mut self) { - // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_discard(&mut self) { - // self.marks_stack.pop(); - // } - fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { @@ -225,6 +181,7 @@ impl State { } pub fn pymatch(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; let ctx = MatchContext { @@ -243,6 +200,7 @@ impl State { } pub fn search(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; // TODO: optimize by op info and skip prefix @@ -479,7 +437,6 @@ fn op_assert(req: &Request, state: &mut State, ctx: &mut Matc return ctx.failure(); } - // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.skip_code_from(req, 1); @@ -592,7 +549,6 @@ fn op_min_repeat_one( if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { state.marks.pop_discard(); @@ -676,7 +632,6 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; // General case: backtracking - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } @@ -861,7 +816,6 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { /* cannot match more repeated items here. make sure the tail matches */ - // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); let repeat_ctx_prev_id = repeat_ctx.prev_id; let next_ctx = ctx.next_offset(1, state, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx_prev_id; @@ -965,7 +919,7 @@ struct MatchContext { count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { +impl std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) From e42df1d8597bf964fb7f70b606df9e0be696c624 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:18:01 +0200 Subject: [PATCH 68/95] update version to 0.4.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8993c1e71d..373166d6db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.1" +version = "0.4.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c4f10edc95ab7dc1f20bed2c9b4bddfc520fa660 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Aug 2022 20:46:20 +0200 Subject: [PATCH 69/95] impl opinfo single literal --- benches/benches.rs | 24 +++---- src/engine.rs | 173 +++++++++++++++++++++++---------------------- tests/tests.rs | 51 +++++++++---- 3 files changed, 139 insertions(+), 109 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index f19b92d64b..604cf91f42 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -92,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let (mut req, mut state) = p.state(s.clone()); - state.search(&mut req); + let (req, mut state) = p.state(s.clone()); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state(s.clone()); - state.pymatch(&mut req); + let (req, mut state) = p.state(s.clone()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state(s.clone()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); - state.search(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 652ca04c27..0d645e046d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,5 +1,7 @@ // good luck to those that follow; here be dragons +use crate::constants::SreInfo; + use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; @@ -10,6 +12,7 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } +#[derive(Debug, Clone, Copy)] pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, @@ -180,7 +183,7 @@ impl State { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request) { + pub fn pymatch(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -196,10 +199,10 @@ impl State { }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } - pub fn search(&mut self, req: &mut Request) { + pub fn search(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -208,12 +211,11 @@ impl State { return; } - // let start = self.start; - // let end = self.end; + let mut end = req.end; let mut start_offset = req.string.offset(0, req.start); - let ctx = MatchContext { + let mut ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -224,35 +226,97 @@ impl State { count: -1, }; - // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { - // search_op_info(self, &mut ctx); - // if let Some(has_matched) = ctx.has_matched { - // self.has_matched = has_matched; - // return; - // } - // } + if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { + /* optimization info block */ + /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + let req = &mut req; + let min = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min { + return; + } + + if min > 1 { + /* adjust end point (but make sure we leave at least one + character in there, so literal search will work) */ + // no overflow can happen as remaining chars >= min + end -= min - 1; + + // adjust ctx position + if end < ctx.string_position { + ctx.string_position = end; + ctx.string_offset = req.string.offset(0, ctx.string_position); + } + } + + let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); + + if flags.contains(SreInfo::PREFIX) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..]; + let overlap = &prefix[len - 1..]; + + if len == 1 { + // pattern starts with a literal character + ctx.skip_code_from(req, 1); + let c = prefix[0]; + req.must_advance = false; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + self.reset(req.start); + // self.start = ctx.string_position; + self.string_position += skip; + + // literal only + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + next_ctx.skip_code(2 * skip); + + self.context_stack.push(next_ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + } + return; + } + } + } self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); req.must_advance = false; - while !self.has_matched && req.start < req.end { + ctx.toplevel = false; + while !self.has_matched && req.start < end { req.start += 1; start_offset = req.string.offset(start_offset, 1); self.reset(req.start); + ctx.string_position = req.start; + ctx.string_offset = start_offset; - let ctx = MatchContext { - string_position: req.start, - string_offset: start_offset, - code_position: 0, - has_matched: None, - toplevel: false, - handler: None, - repeat_ctx_id: usize::MAX, - count: -1, - }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } } } @@ -372,63 +436,6 @@ fn dispatch( } } -/* optimization info block */ -/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ -// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { -// let min = ctx.peek_code(state, 3) as usize; - -// if ctx.remaining_chars(state) < min { -// return ctx.failure(); -// } - -// if min > 1 { -// /* adjust end point (but make sure we leave at least one -// character in there, so literal search will work) */ -// // no overflow can happen as remaining chars >= min -// state.end -= min - 1; - -// // adjust ctx position -// if state.end < ctx.string_position { -// ctx.string_position = state.end; -// ctx.string_offset = state.string.offset(0, ctx.string_position); -// } -// } - -// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); - -// if flags.contains(SreInfo::PREFIX) { -// /* pattern starts with a known prefix */ -// /* */ -// let len = ctx.peek_code(state, 5) as usize; -// let skip = ctx.peek_code(state, 6) as usize; -// let prefix = &ctx.pattern(state)[7..]; -// let overlap = &prefix[len - 1..]; - -// ctx.skip_code_from(state, 1); - -// if len == 1 { -// // pattern starts with a literal character -// let c = prefix[0]; -// let end = state.end; - -// while (!ctx.at_end(state)) { -// // find the next matched literal -// while (ctx.peek_char(state) != c) { -// ctx.skip_char(state, 1); -// if (ctx.at_end(state)) { -// return ctx.failure(); -// } -// } - -// // literal only -// if flags.contains(SreInfo::LITERAL) { -// return ctx.success(); -// } -// } -// } -// } -// } - /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { diff --git a/tests/tests.rs b/tests/tests.rs index cb11db3483..31f032f0a4 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -21,8 +21,8 @@ fn test_2427() { // START GENERATED by generate_tests.py #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1] }; // END GENERATED - let (mut req, mut state) = lookbehind.state("x"); - state.pymatch(&mut req); + let (req, mut state) = lookbehind.state("x"); + state.pymatch(req); assert!(state.has_matched); } @@ -32,8 +32,8 @@ fn test_assert() { // START GENERATED by generate_tests.py #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1] }; // END GENERATED - let (mut req, mut state) = positive_lookbehind.state("abcdef"); - state.search(&mut req); + let (req, mut state) = positive_lookbehind.state("abcdef"); + state.search(req); assert!(state.has_matched); } @@ -43,8 +43,8 @@ fn test_string_boundaries() { // START GENERATED by generate_tests.py #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1] }; // END GENERATED - let (mut req, mut state) = big_b.state(""); - state.search(&mut req); + let (req, mut state) = big_b.state(""); + state.search(req); assert!(!state.has_matched); } @@ -56,8 +56,8 @@ fn test_zerowidth() { // END GENERATED let (mut req, mut state) = p.state("a:"); req.must_advance = true; - state.search(&mut req); - assert!(state.string_position == 1); + state.search(req); + assert_eq!(state.string_position, 1); } #[test] @@ -67,9 +67,9 @@ fn test_repeat_context_panic() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("axxzaz"); - state.pymatch(&mut req); - assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); + let (req, mut state) = p.state("axxzaz"); + state.pymatch(req); + assert_eq!(*state.marks, vec![Optioned::some(1), Optioned::some(3)]); } #[test] @@ -78,7 +78,30 @@ fn test_double_max_until() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("1111"); - state.pymatch(&mut req); - assert!(state.string_position == 4); + let (req, mut state) = p.state("1111"); + state.pymatch(req); + assert_eq!(state.string_position, 4); +} + +#[test] +fn test_info_single() { + // pattern p = re.compile(r'aa*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 1, 4294967295, 1, 1, 97, 0, 17, 97, 25, 6, 0, 4294967295, 17, 97, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("baaaa"); + state.search(req); + assert_eq!(state.start, 1); + assert_eq!(state.string_position, 5); +} + +#[test] +fn test_info_single2() { + // pattern p = re.compile(r'Python|Perl') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("Perl"); + state.search(req); + assert!(state.has_matched); } From 236631141fa3d2681e9a53eb3550c96219cb0cf7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 19:55:48 +0200 Subject: [PATCH 70/95] impl opinfo literal --- src/engine.rs | 81 +++++++++++++++++++++++++++++++++++++++++++++----- tests/tests.rs | 22 ++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 0d645e046d..53181c5043 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -256,13 +256,17 @@ impl State { /* */ let len = ctx.peek_code(req, 5) as usize; let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..]; - let overlap = &prefix[len - 1..]; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; if len == 1 { // pattern starts with a literal character - ctx.skip_code_from(req, 1); let c = prefix[0]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + req.must_advance = false; while !ctx.at_end(req) { @@ -275,9 +279,8 @@ impl State { } req.start = ctx.string_position; - self.reset(req.start); - // self.start = ctx.string_position; - self.string_position += skip; + self.start = ctx.string_position; + self.string_position = ctx.string_position + skip; // literal only if flags.contains(SreInfo::LITERAL) { @@ -287,7 +290,6 @@ impl State { let mut next_ctx = ctx; next_ctx.skip_char(req, skip); - next_ctx.skip_code(2 * skip); self.context_stack.push(next_ctx); self._match(req); @@ -297,6 +299,71 @@ impl State { } ctx.skip_char(req, 1); + self.marks.clear(); + } + return; + } else if len > 1 { + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + self.start = req.start; + self.string_position = self.start + skip; + + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + // next_ctx.skip_char(req, 1); + next_ctx.string_position = self.string_position; + next_ctx.string_offset = req.string.offset(0, self.string_position); + self.context_stack.push(next_ctx); + self._match(req); + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + self.marks.clear(); + } + i = overlap[i] as usize; + if i == 0 { + break; + } + } } return; } diff --git a/tests/tests.rs b/tests/tests.rs index 31f032f0a4..21bc89d40c 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -105,3 +105,25 @@ fn test_info_single2() { state.search(req); assert!(state.has_matched); } + +#[test] +fn test_info_literal() { + // pattern p = re.compile(r'ababc+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 17, 97, 17, 98, 17, 97, 17, 98, 25, 6, 1, 4294967295, 17, 99, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("!ababc"); + state.search(req); + assert!(state.has_matched); +} + +#[test] +fn test_info_literal2() { + // pattern p = re.compile(r'(python)\1') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 112, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("pythonpython"); + state.search(req); + assert!(state.has_matched); +} \ No newline at end of file From 646c8ac6578977b847e8f871e1f83fb422b3e39a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:09:51 +0200 Subject: [PATCH 71/95] impl opinfo charset --- src/engine.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/engine.rs b/src/engine.rs index 53181c5043..fe7ee438e9 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -367,6 +367,31 @@ impl State { } return; } + } else if flags.contains(SreInfo::CHARSET) { + let set = &ctx.pattern(req)[5..]; + ctx.skip_code_from(req, 1); + req.must_advance = false; + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + req.start = ctx.string_position; + self.start = ctx.string_position; + self.string_position = ctx.string_position; + + self.context_stack.push(ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + self.marks.clear(); + } } } From 7e7b9734810947155876cc52028d873ba953b5f4 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:32 +0200 Subject: [PATCH 72/95] clearup --- src/engine.rs | 301 +++++++++++++++++++++++++++----------------------- 1 file changed, 163 insertions(+), 138 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fe7ee438e9..7a644f0671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -252,147 +252,16 @@ impl State { let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); if flags.contains(SreInfo::PREFIX) { - /* pattern starts with a known prefix */ - /* */ - let len = ctx.peek_code(req, 5) as usize; - let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..7 + len]; - let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; - - if len == 1 { - // pattern starts with a literal character - let c = prefix[0]; - - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - // find the next matched literal - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position + skip; - - // literal only - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - next_ctx.skip_char(req, skip); - - self.context_stack.push(next_ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } - return; - } else if len > 1 { - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - let c = prefix[0]; - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - - let mut i = 1; - loop { - if ctx.peek_char(req) == prefix[i] { - i += 1; - if i != len { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - continue; - } - - req.start = ctx.string_position - (len - 1); - self.start = req.start; - self.string_position = self.start + skip; - - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - // next_ctx.skip_char(req, 1); - next_ctx.string_position = self.string_position; - next_ctx.string_offset = req.string.offset(0, self.string_position); - self.context_stack.push(next_ctx); - self._match(req); - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - self.marks.clear(); - } - i = overlap[i] as usize; - if i == 0 { - break; - } - } - } - return; + if flags.contains(SreInfo::LITERAL) { + search_info_literal::(req, self, ctx); + } else { + search_info_literal::(req, self, ctx); } + return; } else if flags.contains(SreInfo::CHARSET) { - let set = &ctx.pattern(req)[5..]; - ctx.skip_code_from(req, 1); - req.must_advance = false; - loop { - while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { - ctx.skip_char(req, 1); - } - if ctx.at_end(req) { - return; - } - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position; - - self.context_stack.push(ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } + return search_info_charset(req, self, ctx); } + // fallback to general search } self.context_stack.push(ctx); @@ -528,6 +397,162 @@ fn dispatch( } } +fn search_info_literal( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + if len == 1 { + // pattern starts with a literal character + let c = prefix[0]; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } + } else { + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + state.start = req.start; + state.string_position = state.start + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + if skip != 0 { + next_ctx.skip_char(req, 1); + } else { + next_ctx.string_position = state.string_position; + next_ctx.string_offset = req.string.offset(0, state.string_position); + } + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + state.marks.clear(); + } + + i = overlap[i] as usize; + if i == 0 { + break; + } + } + } + } +} + +fn search_info_charset( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + let set = &ctx.pattern(req)[5..]; + + ctx.skip_code_from(req, 1); + + req.must_advance = false; + + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position; + + state.context_stack.push(ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } +} + /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { From 26a78dbaa4e4e78f1e5d8dcea9c32054ae07fdd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:46 +0200 Subject: [PATCH 73/95] update to 0.4.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 373166d6db..53524f1446 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.0" +version = "0.4.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 4e6b27144a407bb4daf341048310cad26570a7b3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 21:30:40 +0200 Subject: [PATCH 74/95] introduce SearchIter --- src/engine.rs | 26 ++++++++++++++++++++++++++ tests/tests.rs | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 7a644f0671..7334516c2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -282,6 +282,32 @@ impl State { } } +pub struct SearchIter<'a, S: StrDrive> { + pub req: Request<'a, S>, + pub state: State, +} + +impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { + type Item = (); + + fn next(&mut self) -> Option { + if self.req.start > self.req.end { + return None; + } + + self.state.reset(self.req.start); + self.state.search(self.req); + if !self.state.has_matched { + return None; + } + + self.req.must_advance = self.state.string_position == self.state.start; + self.req.start = self.state.string_position; + + Some(()) + } +} + fn dispatch( req: &Request, state: &mut State, diff --git a/tests/tests.rs b/tests/tests.rs index 21bc89d40c..5212226f4e 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -126,4 +126,4 @@ fn test_info_literal2() { let (req, mut state) = p.state("pythonpython"); state.search(req); assert!(state.has_matched); -} \ No newline at end of file +} From 285ba765a70ab243ee0f881604e60b999c5771af Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Sat, 7 Oct 2023 14:40:45 +0900 Subject: [PATCH 75/95] 0.4.2 with dependency update --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 53524f1446..30e403b54c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "sre-engine" -version = "0.4.1" +version = "0.4.2" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" license = "MIT" -edition = "2018" +edition = "2021" keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.5" -bitflags = "1.2" +num_enum = "0.5.9" +bitflags = "2" optional = "0.5" From a777d22a537bdda61a7df5f4c6af6369de1cadc6 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 8 Dec 2023 22:36:46 +0200 Subject: [PATCH 76/95] fix _count --- src/constants.rs | 1 + src/engine.rs | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index f3962b339a..0d5bb41939 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -101,6 +101,7 @@ pub enum SreCatCode { UNI_NOT_LINEBREAK = 17, } bitflags! { + #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct SreFlag: u16 { const TEMPLATE = 1; const IGNORECASE = 2; diff --git a/src/engine.rs b/src/engine.rs index 7334516c2f..f49560c2c6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -672,7 +672,9 @@ fn op_min_repeat_one( ctx.count = if min_count == 0 { 0 } else { - let count = _count(req, state, ctx, min_count); + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, min_count); if count < min_count { return ctx.failure(); } @@ -713,7 +715,9 @@ fn op_min_repeat_one( state.string_position = ctx.string_position; - if _count(req, state, ctx, 1) == 0 { + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + if _count(req, state, next_ctx, 1) == 0 { state.marks.pop_discard(); return ctx.failure(); } @@ -741,7 +745,9 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; - let count = _count(req, state, ctx, max_count); + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, max_count); ctx.skip_char(req, count); if count < min_count { return ctx.failure(); @@ -1428,17 +1434,16 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, state: &mut State, - ctx: &MatchContext, + mut ctx: MatchContext, max_count: usize, ) -> usize { - let mut ctx = *ctx; let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); let end = ctx.string_position + max_count; let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while !ctx.string_position < end && !ctx.at_linebreak(req) { + while ctx.string_position < end && !ctx.at_linebreak(req) { ctx.skip_char(req, 1); } } @@ -1446,8 +1451,7 @@ fn _count( ctx.skip_char(req, max_count); } SreOpcode::IN => { - while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) - { + while ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { ctx.skip_char(req, 1); } } @@ -1483,7 +1487,6 @@ fn _count( /* General case */ let mut count = 0; - ctx.skip_code(4); let reset_position = ctx.code_position; while count < max_count { @@ -1511,7 +1514,7 @@ fn general_count_literal bool>( mut f: F, ) { let ch = ctx.peek_code(req, 1); - while !ctx.string_position < end && f(ch, ctx.peek_char(req)) { + while ctx.string_position < end && f(ch, ctx.peek_char(req)) { ctx.skip_char(req, 1); } } From d73cc5f58c94e2589efadf1b21f4b5a811869682 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 8 Dec 2023 22:42:38 +0200 Subject: [PATCH 77/95] update version and dependency --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 30e403b54c..de1d68cf6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.2" +version = "0.4.3" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" @@ -10,6 +10,6 @@ keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.5.9" +num_enum = "0.7" bitflags = "2" optional = "0.5" From 9070e12e0df3ece6949dc3d9d650fa2d37d8eb54 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 10 Dec 2023 20:50:43 +0200 Subject: [PATCH 78/95] refactor _match with nest loop --- benches/benches.rs | 28 +- src/engine.rs | 894 +++++++++++++++++++++++++++++++++------------ tests/tests.rs | 30 +- 3 files changed, 677 insertions(+), 275 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index 604cf91f42..fe470d023c 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -10,10 +10,7 @@ struct Pattern { } impl Pattern { - fn state<'a, S: engine::StrDrive>( - &self, - string: S, - ) -> (engine::Request<'a, S>, engine::State) { + fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) { self.state_range(string, 0..usize::MAX) } @@ -21,7 +18,7 @@ impl Pattern { &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); let state = engine::State::default(); (req, state) @@ -93,29 +90,22 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { let (req, mut state) = p.state(s.clone()); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); let (req, mut state) = p.state(s.clone()); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (mut req, mut state) = p.state(s.clone()); req.match_all = true; - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); req.match_all = true; - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); } }) } diff --git a/src/engine.rs b/src/engine.rs index f49560c2c6..7474f29013 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -6,14 +6,13 @@ use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; -use std::ops::Deref; const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } #[derive(Debug, Clone, Copy)] -pub struct Request<'a, S: StrDrive> { +pub struct Request<'a, S> { pub string: S, pub start: usize, pub end: usize, @@ -61,14 +60,6 @@ impl Default for Marks { } } -impl Deref for Marks { - type Target = Vec>; - - fn deref(&self) -> &Self::Target { - &self.marks - } -} - impl Marks { pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { let marks_index = 2 * group_index; @@ -83,6 +74,10 @@ impl Marks { self.last_index } + pub fn raw(&self) -> &[Optioned] { + self.marks.as_slice() + } + fn set(&mut self, mark_nr: usize, position: usize) { if mark_nr & 1 != 0 { self.last_index = mark_nr as isize / 2 + 1; @@ -120,70 +115,23 @@ impl Marks { } } -#[derive(Debug)] -pub struct State { - pub marks: Marks, - context_stack: Vec>, - repeat_stack: Vec, +#[derive(Debug, Default)] +pub struct State { pub start: usize, + pub marks: Marks, pub string_position: usize, - next_context: Option>, - popped_has_matched: bool, - pub has_matched: bool, -} - -impl Default for State { - fn default() -> Self { - Self { - marks: Marks::default(), - context_stack: Vec::new(), - repeat_stack: Vec::new(), - start: 0, - string_position: 0, - next_context: None, - popped_has_matched: false, - has_matched: false, - } - } + repeat_stack: Vec, } -impl State { +impl State { pub fn reset(&mut self, start: usize) { self.marks.clear(); - self.context_stack.clear(); self.repeat_stack.clear(); self.start = start; self.string_position = start; - self.next_context = None; - self.popped_has_matched = false; - self.has_matched = false; - } - - fn _match(&mut self, req: &mut Request) { - while let Some(mut ctx) = self.context_stack.pop() { - if let Some(handler) = ctx.handler.take() { - handler(req, self, &mut ctx); - } else if ctx.remaining_codes(req) > 0 { - let code = ctx.peek_code(req, 0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(req, self, &mut ctx, code); - } else { - ctx.failure(); - } - - if let Some(has_matched) = ctx.has_matched { - self.popped_has_matched = has_matched; - } else { - self.context_stack.push(ctx); - if let Some(next_ctx) = self.next_context.take() { - self.context_stack.push(next_ctx); - } - } - } - self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, mut req: Request) { + pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; self.string_position = req.start; @@ -191,24 +139,20 @@ impl State { string_position: req.start, string_offset: req.string.offset(0, req.start), code_position: 0, - has_matched: None, toplevel: true, - handler: None, + jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, }; - self.context_stack.push(ctx); - - self._match(&mut req); + _match(&req, self, ctx) } - pub fn search(&mut self, mut req: Request) { + pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; self.string_position = req.start; - // TODO: optimize by op info and skip prefix if req.start > req.end { - return; + return false; } let mut end = req.end; @@ -219,9 +163,8 @@ impl State { string_position: req.start, string_offset: start_offset, code_position: 0, - has_matched: None, toplevel: true, - handler: None, + jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, }; @@ -229,11 +172,10 @@ impl State { if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { /* optimization info block */ /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ - let req = &mut req; - let min = ctx.peek_code(req, 3) as usize; + let min = ctx.peek_code(&req, 3) as usize; - if ctx.remaining_chars(req) < min { - return; + if ctx.remaining_chars(&req) < min { + return false; } if min > 1 { @@ -249,42 +191,44 @@ impl State { } } - let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); + let flags = SreInfo::from_bits_truncate(ctx.peek_code(&req, 2)); if flags.contains(SreInfo::PREFIX) { if flags.contains(SreInfo::LITERAL) { - search_info_literal::(req, self, ctx); + return search_info_literal::(&mut req, self, ctx); } else { - search_info_literal::(req, self, ctx); + return search_info_literal::(&mut req, self, ctx); } - return; } else if flags.contains(SreInfo::CHARSET) { - return search_info_charset(req, self, ctx); + return search_info_charset(&mut req, self, ctx); } // fallback to general search } - self.context_stack.push(ctx); - self._match(&mut req); + if _match(&req, self, ctx) { + return true; + } req.must_advance = false; ctx.toplevel = false; - while !self.has_matched && req.start < end { + while req.start < end { req.start += 1; start_offset = req.string.offset(start_offset, 1); self.reset(req.start); ctx.string_position = req.start; ctx.string_offset = start_offset; - self.context_stack.push(ctx); - self._match(&mut req); + if _match(&req, self, ctx) { + return true; + } } + false } } pub struct SearchIter<'a, S: StrDrive> { pub req: Request<'a, S>, - pub state: State, + pub state: State, } impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { @@ -296,8 +240,7 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { } self.state.reset(self.req.start); - self.state.search(self.req); - if !self.state.has_matched { + if !self.state.search(self.req) { return None; } @@ -308,10 +251,537 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { } } +#[derive(Debug, Clone, Copy)] +enum Jump { + OpCode, + Assert1, + AssertNot1, + Branch1, + Branch2, + Repeat1, + UntilBacktrace, + MaxUntil2, + MaxUntil3, + MinUntil1, + RepeatOne1, + RepeatOne2, + MinRepeatOne1, + MinRepeatOne2, +} + +fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { + let mut context_stack = vec![ctx]; + let mut popped_result = false; + + 'coro: loop { + let Some(mut ctx) = context_stack.pop() else { + break; + }; + + popped_result = 'result: loop { + let yield_ = 'context: loop { + match ctx.jump { + Jump::OpCode => {} + Jump::Assert1 => { + if popped_result { + ctx.skip_code_from(req, 1); + } else { + break 'result false; + } + } + Jump::AssertNot1 => { + if popped_result { + break 'result false; + } + ctx.skip_code_from(req, 1); + } + Jump::Branch1 => { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(req, branch_offset) as isize; + if next_length == 0 { + state.marks.pop_discard(); + break 'result false; + } + state.string_position = ctx.string_position; + let next_ctx = ctx.next_offset(branch_offset + 1, Jump::Branch2); + ctx.count += next_length; + break 'context next_ctx; + } + Jump::Branch2 => { + if popped_result { + break 'result true; + } + state.marks.pop_keep(); + ctx.jump = Jump::Branch1; + continue 'context; + } + Jump::Repeat1 => { + state.repeat_stack.pop(); + break 'result popped_result; + } + Jump::UntilBacktrace => { + if !popped_result { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + } + break 'result popped_result; + } + Jump::MaxUntil2 => { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if popped_result { + state.marks.pop_discard(); + break 'result true; + } + + state.marks.pop(); + repeat_ctx.count -= 1; + state.string_position = ctx.string_position; + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + Jump::MaxUntil3 => { + if !popped_result { + state.string_position = ctx.string_position; + } + break 'result popped_result; + } + Jump::MinUntil1 => { + if popped_result { + break 'result true; + } + ctx.repeat_ctx_id = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.string_position = ctx.string_position; + state.marks.pop(); + + // match more until tail matches + if repeat_ctx.count as usize >= repeat_ctx.max_count + && repeat_ctx.max_count != MAXREPEAT + || state.string_position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + break 'result false; + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.string_position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + Jump::RepeatOne1 => { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char(req) != c { + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + ctx.back_skip_char(req, 1); + ctx.count -= 1; + } + } + + state.string_position = ctx.string_position; + // General case: backtracking + break 'context ctx.next_peek_from(1, req, Jump::RepeatOne2); + } + Jump::RepeatOne2 => { + if popped_result { + break 'result true; + } + + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + + ctx.back_skip_char(req, 1); + ctx.count -= 1; + + state.marks.pop_keep(); + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + Jump::MinRepeatOne1 => { + let max_count = ctx.peek_code(req, 3) as usize; + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.string_position = ctx.string_position; + break 'context ctx.next_peek_from(1, req, Jump::MinRepeatOne2); + } else { + state.marks.pop_discard(); + break 'result false; + } + } + Jump::MinRepeatOne2 => { + if popped_result { + break 'result true; + } + + state.string_position = ctx.string_position; + + let mut count_ctx = ctx; + count_ctx.skip_code(4); + if _count(req, state, count_ctx, 1) == 0 { + state.marks.pop_discard(); + break 'result false; + } + + ctx.skip_char(req, 1); + ctx.count += 1; + state.marks.pop_keep(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + } + ctx.jump = Jump::OpCode; + + loop { + macro_rules! general_op_literal { + ($f:expr) => {{ + if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code(2); + ctx.skip_char(req, 1); + }}; + } + + macro_rules! general_op_in { + ($f:expr) => {{ + if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code_from(req, 1); + ctx.skip_char(req, 1); + }}; + } + + macro_rules! general_op_groupref { + ($f:expr) => {{ + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + break 'result false; + }; + + let mut gctx = MatchContext { + string_position: group_start, + string_offset: req.string.offset(0, group_start), + ..ctx + }; + + for _ in group_start..group_end { + if ctx.at_end(req) + || $f(ctx.peek_char(req)) != $f(gctx.peek_char(req)) + { + break 'result false; + } + ctx.skip_char(req, 1); + gctx.skip_char(req, 1); + } + + ctx.skip_code(2); + }}; + } + + if ctx.remaining_codes(req) == 0 { + break 'result false; + } + let opcode = ctx.peek_code(req, 0); + let opcode = SreOpcode::try_from(opcode).unwrap(); + + match opcode { + SreOpcode::FAILURE => break 'result false, + SreOpcode::SUCCESS => { + if ctx.can_success(req) { + state.string_position = ctx.string_position; + break 'result true; + } + break 'result false; + } + SreOpcode::ANY => { + if ctx.at_end(req) || ctx.at_linebreak(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.skip_char(req, 1); + } + SreOpcode::ANY_ALL => { + if ctx.at_end(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.skip_char(req, 1); + } + SreOpcode::ASSERT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.string_position < back { + break 'result false; + } + + let mut next_ctx = ctx.next_offset(3, Jump::Assert1); + next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; + break 'context next_ctx; + } + SreOpcode::ASSERT_NOT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.string_position < back { + ctx.skip_code_from(req, 1); + continue; + } + + let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); + next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; + break 'context next_ctx; + } + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, &ctx, atcode) { + ctx.skip_code(2); + } else { + break 'result false; + } + } + SreOpcode::BRANCH => { + state.marks.push(); + ctx.count = 1; + ctx.jump = Jump::Branch1; + continue 'context; + } + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code(2); + ctx.skip_char(req, 1); + } + SreOpcode::IN => general_op_in!(charset), + SreOpcode::IN_IGNORE => { + general_op_in!(|set, c| charset(set, lower_ascii(c))) + } + SreOpcode::IN_UNI_IGNORE => { + general_op_in!(|set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), + SreOpcode::INFO => { + let min = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min { + break 'result false; + } + ctx.skip_code_from(req, 1); + } + SreOpcode::MARK => { + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.string_position); + ctx.skip_code(2); + } + SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::REPEAT => { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + state.repeat_stack.push(repeat_ctx); + let repeat_ctx_id = state.repeat_stack.len() - 1; + state.string_position = ctx.string_position; + let mut next_ctx = ctx.next_peek_from(1, req, Jump::Repeat1); + next_ctx.repeat_ctx_id = repeat_ctx_id; + break 'context next_ctx; + } + SreOpcode::MAX_UNTIL => { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.string_position = ctx.string_position; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count + || repeat_ctx.max_count == MAXREPEAT) + && state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + state.marks.push(); + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.string_position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::MaxUntil2); + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + SreOpcode::MIN_UNTIL => { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + state.string_position = ctx.string_position; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + state.marks.push(); + ctx.count = ctx.repeat_ctx_id as isize; + let mut next_ctx = ctx.next_offset(1, Jump::MinUntil1); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + SreOpcode::REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.string_position = ctx.string_position; + + let mut next_ctx = ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, max_count); + ctx.skip_char(req, count); + if count < min_count { + break 'result false; + } + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + break 'result true; + } + + state.marks.push(); + ctx.count = count as isize; + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + SreOpcode::MIN_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.string_position = ctx.string_position; + ctx.count = if min_count == 0 { + 0 + } else { + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, count_ctx, min_count); + if count < min_count { + break 'result false; + } + ctx.skip_char(req, count); + count as isize + }; + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + break 'result true; + } + + state.marks.push(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + SreOpcode::LITERAL => general_op_literal!(|code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal!(|code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal!(|code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal!(|code, c| !char_loc_ignore(code, c)) + } + SreOpcode::GROUPREF => general_op_groupref!(|x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) + } + } + SreOpcode::CALL => todo!(), + SreOpcode::CHARSET => todo!(), + SreOpcode::BIGCHARSET => todo!(), + SreOpcode::NEGATE => todo!(), + SreOpcode::RANGE => todo!(), + SreOpcode::RANGE_UNI_IGNORE => todo!(), + SreOpcode::SUBPATTERN => todo!(), + } + } + }; + context_stack.push(ctx); + context_stack.push(yield_); + continue 'coro; + }; + } + popped_result +} + +/* fn dispatch( req: &Request, - state: &mut State, - ctx: &mut MatchContext, + state: &mut State, + ctx: &mut MatchContext, opcode: SreOpcode, ) { match opcode { @@ -422,12 +892,13 @@ fn dispatch( _ => unreachable!("unexpected opcode"), } } +*/ fn search_info_literal( req: &mut Request, - state: &mut State, - mut ctx: MatchContext, -) { + state: &mut State, + mut ctx: MatchContext, +) -> bool { /* pattern starts with a known prefix */ /* */ let len = ctx.peek_code(req, 5) as usize; @@ -450,7 +921,7 @@ fn search_info_literal( while ctx.peek_char(req) != c { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } } @@ -460,18 +931,14 @@ fn search_info_literal( // literal only if LITERAL { - state.has_matched = true; - return; + return true; } let mut next_ctx = ctx; next_ctx.skip_char(req, skip); - state.context_stack.push(next_ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, next_ctx) { + return true; } ctx.skip_char(req, 1); @@ -483,12 +950,12 @@ fn search_info_literal( while ctx.peek_char(req) != c { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } } ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } let mut i = 1; @@ -498,7 +965,7 @@ fn search_info_literal( if i != len { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } continue; } @@ -509,8 +976,7 @@ fn search_info_literal( // literal only if LITERAL { - state.has_matched = true; - return; + return true; } let mut next_ctx = ctx; @@ -521,16 +987,13 @@ fn search_info_literal( next_ctx.string_offset = req.string.offset(0, state.string_position); } - state.context_stack.push(next_ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, next_ctx) { + return true; } ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } state.marks.clear(); } @@ -542,13 +1005,14 @@ fn search_info_literal( } } } + false } fn search_info_charset( req: &mut Request, - state: &mut State, - mut ctx: MatchContext, -) { + state: &mut State, + mut ctx: MatchContext, +) -> bool { let set = &ctx.pattern(req)[5..]; ctx.skip_code_from(req, 1); @@ -560,18 +1024,15 @@ fn search_info_charset( ctx.skip_char(req, 1); } if ctx.at_end(req) { - return; + return false; } req.start = ctx.string_position; state.start = ctx.string_position; state.string_position = ctx.string_position; - state.context_stack.push(ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, ctx) { + return true; } ctx.skip_char(req, 1); @@ -579,9 +1040,10 @@ fn search_info_charset( } } +/* /* assert subpattern */ /* */ -fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); @@ -601,7 +1063,7 @@ fn op_assert(req: &Request, state: &mut State, ctx: &mut Matc /* assert not subpattern */ /* */ -fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -622,17 +1084,13 @@ fn op_assert_not(req: &Request, state: &mut State, ctx: &mut // alternation // <0=skip> code ... -fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { state.marks.push(); ctx.count = 1; create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { @@ -646,7 +1104,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc ctx.next_offset(branch_offset + 1, state, callback); } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -656,11 +1114,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, -) { +fn op_min_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; if ctx.remaining_chars(req) < min_count { @@ -692,23 +1146,19 @@ fn op_min_repeat_one( state.marks.push(); create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - ctx.next_from(1, req, state, callback); + ctx.next_peek_from(1, req, state, callback); } else { state.marks.pop_discard(); ctx.failure(); } } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -735,7 +1185,7 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -764,11 +1214,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut ctx.count = count as isize; create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as isize; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::LITERAL as u32 { @@ -788,10 +1234,10 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; // General case: backtracking - ctx.next_from(1, req, state, callback); + ctx.next_peek_from(1, req, state, callback); } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -810,6 +1256,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut create_context(req, state, ctx); } } +*/ #[derive(Debug, Clone, Copy)] struct RepeatContext { @@ -821,10 +1268,11 @@ struct RepeatContext { prev_id: usize, } +/* /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = RepeatContext { count: -1, min_count: ctx.peek_code(req, 2) as usize, @@ -840,7 +1288,7 @@ fn op_repeat(req: &Request, state: &mut State, ctx: &mut Matc let repeat_ctx_id = state.repeat_stack.len() - 1; - let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| { + let next_ctx = ctx.next_peek_from(1, req, state, |_, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); @@ -848,7 +1296,7 @@ fn op_repeat(req: &Request, state: &mut State, ctx: &mut Matc } /* minimizing repeat */ -fn op_min_until(state: &mut State, ctx: &mut MatchContext) { +fn op_min_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); state.string_position = ctx.string_position; @@ -915,7 +1363,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { } /* maximizing repeat */ -fn op_max_until(state: &mut State, ctx: &mut MatchContext) { +fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; state.string_position = ctx.string_position; @@ -976,7 +1424,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let next_ctx = ctx.next_offset(1, state, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { + fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { ctx.success(); } else { @@ -985,6 +1433,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { } } } +*/ pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; @@ -1061,67 +1510,49 @@ impl<'a> StrDrive for &'a [u8] { } } -type OpFunc = for<'a> fn(&Request<'a, S>, &mut State, &mut MatchContext); - #[derive(Clone, Copy)] -struct MatchContext { +struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, - has_matched: Option, toplevel: bool, - handler: Option>, + jump: Jump, repeat_ctx_id: usize, count: isize, } -impl std::fmt::Debug for MatchContext { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("MatchContext") - .field("string_position", &self.string_position) - .field("string_offset", &self.string_offset) - .field("code_position", &self.code_position) - .field("has_matched", &self.has_matched) - .field("toplevel", &self.toplevel) - .field("handler", &self.handler.map(|x| x as usize)) - .field("repeat_ctx_id", &self.repeat_ctx_id) - .field("count", &self.count) - .finish() - } -} - -impl MatchContext { - fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] { +impl MatchContext { + fn pattern<'a, S>(&self, req: &Request<'a, S>) -> &'a [u32] { &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, req: &Request) -> usize { + fn remaining_codes(&self, req: &Request) -> usize { req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, req: &Request) -> usize { + fn remaining_chars(&self, req: &Request) -> usize { req.end - self.string_position } - fn peek_char(&self, req: &Request) -> u32 { + fn peek_char(&self, req: &Request) -> u32 { req.string.peek(self.string_offset) } - fn skip_char(&mut self, req: &Request, skip: usize) { + fn skip_char(&mut self, req: &Request, skip: usize) { self.string_position += skip; self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, req: &Request) -> u32 { + fn back_peek_char(&self, req: &Request) -> u32 { req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, req: &Request, skip: usize) { + fn back_skip_char(&mut self, req: &Request, skip: usize) { self.string_position -= skip; self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, req: &Request, peek: usize) -> u32 { + fn peek_code(&self, req: &Request, peek: usize) -> u32 { req.pattern_codes[self.code_position + peek] } @@ -1129,7 +1560,7 @@ impl MatchContext { self.code_position += skip; } - fn skip_code_from(&mut self, req: &Request, peek: usize) { + fn skip_code_from(&mut self, req: &Request, peek: usize) { self.skip_code(self.peek_code(req, peek) as usize + 1); } @@ -1138,15 +1569,19 @@ impl MatchContext { self.string_position == 0 } - fn at_end(&self, req: &Request) -> bool { + fn at_end(&self, req: &Request) -> bool { self.string_position == req.end } - fn at_linebreak(&self, req: &Request) -> bool { + fn at_linebreak(&self, req: &Request) -> bool { !self.at_end(req) && is_linebreak(self.peek_char(req)) } - fn at_boundary bool>(&self, req: &Request, mut word_checker: F) -> bool { + fn at_boundary bool>( + &self, + req: &Request, + mut word_checker: F, + ) -> bool { if self.at_beginning() && self.at_end(req) { return false; } @@ -1155,7 +1590,7 @@ impl MatchContext { this != that } - fn at_non_boundary bool>( + fn at_non_boundary bool>( &self, req: &Request, mut word_checker: F, @@ -1168,7 +1603,7 @@ impl MatchContext { this == that } - fn can_success(&self, req: &Request) -> bool { + fn can_success(&self, req: &Request) -> bool { if !self.toplevel { return true; } @@ -1181,51 +1616,29 @@ impl MatchContext { true } - fn success(&mut self) { - self.has_matched = Some(true); + #[must_use] + fn next_peek_from(&mut self, peek: usize, req: &Request, jump: Jump) -> Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, jump) } - fn failure(&mut self) { - self.has_matched = Some(false); + #[must_use] + fn next_offset(&mut self, offset: usize, jump: Jump) -> Self { + self.next_at(self.code_position + offset, jump) } - fn next_from<'b>( - &mut self, - peek: usize, - req: &Request, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) - } - - fn next_offset<'b>( - &mut self, - offset: usize, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.next_at(self.code_position + offset, state, f) - } - - fn next_at<'b>( - &mut self, - code_position: usize, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.handler = Some(f); - state.next_context.insert(MatchContext { + #[must_use] + fn next_at(&mut self, code_position: usize, jump: Jump) -> Self { + self.jump = jump; + MatchContext { code_position, - has_matched: None, - handler: None, + jump: Jump::OpCode, count: -1, ..*self - }) + } } } -fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), @@ -1243,9 +1656,10 @@ fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) - } } +/* fn general_op_literal bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { @@ -1258,7 +1672,7 @@ fn general_op_literal bool>( fn general_op_in bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { @@ -1271,8 +1685,8 @@ fn general_op_in bool>( fn general_op_groupref u32>( req: &Request, - state: &State, - ctx: &mut MatchContext, + state: &State, + ctx: &mut MatchContext, mut f: F, ) { let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); @@ -1301,6 +1715,7 @@ fn general_op_groupref u32>( ctx.skip_code(2); } +*/ fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1433,8 +1848,8 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, - state: &mut State, - mut ctx: MatchContext, + state: &mut State, + mut ctx: MatchContext, max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); @@ -1491,12 +1906,15 @@ fn _count( while count < max_count { ctx.code_position = reset_position; - let code = ctx.peek_code(req, 0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(req, state, &mut ctx, code); - if ctx.has_matched == Some(false) { + if !_match(req, state, ctx) { break; } + // let code = ctx.peek_code(req, 0); + // let code = SreOpcode::try_from(code).unwrap(); + // dispatch(req, state, &mut ctx, code); + // if ctx.has_matched == Some(false) { + // break; + // } count += 1; } return count; @@ -1509,7 +1927,7 @@ fn _count( fn general_count_literal bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, end: usize, mut f: F, ) { diff --git a/tests/tests.rs b/tests/tests.rs index 5212226f4e..4e282a6f97 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,7 +8,7 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); let state = engine::State::default(); (req, state) @@ -22,8 +22,7 @@ fn test_2427() { #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1] }; // END GENERATED let (req, mut state) = lookbehind.state("x"); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); } #[test] @@ -33,8 +32,7 @@ fn test_assert() { #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1] }; // END GENERATED let (req, mut state) = positive_lookbehind.state("abcdef"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -44,8 +42,7 @@ fn test_string_boundaries() { #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1] }; // END GENERATED let (req, mut state) = big_b.state(""); - state.search(req); - assert!(!state.has_matched); + assert!(!state.search(req)); } #[test] @@ -56,7 +53,7 @@ fn test_zerowidth() { // END GENERATED let (mut req, mut state) = p.state("a:"); req.must_advance = true; - state.search(req); + assert!(state.search(req)); assert_eq!(state.string_position, 1); } @@ -68,8 +65,8 @@ fn test_repeat_context_panic() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED let (req, mut state) = p.state("axxzaz"); - state.pymatch(req); - assert_eq!(*state.marks, vec![Optioned::some(1), Optioned::some(3)]); + assert!(state.pymatch(&req)); + assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]); } #[test] @@ -79,7 +76,7 @@ fn test_double_max_until() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1] }; // END GENERATED let (req, mut state) = p.state("1111"); - state.pymatch(req); + assert!(state.pymatch(&req)); assert_eq!(state.string_position, 4); } @@ -90,7 +87,7 @@ fn test_info_single() { #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 1, 4294967295, 1, 1, 97, 0, 17, 97, 25, 6, 0, 4294967295, 17, 97, 1, 1] }; // END GENERATED let (req, mut state) = p.state("baaaa"); - state.search(req); + assert!(state.search(req)); assert_eq!(state.start, 1); assert_eq!(state.string_position, 5); } @@ -102,8 +99,7 @@ fn test_info_single2() { #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; // END GENERATED let (req, mut state) = p.state("Perl"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -113,8 +109,7 @@ fn test_info_literal() { #[rustfmt::skip] let p = Pattern { code: &[15, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 17, 97, 17, 98, 17, 97, 17, 98, 25, 6, 1, 4294967295, 17, 99, 1, 1] }; // END GENERATED let (req, mut state) = p.state("!ababc"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -124,6 +119,5 @@ fn test_info_literal2() { #[rustfmt::skip] let p = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 112, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; // END GENERATED let (req, mut state) = p.state("pythonpython"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } From 39c0106e873645ff8f212bcfa21c79216e5f7d89 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 11 Dec 2023 20:17:51 +0200 Subject: [PATCH 79/95] update to cpython 3.12 op code --- benches/benches.rs | 22 +++++++------- src/constants.rs | 72 ++++++++++++++++++++++++---------------------- src/engine.rs | 4 ++- tests/tests.rs | 20 ++++++------- 4 files changed, 61 insertions(+), 57 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index fe470d023c..f70138f920 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -30,47 +30,47 @@ fn benchmarks(b: &mut Bencher) { // # test common prefix // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; + #[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] }; // END GENERATED // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1] }; + #[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] }; // END GENERATED // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1] }; + #[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] }; // END GENERATED // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1] }; + #[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] }; // END GENERATED // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference // START GENERATED by generate_tests.py - #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; + #[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] }; // END GENERATED // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization // START GENERATED by generate_tests.py - #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; + #[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; // END GENERATED // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets // START GENERATED by generate_tests.py - #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; + #[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; // END GENERATED // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; + #[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; // END GENERATED // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; + #[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; // END GENERATED // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1] }; + #[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] }; // END GENERATED // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping // START GENERATED by generate_tests.py - #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1] }; + #[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] }; // END GENERATED let tests = [ diff --git a/src/constants.rs b/src/constants.rs index 0d5bb41939..dc61c33b2c 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -13,7 +13,7 @@ use bitflags::bitflags; -pub const SRE_MAGIC: usize = 20171005; +pub const SRE_MAGIC: usize = 20221023; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] @@ -26,39 +26,41 @@ pub enum SreOpcode { ASSERT_NOT = 5, AT = 6, BRANCH = 7, - CALL = 8, - CATEGORY = 9, - CHARSET = 10, - BIGCHARSET = 11, - GROUPREF = 12, - GROUPREF_EXISTS = 13, - IN = 14, - INFO = 15, - JUMP = 16, - LITERAL = 17, - MARK = 18, - MAX_UNTIL = 19, - MIN_UNTIL = 20, - NOT_LITERAL = 21, - NEGATE = 22, - RANGE = 23, - REPEAT = 24, - REPEAT_ONE = 25, - SUBPATTERN = 26, - MIN_REPEAT_ONE = 27, - GROUPREF_IGNORE = 28, - IN_IGNORE = 29, - LITERAL_IGNORE = 30, - NOT_LITERAL_IGNORE = 31, - GROUPREF_LOC_IGNORE = 32, - IN_LOC_IGNORE = 33, - LITERAL_LOC_IGNORE = 34, - NOT_LITERAL_LOC_IGNORE = 35, - GROUPREF_UNI_IGNORE = 36, - IN_UNI_IGNORE = 37, - LITERAL_UNI_IGNORE = 38, - NOT_LITERAL_UNI_IGNORE = 39, - RANGE_UNI_IGNORE = 40, + CATEGORY = 8, + CHARSET = 9, + BIGCHARSET = 10, + GROUPREF = 11, + GROUPREF_EXISTS = 12, + IN = 13, + INFO = 14, + JUMP = 15, + LITERAL = 16, + MARK = 17, + MAX_UNTIL = 18, + MIN_UNTIL = 19, + NOT_LITERAL = 20, + NEGATE = 21, + RANGE = 22, + REPEAT = 23, + REPEAT_ONE = 24, + SUBPATTERN = 25, + MIN_REPEAT_ONE = 26, + ATOMIC_GROUP = 27, + POSSESSIVE_REPEAT = 28, + POSSESSIVE_REPEAT_ONE = 29, + GROUPREF_IGNORE = 30, + IN_IGNORE = 31, + LITERAL_IGNORE = 32, + NOT_LITERAL_IGNORE = 33, + GROUPREF_LOC_IGNORE = 34, + IN_LOC_IGNORE = 35, + LITERAL_LOC_IGNORE = 36, + NOT_LITERAL_LOC_IGNORE = 37, + GROUPREF_UNI_IGNORE = 38, + IN_UNI_IGNORE = 39, + LITERAL_UNI_IGNORE = 40, + NOT_LITERAL_UNI_IGNORE = 41, + RANGE_UNI_IGNORE = 42, } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] @@ -101,7 +103,7 @@ pub enum SreCatCode { UNI_NOT_LINEBREAK = 17, } bitflags! { - #[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct SreFlag: u16 { const TEMPLATE = 1; const IGNORECASE = 2; diff --git a/src/engine.rs b/src/engine.rs index 7474f29013..e44a1f4a09 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -759,13 +759,15 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 2) } } - SreOpcode::CALL => todo!(), SreOpcode::CHARSET => todo!(), SreOpcode::BIGCHARSET => todo!(), SreOpcode::NEGATE => todo!(), SreOpcode::RANGE => todo!(), SreOpcode::RANGE_UNI_IGNORE => todo!(), SreOpcode::SUBPATTERN => todo!(), + SreOpcode::ATOMIC_GROUP => todo!(), + SreOpcode::POSSESSIVE_REPEAT => todo!(), + SreOpcode::POSSESSIVE_REPEAT_ONE => todo!(), } } }; diff --git a/tests/tests.rs b/tests/tests.rs index 4e282a6f97..0a1dc407fc 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -19,7 +19,7 @@ impl Pattern { fn test_2427() { // pattern lookbehind = re.compile(r'(? Date: Mon, 11 Dec 2023 22:28:36 +0200 Subject: [PATCH 80/95] fix _count general case --- src/engine.rs | 18 ++++++++---------- tests/tests.rs | 10 ++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index e44a1f4a09..3a24fe812b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1904,19 +1904,17 @@ fn _count( /* General case */ let mut count = 0; - let reset_position = ctx.code_position; - while count < max_count { - ctx.code_position = reset_position; - if !_match(req, state, ctx) { + let sub_ctx = MatchContext { + toplevel: true, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + ..ctx + }; + if !_match(req, state, sub_ctx) { break; } - // let code = ctx.peek_code(req, 0); - // let code = SreOpcode::try_from(code).unwrap(); - // dispatch(req, state, &mut ctx, code); - // if ctx.has_matched == Some(false) { - // break; - // } count += 1; } return count; diff --git a/tests/tests.rs b/tests/tests.rs index 0a1dc407fc..a452efb740 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -121,3 +121,13 @@ fn test_info_literal2() { let (req, mut state) = p.state("pythonpython"); assert!(state.search(req)); } + +#[test] +fn test_repeat_in_assertions() { + // pattern p = re.compile('^([ab]*?)(?=(b)?)c', re.IGNORECASE) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 6, 0, 17, 0, 26, 10, 0, 4294967295, 39, 5, 22, 97, 98, 0, 1, 17, 1, 4, 14, 0, 23, 9, 0, 1, 17, 2, 40, 98, 17, 3, 18, 1, 40, 99, 1] }; + // END GENERATED + let (req, mut state) = p.state("abc"); + assert!(state.search(req)); +} \ No newline at end of file From 99ed744c57c7aa8e3a0d33bf7893b26d5be86434 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 21:53:30 +0200 Subject: [PATCH 81/95] impl atomic group & possessive repeat --- src/engine.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++-- tests/tests.rs | 32 +++++++++++++++ 2 files changed, 136 insertions(+), 4 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 3a24fe812b..daed81212f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -267,6 +267,11 @@ enum Jump { RepeatOne2, MinRepeatOne1, MinRepeatOne2, + AtomicGroup1, + PossessiveRepeat1, + PossessiveRepeat2, + PossessiveRepeat3, + PossessiveRepeat4, } fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { @@ -445,6 +450,73 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.jump = Jump::MinRepeatOne1; continue 'context; } + Jump::AtomicGroup1 => { + if popped_result { + ctx.skip_code_from(req, 1); + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + // dispatch opcode + } else { + state.string_position = ctx.string_position; + break 'result false; + } + } + Jump::PossessiveRepeat1 => { + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count < min_count { + break 'context ctx.next_offset(4, Jump::PossessiveRepeat2); + } + // zero match protection + ctx.string_position = usize::MAX; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + Jump::PossessiveRepeat2 => { + if popped_result { + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } else { + state.string_position = ctx.string_position; + break 'result false; + } + } + Jump::PossessiveRepeat3 => { + let max_count = ctx.peek_code(req, 3) as usize; + if ((ctx.count as usize) < max_count || max_count == MAXREPEAT) + && ctx.string_position != state.string_position + { + state.marks.push(); + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + break 'context ctx.next_offset(4, Jump::PossessiveRepeat4); + } + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + // popped_result = false; + // ctx.jump = Jump::PossessiveRepeat4; + // continue 'context; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + // if ctx.remaining_codes(req) > 1 && ctx.toplevel { + // ctx.skip_code(1); + // } + } + Jump::PossessiveRepeat4 => { + if popped_result { + state.marks.pop_discard(); + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + state.marks.pop(); + state.string_position = ctx.string_position; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + // if ctx.remaining_codes(req) > 1 && ctx.toplevel { + // ctx.skip_code(1); + // } + } } ctx.jump = Jump::OpCode; @@ -759,15 +831,43 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 2) } } + /* pattern tail */ + SreOpcode::ATOMIC_GROUP => { + state.string_position = ctx.string_position; + break 'context ctx.next_offset(2, Jump::AtomicGroup1); + } + /* <1=min> <2=max> pattern + tail */ + SreOpcode::POSSESSIVE_REPEAT => { + state.string_position = ctx.string_position; + ctx.count = 0; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } + /* <1=min> <2=max> item + tail */ + SreOpcode::POSSESSIVE_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + state.string_position = ctx.string_position; + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, count_ctx, max_count); + if count < min_count { + break 'result false; + } + ctx.skip_char(req, count); + ctx.skip_code_from(req, 1); + } SreOpcode::CHARSET => todo!(), SreOpcode::BIGCHARSET => todo!(), SreOpcode::NEGATE => todo!(), SreOpcode::RANGE => todo!(), SreOpcode::RANGE_UNI_IGNORE => todo!(), SreOpcode::SUBPATTERN => todo!(), - SreOpcode::ATOMIC_GROUP => todo!(), - SreOpcode::POSSESSIVE_REPEAT => todo!(), - SreOpcode::POSSESSIVE_REPEAT_ONE => todo!(), } } }; @@ -1906,7 +2006,7 @@ fn _count( while count < max_count { let sub_ctx = MatchContext { - toplevel: true, + toplevel: false, jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, diff --git a/tests/tests.rs b/tests/tests.rs index a452efb740..4c56c42fae 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -130,4 +130,36 @@ fn test_repeat_in_assertions() { // END GENERATED let (req, mut state) = p.state("abc"); assert!(state.search(req)); +} + +#[test] +fn test_possessive_quantifier() { + // pattern p = re.compile('e++a') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 29, 6, 1, 4294967295, 16, 101, 1, 16, 97, 1] }; + // END GENERATED + let (req, mut state) = p.state("eeea"); + assert!(state.pymatch(&req)); +} + +#[test] +fn test_possessive_atomic_group() { + // pattern p = re.compile('(?>x)++x') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 28, 8, 1, 4294967295, 27, 4, 16, 120, 1, 1, 16, 120, 1] }; + // END GENERATED + let (req, mut state) = p.state("xxx"); + assert!(!state.pymatch(&req)); +} + +#[test] +fn test_bug_20998() { + // pattern p = re.compile('[a-c]+', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 24, 10, 1, 4294967295, 39, 5, 22, 97, 99, 0, 1, 1] }; + // END GENERATED + let (mut req, mut state) = p.state("ABC"); + req.match_all = true; + assert!(state.pymatch(&req)); + assert_eq!(state.string_position, 3); } \ No newline at end of file From 9378497346147500328d6b96e59386caecbfbeab Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 21:59:38 +0200 Subject: [PATCH 82/95] clearup --- src/engine.rs | 592 ++------------------------------------------------ 1 file changed, 14 insertions(+), 578 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index daed81212f..3486c52d9f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -493,14 +493,8 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } ctx.string_position = state.string_position; ctx.string_offset = req.string.offset(0, state.string_position); - // popped_result = false; - // ctx.jump = Jump::PossessiveRepeat4; - // continue 'context; ctx.skip_code_from(req, 1); ctx.skip_code(1); - // if ctx.remaining_codes(req) > 1 && ctx.toplevel { - // ctx.skip_code(1); - // } } Jump::PossessiveRepeat4 => { if popped_result { @@ -513,9 +507,6 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - state.string_position = ctx.string_position; ctx.skip_code_from(req, 1); ctx.skip_code(1); - // if ctx.remaining_codes(req) > 1 && ctx.toplevel { - // ctx.skip_code(1); - // } } } ctx.jump = Jump::OpCode; @@ -603,6 +594,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code(1); ctx.skip_char(req, 1); } + /* */ SreOpcode::ASSERT => { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -615,6 +607,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - state.string_position = next_ctx.string_position; break 'context next_ctx; } + /* */ SreOpcode::ASSERT_NOT => { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -636,6 +629,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - break 'result false; } } + // <0=skip> code ... SreOpcode::BRANCH => { state.marks.push(); ctx.count = 1; @@ -672,6 +666,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code(2); } SreOpcode::JUMP => ctx.skip_code_from(req, 1), + /* <1=min> <2=max> item tail */ SreOpcode::REPEAT => { let repeat_ctx = RepeatContext { count: -1, @@ -736,6 +731,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; break 'context next_ctx; } + /* <1=min> <2=max> item tail */ SreOpcode::REPEAT_ONE => { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -766,6 +762,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.jump = Jump::RepeatOne1; continue 'context; } + /* <1=min> <2=max> item tail */ SreOpcode::MIN_REPEAT_ONE => { let min_count = ctx.peek_code(req, 2) as usize; if ctx.remaining_chars(req) < min_count { @@ -862,12 +859,14 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_char(req, count); ctx.skip_code_from(req, 1); } - SreOpcode::CHARSET => todo!(), - SreOpcode::BIGCHARSET => todo!(), - SreOpcode::NEGATE => todo!(), - SreOpcode::RANGE => todo!(), - SreOpcode::RANGE_UNI_IGNORE => todo!(), - SreOpcode::SUBPATTERN => todo!(), + SreOpcode::CHARSET + | SreOpcode::BIGCHARSET + | SreOpcode::NEGATE + | SreOpcode::RANGE + | SreOpcode::RANGE_UNI_IGNORE + | SreOpcode::SUBPATTERN => { + unreachable!("unexpected opcode on main dispatch") + } } } }; @@ -879,123 +878,6 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - popped_result } -/* -fn dispatch( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - opcode: SreOpcode, -) { - match opcode { - SreOpcode::FAILURE => { - ctx.failure(); - } - SreOpcode::SUCCESS => { - if ctx.can_success(req) { - state.string_position = ctx.string_position; - ctx.success(); - } else { - ctx.failure(); - } - } - SreOpcode::ANY => { - if ctx.at_end(req) || ctx.at_linebreak(req) { - ctx.failure(); - } else { - ctx.skip_code(1); - ctx.skip_char(req, 1); - } - } - SreOpcode::ANY_ALL => { - if ctx.at_end(req) { - ctx.failure(); - } else { - ctx.skip_code(1); - ctx.skip_char(req, 1); - } - } - SreOpcode::ASSERT => op_assert(req, state, ctx), - SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if at(req, ctx, atcode) { - ctx.skip_code(2); - } else { - ctx.failure(); - } - } - SreOpcode::BRANCH => op_branch(req, state, ctx), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code(2); - ctx.skip_char(req, 1); - } - } - SreOpcode::IN => general_op_in(req, ctx, charset), - SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))), - SreOpcode::IN_UNI_IGNORE => { - general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c))) - } - SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore), - SreOpcode::INFO => { - let min = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(req) < min { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - } - } - SreOpcode::JUMP => ctx.skip_code_from(req, 1), - SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)), - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(req, ctx, |code, c| code != lower_ascii(c)) - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(req, ctx, |code, c| code == lower_unicode(c)) - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(req, ctx, |code, c| code != lower_unicode(c)) - } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore), - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) - } - SreOpcode::MARK => { - state - .marks - .set(ctx.peek_code(req, 1) as usize, ctx.string_position); - ctx.skip_code(2); - } - SreOpcode::MAX_UNTIL => op_max_until(state, ctx), - SreOpcode::MIN_UNTIL => op_min_until(state, ctx), - SreOpcode::REPEAT => op_repeat(req, state, ctx), - SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx), - SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); - if group_start.is_some() - && group_end.is_some() - && group_start.unpack() <= group_end.unpack() - { - ctx.skip_code(3); - } else { - ctx.skip_code_from(req, 2) - } - } - _ => unreachable!("unexpected opcode"), - } -} -*/ - fn search_info_literal( req: &mut Request, state: &mut State, @@ -1142,224 +1024,6 @@ fn search_info_charset( } } -/* -/* assert subpattern */ -/* */ -fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { - return ctx.failure(); - } - - let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { - if state.popped_has_matched { - ctx.skip_code_from(req, 1); - } else { - ctx.failure(); - } - }); - next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; -} - -/* assert not subpattern */ -/* */ -fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let back = ctx.peek_code(req, 2) as usize; - - if ctx.string_position < back { - return ctx.skip_code_from(req, 1); - } - - let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { - if state.popped_has_matched { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - } - }); - next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; -} - -// alternation -// <0=skip> code ... -fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { - state.marks.push(); - - ctx.count = 1; - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let branch_offset = ctx.count as usize; - let next_length = ctx.peek_code(req, branch_offset) as isize; - if next_length == 0 { - state.marks.pop_discard(); - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - ctx.count += next_length; - ctx.next_offset(branch_offset + 1, state, callback); - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} - -/* <1=min> <2=max> item tail */ -fn op_min_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as usize; - - if ctx.remaining_chars(req) < min_count { - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - ctx.count = if min_count == 0 { - 0 - } else { - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, min_count); - if count < min_count { - return ctx.failure(); - } - ctx.skip_char(req, count); - count as isize - }; - - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { - // tail is empty. we're finished - state.string_position = ctx.string_position; - return ctx.success(); - } - - state.marks.push(); - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let max_count = ctx.peek_code(req, 3) as usize; - - if max_count == MAXREPEAT || ctx.count as usize <= max_count { - state.string_position = ctx.string_position; - ctx.next_peek_from(1, req, state, callback); - } else { - state.marks.pop_discard(); - ctx.failure(); - } - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - - state.string_position = ctx.string_position; - - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - if _count(req, state, next_ctx, 1) == 0 { - state.marks.pop_discard(); - return ctx.failure(); - } - - ctx.skip_char(req, 1); - ctx.count += 1; - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} - -/* match repeated sequence (maximizing regexp) */ -/* this operator only works if the repeated item is -exactly one character wide, and we're not already -collecting backtracking points. for other cases, -use the MAX_REPEAT operator */ -/* <1=min> <2=max> item tail */ -fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as usize; - let max_count = ctx.peek_code(req, 3) as usize; - - if ctx.remaining_chars(req) < min_count { - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, max_count); - ctx.skip_char(req, count); - if count < min_count { - return ctx.failure(); - } - - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { - // tail is empty. we're finished - state.string_position = ctx.string_position; - return ctx.success(); - } - - state.marks.push(); - ctx.count = count as isize; - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as isize; - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::LITERAL as u32 { - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); - while ctx.at_end(req) || ctx.peek_char(req) != c { - if ctx.count <= min_count { - state.marks.pop_discard(); - return ctx.failure(); - } - ctx.back_skip_char(req, 1); - ctx.count -= 1; - } - } - - state.string_position = ctx.string_position; - - // General case: backtracking - ctx.next_peek_from(1, req, state, callback); - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - - let min_count = ctx.peek_code(req, 2) as isize; - - if ctx.count <= min_count { - state.marks.pop_discard(); - return ctx.failure(); - } - - ctx.back_skip_char(req, 1); - ctx.count -= 1; - - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} -*/ - #[derive(Debug, Clone, Copy)] struct RepeatContext { count: isize, @@ -1370,173 +1034,6 @@ struct RepeatContext { prev_id: usize, } -/* -/* create repeat context. all the hard work is done -by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -/* <1=min> <2=max> item tail */ -fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = RepeatContext { - count: -1, - min_count: ctx.peek_code(req, 2) as usize, - max_count: ctx.peek_code(req, 3) as usize, - code_position: ctx.code_position, - last_position: std::usize::MAX, - prev_id: ctx.repeat_ctx_id, - }; - - state.repeat_stack.push(repeat_ctx); - - state.string_position = ctx.string_position; - - let repeat_ctx_id = state.repeat_stack.len() - 1; - - let next_ctx = ctx.next_peek_from(1, req, state, |_, state, ctx| { - ctx.has_matched = Some(state.popped_has_matched); - state.repeat_stack.pop(); - }); - next_ctx.repeat_ctx_id = repeat_ctx_id; -} - -/* minimizing repeat */ -fn op_min_until(state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = state.repeat_stack.last_mut().unwrap(); - - state.string_position = ctx.string_position; - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - return; - } - - state.marks.push(); - - ctx.count = ctx.repeat_ctx_id as isize; - - let repeat_ctx_prev_id = repeat_ctx.prev_id; - - // see if the tail matches - let next_ctx = ctx.next_offset(1, state, |_, state, ctx| { - if state.popped_has_matched { - return ctx.success(); - } - - ctx.repeat_ctx_id = ctx.count as usize; - - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - - state.string_position = ctx.string_position; - - state.marks.pop(); - - // match more until tail matches - - if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || state.string_position == repeat_ctx.last_position - { - repeat_ctx.count -= 1; - return ctx.failure(); - } - - /* zero-width match protection */ - repeat_ctx.last_position = state.string_position; - - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - }); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; -} - -/* maximizing repeat */ -fn op_max_until(state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - - state.string_position = ctx.string_position; - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - return; - } - - if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && state.string_position != repeat_ctx.last_position - { - /* we may have enough matches, but if we can - match another item, do so */ - state.marks.push(); - - ctx.count = repeat_ctx.last_position as isize; - repeat_ctx.last_position = state.string_position; - - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - let save_last_position = ctx.count as usize; - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - repeat_ctx.last_position = save_last_position; - - if state.popped_has_matched { - state.marks.pop_discard(); - return ctx.success(); - } - - state.marks.pop(); - repeat_ctx.count -= 1; - - state.string_position = ctx.string_position; - - /* cannot match more repeated items here. make sure the - tail matches */ - let repeat_ctx_prev_id = repeat_ctx.prev_id; - let next_ctx = ctx.next_offset(1, state, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - }); - return; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - let repeat_ctx_prev_id = repeat_ctx.prev_id; - let next_ctx = ctx.next_offset(1, state, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - - fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - ctx.success(); - } else { - state.string_position = ctx.string_position; - ctx.failure(); - } - } -} -*/ - pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; fn count(&self) -> usize; @@ -1758,67 +1255,6 @@ fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> b } } -/* -fn general_op_literal bool>( - req: &Request, - ctx: &mut MatchContext, - f: F, -) { - if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code(2); - ctx.skip_char(req, 1); - } -} - -fn general_op_in bool>( - req: &Request, - ctx: &mut MatchContext, - f: F, -) { - if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - ctx.skip_char(req, 1); - } -} - -fn general_op_groupref u32>( - req: &Request, - state: &State, - ctx: &mut MatchContext, - mut f: F, -) { - let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); - let (group_start, group_end) = if group_start.is_some() - && group_end.is_some() - && group_start.unpack() <= group_end.unpack() - { - (group_start.unpack(), group_end.unpack()) - } else { - return ctx.failure(); - }; - - let mut gctx = MatchContext { - string_position: group_start, - string_offset: req.string.offset(0, group_start), - ..*ctx - }; - - for _ in group_start..group_end { - if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) { - return ctx.failure(); - } - ctx.skip_char(req, 1); - gctx.skip_char(req, 1); - } - - ctx.skip_code(2); -} -*/ - fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) } From 003c45dbffbfefe5a7a47899836d042bddfbb8a9 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:00:35 +0200 Subject: [PATCH 83/95] remove unneccesary INFO logic on main dispatch --- src/engine.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 3486c52d9f..5cf79ea147 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -652,20 +652,13 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - general_op_in!(|set, c| charset(set, lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), - SreOpcode::INFO => { - let min = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(req) < min { - break 'result false; - } - ctx.skip_code_from(req, 1); - } SreOpcode::MARK => { state .marks .set(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } - SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(req, 1), /* <1=min> <2=max> item tail */ SreOpcode::REPEAT => { let repeat_ctx = RepeatContext { From 41bdcfe2212d08c4068595babe3c497cd2a035fb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:13:54 +0200 Subject: [PATCH 84/95] bump version to 0.5.0 --- Cargo.toml | 2 +- src/engine.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index de1d68cf6d..b0ec8eab2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.3" +version = "0.5.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 5cf79ea147..5ee9af7d1b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,8 +1,6 @@ // good luck to those that follow; here be dragons -use crate::constants::SreInfo; - -use super::constants::{SreAtCode, SreCatCode, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreOpcode, SreInfo}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -284,7 +282,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - }; popped_result = 'result: loop { - let yield_ = 'context: loop { + let yielded = 'context: loop { match ctx.jump { Jump::OpCode => {} Jump::Assert1 => { @@ -864,7 +862,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } }; context_stack.push(ctx); - context_stack.push(yield_); + context_stack.push(yielded); continue 'coro; }; } From 169368b7f06fd502a5a77fd56cf5b7ec46e3a975 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:43:17 +0200 Subject: [PATCH 85/95] fix some clippy --- src/engine.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5ee9af7d1b..b6bf6a6fb6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -142,7 +142,7 @@ impl State { repeat_ctx_id: usize::MAX, count: -1, }; - _match(&req, self, ctx) + _match(req, self, ctx) } pub fn search(&mut self, mut req: Request) -> bool { @@ -1400,16 +1400,16 @@ fn _count( } } SreOpcode::LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code == c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == c); } SreOpcode::NOT_LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code != c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != c); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c)); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c)); } SreOpcode::LITERAL_LOC_IGNORE => { general_count_literal(req, &mut ctx, end, char_loc_ignore); @@ -1419,12 +1419,12 @@ fn _count( } SreOpcode::LITERAL_UNI_IGNORE => { general_count_literal(req, &mut ctx, end, |code, c| { - code == lower_unicode(c) as u32 + code == lower_unicode(c) }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { general_count_literal(req, &mut ctx, end, |code, c| { - code != lower_unicode(c) as u32 + code != lower_unicode(c) }); } _ => { From 454aa4b6544cc53b3443328015ff923103f906c4 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 3 Jan 2024 15:34:01 +0200 Subject: [PATCH 86/95] fix count not advance --- src/engine.rs | 28 ++++++++++++++-------------- tests/tests.rs | 13 ++++++++++++- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b6bf6a6fb6..86b8d20d8a 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreOpcode, SreInfo}; +use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -1418,31 +1418,31 @@ fn _count( general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| { - code == lower_unicode(c) - }); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_unicode(c)); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| { - code != lower_unicode(c) - }); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_unicode(c)); } _ => { /* General case */ let mut count = 0; + let mut sub_ctx = MatchContext { + toplevel: false, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + ..ctx + }; + while count < max_count { - let sub_ctx = MatchContext { - toplevel: false, - jump: Jump::OpCode, - repeat_ctx_id: usize::MAX, - count: -1, - ..ctx - }; if !_match(req, state, sub_ctx) { break; } count += 1; + sub_ctx.skip_char(req, 1); + // ctx.string_position = state.string_position; + // ctx.string_offset = req.string.offset(0, state.string_position); } return count; } diff --git a/tests/tests.rs b/tests/tests.rs index 4c56c42fae..efeb2d2838 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -162,4 +162,15 @@ fn test_bug_20998() { req.match_all = true; assert!(state.pymatch(&req)); assert_eq!(state.string_position, 3); -} \ No newline at end of file +} + +#[test] +fn test_bigcharset() { + // pattern p = re.compile('[a-z]*', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 24, 97, 0, 4294967295, 39, 92, 10, 3, 33685760, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 0, 0, 0, 134217726, 0, 0, 0, 0, 0, 131072, 0, 2147483648, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("x "); + assert!(state.pymatch(&req)); + assert_eq!(state.string_position, 1); +} From 17e1152de63cd9b70e4ad3b061979baa7ee54a35 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 3 Jan 2024 17:17:27 +0200 Subject: [PATCH 87/95] fix assert not mark --- src/engine.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/engine.rs b/src/engine.rs index 86b8d20d8a..964775fd8f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -296,6 +296,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - if popped_result { break 'result false; } + state.marks.pop(); ctx.skip_code_from(req, 1); } Jump::Branch1 => { @@ -612,6 +613,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 1); continue; } + state.marks.push(); let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); next_ctx.toplevel = false; From 2fe129252fd0d798dc0cdac78737bd641825d48b Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 4 Jan 2024 07:51:35 +0200 Subject: [PATCH 88/95] improve ctx in _match lazy create stack vec --- src/constants.rs | 4 ++-- src/engine.rs | 31 ++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index dc61c33b2c..9fe792ce17 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -14,7 +14,7 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20221023; -#[derive(num_enum::TryFromPrimitive, Debug)] +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreOpcode { @@ -62,7 +62,7 @@ pub enum SreOpcode { NOT_LITERAL_UNI_IGNORE = 41, RANGE_UNI_IGNORE = 42, } -#[derive(num_enum::TryFromPrimitive, Debug)] +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreAtCode { diff --git a/src/engine.rs b/src/engine.rs index 964775fd8f..c23ffb7a2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -207,6 +207,15 @@ impl State { return true; } + if ctx.try_peek_code_as::(&req, 1).unwrap() == SreOpcode::AT + && (ctx.try_peek_code_as::(&req, 2).unwrap() == SreAtCode::BEGINNING + || ctx.try_peek_code_as::(&req, 2).unwrap() + == SreAtCode::BEGINNING_STRING) + { + self.reset(req.end); + return false; + } + req.must_advance = false; ctx.toplevel = false; while req.start < end { @@ -272,15 +281,11 @@ enum Jump { PossessiveRepeat4, } -fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { - let mut context_stack = vec![ctx]; +fn _match(req: &Request, state: &mut State, mut ctx: MatchContext) -> bool { + let mut context_stack = vec![]; let mut popped_result = false; 'coro: loop { - let Some(mut ctx) = context_stack.pop() else { - break; - }; - popped_result = 'result: loop { let yielded = 'context: loop { match ctx.jump { @@ -864,9 +869,14 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } }; context_stack.push(ctx); - context_stack.push(yielded); + ctx = yielded; continue 'coro; }; + if let Some(popped_ctx) = context_stack.pop() { + ctx = popped_ctx; + } else { + break; + } } popped_result } @@ -1148,6 +1158,13 @@ impl MatchContext { req.pattern_codes[self.code_position + peek] } + fn try_peek_code_as(&self, req: &Request, peek: usize) -> Result + where + T: TryFrom, + { + self.peek_code(req, peek).try_into() + } + fn skip_code(&mut self, skip: usize) { self.code_position += skip; } From f9b2d10c710d45ebd0cc9788294cd5806cc9e8ac Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Jan 2024 07:23:50 +0200 Subject: [PATCH 89/95] improve search at_beginning --- src/engine.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index c23ffb7a2f..dbb58025fe 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -201,15 +201,17 @@ impl State { return search_info_charset(&mut req, self, ctx); } // fallback to general search + // skip OP INFO + ctx.skip_code_from(&req, 1); } if _match(&req, self, ctx) { return true; } - if ctx.try_peek_code_as::(&req, 1).unwrap() == SreOpcode::AT - && (ctx.try_peek_code_as::(&req, 2).unwrap() == SreAtCode::BEGINNING - || ctx.try_peek_code_as::(&req, 2).unwrap() + if ctx.try_peek_code_as::(&req, 0).unwrap() == SreOpcode::AT + && (ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING + || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { self.reset(req.end); From 118a00c012810900fe89277cebe6a5f09ff286d1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 7 Jan 2024 08:42:18 +0200 Subject: [PATCH 90/95] refactor _count general case --- src/engine.rs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index dbb58025fe..ca44f4994a 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1446,26 +1446,20 @@ fn _count( } _ => { /* General case */ - let mut count = 0; - - let mut sub_ctx = MatchContext { - toplevel: false, - jump: Jump::OpCode, - repeat_ctx_id: usize::MAX, - count: -1, - ..ctx + ctx.toplevel = false; + ctx.jump = Jump::OpCode; + ctx.repeat_ctx_id = usize::MAX; + ctx.count = -1; + + let mut sub_state = State { + marks: Marks::default(), + repeat_stack: vec![], + ..*state }; - while count < max_count { - if !_match(req, state, sub_ctx) { - break; - } - count += 1; - sub_ctx.skip_char(req, 1); - // ctx.string_position = state.string_position; - // ctx.string_offset = req.string.offset(0, state.string_position); + while ctx.string_position < end && _match(req, &mut sub_state, ctx) { + ctx.skip_char(req, 1); } - return count; } } From c93ea30b3b5849edc3ac888a1bfec69f08b537c1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 13 Jan 2024 16:03:38 +0200 Subject: [PATCH 91/95] improve use StringCursor replace index based position --- Cargo.toml | 2 +- benches/benches.rs | 12 +- src/engine.rs | 511 ++++++++++++++------------------------------- src/lib.rs | 5 + src/string.rs | 381 +++++++++++++++++++++++++++++++++ tests/tests.rs | 31 +-- 6 files changed, 562 insertions(+), 380 deletions(-) create mode 100644 src/string.rs diff --git a/Cargo.toml b/Cargo.toml index b0ec8eab2d..e54f124ac0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.5.0" +version = "0.6.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/benches/benches.rs b/benches/benches.rs index f70138f920..e89adab0dd 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,24 +3,24 @@ extern crate test; use test::Bencher; -use sre_engine::engine; +use sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], } impl Pattern { - fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) { + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { self.state_range(string, 0..usize::MAX) } - fn state_range<'a, S: engine::StrDrive>( + fn state_range<'a, S: StrDrive>( &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State) { - let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::default(); + ) -> (Request<'a, S>, State) { + let req = Request::new(string, range.start, range.end, self.code, false); + let state = State::default(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index ca44f4994a..97489633d8 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,14 +1,14 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; -use super::MAXREPEAT; +use crate::string::{ + is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space, + is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode, +}; + +use super::{SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor, MAXREPEAT}; use optional::Optioned; use std::convert::TryFrom; -const fn is_py_ascii_whitespace(b: u8) -> bool { - matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') -} - #[derive(Debug, Clone, Copy)] pub struct Request<'a, S> { pub string: S, @@ -117,25 +117,29 @@ impl Marks { pub struct State { pub start: usize, pub marks: Marks, - pub string_position: usize, + pub cursor: StringCursor, repeat_stack: Vec, } impl State { - pub fn reset(&mut self, start: usize) { + pub fn reset(&mut self, req: &Request, start: usize) { self.marks.clear(); self.repeat_stack.clear(); self.start = start; - self.string_position = start; + if self.cursor.ptr.is_null() || self.cursor.position > self.start { + self.cursor = req.string.create_cursor(self.start); + } else if self.cursor.position < self.start { + let skip = self.start - self.cursor.position; + S::skip(&mut self.cursor, skip); + } } pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; - self.string_position = req.start; + self.cursor = req.string.create_cursor(self.start); let ctx = MatchContext { - string_position: req.start, - string_offset: req.string.offset(0, req.start), + cursor: self.cursor, code_position: 0, toplevel: true, jump: Jump::OpCode, @@ -147,7 +151,7 @@ impl State { pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; - self.string_position = req.start; + self.cursor = req.string.create_cursor(self.start); if req.start > req.end { return false; @@ -155,11 +159,8 @@ impl State { let mut end = req.end; - let mut start_offset = req.string.offset(0, req.start); - let mut ctx = MatchContext { - string_position: req.start, - string_offset: start_offset, + cursor: self.cursor, code_position: 0, toplevel: true, jump: Jump::OpCode, @@ -183,9 +184,9 @@ impl State { end -= min - 1; // adjust ctx position - if end < ctx.string_position { - ctx.string_position = end; - ctx.string_offset = req.string.offset(0, ctx.string_position); + if end < ctx.cursor.position { + let skip = end - self.cursor.position; + S::skip(&mut self.cursor, skip); } } @@ -214,7 +215,7 @@ impl State { || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { - self.reset(req.end); + self.reset(&req, req.end); return false; } @@ -222,10 +223,8 @@ impl State { ctx.toplevel = false; while req.start < end { req.start += 1; - start_offset = req.string.offset(start_offset, 1); - self.reset(req.start); - ctx.string_position = req.start; - ctx.string_offset = start_offset; + self.reset(&req, req.start); + ctx.cursor = self.cursor; if _match(&req, self, ctx) { return true; @@ -248,13 +247,13 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { return None; } - self.state.reset(self.req.start); + self.state.reset(&self.req, self.req.start); if !self.state.search(self.req) { return None; } - self.req.must_advance = self.state.string_position == self.state.start; - self.req.start = self.state.string_position; + self.req.must_advance = self.state.cursor.position == self.state.start; + self.req.start = self.state.cursor.position; Some(()) } @@ -313,7 +312,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.marks.pop_discard(); break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let next_ctx = ctx.next_offset(branch_offset + 1, Jump::Branch2); ctx.count += next_length; break 'context next_ctx; @@ -333,7 +332,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::UntilBacktrace => { if !popped_result { state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; } break 'result popped_result; } @@ -349,7 +348,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.marks.pop(); repeat_ctx.count -= 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; /* cannot match more repeated items here. make sure the tail matches */ @@ -359,7 +358,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } Jump::MaxUntil3 => { if !popped_result { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; } break 'result popped_result; } @@ -369,20 +368,20 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } ctx.repeat_ctx_id = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; state.marks.pop(); // match more until tail matches if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || state.string_position == repeat_ctx.last_position + || state.cursor.position == repeat_ctx.last_position { repeat_ctx.count -= 1; break 'result false; } /* zero-width match protection */ - repeat_ctx.last_position = state.string_position; + repeat_ctx.last_position = state.cursor.position; break 'context ctx .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); @@ -394,17 +393,17 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); - while ctx.at_end(req) || ctx.peek_char(req) != c { + while ctx.at_end(req) || ctx.peek_char::() != c { if ctx.count <= min_count { state.marks.pop_discard(); break 'result false; } - ctx.back_skip_char(req, 1); + ctx.back_advance_char::(); ctx.count -= 1; } } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; // General case: backtracking break 'context ctx.next_peek_from(1, req, Jump::RepeatOne2); } @@ -419,7 +418,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - ctx.back_skip_char(req, 1); + ctx.back_advance_char::(); ctx.count -= 1; state.marks.pop_keep(); @@ -429,7 +428,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::MinRepeatOne1 => { let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'context ctx.next_peek_from(1, req, Jump::MinRepeatOne2); } else { state.marks.pop_discard(); @@ -441,7 +440,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result true; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); @@ -450,7 +449,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - ctx.skip_char(req, 1); + ctx.advance_char::(); ctx.count += 1; state.marks.pop_keep(); ctx.jump = Jump::MinRepeatOne1; @@ -459,11 +458,10 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::AtomicGroup1 => { if popped_result { ctx.skip_code_from(req, 1); - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; // dispatch opcode } else { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result false; } } @@ -473,7 +471,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'context ctx.next_offset(4, Jump::PossessiveRepeat2); } // zero match protection - ctx.string_position = usize::MAX; + ctx.cursor.position = usize::MAX; ctx.jump = Jump::PossessiveRepeat3; continue 'context; } @@ -483,22 +481,20 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex ctx.jump = Jump::PossessiveRepeat1; continue 'context; } else { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result false; } } Jump::PossessiveRepeat3 => { let max_count = ctx.peek_code(req, 3) as usize; if ((ctx.count as usize) < max_count || max_count == MAXREPEAT) - && ctx.string_position != state.string_position + && ctx.cursor.position != state.cursor.position { state.marks.push(); - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; break 'context ctx.next_offset(4, Jump::PossessiveRepeat4); } - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; ctx.skip_code_from(req, 1); ctx.skip_code(1); } @@ -510,7 +506,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex continue 'context; } state.marks.pop(); - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.skip_code_from(req, 1); ctx.skip_code(1); } @@ -520,21 +516,22 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex loop { macro_rules! general_op_literal { ($f:expr) => {{ - if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char(req)) { + if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char::()) { break 'result false; } ctx.skip_code(2); - ctx.skip_char(req, 1); + ctx.advance_char::(); }}; } macro_rules! general_op_in { ($f:expr) => {{ - if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { + if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char::()) + { break 'result false; } ctx.skip_code_from(req, 1); - ctx.skip_char(req, 1); + ctx.advance_char::(); }}; } @@ -552,19 +549,18 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; let mut gctx = MatchContext { - string_position: group_start, - string_offset: req.string.offset(0, group_start), + cursor: req.string.create_cursor(group_start), ..ctx }; for _ in group_start..group_end { if ctx.at_end(req) - || $f(ctx.peek_char(req)) != $f(gctx.peek_char(req)) + || $f(ctx.peek_char::()) != $f(gctx.peek_char::()) { break 'result false; } - ctx.skip_char(req, 1); - gctx.skip_char(req, 1); + ctx.advance_char::(); + gctx.advance_char::(); } ctx.skip_code(2); @@ -581,7 +577,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex SreOpcode::FAILURE => break 'result false, SreOpcode::SUCCESS => { if ctx.can_success(req) { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } break 'result false; @@ -591,32 +587,32 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } ctx.skip_code(1); - ctx.skip_char(req, 1); + ctx.advance_char::(); } SreOpcode::ANY_ALL => { if ctx.at_end(req) { break 'result false; } ctx.skip_code(1); - ctx.skip_char(req, 1); + ctx.advance_char::(); } /* */ SreOpcode::ASSERT => { let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { + if ctx.cursor.position < back { break 'result false; } let mut next_ctx = ctx.next_offset(3, Jump::Assert1); next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; break 'context next_ctx; } /* */ SreOpcode::ASSERT_NOT => { let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { + if ctx.cursor.position < back { ctx.skip_code_from(req, 1); continue; } @@ -624,8 +620,8 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; break 'context next_ctx; } SreOpcode::AT => { @@ -645,11 +641,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } SreOpcode::CATEGORY => { let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { + if ctx.at_end(req) || !category(catcode, ctx.peek_char::()) { break 'result false; } ctx.skip_code(2); - ctx.skip_char(req, 1); + ctx.advance_char::(); } SreOpcode::IN => general_op_in!(charset), SreOpcode::IN_IGNORE => { @@ -662,7 +658,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex SreOpcode::MARK => { state .marks - .set(ctx.peek_code(req, 1) as usize, ctx.string_position); + .set(ctx.peek_code(req, 1) as usize, ctx.cursor.position); ctx.skip_code(2); } SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(req, 1), @@ -678,14 +674,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; state.repeat_stack.push(repeat_ctx); let repeat_ctx_id = state.repeat_stack.len() - 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut next_ctx = ctx.next_peek_from(1, req, Jump::Repeat1); next_ctx.repeat_ctx_id = repeat_ctx_id; break 'context next_ctx; } SreOpcode::MAX_UNTIL => { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; repeat_ctx.count += 1; if (repeat_ctx.count as usize) < repeat_ctx.min_count { @@ -696,13 +692,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && state.string_position != repeat_ctx.last_position + && state.cursor.position != repeat_ctx.last_position { /* we may have enough matches, but if we can match another item, do so */ state.marks.push(); ctx.count = repeat_ctx.last_position as isize; - repeat_ctx.last_position = state.string_position; + repeat_ctx.last_position = state.cursor.position; break 'context ctx .next_at(repeat_ctx.code_position + 4, Jump::MaxUntil2); @@ -716,7 +712,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } SreOpcode::MIN_UNTIL => { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; repeat_ctx.count += 1; if (repeat_ctx.count as usize) < repeat_ctx.min_count { @@ -740,12 +736,12 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut next_ctx = ctx; next_ctx.skip_code(4); let count = _count(req, state, next_ctx, max_count); - ctx.skip_char(req, count); + ctx.skip_char::(count); if count < min_count { break 'result false; } @@ -753,7 +749,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } @@ -769,7 +765,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.count = if min_count == 0 { 0 } else { @@ -779,14 +775,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if count < min_count { break 'result false; } - ctx.skip_char(req, count); + ctx.skip_char::(count); count as isize }; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } @@ -830,13 +826,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } /* pattern tail */ SreOpcode::ATOMIC_GROUP => { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'context ctx.next_offset(2, Jump::AtomicGroup1); } /* <1=min> <2=max> pattern tail */ SreOpcode::POSSESSIVE_REPEAT => { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.count = 0; ctx.jump = Jump::PossessiveRepeat1; continue 'context; @@ -849,14 +845,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if ctx.remaining_chars(req) < min_count { break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); let count = _count(req, state, count_ctx, max_count); if count < min_count { break 'result false; } - ctx.skip_char(req, count); + ctx.skip_char::(count); ctx.skip_code_from(req, 1); } SreOpcode::CHARSET @@ -907,16 +903,17 @@ fn search_info_literal( while !ctx.at_end(req) { // find the next matched literal - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); + while ctx.peek_char::() != c { + ctx.advance_char::(); if ctx.at_end(req) { return false; } } - req.start = ctx.string_position; - state.start = ctx.string_position; - state.string_position = ctx.string_position + skip; + req.start = ctx.cursor.position; + state.start = req.start; + state.cursor = ctx.cursor; + S::skip(&mut state.cursor, skip); // literal only if LITERAL { @@ -924,44 +921,46 @@ fn search_info_literal( } let mut next_ctx = ctx; - next_ctx.skip_char(req, skip); + next_ctx.skip_char::(skip); if _match(req, state, next_ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); state.marks.clear(); } } else { while !ctx.at_end(req) { let c = prefix[0]; - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); + while ctx.peek_char::() != c { + ctx.advance_char::(); if ctx.at_end(req) { return false; } } - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } let mut i = 1; loop { - if ctx.peek_char(req) == prefix[i] { + if ctx.peek_char::() == prefix[i] { i += 1; if i != len { - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } continue; } - req.start = ctx.string_position - (len - 1); - state.start = req.start; - state.string_position = state.start + skip; + req.start = ctx.cursor.position - (len - 1); + state.reset(req, req.start); + S::skip(&mut state.cursor, skip); + // state.start = req.start; + // state.cursor = req.string.create_cursor(req.start + skip); // literal only if LITERAL { @@ -970,17 +969,16 @@ fn search_info_literal( let mut next_ctx = ctx; if skip != 0 { - next_ctx.skip_char(req, 1); + next_ctx.advance_char::(); } else { - next_ctx.string_position = state.string_position; - next_ctx.string_offset = req.string.offset(0, state.string_position); + next_ctx.cursor = state.cursor; } if _match(req, state, next_ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } @@ -1009,22 +1007,22 @@ fn search_info_charset( req.must_advance = false; loop { - while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { - ctx.skip_char(req, 1); + while !ctx.at_end(req) && !charset(set, ctx.peek_char::()) { + ctx.advance_char::(); } if ctx.at_end(req) { return false; } - req.start = ctx.string_position; - state.start = ctx.string_position; - state.string_position = ctx.string_position; + req.start = ctx.cursor.position; + state.start = ctx.cursor.position; + state.cursor = ctx.cursor; if _match(req, state, ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); state.marks.clear(); } } @@ -1039,85 +1037,9 @@ struct RepeatContext { prev_id: usize, } -pub trait StrDrive: Copy { - fn offset(&self, offset: usize, skip: usize) -> usize; - fn count(&self) -> usize; - fn peek(&self, offset: usize) -> u32; - fn back_peek(&self, offset: usize) -> u32; - fn back_offset(&self, offset: usize, skip: usize) -> usize; -} - -impl StrDrive for &str { - fn offset(&self, offset: usize, skip: usize) -> usize { - self.get(offset..) - .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or(self.len()) - } - - fn count(&self) -> usize { - self.chars().count() - } - - fn peek(&self, offset: usize) -> u32 { - unsafe { self.get_unchecked(offset..) } - .chars() - .next() - .unwrap() as u32 - } - - fn back_peek(&self, offset: usize) -> u32 { - let bytes = self.as_bytes(); - let back_offset = utf8_back_peek_offset(bytes, offset); - match offset - back_offset { - 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]), - 4 => u32::from_be_bytes([ - bytes[offset - 4], - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - _ => unreachable!(), - } - } - - fn back_offset(&self, offset: usize, skip: usize) -> usize { - let bytes = self.as_bytes(); - let mut back_offset = offset; - for _ in 0..skip { - back_offset = utf8_back_peek_offset(bytes, back_offset); - } - back_offset - } -} - -impl<'a> StrDrive for &'a [u8] { - fn offset(&self, offset: usize, skip: usize) -> usize { - offset + skip - } - - fn count(&self) -> usize { - self.len() - } - - fn peek(&self, offset: usize) -> u32 { - self[offset] as u32 - } - - fn back_peek(&self, offset: usize) -> u32 { - self[offset - 1] as u32 - } - - fn back_offset(&self, offset: usize, skip: usize) -> usize { - offset - skip - } -} - #[derive(Clone, Copy)] struct MatchContext { - string_position: usize, - string_offset: usize, + cursor: StringCursor, code_position: usize, toplevel: bool, jump: Jump, @@ -1135,25 +1057,31 @@ impl MatchContext { } fn remaining_chars(&self, req: &Request) -> usize { - req.end - self.string_position + req.end - self.cursor.position + } + + fn peek_char(&self) -> u32 { + S::peek(&self.cursor) + } + + fn skip_char(&mut self, skip: usize) { + S::skip(&mut self.cursor, skip); } - fn peek_char(&self, req: &Request) -> u32 { - req.string.peek(self.string_offset) + fn advance_char(&mut self) -> u32 { + S::advance(&mut self.cursor) } - fn skip_char(&mut self, req: &Request, skip: usize) { - self.string_position += skip; - self.string_offset = req.string.offset(self.string_offset, skip); + fn back_peek_char(&self) -> u32 { + S::back_peek(&self.cursor) } - fn back_peek_char(&self, req: &Request) -> u32 { - req.string.back_peek(self.string_offset) + fn back_skip_char(&mut self, skip: usize) { + S::back_skip(&mut self.cursor, skip); } - fn back_skip_char(&mut self, req: &Request, skip: usize) { - self.string_position -= skip; - self.string_offset = req.string.back_offset(self.string_offset, skip); + fn back_advance_char(&mut self) -> u32 { + S::back_advance(&mut self.cursor) } fn peek_code(&self, req: &Request, peek: usize) -> u32 { @@ -1177,15 +1105,15 @@ impl MatchContext { fn at_beginning(&self) -> bool { // self.ctx().string_position == self.state().start - self.string_position == 0 + self.cursor.position == 0 } fn at_end(&self, req: &Request) -> bool { - self.string_position == req.end + self.cursor.position == req.end } fn at_linebreak(&self, req: &Request) -> bool { - !self.at_end(req) && is_linebreak(self.peek_char(req)) + !self.at_end(req) && is_linebreak(self.peek_char::()) } fn at_boundary bool>( @@ -1196,8 +1124,8 @@ impl MatchContext { if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); - let this = !self.at_end(req) && word_checker(self.peek_char(req)); + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); this != that } @@ -1209,8 +1137,8 @@ impl MatchContext { if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); - let this = !self.at_end(req) && word_checker(self.peek_char(req)); + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); this == that } @@ -1221,7 +1149,7 @@ impl MatchContext { if req.match_all && !self.at_end(req) { return false; } - if req.must_advance && self.string_position == req.start { + if req.must_advance && self.cursor.position == req.start { return false; } true @@ -1252,7 +1180,7 @@ impl MatchContext { fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::()), SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), SreAtCode::END => { @@ -1403,21 +1331,22 @@ fn _count( max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); - let end = ctx.string_position + max_count; + let end = ctx.cursor.position + max_count; let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while ctx.string_position < end && !ctx.at_linebreak(req) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && !ctx.at_linebreak(req) { + ctx.advance_char::(); } } SreOpcode::ANY_ALL => { - ctx.skip_char(req, max_count); + ctx.skip_char::(max_count); } SreOpcode::IN => { - while ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char::()) + { + ctx.advance_char::(); } } SreOpcode::LITERAL => { @@ -1457,14 +1386,14 @@ fn _count( ..*state }; - while ctx.string_position < end && _match(req, &mut sub_state, ctx) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && _match(req, &mut sub_state, ctx) { + ctx.advance_char::(); } } } // TODO: return offset - ctx.string_position - state.string_position + ctx.cursor.position - state.cursor.position } fn general_count_literal bool>( @@ -1474,145 +1403,7 @@ fn general_count_literal bool>( mut f: F, ) { let ch = ctx.peek_code(req, 1); - while ctx.string_position < end && f(ch, ctx.peek_char(req)) { - ctx.skip_char(req, 1); - } -} - -fn is_word(ch: u32) -> bool { - ch == '_' as u32 - || u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) -} -fn is_space(ch: u32) -> bool { - u8::try_from(ch) - .map(is_py_ascii_whitespace) - .unwrap_or(false) -} -fn is_digit(ch: u32) -> bool { - u8::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) -} -fn is_loc_alnum(ch: u32) -> bool { - // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) -} -fn is_loc_word(ch: u32) -> bool { - ch == '_' as u32 || is_loc_alnum(ch) -} -fn is_linebreak(ch: u32) -> bool { - ch == '\n' as u32 -} -pub fn lower_ascii(ch: u32) -> u32 { - u8::try_from(ch) - .map(|x| x.to_ascii_lowercase() as u32) - .unwrap_or(ch) -} -fn lower_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - lower_ascii(ch) -} -fn upper_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.to_ascii_uppercase() as u32) - .unwrap_or(ch) -} -fn is_uni_digit(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) -} -fn is_uni_space(ch: u32) -> bool { - // TODO: check with cpython - is_space(ch) - || matches!( - ch, - 0x0009 - | 0x000A - | 0x000B - | 0x000C - | 0x000D - | 0x001C - | 0x001D - | 0x001E - | 0x001F - | 0x0020 - | 0x0085 - | 0x00A0 - | 0x1680 - | 0x2000 - | 0x2001 - | 0x2002 - | 0x2003 - | 0x2004 - | 0x2005 - | 0x2006 - | 0x2007 - | 0x2008 - | 0x2009 - | 0x200A - | 0x2028 - | 0x2029 - | 0x202F - | 0x205F - | 0x3000 - ) -} -fn is_uni_linebreak(ch: u32) -> bool { - matches!( - ch, - 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 - ) -} -fn is_uni_alnum(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_alphanumeric()) - .unwrap_or(false) -} -fn is_uni_word(ch: u32) -> bool { - ch == '_' as u32 || is_uni_alnum(ch) -} -pub fn lower_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_lowercase().next().unwrap() as u32) - .unwrap_or(ch) -} -pub fn upper_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_uppercase().next().unwrap() as u32) - .unwrap_or(ch) -} - -fn is_utf8_first_byte(b: u8) -> bool { - // In UTF-8, there are three kinds of byte... - // 0xxxxxxx : ASCII - // 10xxxxxx : 2nd, 3rd or 4th byte of code - // 11xxxxxx : 1st byte of multibyte code - (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) -} - -fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { - let mut offset = offset - 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - panic!("not utf-8 code point"); - } - } - } + while ctx.cursor.position < end && f(ch, ctx.peek_char::()) { + ctx.advance_char::(); } - offset } diff --git a/src/lib.rs b/src/lib.rs index c23e807501..fd9f367dc6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,10 @@ pub mod constants; pub mod engine; +pub mod string; + +pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC}; +pub use engine::{Request, SearchIter, State}; +pub use string::{StrDrive, StringCursor}; pub const CODESIZE: usize = 4; diff --git a/src/string.rs b/src/string.rs new file mode 100644 index 0000000000..464901af83 --- /dev/null +++ b/src/string.rs @@ -0,0 +1,381 @@ +#[derive(Debug, Clone, Copy)] +pub struct StringCursor { + pub(crate) ptr: *const u8, + pub position: usize, +} + +impl Default for StringCursor { + fn default() -> Self { + Self { + ptr: std::ptr::null(), + position: 0, + } + } +} + +pub trait StrDrive: Copy { + fn count(&self) -> usize; + fn create_cursor(&self, n: usize) -> StringCursor; + fn advance(cursor: &mut StringCursor) -> u32; + fn peek(cursor: &StringCursor) -> u32; + fn skip(cursor: &mut StringCursor, n: usize); + fn back_advance(cursor: &mut StringCursor) -> u32; + fn back_peek(cursor: &StringCursor) -> u32; + fn back_skip(cursor: &mut StringCursor, n: usize); +} + +impl<'a> StrDrive for &'a [u8] { + #[inline] + fn count(&self) -> usize { + self.len() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + StringCursor { + ptr: self[n..].as_ptr(), + position: n, + } + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { cursor.ptr = cursor.ptr.add(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + unsafe { cursor.ptr = cursor.ptr.add(n) }; + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { cursor.ptr = cursor.ptr.sub(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr.offset(-1) as u32 } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + unsafe { cursor.ptr = cursor.ptr.sub(n) }; + } +} + +impl StrDrive for &str { + #[inline] + fn count(&self) -> usize { + self.chars().count() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + let mut ptr = self.as_ptr(); + for _ in 0..n { + unsafe { next_code_point(&mut ptr) }; + } + StringCursor { ptr, position: n } + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { next_code_point(&mut cursor.ptr) } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point(&mut ptr) } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + for _ in 0..n { + unsafe { next_code_point(&mut cursor.ptr) }; + } + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { next_code_point_reverse(&mut cursor.ptr) } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point_reverse(&mut ptr) } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + for _ in 0..n { + unsafe { next_code_point_reverse(&mut cursor.ptr) }; + } + } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + let x = **ptr; + *ptr = ptr.offset(1); + + if x < 128 { + return x as u32; + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = **ptr; + *ptr = ptr.offset(1); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = **ptr; + *ptr = ptr.offset(1); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = **ptr; + *ptr = ptr.offset(1); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + ch +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + *ptr = ptr.offset(-1); + let w = match **ptr { + next_byte if next_byte < 128 => return next_byte as u32, + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let z = **ptr; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let y = **ptr; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let x = **ptr; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + ch +} + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} + +#[inline] +pub(crate) fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_alnum(ch: u32) -> bool { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) +} +#[inline] +pub(crate) fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 +} +#[inline] +pub fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn lower_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + lower_ascii(ch) +} +#[inline] +pub(crate) fn upper_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn is_uni_digit(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_space(ch: u32) -> bool { + // TODO: check with cpython + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +#[inline] +pub(crate) fn is_uni_linebreak(ch: u32) -> bool { + matches!( + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 + ) +} +#[inline] +pub(crate) fn is_uni_alnum(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) +} +#[inline] +pub fn lower_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) +} +#[inline] +pub fn upper_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) +} diff --git a/tests/tests.rs b/tests/tests.rs index efeb2d2838..f589c62e6e 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,16 +1,13 @@ -use sre_engine::engine; +use sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], } impl Pattern { - fn state<'a, S: engine::StrDrive>( - &self, - string: S, - ) -> (engine::Request<'a, S>, engine::State) { - let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::default(); + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { + let req = Request::new(string, 0, usize::MAX, self.code, false); + let state = State::default(); (req, state) } } @@ -54,7 +51,7 @@ fn test_zerowidth() { let (mut req, mut state) = p.state("a:"); req.must_advance = true; assert!(state.search(req)); - assert_eq!(state.string_position, 1); + assert_eq!(state.cursor.position, 1); } #[test] @@ -66,7 +63,10 @@ fn test_repeat_context_panic() { // END GENERATED let (req, mut state) = p.state("axxzaz"); assert!(state.pymatch(&req)); - assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]); + assert_eq!( + *state.marks.raw(), + vec![Optioned::some(1), Optioned::some(3)] + ); } #[test] @@ -77,7 +77,7 @@ fn test_double_max_until() { // END GENERATED let (req, mut state) = p.state("1111"); assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 4); + assert_eq!(state.cursor.position, 4); } #[test] @@ -89,7 +89,7 @@ fn test_info_single() { let (req, mut state) = p.state("baaaa"); assert!(state.search(req)); assert_eq!(state.start, 1); - assert_eq!(state.string_position, 5); + assert_eq!(state.cursor.position, 5); } #[test] @@ -161,7 +161,7 @@ fn test_bug_20998() { let (mut req, mut state) = p.state("ABC"); req.match_all = true; assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 3); + assert_eq!(state.cursor.position, 3); } #[test] @@ -172,5 +172,10 @@ fn test_bigcharset() { // END GENERATED let (req, mut state) = p.state("x "); assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 1); + assert_eq!(state.cursor.position, 1); +} + +#[test] +fn test_search_nonascii() { + // pattern p = re.compile('\xe0+') } From 10e51ba68909e9f09860c4a5d727c00a74cb0d7c Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Jan 2024 10:03:01 +0200 Subject: [PATCH 92/95] improve: use adjust_cursor reduce double calc --- src/engine.rs | 17 +++++++---------- src/string.rs | 25 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 97489633d8..a854f8d898 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -7,7 +7,7 @@ use crate::string::{ use super::{SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor, MAXREPEAT}; use optional::Optioned; -use std::convert::TryFrom; +use std::{convert::TryFrom, ptr::null}; #[derive(Debug, Clone, Copy)] pub struct Request<'a, S> { @@ -126,17 +126,12 @@ impl State { self.marks.clear(); self.repeat_stack.clear(); self.start = start; - if self.cursor.ptr.is_null() || self.cursor.position > self.start { - self.cursor = req.string.create_cursor(self.start); - } else if self.cursor.position < self.start { - let skip = self.start - self.cursor.position; - S::skip(&mut self.cursor, skip); - } + req.string.adjust_cursor(&mut self.cursor, start); } pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; - self.cursor = req.string.create_cursor(self.start); + req.string.adjust_cursor(&mut self.cursor, self.start); let ctx = MatchContext { cursor: self.cursor, @@ -151,7 +146,7 @@ impl State { pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; - self.cursor = req.string.create_cursor(self.start); + req.string.adjust_cursor(&mut self.cursor, self.start); if req.start > req.end { return false; @@ -215,7 +210,9 @@ impl State { || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { - self.reset(&req, req.end); + self.cursor.position = req.end; + self.cursor.ptr = null(); + // self.reset(&req, req.end); return false; } diff --git a/src/string.rs b/src/string.rs index 464901af83..1340c37423 100644 --- a/src/string.rs +++ b/src/string.rs @@ -16,6 +16,7 @@ impl Default for StringCursor { pub trait StrDrive: Copy { fn count(&self) -> usize; fn create_cursor(&self, n: usize) -> StringCursor; + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize); fn advance(cursor: &mut StringCursor) -> u32; fn peek(cursor: &StringCursor) -> u32; fn skip(cursor: &mut StringCursor, n: usize); @@ -38,6 +39,12 @@ impl<'a> StrDrive for &'a [u8] { } } + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + cursor.position = n; + cursor.ptr = self[n..].as_ptr(); + } + #[inline] fn advance(cursor: &mut StringCursor) -> u32 { cursor.position += 1; @@ -83,11 +90,21 @@ impl StrDrive for &str { #[inline] fn create_cursor(&self, n: usize) -> StringCursor { - let mut ptr = self.as_ptr(); - for _ in 0..n { - unsafe { next_code_point(&mut ptr) }; + let mut cursor = StringCursor { + ptr: self.as_ptr(), + position: 0, + }; + Self::skip(&mut cursor, n); + cursor + } + + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + if cursor.ptr.is_null() || cursor.position > n { + *cursor = Self::create_cursor(&self, n); + } else if cursor.position < n { + Self::skip(cursor, n - cursor.position); } - StringCursor { ptr, position: n } } #[inline] From 21fc2059b70ebd5bf4a7c524c40e7d4347e065dc Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Jan 2024 16:02:05 +0200 Subject: [PATCH 93/95] improve: fix double count on _count --- src/engine.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index a854f8d898..34f00234e5 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -441,7 +441,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut count_ctx = ctx; count_ctx.skip_code(4); - if _count(req, state, count_ctx, 1) == 0 { + if _count(req, state, &mut count_ctx, 1) == 0 { state.marks.pop_discard(); break 'result false; } @@ -735,13 +735,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.cursor = ctx.cursor; - let mut next_ctx = ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, max_count); - ctx.skip_char::(count); + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, &mut count_ctx, max_count); if count < min_count { break 'result false; } + ctx.cursor = count_ctx.cursor; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { @@ -768,11 +768,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } else { let mut count_ctx = ctx; count_ctx.skip_code(4); - let count = _count(req, state, count_ctx, min_count); + let count = _count(req, state, &mut count_ctx, min_count); if count < min_count { break 'result false; } - ctx.skip_char::(count); + ctx.cursor = count_ctx.cursor; count as isize }; @@ -845,11 +845,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); - let count = _count(req, state, count_ctx, max_count); + let count = _count(req, state, &mut count_ctx, max_count); if count < min_count { break 'result false; } - ctx.skip_char::(count); + ctx.cursor = count_ctx.cursor; ctx.skip_code_from(req, 1); } SreOpcode::CHARSET @@ -1324,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, state: &mut State, - mut ctx: MatchContext, + ctx: &mut MatchContext, max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); @@ -1347,28 +1347,28 @@ fn _count( } } SreOpcode::LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code == c); + general_count_literal(req, ctx, end, |code, c| code == c); } SreOpcode::NOT_LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code != c); + general_count_literal(req, ctx, end, |code, c| code != c); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c)); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c)); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(req, &mut ctx, end, char_loc_ignore); + general_count_literal(req, ctx, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c)); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c)); } _ => { /* General case */ @@ -1383,7 +1383,7 @@ fn _count( ..*state }; - while ctx.cursor.position < end && _match(req, &mut sub_state, ctx) { + while ctx.cursor.position < end && _match(req, &mut sub_state, *ctx) { ctx.advance_char::(); } } From 12601d0b44c719183d559fa73d76ab6561255ed9 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Mon, 18 Mar 2024 16:57:28 +0900 Subject: [PATCH 94/95] integrate sre_engine crate to workspace --- Cargo.lock | 38 ++++++++++++++++++++++++++++++++---- Cargo.toml | 5 +++-- vm/sre_engine/Cargo.toml | 8 ++++---- vm/sre_engine/tests/tests.rs | 2 +- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c8d0342708..52afbb053f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1428,7 +1428,16 @@ version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d829733185c1ca374f17e52b762f24f535ec625d2cc1f070e34c8a9068f341b" dependencies = [ - "num_enum_derive", + "num_enum_derive 0.5.9", +] + +[[package]] +name = "num_enum" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +dependencies = [ + "num_enum_derive 0.7.2", ] [[package]] @@ -1443,6 +1452,18 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "num_enum_derive" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -2165,6 +2186,15 @@ dependencies = [ "rustpython-derive", ] +[[package]] +name = "rustpython-sre_engine" +version = "0.6.0" +dependencies = [ + "bitflags 2.4.0", + "num_enum 0.7.2", + "optional", +] + [[package]] name = "rustpython-stdlib" version = "0.3.0" @@ -2200,7 +2230,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "num_enum", + "num_enum 0.7.2", "once_cell", "openssl", "openssl-probe", @@ -2270,7 +2300,7 @@ dependencies = [ "num-integer", "num-traits", "num_cpus", - "num_enum", + "num_enum 0.7.2", "once_cell", "optional", "parking_lot", @@ -2527,7 +2557,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a490c5c46c35dba9a6f5e7ee8e4d67e775eb2d2da0f115750b8d10e1c1ac2d28" dependencies = [ "bitflags 1.3.2", - "num_enum", + "num_enum 0.5.9", "optional", ] diff --git a/Cargo.toml b/Cargo.toml index bfc882fdc5..0f4fb49dc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ include = ["LICENSE", "Cargo.toml", "src/**/*.rs"] resolver = "2" members = [ "compiler", "compiler/core", "compiler/codegen", - ".", "common", "derive", "jit", "vm", "pylib", "stdlib", "wasm/lib", "derive-impl", + ".", "common", "derive", "jit", "vm", "vm/sre_engine", "pylib", "stdlib", "wasm/lib", "derive-impl", ] [workspace.dependencies] @@ -27,6 +27,7 @@ rustpython-jit = { path = "jit", version = "0.3.0" } rustpython-vm = { path = "vm", default-features = false, version = "0.3.0" } rustpython-pylib = { path = "pylib", version = "0.3.0" } rustpython-stdlib = { path = "stdlib", default-features = false, version = "0.3.0" } +rustpython-sre_engine = { path = "vm/sre_engine", version = "0.6.0" } rustpython-doc = { git = "https://github.com/RustPython/__doc__", tag = "0.3.0", version = "0.3.0" } rustpython-literal = { git = "https://github.com/RustPython/Parser.git", rev = "29c4728dbedc7e69cc2560b9b34058bbba9b1303" } @@ -64,7 +65,7 @@ malachite-base = "0.4.4" num-complex = "0.4.0" num-integer = "0.1.44" num-traits = "0.2" -num_enum = "0.5.7" +num_enum = "0.7" once_cell = "1.18" parking_lot = "0.12.1" paste = "1.0.7" diff --git a/vm/sre_engine/Cargo.toml b/vm/sre_engine/Cargo.toml index e54f124ac0..2caa8b73e5 100644 --- a/vm/sre_engine/Cargo.toml +++ b/vm/sre_engine/Cargo.toml @@ -1,15 +1,15 @@ [package] -name = "sre-engine" +name = "rustpython-sre_engine" version = "0.6.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" -repository = "https://github.com/RustPython/sre-engine" +repository = "https://github.com/RustPython/RustPython" license = "MIT" edition = "2021" keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.7" -bitflags = "2" +num_enum = { workspace = true } +bitflags = { workspace = true } optional = "0.5" diff --git a/vm/sre_engine/tests/tests.rs b/vm/sre_engine/tests/tests.rs index f589c62e6e..53494c5e3d 100644 --- a/vm/sre_engine/tests/tests.rs +++ b/vm/sre_engine/tests/tests.rs @@ -1,4 +1,4 @@ -use sre_engine::{Request, State, StrDrive}; +use rustpython_sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], From 1dd9a2fbe45f7f34f5c505d5e9774603bd34d6bb Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Fri, 22 Mar 2024 11:28:49 +0900 Subject: [PATCH 95/95] suppress clippy warnings --- vm/sre_engine/src/engine.rs | 5 +++++ vm/sre_engine/src/string.rs | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vm/sre_engine/src/engine.rs b/vm/sre_engine/src/engine.rs index 34f00234e5..fb7d766e29 100644 --- a/vm/sre_engine/src/engine.rs +++ b/vm/sre_engine/src/engine.rs @@ -283,6 +283,8 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut context_stack = vec![]; let mut popped_result = false; + // NOTE: 'result loop is not an actual loop but break label + #[allow(clippy::never_loop)] 'coro: loop { popped_result = 'result: loop { let yielded = 'context: loop { @@ -513,6 +515,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex loop { macro_rules! general_op_literal { ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char::()) { break 'result false; } @@ -523,6 +526,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex macro_rules! general_op_in { ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char::()) { break 'result false; @@ -551,6 +555,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; for _ in group_start..group_end { + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || $f(ctx.peek_char::()) != $f(gctx.peek_char::()) { diff --git a/vm/sre_engine/src/string.rs b/vm/sre_engine/src/string.rs index 1340c37423..e3f14ef019 100644 --- a/vm/sre_engine/src/string.rs +++ b/vm/sre_engine/src/string.rs @@ -101,7 +101,7 @@ impl StrDrive for &str { #[inline] fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { if cursor.ptr.is_null() || cursor.position > n { - *cursor = Self::create_cursor(&self, n); + *cursor = Self::create_cursor(self, n); } else if cursor.position < n { Self::skip(cursor, n - cursor.position); }